commit 9eead317bd70c4dd4b4ca277f02a66d0ce437507 Author: BadStorm Date: Sat Nov 1 15:38:32 2025 +0100 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..af648ef --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Ignora file temporanei e di sistema +*.log +*.tmp +*.swp +.DS_Store + +# Ignora file di build o output comuni +build/ +dist/ +tmp/ +*.out + +# Ignora file di configurazione locali +.env +*.local + +# Ignora file container qadlet +*.qadlet + +# (Facoltativo) Ignora tutti i file Markdown tranne README.md +# *.md +# !README.md + +# (Facoltativo) Ignora file Containerfile se non vuoi versionarli +# Containerfile diff --git a/BadAI/badai b/BadAI/badai new file mode 100644 index 0000000..68d8d25 --- /dev/null +++ b/BadAI/badai @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +# DuckAI command-line tool +# Usage: duckai restart [--sleep N] + +set -e # Exit on error + +sleep_time=3 # Default sleep time in seconds + +# Parse arguments +if [[ "$1" == "restart" ]]; then + name="" + if [[ "$2" == "--sleep" && -n "$3" && "$3" =~ ^[0-9]+$ ]]; then + sleep_time="$3" + name="$4" + elif [[ -n "$2" ]]; then + name="$2" + fi + + echo "Reloading systemd user daemon..." + systemctl --user daemon-reload + + if [[ -n "$name" ]]; then + # Riavvia servizio specifico + # Cerca container + container_file="" + for file in ~/.config/containers/systemd/*.container; do + if [[ -f "$file" ]]; then + if [[ "$(basename "$file")" =~ ^99_ ]]; then continue; fi + service_name=$(basename "$file" .container) + display_name=$(echo "$service_name" | sed 's/^[0-9]*_//') + if [[ "$display_name" == "$name" ]]; then + container_file="$file" + break + fi + fi + done + if [[ -n "$container_file" ]]; then + service_name=$(basename "$container_file" .container) + echo "Restarting container $name..." + if systemctl --user restart "$service_name" 2>/dev/null; then + echo " ✓ $name restarted successfully" + else + echo " ✗ Failed to restart $name" + fi + else + # Cerca network + network_file="" + for file in ~/.config/containers/systemd/*.network; do + if [[ -f "$file" ]]; then + if [[ "$(basename "$file")" =~ ^99_ ]]; then continue; fi + service_name=$(basename "$file" .network)-network + display_name=$(echo "$service_name" | sed 's/^[0-9]*_//') + if [[ "$display_name" == "$name" ]]; then + network_file="$file" + break + fi + fi + done + if [[ -n "$network_file" ]]; then + service_name=$(basename "$network_file" .network)-network + echo "Restarting network $name..." + if systemctl --user try-restart "$service_name" 2>/dev/null || systemctl --user start "$service_name" 2>/dev/null; then + echo " ✓ $name restarted successfully" + else + echo " ✗ Failed to restart $name" + fi + else + echo "Service $name not found." + exit 1 + fi + fi + # Riavvia nginx + nginx_file="" + for file in ~/.config/containers/systemd/*nginx*.container; do + if [[ -f "$file" ]]; then + nginx_file="$file" + break + fi + done + if [[ -n "$nginx_file" ]]; then + service_name=$(basename "$nginx_file" .container) + echo "Restarting nginx..." + if systemctl --user restart "$service_name" 2>/dev/null; then + echo " ✓ nginx restarted successfully" + else + echo " ✗ Failed to restart nginx" + fi + fi + else + # Riavvia tutti + echo "Restarting all quadlet networks..." + for file in ~/.config/containers/systemd/*.network; do + if [[ -f "$file" ]]; then + if [[ "$(basename "$file")" =~ ^99_ ]]; then continue; fi + service_name=$(basename "$file" .network)-network + display_name=$(echo "$service_name" | sed 's/^[0-9]*_//') + echo "Restarting $display_name..." + if systemctl --user try-restart "$service_name" 2>/dev/null || systemctl --user start "$service_name" 2>/dev/null; then + echo " ✓ $display_name restarted successfully" + else + echo " ✗ Failed to restart $display_name" + fi + sleep "$sleep_time" + fi + done + + echo "Restarting all quadlet containers..." + for file in ~/.config/containers/systemd/*.container; do + if [[ -f "$file" ]]; then + if [[ "$(basename "$file")" =~ ^99_ ]]; then continue; fi + service_name=$(basename "$file" .container) + display_name=$(echo "$service_name" | sed 's/^[0-9]*_//') + echo "Restarting $display_name..." + if systemctl --user restart "$service_name" 2>/dev/null; then + echo " ✓ $display_name restarted successfully" + else + echo " ✗ Failed to restart $display_name" + fi + sleep "$sleep_time" + fi + done + + echo "All services restarted successfully." + fi +elif [[ "$1" == "help" || -z "$1" ]]; then + cat <<'EOF' +D U C K A I - C O M M A N D L I N E +-------------------------------------------------------------------------------- + +Usage: duckai [options] + +Commands: + restart [--sleep N] [name] Restart all quadlet containers and networks, or a specific one by name (without prefix), reload systemd user daemon + help Show this help message + +Options: + --sleep N Sleep N seconds between restarts (default: 3) +EOF +else + echo "Unknown command: $1" + echo "Use 'duckai help' for usage." +fi \ No newline at end of file diff --git a/BadAI/badai.go b/BadAI/badai.go new file mode 100644 index 0000000..25f58d9 --- /dev/null +++ b/BadAI/badai.go @@ -0,0 +1,213 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" +) + +func main() { + if len(os.Args) < 2 { + printHelp() + return + } + + command := os.Args[1] + + switch command { + case "restart": + name := "" + if len(os.Args) > 2 { + if os.Args[2] == "--sleep" && len(os.Args) > 3 { + // Ignora per ora, ma in handleRestart parsare + } else { + name = os.Args[2] + } + } + handleRestart(name) + case "help": + printHelp() + default: + fmt.Printf("Unknown command: %s\n", command) + fmt.Println("Use 'duckai help' for usage.") + os.Exit(1) + } +} + +func handleRestart(name string) { + sleepTime := 3 + args := os.Args[2:] + if len(args) > 0 && args[0] == "--sleep" && len(args) > 1 { + if s, err := strconv.Atoi(args[1]); err == nil { + sleepTime = s + } + if len(args) > 2 { + name = args[2] + } + } else if name == "" && len(args) > 0 { + name = args[0] + } + + fmt.Println("Reloading systemd user daemon...") + runCommand("systemctl", "--user", "daemon-reload") + + if name != "" { + // Riavvia servizio specifico + containerFile := findServiceFile("*.container", name) + if containerFile != "" { + serviceName := strings.TrimSuffix(filepath.Base(containerFile), ".container") + fmt.Printf("Restarting container %s...\n", name) + if runCommand("systemctl", "--user", "restart", serviceName) { + fmt.Printf(" ✓ %s restarted successfully\n", name) + } else { + fmt.Printf(" ✗ Failed to restart %s\n", name) + } + } else { + networkFile := findServiceFile("*.network", name) + if networkFile != "" { + serviceName := strings.TrimSuffix(filepath.Base(networkFile), ".network") + "-network" + fmt.Printf("Restarting network %s...\n", name) + if runCommand("systemctl", "--user", "try-restart", serviceName) || runCommand("systemctl", "--user", "start", serviceName) { + fmt.Printf(" ✓ %s restarted successfully\n", name) + } else { + fmt.Printf(" ✗ Failed to restart %s\n", name) + } + } else { + fmt.Printf("Service %s not found.\n", name) + os.Exit(1) + } + } + // Riavvia nginx + restartNginx() + } else { + fmt.Println("Restarting all quadlet networks...") + restartNetworks(sleepTime) + + fmt.Println("Restarting all quadlet containers...") + restartContainers(sleepTime) + + fmt.Println("All services restarted successfully.") + } +} + +func findServiceFile(pattern, name string) string { + home, _ := os.UserHomeDir() + dir := filepath.Join(home, ".config", "containers", "systemd") + files, _ := filepath.Glob(filepath.Join(dir, pattern)) + re := regexp.MustCompile(`^[0-9]+_`) + for _, file := range files { + if strings.HasPrefix(filepath.Base(file), "99_") { + continue + } + serviceName := strings.TrimSuffix(filepath.Base(file), filepath.Ext(file)) + if pattern == "*.network" { + serviceName += "-network" + } + displayName := re.ReplaceAllString(serviceName, "") + if displayName == name { + return file + } + } + return "" +} + home, _ := os.UserHomeDir() + dir := filepath.Join(home, ".config", "containers", "systemd") + files, _ := filepath.Glob(filepath.Join(dir, pattern)) + re := regexp.MustCompile(`^[0-9]+_`) + for _, file := range files { + if strings.HasPrefix(filepath.Base(file), "99_") { + continue + } + serviceName := strings.TrimSuffix(filepath.Base(file), filepath.Ext(file)) + if pattern == "*.network" { + serviceName += "-network" + } + displayName := re.ReplaceAllString(serviceName, "") + if displayName == name { + return file + } + } + return "" +} + +func restartNetworks(sleepTime int) { + home, _ := os.UserHomeDir() + dir := filepath.Join(home, ".config", "containers", "systemd") + files, _ := filepath.Glob(filepath.Join(dir, "*.network")) + re := regexp.MustCompile(`^[0-9]+_`) + for _, file := range files { + if strings.HasPrefix(filepath.Base(file), "99_") { + continue + } + serviceName := strings.TrimSuffix(filepath.Base(file), ".network") + "-network" + displayName := re.ReplaceAllString(serviceName, "") + fmt.Printf("Restarting %s...\n", displayName) + if runCommand("systemctl", "--user", "try-restart", serviceName) || runCommand("systemctl", "--user", "start", serviceName) { + fmt.Printf(" ✓ %s restarted successfully\n", displayName) + } else { + fmt.Printf(" ✗ Failed to restart %s\n", displayName) + } + time.Sleep(time.Duration(sleepTime) * time.Second) + } +} + +func restartContainers(sleepTime int) { + home, _ := os.UserHomeDir() + dir := filepath.Join(home, ".config", "containers", "systemd") + files, _ := filepath.Glob(filepath.Join(dir, "*.container")) + re := regexp.MustCompile(`^[0-9]+_`) + for _, file := range files { + if strings.HasPrefix(filepath.Base(file), "99_") { + continue + } + serviceName := strings.TrimSuffix(filepath.Base(file), ".container") + displayName := re.ReplaceAllString(serviceName, "") + fmt.Printf("Restarting %s...\n", displayName) + if runCommand("systemctl", "--user", "restart", serviceName) { + fmt.Printf(" ✓ %s restarted successfully\n", displayName) + } else { + fmt.Printf(" ✗ Failed to restart %s\n", displayName) + } + time.Sleep(time.Duration(sleepTime) * time.Second) + } +} + +func restartNginx() { + home, _ := os.UserHomeDir() + dir := filepath.Join(home, ".config", "containers", "systemd") + files, _ := filepath.Glob(filepath.Join(dir, "*nginx*.container")) + if len(files) > 0 { + serviceName := strings.TrimSuffix(filepath.Base(files[0]), ".container") + fmt.Println("Restarting nginx...") + if runCommand("systemctl", "--user", "restart", serviceName) { + fmt.Println(" ✓ nginx restarted successfully") + } else { + fmt.Println(" ✗ Failed to restart nginx") + } + } +} + +func runCommand(name string, args ...string) bool { + cmd := exec.Command(name, args...) + err := cmd.Run() + return err == nil +} + +func printHelp() { + fmt.Println(`D U C K A I - C O M M A N D L I N E +-------------------------------------------------------------------------------- + +Usage: duckai [options] + +Commands: + restart [--sleep N] [name] Restart all quadlet containers and networks, or a specific one by name (without prefix), reload systemd user daemon + help Show this help message + +Options: + --sleep N Sleep N seconds between restarts (default: 3)`) +} \ No newline at end of file diff --git a/BadAI/banner.sh b/BadAI/banner.sh new file mode 100644 index 0000000..0701a39 --- /dev/null +++ b/BadAI/banner.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# Lightweight banner for BadAI host system — Ubuntu with AMDGPU drivers + +oem_info() { + local v="" m="" d lv lm + for d in /sys/class/dmi/id /sys/devices/virtual/dmi/id; do + [[ -r "$d/sys_vendor" ]] && v=$(<"$d/sys_vendor") + [[ -r "$d/product_name" ]] && m=$(<"$d/product_name") + [[ -n "$v" || -n "$m" ]] && break + done + # ARM/SBC fallback + if [[ -z "$v" && -z "$m" && -r /proc/device-tree/model ]]; then + tr -d '\0' /dev/null 2>&1; then + name=$(lspci -nn 2>/dev/null | grep -Ei 'vga|display|gpu' | grep -i amd | head -n1 | cut -d: -f3-) + fi + name=$(printf '%s' "$name" | sed -e 's/^[[:space:]]\+//' -e 's/[[:space:]]\+$//' -e 's/[[:space:]]\{2,\}/ /g') + printf '%s\n' "${name:-Unknown AMD GPU}" +} + +ubuntu_version() { + lsb_release -d 2>/dev/null | cut -f2 || uname -a +} + +system_load() { + uptime | awk -F'load average:' '{ print $2 }' | sed 's/,//g' +} + +memory_usage() { + free -h | awk 'NR==2{printf "%.0f%%", $3*100/$2 }' +} + +updates_info() { + if command -v apt >/dev/null 2>&1; then + local upgradable=$(apt list --upgradable 2>/dev/null | grep -v '^Listing' | grep -c '^[^/]*$') + local security=$(apt list --upgradable 2>/dev/null | grep -c 'security') + printf '%d updates can be applied immediately.\n' "$upgradable" + if [[ $security -gt 0 ]]; then + printf '%d additional security updates can be applied.\n' "$security" + fi + else + printf 'Updates info not available.\n' + fi +} + +MACHINE="$(oem_info)" +GPU="$(gpu_name)" +UBUNTU_VER="$(ubuntu_version)" +LOAD="$(system_load)" +MEM="$(memory_usage)" + +echo +cat <<'ASCII' + +__________ .___ _____ .___ +\______ \_____ __| _/ / _ \ | | + | | _/\__ \ / __ | / /_\ \| | + | | \ / __ \_/ /_/ | / | \ | + |______ /(____ /\____ | \____|__ /___| + \/ \/ \/ \/ + + +B A D A I - H O S T ( U B U N T U , A M D G P U ) + +ASCII + +echo "--------------------------------------------------------------------------------" +printf 'Machine: %s\n' "$MACHINE" +printf 'GPU : %s\n' "$GPU" +printf 'OS : %s\n' "$UBUNTU_VER" +printf 'Load : %s\n' "$LOAD" +printf 'Memory : %s\n' "$MEM" + +echo +echo "--------------------------------------------------------------------------------" +updates_info + +echo +echo "--------------------------------------------------------------------------------" +printf 'Usage:\n' +printf ' - %-24s → %s\n' "podman ps" "List running containers" +printf ' - %-24s → %s\n' "podman logs " "View container logs" +printf ' - %-24s → %s\n' "podman exec -it bash" "Access container shell" +printf ' - %-24s → %s\n' "radentop" "Monitor AMD GPU usage (if installed)" +printf ' - %-24s → %s\n' "htop" "Monitor system processes and resources" +printf ' - %-24s → %s\n' "duckai restart" "Restart all services" +printf ' - %-24s → %s\n' "duckai help" "Show DuckAI commands" \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f604a1b --- /dev/null +++ b/README.md @@ -0,0 +1,45 @@ +# Guida Installazione Container Rootless - Ubuntu + +Guida completa per configurare un'infrastruttura AI containerizzata con Podman rootless su Ubuntu. + +## 1. Prerequisiti + +### Sistema Operativo +- Ubuntu 22.04 LTS o superiore +- Utente non-root con privilegi sudo + +### Partizionamento +/boot → 1 GB → ext4 +/ → 40 GB → ext4 +/home → 50 GB → ext4 (se usi Podman rootless) +/srv → tutto il resto → xfs (per dati, modelli AI, volumi bind) +/swap → 16 GB → swap + +## Installazione + +Per installare automaticamente tutto il necessario, esegui: + +```bash +curl -fsSL https://tuo-repo/install.sh | sh +``` + +Sostituisci `https://tuo-repo/install.sh` con l'URL del tuo repository remoto (ad esempio, `https://raw.githubusercontent.com/tuo-username/tuo-repo/main/install.sh`). + +Questo script eseguirà tutti i passi di configurazione, inclusi aggiornamenti di sistema, installazione di Podman, configurazione di systemd e riavvio finale. + +## Utilizzo dopo l'installazione + +Dopo l'installazione e il riavvio, usa il comando `badai` per gestire i servizi AI containerizzati. + +### Comandi principali: +- `badai restart`: Riavvia tutti i container e le reti quadlet, ricarica il daemon systemd dell'utente. +- `badai restart [nome]`: Riavvia un servizio specifico per nome (senza prefisso numerico). +- `badai help`: Mostra il messaggio di aiuto. + +Esempi: +```bash +badai restart # Riavvia tutto +badai restart llamacpp # Riavvia solo il container llamacpp +``` + +I servizi includono container come `llamacpp` per modelli AI e `nginx` per il proxy, oltre alle reti interne. diff --git a/Services/certbot/certbot.container b/Services/certbot/certbot.container new file mode 100644 index 0000000..bb92b48 --- /dev/null +++ b/Services/certbot/certbot.container @@ -0,0 +1,9 @@ +[Unit] +Name=certbot + +[Container] +ContainerName=certbot +Image=certbot/certbot +AutoUpdate=registry + +Volume=/srv/containers/nginx/ssl:/etc/letsencrypt diff --git a/Services/llamacpp/DOCS.md b/Services/llamacpp/DOCS.md new file mode 100644 index 0000000..b5ef56c --- /dev/null +++ b/Services/llamacpp/DOCS.md @@ -0,0 +1,445 @@ +----- common params ----- + +-h, --help, --usage print usage and exit +--version show version and build info +--completion-bash print source-able bash completion script for llama.cpp +--verbose-prompt print a verbose prompt before generation (default: false) +-t, --threads N number of CPU threads to use during generation (default: -1) + (env: LLAMA_ARG_THREADS) +-tb, --threads-batch N number of threads to use during batch and prompt processing (default: + same as --threads) +-C, --cpu-mask M CPU affinity mask: arbitrarily long hex. Complements cpu-range + (default: "") +-Cr, --cpu-range lo-hi range of CPUs for affinity. Complements --cpu-mask +--cpu-strict <0|1> use strict CPU placement (default: 0) +--prio N set process/thread priority : low(-1), normal(0), medium(1), high(2), + realtime(3) (default: 0) +--poll <0...100> use polling level to wait for work (0 - no polling, default: 50) +-Cb, --cpu-mask-batch M CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch + (default: same as --cpu-mask) +-Crb, --cpu-range-batch lo-hi ranges of CPUs for affinity. Complements --cpu-mask-batch +--cpu-strict-batch <0|1> use strict CPU placement (default: same as --cpu-strict) +--prio-batch N set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime + (default: 0) +--poll-batch <0|1> use polling to wait for work (default: same as --poll) +-c, --ctx-size N size of the prompt context (default: 4096, 0 = loaded from model) + (env: LLAMA_ARG_CTX_SIZE) +-n, --predict, --n-predict N number of tokens to predict (default: -1, -1 = infinity) + (env: LLAMA_ARG_N_PREDICT) +-b, --batch-size N logical maximum batch size (default: 2048) + (env: LLAMA_ARG_BATCH) +-ub, --ubatch-size N physical maximum batch size (default: 512) + (env: LLAMA_ARG_UBATCH) +--keep N number of tokens to keep from the initial prompt (default: 0, -1 = + all) +--swa-full use full-size SWA cache (default: false) + [(more + info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) + (env: LLAMA_ARG_SWA_FULL) +--kv-unified, -kvu use single unified KV buffer for the KV cache of all sequences + (default: false) + [(more info)](https://github.com/ggml-org/llama.cpp/pull/14363) + (env: LLAMA_ARG_KV_SPLIT) +-fa, --flash-attn [on|off|auto] set Flash Attention use ('on', 'off', or 'auto', default: 'auto') + (env: LLAMA_ARG_FLASH_ATTN) +--no-perf disable internal libllama performance timings (default: false) + (env: LLAMA_ARG_NO_PERF) +-e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true) +--no-escape do not process escape sequences +--rope-scaling {none,linear,yarn} RoPE frequency scaling method, defaults to linear unless specified by + the model + (env: LLAMA_ARG_ROPE_SCALING_TYPE) +--rope-scale N RoPE context scaling factor, expands context by a factor of N + (env: LLAMA_ARG_ROPE_SCALE) +--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from + model) + (env: LLAMA_ARG_ROPE_FREQ_BASE) +--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N + (env: LLAMA_ARG_ROPE_FREQ_SCALE) +--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training + context size) + (env: LLAMA_ARG_YARN_ORIG_CTX) +--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full + interpolation) + (env: LLAMA_ARG_YARN_EXT_FACTOR) +--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: -1.0) + (env: LLAMA_ARG_YARN_ATTN_FACTOR) +--yarn-beta-slow N YaRN: high correction dim or alpha (default: -1.0) + (env: LLAMA_ARG_YARN_BETA_SLOW) +--yarn-beta-fast N YaRN: low correction dim or beta (default: -1.0) + (env: LLAMA_ARG_YARN_BETA_FAST) +-nkvo, --no-kv-offload disable KV offload + (env: LLAMA_ARG_NO_KV_OFFLOAD) +-nr, --no-repack disable weight repacking + (env: LLAMA_ARG_NO_REPACK) +--no-host bypass host buffer allowing extra buffers to be used + (env: LLAMA_ARG_NO_HOST) +-ctk, --cache-type-k TYPE KV cache data type for K + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (default: f16) + (env: LLAMA_ARG_CACHE_TYPE_K) +-ctv, --cache-type-v TYPE KV cache data type for V + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (default: f16) + (env: LLAMA_ARG_CACHE_TYPE_V) +-dt, --defrag-thold N KV cache defragmentation threshold (DEPRECATED) + (env: LLAMA_ARG_DEFRAG_THOLD) +-np, --parallel N number of parallel sequences to decode (default: 1) + (env: LLAMA_ARG_N_PARALLEL) +--mlock force system to keep model in RAM rather than swapping or compressing + (env: LLAMA_ARG_MLOCK) +--no-mmap do not memory-map model (slower load but may reduce pageouts if not + using mlock) + (env: LLAMA_ARG_NO_MMAP) +--numa TYPE attempt optimizations that help on some NUMA systems + - distribute: spread execution evenly over all nodes + - isolate: only spawn threads on CPUs on the node that execution + started on + - numactl: use the CPU map provided by numactl + if run without this previously, it is recommended to drop the system + page cache before using this + see https://github.com/ggml-org/llama.cpp/issues/1437 + (env: LLAMA_ARG_NUMA) +-dev, --device comma-separated list of devices to use for offloading (none = don't + offload) + use --list-devices to see a list of available devices + (env: LLAMA_ARG_DEVICE) +--list-devices print list of available devices and exit +--override-tensor, -ot =,... + override tensor buffer type +--cpu-moe, -cmoe keep all Mixture of Experts (MoE) weights in the CPU + (env: LLAMA_ARG_CPU_MOE) +--n-cpu-moe, -ncmoe N keep the Mixture of Experts (MoE) weights of the first N layers in the + CPU + (env: LLAMA_ARG_N_CPU_MOE) +-ngl, --gpu-layers, --n-gpu-layers N max. number of layers to store in VRAM (default: -1) + (env: LLAMA_ARG_N_GPU_LAYERS) +-sm, --split-mode {none,layer,row} how to split the model across multiple GPUs, one of: + - none: use one GPU only + - layer (default): split layers and KV across GPUs + - row: split rows across GPUs + (env: LLAMA_ARG_SPLIT_MODE) +-ts, --tensor-split N0,N1,N2,... fraction of the model to offload to each GPU, comma-separated list of + proportions, e.g. 3,1 + (env: LLAMA_ARG_TENSOR_SPLIT) +-mg, --main-gpu INDEX the GPU to use for the model (with split-mode = none), or for + intermediate results and KV (with split-mode = row) (default: 0) + (env: LLAMA_ARG_MAIN_GPU) +--check-tensors check model tensor data for invalid values (default: false) +--override-kv KEY=TYPE:VALUE advanced option to override model metadata by key. may be specified + multiple times. + types: int, float, bool, str. example: --override-kv + tokenizer.ggml.add_bos_token=bool:false +--no-op-offload disable offloading host tensor operations to device (default: false) +--lora FNAME path to LoRA adapter (can be repeated to use multiple adapters) +--lora-scaled FNAME SCALE path to LoRA adapter with user defined scaling (can be repeated to use + multiple adapters) +--control-vector FNAME add a control vector + note: this argument can be repeated to add multiple control vectors +--control-vector-scaled FNAME SCALE add a control vector with user defined scaling SCALE + note: this argument can be repeated to add multiple scaled control + vectors +--control-vector-layer-range START END + layer range to apply the control vector(s) to, start and end inclusive +-m, --model FNAME model path (default: `models/$filename` with filename from `--hf-file` + or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf) + (env: LLAMA_ARG_MODEL) +-mu, --model-url MODEL_URL model download url (default: unused) + (env: LLAMA_ARG_MODEL_URL) +-dr, --docker-repo [/][:quant] + Docker Hub model repository. repo is optional, default to ai/. quant + is optional, default to :latest. + example: gemma3 + (default: unused) + (env: LLAMA_ARG_DOCKER_REPO) +-hf, -hfr, --hf-repo /[:quant] + Hugging Face model repository; quant is optional, case-insensitive, + default to Q4_K_M, or falls back to the first file in the repo if + Q4_K_M doesn't exist. + mmproj is also downloaded automatically if available. to disable, add + --no-mmproj + example: unsloth/phi-4-GGUF:q4_k_m + (default: unused) + (env: LLAMA_ARG_HF_REPO) +-hfd, -hfrd, --hf-repo-draft /[:quant] + Same as --hf-repo, but for the draft model (default: unused) + (env: LLAMA_ARG_HFD_REPO) +-hff, --hf-file FILE Hugging Face model file. If specified, it will override the quant in + --hf-repo (default: unused) + (env: LLAMA_ARG_HF_FILE) +-hfv, -hfrv, --hf-repo-v /[:quant] + Hugging Face model repository for the vocoder model (default: unused) + (env: LLAMA_ARG_HF_REPO_V) +-hffv, --hf-file-v FILE Hugging Face model file for the vocoder model (default: unused) + (env: LLAMA_ARG_HF_FILE_V) +-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment + variable) + (env: HF_TOKEN) +--log-disable Log disable +--log-file FNAME Log to file +--log-colors [on|off|auto] Set colored logging ('on', 'off', or 'auto', default: 'auto') + 'auto' enables colors when output is to a terminal + (env: LLAMA_LOG_COLORS) +-v, --verbose, --log-verbose Set verbosity level to infinity (i.e. log all messages, useful for + debugging) +--offline Offline mode: forces use of cache, prevents network access + (env: LLAMA_OFFLINE) +-lv, --verbosity, --log-verbosity N Set the verbosity threshold. Messages with a higher verbosity will be + ignored. + (env: LLAMA_LOG_VERBOSITY) +--log-prefix Enable prefix in log messages + (env: LLAMA_LOG_PREFIX) +--log-timestamps Enable timestamps in log messages + (env: LLAMA_LOG_TIMESTAMPS) +-ctkd, --cache-type-k-draft TYPE KV cache data type for K for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (default: f16) + (env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) +-ctvd, --cache-type-v-draft TYPE KV cache data type for V for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (default: f16) + (env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) + + +----- sampling params ----- + +--samplers SAMPLERS samplers that will be used for generation in the order, separated by + ';' + (default: + penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) +-s, --seed SEED RNG seed (default: -1, use random seed for -1) +--sampling-seq, --sampler-seq SEQUENCE + simplified sequence for samplers that will be used (default: + edskypmxt) +--ignore-eos ignore end of stream token and continue generating (implies + --logit-bias EOS-inf) +--temp N temperature (default: 0.8) +--top-k N top-k sampling (default: 40, 0 = disabled) +--top-p N top-p sampling (default: 0.9, 1.0 = disabled) +--min-p N min-p sampling (default: 0.1, 0.0 = disabled) +--top-nsigma N top-n-sigma sampling (default: -1.0, -1.0 = disabled) +--xtc-probability N xtc probability (default: 0.0, 0.0 = disabled) +--xtc-threshold N xtc threshold (default: 0.1, 1.0 = disabled) +--typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) +--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 + = ctx_size) +--repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) +--presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled) +--frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) +--dry-multiplier N set DRY sampling multiplier (default: 0.0, 0.0 = disabled) +--dry-base N set DRY sampling base value (default: 1.75) +--dry-allowed-length N set allowed length for DRY sampling (default: 2) +--dry-penalty-last-n N set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = + context size) +--dry-sequence-breaker STRING add sequence breaker for DRY sampling, clearing out default breakers + ('\n', ':', '"', '*') in the process; use "none" to not use any + sequence breakers +--dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled) +--dynatemp-exp N dynamic temperature exponent (default: 1.0) +--mirostat N use Mirostat sampling. + Top K, Nucleus and Locally Typical samplers are ignored if used. + (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) +--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1) +--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0) +-l, --logit-bias TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion, + i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello', + or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' +--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ + dir) (default: '') +--grammar-file FNAME file to read grammar from +-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. + `{}` for any JSON object + For schemas w/ external $refs, use --grammar + + example/json_schema_to_grammar.py instead +-jf, --json-schema-file FILE File containing a JSON schema to constrain generations + (https://json-schema.org/), e.g. `{}` for any JSON object + For schemas w/ external $refs, use --grammar + + example/json_schema_to_grammar.py instead + + +----- example-specific params ----- + +--ctx-checkpoints, --swa-checkpoints N + max number of context checkpoints to create per slot (default: 8) + [(more info)](https://github.com/ggml-org/llama.cpp/pull/15293) + (env: LLAMA_ARG_CTX_CHECKPOINTS) +--cache-ram, -cram N set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - + disable) + [(more info)](https://github.com/ggml-org/llama.cpp/pull/16391) + (env: LLAMA_ARG_CACHE_RAM) +--no-context-shift disables context shift on infinite text generation (default: enabled) + (env: LLAMA_ARG_NO_CONTEXT_SHIFT) +--context-shift enables context shift on infinite text generation (default: disabled) + (env: LLAMA_ARG_CONTEXT_SHIFT) +-r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode +-sp, --special special tokens output enabled (default: false) +--no-warmup skip warming up the model with an empty run +--spm-infill use Suffix/Prefix/Middle pattern for infill (instead of + Prefix/Suffix/Middle) as some models prefer this. (default: disabled) +--pooling {none,mean,cls,last,rank} pooling type for embeddings, use model default if unspecified + (env: LLAMA_ARG_POOLING) +-cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled) + (env: LLAMA_ARG_CONT_BATCHING) +-nocb, --no-cont-batching disable continuous batching + (env: LLAMA_ARG_NO_CONT_BATCHING) +--mmproj FILE path to a multimodal projector file. see tools/mtmd/README.md + note: if -hf is used, this argument can be omitted + (env: LLAMA_ARG_MMPROJ) +--mmproj-url URL URL to a multimodal projector file. see tools/mtmd/README.md + (env: LLAMA_ARG_MMPROJ_URL) +--no-mmproj explicitly disable multimodal projector, useful when using -hf + (env: LLAMA_ARG_NO_MMPROJ) +--no-mmproj-offload do not offload multimodal projector to GPU + (env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) +--override-tensor-draft, -otd =,... + override tensor buffer type for draft model +--cpu-moe-draft, -cmoed keep all Mixture of Experts (MoE) weights in the CPU for the draft + model + (env: LLAMA_ARG_CPU_MOE_DRAFT) +--n-cpu-moe-draft, -ncmoed N keep the Mixture of Experts (MoE) weights of the first N layers in the + CPU for the draft model + (env: LLAMA_ARG_N_CPU_MOE_DRAFT) +-a, --alias STRING set alias for model name (to be used by REST API) + (env: LLAMA_ARG_ALIAS) +--host HOST ip address to listen, or bind to an UNIX socket if the address ends + with .sock (default: 127.0.0.1) + (env: LLAMA_ARG_HOST) +--port PORT port to listen (default: 8080) + (env: LLAMA_ARG_PORT) +--path PATH path to serve static files from (default: ) + (env: LLAMA_ARG_STATIC_PATH) +--api-prefix PREFIX prefix path the server serves from, without the trailing slash + (default: ) + (env: LLAMA_ARG_API_PREFIX) +--no-webui Disable the Web UI (default: enabled) + (env: LLAMA_ARG_NO_WEBUI) +--embedding, --embeddings restrict to only support embedding use case; use only with dedicated + embedding models (default: disabled) + (env: LLAMA_ARG_EMBEDDINGS) +--reranking, --rerank enable reranking endpoint on server (default: disabled) + (env: LLAMA_ARG_RERANKING) +--api-key KEY API key to use for authentication (default: none) + (env: LLAMA_API_KEY) +--api-key-file FNAME path to file containing API keys (default: none) +--ssl-key-file FNAME path to file a PEM-encoded SSL private key + (env: LLAMA_ARG_SSL_KEY_FILE) +--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate + (env: LLAMA_ARG_SSL_CERT_FILE) +--chat-template-kwargs STRING sets additional params for the json template parser + (env: LLAMA_CHAT_TEMPLATE_KWARGS) +-to, --timeout N server read/write timeout in seconds (default: 600) + (env: LLAMA_ARG_TIMEOUT) +--threads-http N number of threads used to process HTTP requests (default: -1) + (env: LLAMA_ARG_THREADS_HTTP) +--cache-reuse N min chunk size to attempt reusing from the cache via KV shifting + (default: 0) + [(card)](https://ggml.ai/f0.png) + (env: LLAMA_ARG_CACHE_REUSE) +--metrics enable prometheus compatible metrics endpoint (default: disabled) + (env: LLAMA_ARG_ENDPOINT_METRICS) +--props enable changing global properties via POST /props (default: disabled) + (env: LLAMA_ARG_ENDPOINT_PROPS) +--slots enable slots monitoring endpoint (default: enabled) + (env: LLAMA_ARG_ENDPOINT_SLOTS) +--no-slots disables slots monitoring endpoint + (env: LLAMA_ARG_NO_ENDPOINT_SLOTS) +--slot-save-path PATH path to save slot kv cache (default: disabled) +--jinja use jinja template for chat (default: disabled) + (env: LLAMA_ARG_JINJA) +--reasoning-format FORMAT controls whether thought tags are allowed and/or extracted from the + response, and in which format they're returned; one of: + - none: leaves thoughts unparsed in `message.content` + - deepseek: puts thoughts in `message.reasoning_content` + - deepseek-legacy: keeps `` tags in `message.content` while + also populating `message.reasoning_content` + (default: auto) + (env: LLAMA_ARG_THINK) +--reasoning-budget N controls the amount of thinking allowed; currently only one of: -1 for + unrestricted thinking budget, or 0 to disable thinking (default: -1) + (env: LLAMA_ARG_THINK_BUDGET) +--chat-template JINJA_TEMPLATE set custom jinja chat template (default: template taken from model's + metadata) + if suffix/prefix are specified, template will be disabled + only commonly used templates are accepted (unless --jinja is set + before this flag): + list of built-in templates: + bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, + command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, + gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, + hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, + llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, + mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, + openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, + vicuna-orca, yandex, zephyr + (env: LLAMA_ARG_CHAT_TEMPLATE) +--chat-template-file JINJA_TEMPLATE_FILE + set custom jinja chat template file (default: template taken from + model's metadata) + if suffix/prefix are specified, template will be disabled + only commonly used templates are accepted (unless --jinja is set + before this flag): + list of built-in templates: + bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, + command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, + gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, + hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, + llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, + mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, + openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, + vicuna-orca, yandex, zephyr + (env: LLAMA_ARG_CHAT_TEMPLATE_FILE) +--no-prefill-assistant whether to prefill the assistant's response if the last message is an + assistant message (default: prefill enabled) + when this flag is set, if the last message is an assistant message + then it will be treated as a full message and not prefilled + + (env: LLAMA_ARG_NO_PREFILL_ASSISTANT) +-sps, --slot-prompt-similarity SIMILARITY + how much the prompt of a request must match the prompt of a slot in + order to use that slot (default: 0.10, 0.0 = disabled) +--lora-init-without-apply load LoRA adapters without applying them (apply later via POST + /lora-adapters) (default: disabled) +-td, --threads-draft N number of threads to use during generation (default: same as + --threads) +-tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: + same as --threads-draft) +--draft-max, --draft, --draft-n N number of tokens to draft for speculative decoding (default: 16) + (env: LLAMA_ARG_DRAFT_MAX) +--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding + (default: 0) + (env: LLAMA_ARG_DRAFT_MIN) +--draft-p-min P minimum speculative decoding probability (greedy) (default: 0.8) + (env: LLAMA_ARG_DRAFT_P_MIN) +-cd, --ctx-size-draft N size of the prompt context for the draft model (default: 0, 0 = loaded + from model) + (env: LLAMA_ARG_CTX_SIZE_DRAFT) +-devd, --device-draft comma-separated list of devices to use for offloading the draft model + (none = don't offload) + use --list-devices to see a list of available devices +-ngld, --gpu-layers-draft, --n-gpu-layers-draft N + number of layers to store in VRAM for the draft model + (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) +-md, --model-draft FNAME draft model for speculative decoding (default: unused) + (env: LLAMA_ARG_MODEL_DRAFT) +--spec-replace TARGET DRAFT translate the string in TARGET into DRAFT if the draft model and main + model are not compatible +-mv, --model-vocoder FNAME vocoder model for audio generation (default: unused) +--tts-use-guide-tokens Use guide tokens to improve TTS word recall +--embd-gemma-default use default EmbeddingGemma model (note: can download weights from the + internet) +--fim-qwen-1.5b-default use default Qwen 2.5 Coder 1.5B (note: can download weights from the + internet) +--fim-qwen-3b-default use default Qwen 2.5 Coder 3B (note: can download weights from the + internet) +--fim-qwen-7b-default use default Qwen 2.5 Coder 7B (note: can download weights from the + internet) +--fim-qwen-7b-spec use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can + download weights from the internet) +--fim-qwen-14b-spec use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: + can download weights from the internet) +--fim-qwen-30b-default use default Qwen 3 Coder 30B A3B Instruct (note: can download weights + from the internet) +--gpt-oss-20b-default use gpt-oss-20b (note: can download weights from the internet) +--gpt-oss-120b-default use gpt-oss-120b (note: can download weights from the internet) +--vision-gemma-4b-default use Gemma 3 4B QAT (note: can download weights from the internet) +--vision-gemma-12b-default use Gemma 3 12B QAT (note: can download weights from the internet) \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startBaseMedium.sh b/Services/llamacpp/Scripts/startBaseMedium.sh new file mode 100755 index 0000000..85b8460 --- /dev/null +++ b/Services/llamacpp/Scripts/startBaseMedium.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Report descrittivi: 0.6 ok; 0.55 più stabile +TEMP=${BASE_TEMP:-0.6} +exec /app/llama-server $BASE_MEDIUM_MODEL \ + -c $BASE_CONTEXT_SIZE -ngl $BASE_GPU_LAYERS -n $BASE_MAX_TOKENS \ + --temp $TEMP --top-p 0.9 --top-k 40 --repeat-penalty 1.1 \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --jinja \ + --timeout 600 --host 0.0.0.0 --port 8092 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startBaseMini.sh b/Services/llamacpp/Scripts/startBaseMini.sh new file mode 100755 index 0000000..6c1da24 --- /dev/null +++ b/Services/llamacpp/Scripts/startBaseMini.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +exec /app/llama-server $BASE_MINI_MODEL \ + -c 4096 -n 128 \ + --temp 0.2 --top-p 0.9 --top-k 40 --repeat-penalty 1.05 \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --jinja \ + --timeout 600 --host 0.0.0.0 --port 8091 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startBaseTop.sh b/Services/llamacpp/Scripts/startBaseTop.sh new file mode 100755 index 0000000..ff0fd17 --- /dev/null +++ b/Services/llamacpp/Scripts/startBaseTop.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +exec /app/llama-server $BASE_TOP_MODEL \ + -c $BASE_CONTEXT_SIZE -ngl $BASE_GPU_LAYERS -n $BASE_MAX_TOKENS \ + --temp 0.5 --top-p 0.9 --top-k 40 --repeat-penalty 1.1 \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --jinja \ + --timeout 900 --host 0.0.0.0 --port 8093 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startChat.sh b/Services/llamacpp/Scripts/startChat.sh new file mode 100755 index 0000000..a4d98a6 --- /dev/null +++ b/Services/llamacpp/Scripts/startChat.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Report descrittivi: 0.6 ok; 0.55 più stabile +TEMP=${GENERAL_TEMP:-0.6} +exec /app/llama-server $CHAT_MODEL \ + -c $GENERAL_CONTEXT_SIZE -ngl $GENERAL_GPU_LAYERS -n $GENERAL_MAX_TOKENS \ + --temp $TEMP --top-p 0.9 --top-k 40 --repeat-penalty 1.1 \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --jinja \ + --timeout 600 --host 0.0.0.0 --port 8093 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startCoder.sh b/Services/llamacpp/Scripts/startCoder.sh new file mode 100755 index 0000000..5bc4729 --- /dev/null +++ b/Services/llamacpp/Scripts/startCoder.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +exec /app/llama-server $CODER_MODEL \ + -c $CODER_CONTEXT_SIZE -n $CODER_MAX_TOKENS \ + --temp 0.3 --top-p 0.9 --top-k 40 --repeat-penalty 1.05 \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --jinja \ + --timeout 600 --host 0.0.0.0 --port 8094 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startCoderMedium.sh b/Services/llamacpp/Scripts/startCoderMedium.sh new file mode 100755 index 0000000..0480fbf --- /dev/null +++ b/Services/llamacpp/Scripts/startCoderMedium.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Prefer Q6 + mmap; fallback to no-mmap only if explicitly requested +EXTRA="" +if [[ "$FORCE_NO_MMAP_CODER" == "1" ]]; then EXTRA="--no-mmap"; fi +exec /app/llama-server $CODER_MEDIUM_MODEL \ + -c $CODER_CONTEXT_SIZE -ngl $CODER_GPU_LAYERS -n $CODER_MAX_TOKENS \ + --temp 0.5 --top-p 0.9 --top-k 40 --repeat-penalty 1.1 $EXTRA \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --jinja \ + --timeout 900 --host 0.0.0.0 --port 8095 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startCoderMini.sh b/Services/llamacpp/Scripts/startCoderMini.sh new file mode 100755 index 0000000..2c1f13a --- /dev/null +++ b/Services/llamacpp/Scripts/startCoderMini.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +exec /app/llama-server $CODER_MINI_MODEL \ + -c 4096 -n 256 \ + --temp 0.3 --top-p 0.9 --top-k 40 --repeat-penalty 1.05 \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --timeout 600 --host 0.0.0.0 --port 8094 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startCoderTop.sh b/Services/llamacpp/Scripts/startCoderTop.sh new file mode 100755 index 0000000..55a04dd --- /dev/null +++ b/Services/llamacpp/Scripts/startCoderTop.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Large model: auto-select mmap based on GPU layers +# <= 45 layers: use mmap (less VRAM usage, faster startup) +# > 45 layers: disable mmap (avoids SVM limits) +LAYERS=${CODER_TOP_GPU_LAYERS:-55} +MMAP_OPT="" +if [ "$LAYERS" -gt 45 ]; then + MMAP_OPT="--no-mmap" + echo "Using --no-mmap (layers=$LAYERS > 45)" +else + echo "Using mmap (layers=$LAYERS <= 45)" +fi +exec /app/llama-server $CODER_TOP_MODEL \ + -c $CODER_CONTEXT_SIZE -ngl $LAYERS -n $CODER_MAX_TOKENS \ + --temp 0.45 --top-p 0.9 --top-k 40 --repeat-penalty 1.12 \ + $MMAP_OPT \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --jinja \ + --timeout 1200 --host 0.0.0.0 --port 8096 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startEmbedding.sh b/Services/llamacpp/Scripts/startEmbedding.sh new file mode 100755 index 0000000..0950ef4 --- /dev/null +++ b/Services/llamacpp/Scripts/startEmbedding.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +exec /app/llama-server $EMBEDDING_MODEL \ + --embeddings --pooling mean \ + --flash-attn auto --threads -1 --threads-http -1 \ + --timeout 600 --host 0.0.0.0 --port 8096 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startEmbeddingFast.sh b/Services/llamacpp/Scripts/startEmbeddingFast.sh new file mode 100755 index 0000000..854672d --- /dev/null +++ b/Services/llamacpp/Scripts/startEmbeddingFast.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +exec /app/llama-server $EMBEDDING_FAST_MODEL \ + --embeddings --pooling mean \ + --flash-attn auto --threads -1 --threads-http -1 \ + --timeout 600 --host 0.0.0.0 --port 8095 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startEmbeddingMedium.sh b/Services/llamacpp/Scripts/startEmbeddingMedium.sh new file mode 100755 index 0000000..6b6a116 --- /dev/null +++ b/Services/llamacpp/Scripts/startEmbeddingMedium.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +exec /app/llama-server $EMBEDDING_MEDIUM_MODEL \ + --embeddings --pooling mean \ + --flash-attn auto --threads -1 --threads-http -1 \ + --timeout 600 --host 0.0.0.0 --port 8098 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startEmbeddingMini.sh b/Services/llamacpp/Scripts/startEmbeddingMini.sh new file mode 100755 index 0000000..108808c --- /dev/null +++ b/Services/llamacpp/Scripts/startEmbeddingMini.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +exec /app/llama-server $EMBEDDING_MINI_MODEL \ + --embeddings --pooling mean \ + --flash-attn auto --threads -1 --threads-http -1 \ + --timeout 600 --host 0.0.0.0 --port 8097 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startEmbeddingTop.sh b/Services/llamacpp/Scripts/startEmbeddingTop.sh new file mode 100755 index 0000000..f19476c --- /dev/null +++ b/Services/llamacpp/Scripts/startEmbeddingTop.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +exec /app/llama-server $EMBEDDING_TOP_MODEL \ + --embeddings --pooling mean \ + --flash-attn auto --threads -1 --threads-http -1 \ + --timeout 600 --host 0.0.0.0 --port 8099 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startGeneral.sh b/Services/llamacpp/Scripts/startGeneral.sh new file mode 100755 index 0000000..d622050 --- /dev/null +++ b/Services/llamacpp/Scripts/startGeneral.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Report descrittivi: 0.6 ok; 0.55 più stabile +TEMP=${GENERAL_TEMP:-0.6} +exec /app/llama-server $GENERAL_MODEL \ + -c $GENERAL_CONTEXT_SIZE -ngl $GENERAL_GPU_LAYERS -n $GENERAL_MAX_TOKENS \ + --temp $TEMP --top-p 0.9 --top-k 40 --repeat-penalty 1.1 \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --jinja \ + --timeout 600 --host 0.0.0.0 --port 8092 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/startGeneralFast.sh b/Services/llamacpp/Scripts/startGeneralFast.sh new file mode 100755 index 0000000..89f15c8 --- /dev/null +++ b/Services/llamacpp/Scripts/startGeneralFast.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +exec /app/llama-server $GENERAL_FAST_MODEL \ + -c $GENERAL_CONTEXT_SIZE -n 128 \ + --temp 0.6 --top-p 0.9 --top-k 40 --repeat-penalty 1.05 \ + --flash-attn auto --threads -1 --threads-batch -1 --threads-http -1 \ + --jinja \ + --timeout 600 --host 0.0.0.0 --port 8091 & +PID=$! + +cleanup() { + echo "Stopping llama-server..." + kill $PID 2>/dev/null + wait $PID 2>/dev/null + exit 0 +} + +trap cleanup SIGTERM SIGINT + +wait $PID \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopBaseMedium.sh b/Services/llamacpp/Scripts/stopBaseMedium.sh new file mode 100755 index 0000000..1bac166 --- /dev/null +++ b/Services/llamacpp/Scripts/stopBaseMedium.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pkill -f "llama-server.*8092" || true \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopBaseMini.sh b/Services/llamacpp/Scripts/stopBaseMini.sh new file mode 100755 index 0000000..e2372cb --- /dev/null +++ b/Services/llamacpp/Scripts/stopBaseMini.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pkill -f "llama-server.*8091" || true \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopBaseTop.sh b/Services/llamacpp/Scripts/stopBaseTop.sh new file mode 100755 index 0000000..e8f0af9 --- /dev/null +++ b/Services/llamacpp/Scripts/stopBaseTop.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pkill -f "llama-server.*8093" || true \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopChat.sh b/Services/llamacpp/Scripts/stopChat.sh new file mode 100755 index 0000000..4f08201 --- /dev/null +++ b/Services/llamacpp/Scripts/stopChat.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Stop llama-server per DuckAi-Chat +pkill -f "llama-server.*--port 8093" \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopCoder.sh b/Services/llamacpp/Scripts/stopCoder.sh new file mode 100755 index 0000000..b746e7d --- /dev/null +++ b/Services/llamacpp/Scripts/stopCoder.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Stop llama-server per DuckAi-Coder +pkill -f "llama-server.*--port 8094" \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopCoderMedium.sh b/Services/llamacpp/Scripts/stopCoderMedium.sh new file mode 100755 index 0000000..0bcc58f --- /dev/null +++ b/Services/llamacpp/Scripts/stopCoderMedium.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pkill -f "llama-server.*8095" || true \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopCoderMini.sh b/Services/llamacpp/Scripts/stopCoderMini.sh new file mode 100755 index 0000000..ac34c3c --- /dev/null +++ b/Services/llamacpp/Scripts/stopCoderMini.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pkill -f "llama-server.*8094" || true \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopCoderTop.sh b/Services/llamacpp/Scripts/stopCoderTop.sh new file mode 100755 index 0000000..67c94d1 --- /dev/null +++ b/Services/llamacpp/Scripts/stopCoderTop.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pkill -f "llama-server.*8096" || true \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopEmbedding.sh b/Services/llamacpp/Scripts/stopEmbedding.sh new file mode 100755 index 0000000..6ea4688 --- /dev/null +++ b/Services/llamacpp/Scripts/stopEmbedding.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Stop llama-server per DuckAi-Embedding +pkill -f "llama-server.*--port 8096" \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopEmbeddingFast.sh b/Services/llamacpp/Scripts/stopEmbeddingFast.sh new file mode 100755 index 0000000..1758b7d --- /dev/null +++ b/Services/llamacpp/Scripts/stopEmbeddingFast.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Stop llama-server per DuckAi-EmbeddingFast +pkill -f "llama-server.*--port 8095" \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopEmbeddingMedium.sh b/Services/llamacpp/Scripts/stopEmbeddingMedium.sh new file mode 100755 index 0000000..3f8eacc --- /dev/null +++ b/Services/llamacpp/Scripts/stopEmbeddingMedium.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pkill -f "llama-server.*8098" || true \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopEmbeddingMini.sh b/Services/llamacpp/Scripts/stopEmbeddingMini.sh new file mode 100755 index 0000000..54d1707 --- /dev/null +++ b/Services/llamacpp/Scripts/stopEmbeddingMini.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pkill -f "llama-server.*8097" || true \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopEmbeddingTop.sh b/Services/llamacpp/Scripts/stopEmbeddingTop.sh new file mode 100755 index 0000000..f2dcf3d --- /dev/null +++ b/Services/llamacpp/Scripts/stopEmbeddingTop.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pkill -f "llama-server.*8099" || true \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopGeneral.sh b/Services/llamacpp/Scripts/stopGeneral.sh new file mode 100755 index 0000000..2f4fda0 --- /dev/null +++ b/Services/llamacpp/Scripts/stopGeneral.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Stop llama-server per DuckAi-General +pkill -f "llama-server.*--port 8092" \ No newline at end of file diff --git a/Services/llamacpp/Scripts/stopGeneralFast.sh b/Services/llamacpp/Scripts/stopGeneralFast.sh new file mode 100755 index 0000000..754410c --- /dev/null +++ b/Services/llamacpp/Scripts/stopGeneralFast.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Stop llama-server per DuckAi-GeneralFast +pkill -f "llama-server.*--port 8091" \ No newline at end of file diff --git a/Services/llamacpp/config.preset.yaml b/Services/llamacpp/config.preset.yaml new file mode 100644 index 0000000..d096317 --- /dev/null +++ b/Services/llamacpp/config.preset.yaml @@ -0,0 +1,59 @@ +logLevel: info +healthCheckTimeout: 120 + +models: + DuckAi-GeneralFast: + proxy: http://localhost:8091 + cmd: /app/Scripts/startGeneralFast.sh + cmdStop: /app/Scripts/stopGeneralFast.sh + checkEndpoint: /health + + DuckAi-General: + proxy: http://localhost:8092 + cmd: /app/Scripts/startGeneral.sh + cmdStop: /app/Scripts/stopGeneral.sh + checkEndpoint: /health + ttl: 600 + + DuckAi-Chat: + proxy: http://localhost:8093 + cmd: /app/Scripts/startChat.sh + cmdStop: /app/Scripts/stopChat.sh + checkEndpoint: /health + ttl: 600 + + DuckAi-Coder: + proxy: http://localhost:8094 + cmd: /app/Scripts/startCoder.sh + cmdStop: /app/Scripts/stopCoder.sh + checkEndpoint: /health + ttl: 600 + + DuckAi-EmbeddingFast: + proxy: http://localhost:8095 + cmd: /app/Scripts/startEmbeddingFast.sh + cmdStop: /app/Scripts/stopEmbeddingFast.sh + checkEndpoint: /health + ttl: 600 + + DuckAi-Embedding: + proxy: http://localhost:8096 + cmd: /app/Scripts/startEmbedding.sh + cmdStop: /app/Scripts/stopEmbedding.sh + checkEndpoint: /health + ttl: 600 + +groups: + default-models: + swap: false + exclusive: false + persistent: true + members: + - DuckAi-GeneralFast + - DuckAi-Chat + - DuckAi-Embedding + +hooks: + on_startup: + preload: + - DuckAi-GeneralFast diff --git a/Services/llamacpp/entrypoint.sh b/Services/llamacpp/entrypoint.sh new file mode 100644 index 0000000..60a33c7 --- /dev/null +++ b/Services/llamacpp/entrypoint.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +CONFIG_FILE="/app/config.yaml" +PRESET_FILE="/app/config.preset.yaml" + +echo "Checking configuration..." + +# Se il file non esiste o è vuoto o non contiene 'models:', usa il preset +if [ ! -f "$CONFIG_FILE" ] || [ ! -s "$CONFIG_FILE" ] || ! grep -q "models:" "$CONFIG_FILE" 2>/dev/null; then + echo "Config file missing, empty, or invalid. Copying from preset..." + cp "$PRESET_FILE" "$CONFIG_FILE" + echo "Config file populated from preset." +else + echo "Config file found and valid." +fi + +exec /app/llama-swap -config "$CONFIG_FILE" -listen :8080 diff --git a/Services/llamacpp/lamacpp-nginx.conf b/Services/llamacpp/lamacpp-nginx.conf new file mode 100644 index 0000000..444b360 --- /dev/null +++ b/Services/llamacpp/lamacpp-nginx.conf @@ -0,0 +1,101 @@ +# Template Nginx per servizi containerizzati +# Sostituisci [DOMAIN], [UPSTREAM_NAME], [UPSTREAM_SERVER] con i valori appropriati + +server { + listen 80; + server_name models.ai.duckpage.net; + return 301 https://$server_name$request_uri; +} + +server { + listen 443 ssl; + listen [::]:443 ssl; + server_name models.ai.duckpage.net; + charset utf-8; + keepalive_timeout 70; + + # SSL + ssl_certificate /etc/nginx/ssl/live/ai.duckpage.net/fullchain.pem; + ssl_certificate_key /etc/nginx/ssl/live/ai.duckpage.net/privkey.pem; + + # Improve HTTPS performance with session resumption + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 10m; + + # SSL Protocols and Ciphers + ssl_protocols TLSv1.3; + ssl_prefer_server_ciphers off; + ssl_dhparam /etc/nginx/ssl/dhparam.pem; + ssl_ecdh_curve secp521r1:secp384r1; + + # Security Headers + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains"; + add_header X-Frame-Options SAMEORIGIN always; + add_header X-Content-Type-Options nosniff always; + add_header X-Xss-Protection "1; mode=block" always; + + # OCSP Stapling + ssl_stapling on; + ssl_stapling_verify on; + ssl_trusted_certificate /etc/nginx/ssl/live/ai.duckpage.net/fullchain.pem; + resolver 1.1.1.1 1.0.0.1 [2606:4700:4700::1111] [2606:4700:4700::1001] valid=300s; + resolver_timeout 5s; + + client_max_body_size 512M; + client_body_buffer_size 128k; + + # Gzip + gzip_types text/plain text/xml text/css application/xhtml+xml application/xml image/svg+xml application/rss+xml application/atom_xml application/javascript application/x-javascript application/x-httpd-php application/x-httpd-fastphp application/x-httpd-eruby; + + + # Main Proxy + location /v1 { + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "Upgrade"; + + proxy_connect_timeout 600; + proxy_send_timeout 600; + proxy_read_timeout 600; + send_timeout 600; + + proxy_redirect off; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_pass http://llamacpp:8080/v1; + } + + location / { + # Allow specific IPs (replace with your actual IPs) + allow 127.0.0.1; + allow ::1; + allow 10.50.210.0/24; + allow 10.0.80.0/24; + # Add more allow lines for specific IPs, e.g., allow 192.168.1.0/24; + deny all; + + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "Upgrade"; + + proxy_connect_timeout 600; + proxy_send_timeout 600; + proxy_read_timeout 600; + send_timeout 600; + + proxy_redirect off; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_pass http://llamacpp:8080; + } + + location ~ /\.ht { + deny all; + } +} diff --git a/Services/llamacpp/llama-vulkan.Containerfile b/Services/llamacpp/llama-vulkan.Containerfile new file mode 100644 index 0000000..8cefc96 --- /dev/null +++ b/Services/llamacpp/llama-vulkan.Containerfile @@ -0,0 +1,121 @@ +### LLaMACpp Builder Container with Vulkan for GPUs +### Multi-stage: download stage with pre-built binaries, runtime stage with only runtime libraries +### +### BUILD: podman build -t llamacpp:vulkan-amd64 -f llama-vulkan.Containerfile . +### Export: podman save -o /home/duckpage/llamacpp-vulkan-amd64.tar localhost/llamacpp:vulkan-amd64 + + +ARG UBUNTU_VERSION=24.04 + +### Download image +FROM ubuntu:${UBUNTU_VERSION} AS download + +RUN apt-get update \ + && apt-get install -y curl unzip grep sed \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /tmp + +RUN VERSION=$(curl -s -I https://github.com/ggml-org/llama.cpp/releases/latest | grep -i location | sed 's|.*/tag/||' | tr -d '\r') \ + && echo "Last llama.cpp version: $VERSION" \ + && curl -L https://github.com/ggml-org/llama.cpp/releases/download/${VERSION}/llama-${VERSION}-bin-ubuntu-vulkan-x64.zip -o llama.zip \ + && unzip llama.zip \ + && rm llama.zip \ + && if [ -d llama-* ]; then mv llama-*/* . && rmdir llama-*; elif [ -d build ]; then mv build/* . && rmdir build; fi \ + && if [ -d bin ]; then mv bin/* . && rmdir bin; fi # flatten further + +RUN mkdir -p /app/lib /app/full \ + && find . -name "*.so" -exec cp {} /app/lib \; \ + && cp -r * /app/full 2>/dev/null || true \ + && ls -la /app/full # list contents + +## Base image +FROM ubuntu:${UBUNTU_VERSION} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl nano ca-certificates wget\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=download /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=download /app/full /app + +RUN chmod +x /app/llama-server + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + libvulkan-dev \ + git \ + python3-pip \ + python3 \ + python3-wheel\ + && pip install --break-system-packages --upgrade setuptools \ + && pip install --break-system-packages -U "huggingface_hub[cli]" \ + && if [ -f requirements.txt ]; then pip install --break-system-packages -r requirements.txt; fi \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +# -------- Model args (prefer Q6 to keep mmap on and avoid load issues) -------- +ARG GENERAL_FAST_MODEL="-m models/gemma-3-1b-it-Q5_K_M.gguf" +ARG GENERAL_MODEL="-m models/gpt-oss-20b-Q4_K_M.gguf" + +ARG CHAT_MODEL="-m models/Qwen3-VL-30B-A3B-Q4_K_S.gguf" + +ARG CODER_MODEL="-m models/Qwen3-Coder-30B-A3B-Instruct-Q6_K.gguf" + +ARG EMBEDDING_FAST_MODEL="-m models/embeddinggemma-300M-Q8_0.gguf" +ARG EMBEDDING_MODEL="-m models/bge-code-v1-q6_k.gguf" + +# -------- Runtime defaults -------- +ARG GENERAL_CONTEXT_SIZE=16384 +ARG GENERAL_GPU_LAYERS=99 +ARG GENERAL_MAX_TOKENS=512 + +ARG CODER_CONTEXT_SIZE=131072 +ARG CODER_GPU_LAYERS=99 +ARG CODER_MAX_TOKENS=512 + +ENV GENERAL_FAST_MODEL=${GENERAL_FAST_MODEL} +ENV GENERAL_MODEL=${GENERAL_MODEL} +ENV CODER_MODEL=${CODER_MODEL} +ENV EMBEDDING_FAST_MODEL=${EMBEDDING_FAST_MODEL} +ENV EMBEDDING_MODEL=${EMBEDDING_MODEL} + +ENV GENERAL_CONTEXT_SIZE=${GENERAL_CONTEXT_SIZE} +ENV GENERAL_GPU_LAYERS=${GENERAL_GPU_LAYERS} +ENV GENERAL_MAX_TOKENS=${GENERAL_MAX_TOKENS} +ENV CODER_CONTEXT_SIZE=${CODER_CONTEXT_SIZE} +ENV CODER_GPU_LAYERS=${CODER_GPU_LAYERS} +ENV CODER_MAX_TOKENS=${CODER_MAX_TOKENS} + +# -------- llama-swap -------- +RUN curl -L https://github.com/mostlygeek/llama-swap/releases/download/v165/llama-swap_165_linux_amd64.tar.gz -o /tmp/llama-swap.tar.gz \ + && tar -xzf /tmp/llama-swap.tar.gz -C /app \ + && rm /tmp/llama-swap.tar.gz + +# -------- start/stop scripts -------- +# Nota: usiamo --threads -1 --threads-batch -1 per lasciare a llama.cpp l'autotuning + +COPY ./Scripts/ /app/Scripts/ +RUN chmod +x /app/Scripts/*.sh + +# -------- Copy preset config file -------- +COPY ./config.preset.yaml /app/config.preset.yaml + +# -------- Copy entrypoint script -------- +COPY ./entrypoint.sh /app/entrypoint.sh +RUN chmod +x /app/entrypoint.sh + +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/Services/llamacpp/llamacpp.container b/Services/llamacpp/llamacpp.container new file mode 100644 index 0000000..60f9964 --- /dev/null +++ b/Services/llamacpp/llamacpp.container @@ -0,0 +1,54 @@ +[Unit] +Name=llamacpp + +[Container] +ContainerName=llamacpp +Image=localhost/llamacpp:rocm-amd64 +Network=internal.network + +#PublishPort=8080:8080 + +# ROCm +AddDevice=/dev/kfd +AddDevice=/dev/dri +PodmanArgs=--userns=keep-id --group-add=keep-groups --ipc=host +SecurityLabelType=container_runtime_t + +# ROCm tuning +#Environment=HSA_OVERRIDE_GFX_VERSION=11.5.1 +#Environment=ROCR_VISIBLE_DEVICES=0 +#Environment=GPU_TARGETS=gfx1151 + +# API Key +#Environment=LLAMA_API_KEY="" + +# Models +Environment=GENERAL_FAST_MODEL="-m models/gemma-3-1b-it-Q5_K_M.gguf" +Environment=GENERAL_MODEL="-m models/gpt-oss-20b-Q4_K_M.gguf" + +Environment=CHAT_MODEL="-m models/Qwen3-VL-30B-A3B-Q4_K_S.gguf" + +Environment=CODER_MODEL="-m models/Qwen3-Coder-30B-A3B-Instruct-Q6_K.gguf" + +Environment=EMBEDDING_FAST_MODEL="-m models/embeddinggemma-300M-Q8_0.gguf" +Environment=EMBEDDING_MODEL="-m models/bge-code-v1-q6_k.gguf" + +Environment=GENERAL_CONTEXT_SIZE=262144 +Environment=GENERAL_GPU_LAYERS=99 +Environment=GENERAL_MAX_TOKENS=512 + +Environment=CODER_CONTEXT_SIZE=131072 +Environment=CODER_GPU_LAYERS=99 +Environment=CODER_MAX_TOKENS=512 + +# Mount points +Volume=/srv/containers/aitools/models/llamacpp:/home/ubuntu/.cache/llama.cpp +Volume=/srv/containers/aitools/models/llamacpp:/app/models +Volume=/srv/containers/aitools/llamacpp_config.yaml:/app/config.yaml + +[Service] +Restart=on-failure +TimeoutStartSec=15m + +[Install] +WantedBy=multi-user.target default.target diff --git a/Services/nginx/nginx.container b/Services/nginx/nginx.container new file mode 100644 index 0000000..f9e0544 --- /dev/null +++ b/Services/nginx/nginx.container @@ -0,0 +1,23 @@ +[Unit] +Name=nginx + +[Container] +ContainerName=nginx +Image=nginx:latest +#AutoUpdate=registry +Network=internal + +Volume=/srv/containers/nginx/conf:/etc/nginx/conf.d:ro +Volume=/srv/containers/nginx/html:/usr/share/nginx/html:ro + +Volume=/srv/containers/nginx/ssl:/etc/nginx/ssl:ro + +PublishPort=80:80 +PublishPort=443:443 + +[Service] +TimeoutStartSec=5m +Restart=always + +[Install] +WantedBy=multi-user.target default.target \ No newline at end of file diff --git a/Services/searxng/searxng.container b/Services/searxng/searxng.container new file mode 100644 index 0000000..98420ba --- /dev/null +++ b/Services/searxng/searxng.container @@ -0,0 +1,20 @@ +[Unit] +Name=searxng + +[Container] +ContainerName=searxng +Image=docker.io/searxng/searxng:latest +#AutoUpdate=registry +Network=internal.network +#PublishPort=8888:8080 + +# Production +Volume=/srv/containers/aitools/searxng/config:/etc/searxng +Volume=/srv/containers/aitools/searxng/data:/var/cache/searxng + +[Service] +TimeoutStartSec=5m +Restart=on-failure + +[Install] +WantedBy=multi-user.target default.target \ No newline at end of file diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..1a9f118 --- /dev/null +++ b/install.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +set -e + +# Modifica questa variabile con l'URL raw del tuo repository remoto +REPO_URL="https://raw.githubusercontent.com/yourusername/yourrepo/main" + +echo "Iniziando l'installazione di BDI Podman Serverconf..." + +# 1. Aggiornare il sistema Ubuntu +echo "Aggiornando il sistema..." +sudo apt update && sudo apt upgrade -y + +# 2. Aggiungere utente ai gruppi render e video +echo "Aggiungendo utente ai gruppi render e video..." +sudo usermod -a -G render,video $LOGNAME + +# 3. Installare podman +echo "Installando podman..." +sudo apt install -y podman + +# 4. Creare cartelle per systemd containers +echo "Creando cartelle per containers systemd..." +mkdir -p ~/.config/containers/systemd + +# 5. Scaricare e copiare internal.network +echo "Scaricando internal.network..." +curl -fsSL $REPO_URL/internal.network -o ~/.config/containers/systemd/internal.network + +# 6. Aggiungere registri a /etc/containers/registries.conf +echo "Aggiungendo registri a registries.conf..." +echo -e "[registries.search]\nregistries = [\"docker.io\", \"quay.io\", \"ghcr.io\"]" | sudo tee -a /etc/containers/registries.conf > /dev/null + +# 7. Creare /srv/containers e assegnare permessi +echo "Creando /srv/containers e assegnando permessi..." +sudo mkdir -p /srv/containers +sudo chown -R $LOGNAME /srv/containers + +# 8. Creare sottocartelle per aitools +echo "Creando cartelle per aitools..." +mkdir -p /srv/containers/aitools/{models,.cache} + +# 9. Creare file vuoto llamacpp_config.yaml +echo "Creando llamacpp_config.yaml..." +touch /srv/containers/aitools/llamacpp_config.yaml + +# 10. Aggiornare GRUB +echo "Aggiornando GRUB..." +sudo sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT=.*/GRUB_CMDLINE_LINUX_DEFAULT="amdgpu.gttsize=24576 amdttm.pages_limit=27648000"/' /etc/default/grub +sudo update-grub + +# 11. Scaricare e installare banner.sh +echo "Scaricando e installando banner MOTD..." +sudo curl -fsSL $REPO_URL/banner.sh -o /etc/update-motd.d/99-duckai-banner +sudo chmod +x /etc/update-motd.d/99-duckai-banner + +# 12. Disabilitare altri script MOTD +echo "Disabilitando altri script MOTD..." +sudo bash -c 'for f in /etc/update-motd.d/*; do [[ "$f" != "/etc/update-motd.d/99-duckai-banner" ]] && mv "$f" "${f}.disabled"; done' + +# 13. Scaricare e installare badai +echo "Scaricando e installando badai..." +sudo curl -fsSL $REPO_URL/badai -o /usr/local/bin/badai +sudo chmod +x /usr/local/bin/badai + +# 14. Scaricare container files +echo "Scaricando file container..." +curl -fsSL $REPO_URL/Services/llamacpp/llamacpp.container -o ~/.config/containers/systemd/llamacpp.container +curl -fsSL $REPO_URL/Services/nginx/nginx.container -o ~/.config/containers/systemd/nginx.container + +# 15. Riavviare il sistema +echo "Installazione completata. Riavviando il sistema..." +sudo reboot \ No newline at end of file diff --git a/internal.network b/internal.network new file mode 100644 index 0000000..5ac3f97 --- /dev/null +++ b/internal.network @@ -0,0 +1,12 @@ +[Unit] +Description=Internal network for containers +After=network-online.target + +[Network] +NetworkName=internal +Subnet=10.10.0.0/24 +Gateway=10.10.0.1 +DNS=9.9.9.9 + +[Install] +WantedBy=default.target \ No newline at end of file