-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
60 lines (59 loc) · 2.08 KB
/
docker-compose.yml
File metadata and controls
60 lines (59 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# =============================================================================
# llama.cpp server -- RTX 4090 + RTX 5070 Ti
#
# Quick start:
# MODEL=my-model.gguf docker compose up
#
# Override any variable below on the command line, in a .env file, or by
# editing the environment section directly.
# =============================================================================
services:
llama-server:
build: .
container_name: llama-server
ports:
- "8080:8080"
volumes:
- ./models:/models:ro
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
# ---- Model ----
- MODEL=/models/${MODEL:-model.gguf}
# ---- Context ----
- CTX_SIZE=${CTX_SIZE:-131072}
# ---- GPU offload ----
- N_GPU_LAYERS=${N_GPU_LAYERS:-auto}
# ---- Split mode: layer (default), row, none ----
- SPLIT_MODE=${SPLIT_MODE:-layer}
# ---- Manual tensor split ratio (blank = use --fit auto) ----
- TENSOR_SPLIT=${TENSOR_SPLIT:-}
# ---- Main GPU (0 = RTX 4090, 1 = RTX 5070 Ti) ----
- MAIN_GPU=${MAIN_GPU:-0}
# ---- Flash attention ----
- FLASH_ATTN=${FLASH_ATTN:-1}
# ---- KV cache quantization ----
- KV_CACHE_TYPE_K=${KV_CACHE_TYPE_K:-q8_0}
- KV_CACHE_TYPE_V=${KV_CACHE_TYPE_V:-q8_0}
# ---- Auto-fit (on by default, adjusts layers/ctx to fit VRAM) ----
- FIT=${FIT:-on}
# ---- Fit target: VRAM headroom in MiB per device ----
# CUDA0 (4090): dedicated, nothing else running → minimal margin
# CUDA1 (5070 Ti): shares with OS/display (~3 GB) → standard margin
- FIT_TARGET=${FIT_TARGET:-128,1024}
# ---- Fit minimum context size ----
- FIT_CTX=${FIT_CTX:-}
# ---- Extra flags (pass anything not covered above) ----
- EXTRA_ARGS=${EXTRA_ARGS:-}
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8080/health"]
interval: 10s
timeout: 5s
retries: 60
start_period: 120s
restart: unless-stopped