Skip to content

Commit 066fa4c

Browse files
author
Chris Warren-Smith
committed
LLM: plugin module - update build for GPU
1 parent 67db281 commit 066fa4c

File tree

8 files changed

+353
-116
lines changed

8 files changed

+353
-116
lines changed

llama/CMakeLists.txt

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,11 @@ set(CMAKE_C_STANDARD 11)
1111
# -----------------------------
1212
set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp)
1313

14+
set(LLAMA_BACKEND "AUTO" CACHE STRING "llama.cpp backend: AUTO, CPU, GPU, CUDA")
15+
set_property(CACHE LLAMA_BACKEND PROPERTY STRINGS AUTO CPU GPU CUDA)
16+
1417
# -----------------------------
15-
# FORCE CPU-only static builds
18+
# FORCE static builds
1619
# -----------------------------
1720
# Disable all shared libraries globally
1821
set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
@@ -34,16 +37,72 @@ set(GGML_BUILD_SHARED OFF CACHE BOOL "" FORCE)
3437
set(GGML_BUILD_TESTS OFF CACHE BOOL "" FORCE)
3538
set(GGML_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
3639

37-
# CPU-only flags
40+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
41+
42+
# -------------------------------
43+
# Define backend options
44+
# -------------------------------
45+
set(LLAMA_BACKEND "AUTO" CACHE STRING "Select llama.cpp backend: AUTO, CPU, GPU, CUDA")
46+
set_property(CACHE LLAMA_BACKEND PROPERTY STRINGS AUTO CPU GPU CUDA)
47+
48+
#
49+
# sudo apt install nvidia-open cuda-toolkit
50+
#
51+
52+
# -------------------------------
53+
# Disable all accelerators by default
54+
# -------------------------------
3855
set(GGML_OPENMP OFF CACHE BOOL "" FORCE)
3956
set(GGML_CUDA OFF CACHE BOOL "" FORCE)
4057
set(GGML_METAL OFF CACHE BOOL "" FORCE)
4158
set(GGML_OPENCL OFF CACHE BOOL "" FORCE)
4259
set(GGML_KOMPUTE OFF CACHE BOOL "" FORCE)
4360
set(GGML_SYCL OFF CACHE BOOL "" FORCE)
4461
set(GGML_ACCELERATE OFF CACHE BOOL "" FORCE)
45-
set(GGML_NATIVE ON CACHE BOOL "" FORCE)
46-
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
62+
set(GGML_NATIVE OFF CACHE BOOL "" FORCE) # default off
63+
64+
# -------------------------------
65+
# Configure backends based on LLAMA_BACKEND
66+
# -------------------------------
67+
include(CheckLanguage)
68+
69+
if(LLAMA_BACKEND STREQUAL "CPU")
70+
message(STATUS "llama.cpp backend: CPU-only")
71+
set(GGML_NATIVE ON CACHE BOOL "" FORCE) # enable CPU SIMD optimizations
72+
73+
elseif(LLAMA_BACKEND STREQUAL "GPU")
74+
message(STATUS "llama.cpp backend: GPU (non-CUDA)")
75+
set(GGML_OPENMP ON CACHE BOOL "" FORCE) # parallel CPU fallback
76+
# GPU non-CUDA options can be added here in the future
77+
78+
elseif(LLAMA_BACKEND STREQUAL "CUDA")
79+
message(STATUS "llama.cpp backend: CUDA")
80+
81+
check_language(CUDA)
82+
if(CMAKE_CUDA_COMPILER)
83+
enable_language(CUDA)
84+
set(GGML_CUDA ON CACHE BOOL "" FORCE)
85+
else()
86+
message(FATAL_ERROR "CUDA backend requested but nvcc not found")
87+
endif()
88+
89+
elseif(LLAMA_BACKEND STREQUAL "AUTO")
90+
message(STATUS "llama.cpp backend: AUTO")
91+
92+
check_language(CUDA)
93+
if(CMAKE_CUDA_COMPILER)
94+
enable_language(CUDA)
95+
set(GGML_CUDA ON CACHE BOOL "" FORCE)
96+
message(STATUS "CUDA detected – enabling GGML_CUDA")
97+
else()
98+
set(GGML_OPENMP ON CACHE BOOL "" FORCE)
99+
set(GGML_NATIVE ON CACHE BOOL "" FORCE)
100+
message(STATUS "CUDA not found – using CPU/OpenMP")
101+
endif()
102+
103+
else()
104+
message(FATAL_ERROR "Invalid LLAMA_BACKEND value: ${LLAMA_BACKEND}")
105+
endif()
47106

48107
# -----------------------------
49108
# Add llama.cpp subdirectories
@@ -129,7 +188,7 @@ if (ANDROID)
129188
../include/hashmap.cpp
130189
../include/apiexec.cpp
131190
)
132-
191+
133192
# Optional: set the SONAME / versioning if you need it
134193
set_target_properties(llm_android PROPERTIES
135194
OUTPUT_NAME "libllm"

llama/README.md

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,87 @@
1+
1️⃣ Ensure nvidia-open driver is installed and working
2+
3+
Check:
4+
5+
``
6+
nvidia-smi
7+
``
8+
9+
If it works, your driver is fine — no need to install the proprietary driver.
10+
11+
2️⃣ Add NVIDIA CUDA repository
12+
13+
```
14+
wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb
15+
sudo dpkg -i cuda-keyring_1.1-1_all.deb
16+
sudo apt update
17+
```
18+
19+
This repo contains the latest CUDA toolkit for Debian 12.
20+
21+
3️⃣ Install CUDA Toolkit only (no driver replacement)
22+
sudo apt install -y cuda-toolkit
23+
24+
25+
This installs:
26+
27+
- nvcc compiler
28+
- CUDA headers
29+
- Runtime libraries (libcudart.so, etc.)
30+
31+
4️⃣ Add CUDA to your environment
32+
33+
```
34+
export PATH=/usr/local/cuda/bin:$PATH
35+
export CUDAToolkit_ROOT=/usr/local/cuda
36+
```
37+
38+
Optional: add to ~/.bashrc to make it permanent:
39+
40+
```
41+
echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc
42+
echo 'export CUDAToolkit_ROOT=/usr/local/cuda' >> ~/.bashrc
43+
source ~/.bashrc
44+
```
45+
46+
Verify:
47+
48+
nvcc --version
49+
50+
Should show something like:
51+
52+
```
53+
nvcc: NVIDIA (R) Cuda compiler driver
54+
Cuda compilation tools, release 12.4, V12.4.105
55+
```
56+
57+
5️⃣ Clean llama.cpp build directory
58+
59+
```
60+
rm -rf build
61+
mkdir build
62+
cd build
63+
```
64+
65+
6️⃣ Configure CMake for CUDA backend
66+
67+
```
68+
cmake -DLLAMA_BACKEND=CUDA ..
69+
```
70+
71+
You should now see:
72+
73+
-- CUDA detected – enabling GGML_CUDA
74+
75+
7️⃣ Build
76+
77+
```
78+
make -j$(nproc)
79+
```
80+
81+
The binary will use CUDA acceleration
82+
83+
Note: fully static builds are not possible for CUDA; some .so libraries will remain dynamically linked (normal).
84+
185
# Generator settings
286

387
## factual answers, tools, summaries

llama/llama-sb.cpp

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,13 @@ void Llama::reset() {
6565
_max_tokens = 150;
6666
}
6767

68-
bool Llama::construct(string model_path, int n_ctx, int n_batch) {
68+
bool Llama::construct(string model_path, int n_ctx, int n_batch, int n_gpu_layers) {
6969
ggml_backend_load_all();
7070

7171
llama_model_params mparams = llama_model_default_params();
72-
mparams.n_gpu_layers = 0;
72+
if (n_gpu_layers >= 0) {
73+
mparams.n_gpu_layers = n_gpu_layers;
74+
}
7375

7476
_model = llama_model_load_from_file(model_path.c_str(), mparams);
7577
if (!_model) {
@@ -196,6 +198,8 @@ bool Llama::make_space_for_tokens(int n_tokens, int keep_min) {
196198
}
197199

198200
bool Llama::generate(LlamaIter &iter, const string &prompt) {
201+
configure_sampler();
202+
199203
vector<llama_token> prompt_tokens = tokenize(prompt);
200204
if (prompt_tokens.size() == 0) {
201205
return false;
@@ -233,8 +237,6 @@ bool Llama::generate(LlamaIter &iter, const string &prompt) {
233237
}
234238
}
235239

236-
configure_sampler();
237-
238240
iter._t_start = std::chrono::high_resolution_clock::now();
239241
iter._llama = this;
240242
iter._has_next = true;
@@ -299,44 +301,50 @@ string Llama::next(LlamaIter &iter) {
299301
return "";
300302
}
301303

302-
// decode the token
303-
llama_batch batch = llama_batch_get_one(&tok, 1);
304-
if (llama_decode(_ctx, batch)) {
305-
_last_error = "Failed to evaluate token during generation";
306-
return "";
307-
}
308-
309-
string out;
310-
311-
if (!llama_vocab_is_control(_vocab, tok)) {
312-
char buf[512];
313-
int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false);
314-
if (n > 0) {
315-
if (iter._last_word == buf) {
316-
if (++iter._repetition_count == MAX_REPEAT) {
317-
iter._has_next = false;
318-
}
319-
} else {
320-
iter._repetition_count = 0;
321-
iter._last_word = buf;
322-
}
323-
out.append(buf, n);
304+
string result;
324305

325-
if (++iter._tokens_generated > _max_tokens && ends_with_sentence_boundary(out)) {
306+
//if (!llama_vocab_is_control(_vocab, tok)) {
307+
char buf[512];
308+
int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false);
309+
if (n > 0) {
310+
// detect repetition
311+
if (iter._last_word == buf) {
312+
if (++iter._repetition_count == MAX_REPEAT) {
326313
iter._has_next = false;
327314
}
315+
} else {
316+
iter._repetition_count = 0;
317+
iter._last_word = buf;
318+
}
319+
320+
result.append(buf, n);
328321

322+
// detect end of max-tokens
323+
if (++iter._tokens_generated > _max_tokens && ends_with_sentence_boundary(result)) {
324+
iter._has_next = false;
325+
}
326+
327+
// detect stop words
328+
if (iter._has_next) {
329329
for (const auto &stop : _stop_sequences) {
330-
size_t pos = out.find(stop);
330+
size_t pos = result.find(stop);
331331
if (pos != std::string::npos) {
332332
// found stop sequence - truncate and signal end
333-
out = out.substr(0, pos);
333+
result = result.substr(0, pos);
334334
iter._has_next = false;
335335
break;
336336
}
337337
}
338338
}
339339
}
340-
return out;
340+
341+
// prepare the next batch with the sampled token
342+
llama_batch batch = llama_batch_get_one(&tok, 1);
343+
if (llama_decode(_ctx, batch)) {
344+
_last_error = "Failed to evaluate token during generation";
345+
return "";
346+
}
347+
348+
return result;
341349
}
342350

llama/llama-sb.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ struct Llama {
3333
~Llama();
3434

3535
// init
36-
bool construct(string model_path, int n_ctx, int n_batch);
36+
bool construct(string model_path, int n_ctx, int n_batch, int n_gpu_layers);
3737

3838
// generation
3939
bool generate(LlamaIter &iter, const string &prompt);

llama/main.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -320,9 +320,10 @@ static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
320320
auto model = expand_path(get_param_str(argc, params, 0, ""));
321321
auto n_ctx = get_param_int(argc, params, 1, 2048);
322322
auto n_batch = get_param_int(argc, params, 2, 1024);
323+
auto n_gpu_layers = get_param_int(argc, params, 3, -1);
323324
int id = ++g_nextId;
324325
Llama &llama = g_llama[id];
325-
if (llama.construct(model, n_ctx, n_batch)) {
326+
if (llama.construct(model, n_ctx, n_batch, n_gpu_layers)) {
326327
map_init_id(retval, id, CLASS_ID_LLAMA);
327328
v_create_callback(retval, "add_stop", cmd_llama_add_stop);
328329
v_create_callback(retval, "generate", cmd_llama_generate);
@@ -344,7 +345,7 @@ static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
344345
}
345346

346347
FUNC_SIG lib_func[] = {
347-
{1, 3, "LLAMA", cmd_create_llama},
348+
{1, 4, "LLAMA", cmd_create_llama},
348349
};
349350

350351
SBLIB_API int sblib_func_count() {

0 commit comments

Comments
 (0)