smallbasic
diff --git a/‎llama/CMakeLists.txt‎
Lines changed: 64 additions & 5 deletions b/‎llama/CMakeLists.txt‎
Lines changed: 64 additions & 5 deletions
diff --git a/‎llama/README.md‎
Lines changed: 84 additions & 0 deletions b/‎llama/README.md‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎llama/llama-sb.cpp‎
Lines changed: 38 additions & 30 deletions b/‎llama/llama-sb.cpp‎
Lines changed: 38 additions & 30 deletions
diff --git a/‎llama/llama-sb.h‎
Lines changed: 1 addition & 1 deletion b/‎llama/llama-sb.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama/main.cpp‎
Lines changed: 3 additions & 2 deletions b/‎llama/main.cpp‎
Lines changed: 3 additions & 2 deletions
@@ -11,8 +11,11 @@ set(CMAKE_C_STANDARD 11)
 # -----------------------------
 set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp)
 
+set(LLAMA_BACKEND "AUTO" CACHE STRING "llama.cpp backend: AUTO, CPU, GPU, CUDA")
+set_property(CACHE LLAMA_BACKEND PROPERTY STRINGS AUTO CPU GPU CUDA)
+
 # -----------------------------
-# FORCE CPU-only static builds
+# FORCE static builds
 # -----------------------------
 # Disable all shared libraries globally
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
@@ -34,16 +37,72 @@ set(GGML_BUILD_SHARED OFF CACHE BOOL "" FORCE)
 set(GGML_BUILD_TESTS OFF CACHE BOOL "" FORCE)
 set(GGML_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
 
-# CPU-only flags
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# -------------------------------
+# Define backend options
+# -------------------------------
+set(LLAMA_BACKEND "AUTO" CACHE STRING "Select llama.cpp backend: AUTO, CPU, GPU, CUDA")
+set_property(CACHE LLAMA_BACKEND PROPERTY STRINGS AUTO CPU GPU CUDA)
+
+#
+# sudo apt install nvidia-open cuda-toolkit
+#
+
+# -------------------------------
+# Disable all accelerators by default
+# -------------------------------
 set(GGML_OPENMP OFF CACHE BOOL "" FORCE)
 set(GGML_CUDA OFF CACHE BOOL "" FORCE)
 set(GGML_METAL OFF CACHE BOOL "" FORCE)
 set(GGML_OPENCL OFF CACHE BOOL "" FORCE)
 set(GGML_KOMPUTE OFF CACHE BOOL "" FORCE)
 set(GGML_SYCL OFF CACHE BOOL "" FORCE)
 set(GGML_ACCELERATE OFF CACHE BOOL "" FORCE)
-set(GGML_NATIVE ON CACHE BOOL "" FORCE)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(GGML_NATIVE OFF CACHE BOOL "" FORCE)  # default off
+
+# -------------------------------
+# Configure backends based on LLAMA_BACKEND
+# -------------------------------
+include(CheckLanguage)
+
+if(LLAMA_BACKEND STREQUAL "CPU")
+  message(STATUS "llama.cpp backend: CPU-only")
+  set(GGML_NATIVE ON CACHE BOOL "" FORCE)  # enable CPU SIMD optimizations
+
+elseif(LLAMA_BACKEND STREQUAL "GPU")
+  message(STATUS "llama.cpp backend: GPU (non-CUDA)")
+  set(GGML_OPENMP ON CACHE BOOL "" FORCE)  # parallel CPU fallback
+  # GPU non-CUDA options can be added here in the future
+
+elseif(LLAMA_BACKEND STREQUAL "CUDA")
+  message(STATUS "llama.cpp backend: CUDA")
+
+  check_language(CUDA)
+  if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+    set(GGML_CUDA ON CACHE BOOL "" FORCE)
+  else()
+    message(FATAL_ERROR "CUDA backend requested but nvcc not found")
+  endif()
+
+elseif(LLAMA_BACKEND STREQUAL "AUTO")
+  message(STATUS "llama.cpp backend: AUTO")
+
+  check_language(CUDA)
+  if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+    set(GGML_CUDA ON CACHE BOOL "" FORCE)
+    message(STATUS "CUDA detected – enabling GGML_CUDA")
+  else()
+    set(GGML_OPENMP ON CACHE BOOL "" FORCE)
+    set(GGML_NATIVE ON CACHE BOOL "" FORCE)
+    message(STATUS "CUDA not found – using CPU/OpenMP")
+  endif()
+
+else()
+  message(FATAL_ERROR "Invalid LLAMA_BACKEND value: ${LLAMA_BACKEND}")
+endif()
 
 # -----------------------------
 # Add llama.cpp subdirectories
@@ -129,7 +188,7 @@ if (ANDROID)
     ../include/hashmap.cpp
     ../include/apiexec.cpp
   )
-   
+
   # Optional: set the SONAME / versioning if you need it
   set_target_properties(llm_android PROPERTIES
     OUTPUT_NAME "libllm"
 
@@ -1,3 +1,87 @@
+1️⃣ Ensure nvidia-open driver is installed and working
+
+Check:
+
+``
+nvidia-smi
+``
+
+If it works, your driver is fine — no need to install the proprietary driver.
+
+2️⃣ Add NVIDIA CUDA repository
+
+```
+wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt update
+```
+
+This repo contains the latest CUDA toolkit for Debian 12.
+
+3️⃣ Install CUDA Toolkit only (no driver replacement)
+sudo apt install -y cuda-toolkit
+
+
+This installs:
+
+- nvcc compiler
+- CUDA headers
+- Runtime libraries (libcudart.so, etc.)
+
+4️⃣ Add CUDA to your environment
+
+```
+export PATH=/usr/local/cuda/bin:$PATH
+export CUDAToolkit_ROOT=/usr/local/cuda
+```
+
+Optional: add to ~/.bashrc to make it permanent:
+
+```
+echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc
+echo 'export CUDAToolkit_ROOT=/usr/local/cuda' >> ~/.bashrc
+source ~/.bashrc
+```
+
+Verify:
+
+nvcc --version
+
+Should show something like:
+
+```
+nvcc: NVIDIA (R) Cuda compiler driver
+Cuda compilation tools, release 12.4, V12.4.105
+```
+
+5️⃣ Clean llama.cpp build directory
+
+```
+rm -rf build
+mkdir build
+cd build
+```
+
+6️⃣ Configure CMake for CUDA backend
+
+```
+cmake -DLLAMA_BACKEND=CUDA ..
+```
+
+You should now see:
+
+-- CUDA detected – enabling GGML_CUDA
+
+7️⃣ Build 
+
+```
+make -j$(nproc)
+```
+
+The binary will use CUDA acceleration
+
+Note: fully static builds are not possible for CUDA; some .so libraries will remain dynamically linked (normal).
+
 #  Generator settings
 
 ## factual answers, tools, summaries
 
@@ -65,11 +65,13 @@ void Llama::reset() {
   _max_tokens = 150;
 }
 
-bool Llama::construct(string model_path, int n_ctx, int n_batch) {
+bool Llama::construct(string model_path, int n_ctx, int n_batch, int n_gpu_layers) {
   ggml_backend_load_all();
 
   llama_model_params mparams = llama_model_default_params();
-  mparams.n_gpu_layers = 0;
+  if (n_gpu_layers >= 0) {
+     mparams.n_gpu_layers = n_gpu_layers;
+  }
 
   _model = llama_model_load_from_file(model_path.c_str(), mparams);
   if (!_model) {
@@ -196,6 +198,8 @@ bool Llama::make_space_for_tokens(int n_tokens, int keep_min) {
 }
 
 bool Llama::generate(LlamaIter &iter, const string &prompt) {
+  configure_sampler();
+
   vector<llama_token> prompt_tokens = tokenize(prompt);
   if (prompt_tokens.size() == 0) {
     return false;
@@ -233,8 +237,6 @@ bool Llama::generate(LlamaIter &iter, const string &prompt) {
     }
   }
 
-  configure_sampler();
-
   iter._t_start = std::chrono::high_resolution_clock::now();
   iter._llama = this;
   iter._has_next = true;
@@ -299,44 +301,50 @@ string Llama::next(LlamaIter &iter) {
     return "";
   }
 
-  // decode the token
-  llama_batch batch = llama_batch_get_one(&tok, 1);
-  if (llama_decode(_ctx, batch)) {
-    _last_error = "Failed to evaluate token during generation";
-    return "";
-  }
-
-  string out;
-
-  if (!llama_vocab_is_control(_vocab, tok)) {
-    char buf[512];
-    int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false);
-    if (n > 0) {
-      if (iter._last_word == buf) {
-        if (++iter._repetition_count == MAX_REPEAT) {
-          iter._has_next = false;
-        }
-      } else {
-        iter._repetition_count = 0;
-        iter._last_word = buf;
-      }
-      out.append(buf, n);
+  string result;
 
-      if (++iter._tokens_generated > _max_tokens && ends_with_sentence_boundary(out)) {
+  //if (!llama_vocab_is_control(_vocab, tok)) {
+  char buf[512];
+  int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false);
+  if (n > 0) {
+    // detect repetition
+    if (iter._last_word == buf) {
+      if (++iter._repetition_count == MAX_REPEAT) {
         iter._has_next = false;
       }
+    } else {
+      iter._repetition_count = 0;
+      iter._last_word = buf;
+    }
+
+    result.append(buf, n);
 
+    // detect end of max-tokens
+    if (++iter._tokens_generated > _max_tokens && ends_with_sentence_boundary(result)) {
+      iter._has_next = false;
+    }
+
+    // detect stop words
+    if (iter._has_next) {
       for (const auto &stop : _stop_sequences) {
-        size_t pos = out.find(stop);
+        size_t pos = result.find(stop);
         if (pos != std::string::npos) {
           // found stop sequence - truncate and signal end
-          out = out.substr(0, pos);
+          result = result.substr(0, pos);
           iter._has_next = false;
           break;
         }
       }
     }
   }
-  return out;
+
+  // prepare the next batch with the sampled token
+  llama_batch batch = llama_batch_get_one(&tok, 1);
+  if (llama_decode(_ctx, batch)) {
+    _last_error = "Failed to evaluate token during generation";
+    return "";
+  }
+
+  return result;
 }
 
@@ -33,7 +33,7 @@ struct Llama {
   ~Llama();
 
   // init
-  bool construct(string model_path, int n_ctx, int n_batch);
+  bool construct(string model_path, int n_ctx, int n_batch, int n_gpu_layers);
 
   // generation
   bool generate(LlamaIter &iter, const string &prompt);
 
@@ -320,9 +320,10 @@ static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
   auto model = expand_path(get_param_str(argc, params, 0, ""));
   auto n_ctx = get_param_int(argc, params, 1, 2048);
   auto n_batch = get_param_int(argc, params, 2, 1024);
+  auto n_gpu_layers = get_param_int(argc, params, 3, -1);
   int id = ++g_nextId;
   Llama &llama = g_llama[id];
-  if (llama.construct(model, n_ctx, n_batch)) {
+  if (llama.construct(model, n_ctx, n_batch, n_gpu_layers)) {
     map_init_id(retval, id, CLASS_ID_LLAMA);
     v_create_callback(retval, "add_stop", cmd_llama_add_stop);
     v_create_callback(retval, "generate", cmd_llama_generate);
@@ -344,7 +345,7 @@ static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
 }
 
 FUNC_SIG lib_func[] = {
-  {1, 3, "LLAMA", cmd_create_llama},
+  {1, 4, "LLAMA", cmd_create_llama},
 };
 
 SBLIB_API int sblib_func_count() {