evalops
diff --git a/‎Cargo.lock‎
Lines changed: 448 additions & 5 deletions b/‎Cargo.lock‎
Lines changed: 448 additions & 5 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 6 additions & 1 deletion b/‎Cargo.toml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 79 additions & 3 deletions b/‎README.md‎
Lines changed: 79 additions & 3 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 35 additions & 2 deletions b/‎docker-compose.yml‎
Lines changed: 35 additions & 2 deletions
diff --git a/‎src/adapters/llm.rs‎
Lines changed: 19 additions & 2 deletions b/‎src/adapters/llm.rs‎
Lines changed: 19 additions & 2 deletions
@@ -11,7 +11,7 @@ categories = ["development-tools", "command-line-utilities"]
 
 [dependencies]
 clap = { version = "4.4", features = ["derive"] }
-tokio = { version = "1.35", features = ["macros", "rt-multi-thread", "fs", "time"] }
+tokio = { version = "1.35", features = ["macros", "rt-multi-thread", "fs", "time", "sync"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 serde_yaml = "0.9"
@@ -31,6 +31,11 @@ glob = "0.3"
 ignore = "0.4"
 shell-words = "1.1"
 url = "2.5"
+axum = "0.8"
+tower-http = { version = "0.6", features = ["cors", "fs"] }
+rust-embed = "8"
+uuid = { version = "1", features = ["v4"] }
+mime_guess = "2"
 
 [dev-dependencies]
 tempfile = "3.8"
 
@@ -221,20 +221,96 @@ git diff | diffscope review \
 
 #### Docker Compose (Ollama + DiffScope)
 ```bash
-# Start Ollama and DiffScope together
+# Start Ollama (with GPU) and DiffScope together — model is auto-pulled
 docker compose up diffscope-local
 
-# Pull a model first
-docker compose exec ollama ollama pull codellama
+# CPU-only mode (no NVIDIA GPU required)
+docker compose --profile cpu up ollama-cpu
+
+# Pull a specific model manually
+docker compose exec ollama ollama pull deepseek-coder:6.7b-instruct
+```
+
+#### Docker with GPU
+
+Requires [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):
+
+```bash
+# Install NVIDIA Container Toolkit (Ubuntu/Debian)
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+  sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+  sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker
+
+# Run with GPU
+docker compose up diffscope-local
+```
+
+If you don't have an NVIDIA GPU, use the CPU-only profile or run Ollama directly on the host.
+
+#### Recommended Models
+
+| RAM Available | Recommended Model | Command |
+|---|---|---|
+| 8 GB | Codellama 7B Q4 | `ollama pull codellama:7b-instruct-q4_0` |
+| 16 GB | Deepseek Coder 6.7B | `ollama pull deepseek-coder:6.7b-instruct` |
+| 16 GB | Codellama 13B Q4 | `ollama pull codellama:13b-instruct-q4_0` |
+| 32+ GB | Deepseek Coder 33B Q4 | `ollama pull deepseek-coder:33b-instruct-q4_0` |
+
+For best results, use instruction-tuned (`-instruct`) variants of code-specialized models.
+
+#### Performance Tuning
+
+For local models, adjust these config values based on your model's context window:
+
+```yaml
+# .diffscope.yml for 7B model on 16GB RAM
+context_window: 8192        # Model's actual context limit
+max_tokens: 2048            # Max response length
+max_diff_chars: 12000       # Truncate large diffs
+max_context_chars: 8000     # Limit surrounding code context
+context_max_chunks: 8       # Max context files to include
+temperature: 0.1            # Low temp for consistent reviews
 ```
 
+**Tips:**
+- `diffscope doctor` shows detected context window and tests inference speed
+- Quantized models (Q4, Q5) use ~50-60% less RAM with minimal quality loss
+- GPU inference is 5-10x faster than CPU-only
+- First request after model load is slower (loading into VRAM)
+
 #### Check Your Setup
 ```bash
 # Verify endpoint reachability, models, and recommendations
 diffscope doctor
 diffscope doctor --base-url http://localhost:11434
 ```
 
+#### Troubleshooting
+
+**Model is slow (>30 seconds per review)**
+- Check tokens/sec with `diffscope doctor`
+- Try a quantized model: `ollama pull codellama:7b-instruct-q4_0`
+- Reduce context: set `max_diff_chars: 8000` and `context_window: 4096`
+
+**Out of memory errors**
+- Use a smaller model (7B instead of 13B)
+- Use heavier quantization (Q4 instead of Q8)
+- Set `context_window` lower in config (e.g., 4096)
+- Monitor with `nvidia-smi` (GPU) or `htop` (RAM)
+
+**Empty or garbage reviews**
+- Run `diffscope doctor` to test model responsiveness
+- Try a code-specialized model (deepseek-coder, codellama)
+- Avoid models smaller than 3B for code review
+
+**"Endpoint unreachable" error**
+- Verify server is running: `curl http://localhost:11434/api/tags`
+- Check the port matches your `base_url` config
+- For Docker: ensure services are on the same network
+
 #### Environment Variables
 | Variable | Description |
 |----------|-------------|
 
@@ -16,8 +16,8 @@ services:
     build: .
     image: diffscope:latest
     depends_on:
-      ollama:
-        condition: service_healthy
+      ollama-pull:
+        condition: service_completed_successfully
     environment:
       - DIFFSCOPE_BASE_URL=http://ollama:11434
       - DIFFSCOPE_MODEL=${DIFFSCOPE_MODEL:-ollama:codellama}
@@ -32,12 +32,45 @@ services:
       - "11434:11434"
     volumes:
       - ollama_data:/root/.ollama
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD-SHELL", "curl -sf http://localhost:11434/api/tags || exit 1"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 30s
+
+  # CPU-only variant (use instead of 'ollama' when no GPU is available)
+  ollama-cpu:
+    image: ollama/ollama:latest
+    ports:
+      - "11434:11434"
+    volumes:
+      - ollama_data:/root/.ollama
+    profiles:
+      - cpu
     healthcheck:
       test: ["CMD-SHELL", "curl -sf http://localhost:11434/api/tags || exit 1"]
       interval: 10s
       timeout: 5s
       retries: 5
       start_period: 30s
 
+  ollama-pull:
+    image: ollama/ollama:latest
+    depends_on:
+      ollama:
+        condition: service_healthy
+    entrypoint: ["ollama", "pull", "${DIFFSCOPE_OLLAMA_MODEL:-codellama:7b}"]
+    environment:
+      - OLLAMA_HOST=http://ollama:11434
+    restart: "no"
+
 volumes:
   ollama_data:
@@ -17,7 +17,7 @@ pub struct ModelConfig {
 impl Default for ModelConfig {
     fn default() -> Self {
         Self {
-            model_name: "gpt-4o".to_string(),
+            model_name: "anthropic/claude-sonnet-4.6".to_string(),
             api_key: None,
             base_url: None,
             temperature: 0.2,
@@ -64,10 +64,27 @@ pub fn create_adapter(config: &ModelConfig) -> Result<Box<dyn LLMAdapter>> {
         return match adapter.as_str() {
             "anthropic" => Ok(Box::new(crate::adapters::AnthropicAdapter::new(config)?)),
             "ollama" => Ok(Box::new(crate::adapters::OllamaAdapter::new(config)?)),
+            "openrouter" => {
+                // OpenRouter uses OpenAI-compatible API
+                let mut or_config = config.clone();
+                if or_config.base_url.is_none() {
+                    or_config.base_url = Some("https://openrouter.ai/api/v1".to_string());
+                }
+                Ok(Box::new(crate::adapters::OpenAIAdapter::new(or_config)?))
+            }
             _ => Ok(Box::new(crate::adapters::OpenAIAdapter::new(config)?)),
         };
     }
 
+    // OpenRouter-style model IDs (vendor/model)
+    if config.model_name.contains('/') {
+        let mut or_config = config;
+        if or_config.base_url.is_none() {
+            or_config.base_url = Some("https://openrouter.ai/api/v1".to_string());
+        }
+        return Ok(Box::new(crate::adapters::OpenAIAdapter::new(or_config)?));
+    }
+
     // Model-name heuristic
     match config.model_name.as_str() {
         // Anthropic Claude models (all versions)
@@ -236,7 +253,7 @@ mod tests {
     #[test]
     fn test_model_config_default() {
         let config = ModelConfig::default();
-        assert_eq!(config.model_name, "gpt-4o");
+        assert_eq!(config.model_name, "anthropic/claude-sonnet-4.6");
         assert!(config.api_key.is_none());
         assert!(config.base_url.is_none());
         assert!((config.temperature - 0.2).abs() < f32::EPSILON);