commoncrawl · raghotham · Nov 4, 2025
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ Search, analyze, and index Common Crawl data into vector stores for RAG applicat
 - **`AWS_SESSION_TOKEN`** - Optional for Athena/S3 access (needed to run Athena queries). This is required for temporary credentials
 - **`OPENAI_API_KEY`** - Required for vector operations (index, query, list)
 - `OPENAI_BASE_URL` - Optional custom OpenAI endpoint (e.g., `http://localhost:8321/v1` for Llama Stack)
+- `OPENAI_VERIFY_SSL` - Verify SSL certificates (default: `true`). Set to `false` for self-signed certs or local development. ⚠️ Use only with trusted endpoints.
 - `OPENAI_EMBEDDING_MODEL` - Embedding model to use (e.g., `text-embedding-3-small`, `nomic-embed-text`)
 - `OPENAI_EMBEDDING_DIMENSIONS` - Embedding dimensions (optional, model-specific)
 - `AWS_DEFAULT_REGION` - AWS region (defaults to us-west-2)
@@ -66,6 +67,22 @@ uv run cc-vec index --url-patterns "%.github.io" --vector-store-name "ml-researc
 # Vector store name is optional - will auto-generate if not provided
 uv run cc-vec index --url-patterns "%.github.io" --limit 50
 
+# Using with alternative OpenAI-compatible endpoints (Ollama example)
+export OPENAI_BASE_URL=http://localhost:11434/v1
+export OPENAI_API_KEY=ollama  # Ollama doesn't require a real key
+export OPENAI_EMBEDDING_MODEL=nomic-embed-text
+uv run cc-vec index --url-patterns "%.github.io" --vector-store-name "local-research" --limit 50
+
+# Using with Llama Stack
+export OPENAI_BASE_URL=http://localhost:8321/v1
+uv run cc-vec index --url-patterns "%.edu" --vector-store-name "education" --limit 100
+
+# With self-signed certificates or local development (disable SSL verification)
+export OPENAI_BASE_URL=https://localhost:8443/v1
+export OPENAI_VERIFY_SSL=false  # ⚠️ Use only in development with trusted endpoints
+export OPENAI_API_KEY=your-key
+uv run cc-vec index --url-patterns "%.github.io" --vector-store-name "local-dev" --limit 50
+
 # List cc-vec vector stores (default - only shows stores created by cc-vec)
 uv run cc-vec list --output json
 
@@ -80,9 +97,111 @@ uv run cc-vec query "Explain deep learning" --vector-store-name "ml-research" --
 
 ```
 
+## 1.5. 🦙 Local Llama Stack Setup (Optional)
+
+For running cc-vec with local models via Llama Stack + Ollama, use the standalone manager script:
+
+**Prerequisites:**
+- Ollama installed and running (`ollama serve`)
+- Docker (for Docker backend) or uv (for UV backend)
+
+**First-time setup:**
+
+```bash
+# Install and start Ollama first
+# macOS/Linux: curl -fsSL https://ollama.com/install.sh | sh
+ollama serve &
+
+# Run setup (pulls required models, installs dependencies)
+uv run llama-stack-helper setup --backend docker
+
+# Or for UV backend:
+uv run llama-stack-helper setup --backend uv
+```
+
+**Start Llama Stack:**
+
+```bash
+# Docker backend (recommended)
+uv run llama-stack-helper start --backend docker
+
+# Or UV backend
+uv run llama-stack-helper start --backend uv
+```
+
+**Check status:**
+
+```bash
+uv run llama-stack-helper status
+```
+
+**View logs:**
+
+```bash
+# Show last 20 lines
+uv run llama-stack-helper logs
+
+# Follow logs in real-time
+uv run llama-stack-helper logs --follow
+```
+
+**Stop Llama Stack:**
+
+```bash
+uv run llama-stack-helper stop --backend docker
+```
+
+**Use with cc-vec:**
+
+Once Llama Stack is running, set the environment variables:
+
+```bash
+# Set Llama Stack environment variables in your current shell
+eval "$(uv run llama-stack-helper env)"
+
+# Now use cc-vec normally with your Athena credentials
+export ATHENA_OUTPUT_BUCKET=s3://your-bucket/
+export AWS_ACCESS_KEY_ID=your-key
+export AWS_SECRET_ACCESS_KEY=your-secret
+
+uv run cc-vec index --url-patterns "%.edu" --limit 10
+```
+
+The `env` command outputs (using your configured models):
+```bash
+export OPENAI_BASE_URL=http://localhost:8321/v1
+export OPENAI_API_KEY=none
+export OPENAI_VERIFY_SSL=false
+export OPENAI_EMBEDDING_MODEL=toshk0/nomic-embed-text-v2-moe:Q6_K  # or your custom model
+export OPENAI_EMBEDDING_DIMENSIONS=768  # or your custom dimensions
+```
+
+**Default models** (automatically pulled during setup):
+- `llama3.2:3b` - Inference model
+- `toshk0/nomic-embed-text-v2-moe:Q6_K` - Embedding model (768 dimensions)
+
+**Custom models** (optional):
+
+You can customize which models to use by setting environment variables before running setup:
+
+```bash
+export LLAMA_STACK_INFERENCE_MODEL=llama3.2:1b
+export LLAMA_STACK_EMBEDDING_MODEL=nomic-embed-text
+export LLAMA_STACK_EMBEDDING_DIMENSIONS=768
+
+# Now run setup - it will pull your custom models
+uv run llama-stack-helper setup
+```
+
+These models will be:
+1. Downloaded into Ollama during setup
+2. Configured in the Llama Stack run.yaml
+3. Used automatically by cc-vec when you run `eval "$(uv run llama-stack-helper env)"`
+
 ## 2. 📦 Python Library
 
 ```python
+import os
 from cc_vec import (
     search,
     stats,
@@ -95,6 +214,22 @@ from cc_vec import (
     VectorStoreConfig,
 )
 
+# For alternative endpoints, set environment variables before importing
+# Example: Using Ollama
+# os.environ["OPENAI_BASE_URL"] = "http://localhost:11434/v1"
+# os.environ["OPENAI_API_KEY"] = "ollama"
+# os.environ["OPENAI_EMBEDDING_MODEL"] = "nomic-embed-text"
+
+# Example: Using Llama Stack
+# os.environ["OPENAI_BASE_URL"] = "http://localhost:8321/v1"
+# os.environ["OPENAI_API_KEY"] = "your-llama-stack-key"
+
+# Example: With self-signed certificates (disable SSL verification)
+# ⚠️ Use only in development with trusted endpoints
+# os.environ["OPENAI_BASE_URL"] = "https://localhost:8443/v1"
+# os.environ["OPENAI_VERIFY_SSL"] = "false"
+# os.environ["OPENAI_API_KEY"] = "your-key"
+
 # Basic search and stats (no OpenAI key needed)
 filter_config = FilterConfig(url_patterns=["%.github.io"])
 
@@ -199,6 +334,9 @@ The config uses stdio mode (required by Claude Desktop):
       "env": {
         "ATHENA_OUTPUT_BUCKET": "your-athena-output-bucket",
         "OPENAI_API_KEY": "your-openai-api-key-here"
+        // "OPENAI_BASE_URL": "http://localhost:11434/v1"   // Optional: Use for Ollama, Llama Stack, or other endpoints
+        // "OPENAI_VERIFY_SSL": "false"                     // Optional: Disable SSL verification for self-signed certs (dev only)
+        // "OPENAI_EMBEDDING_MODEL": "nomic-embed-text"     // Optional: Specify custom embedding model
       }
     }
   }

diff --git a/claude_desktop_config.json b/claude_desktop_config.json
@@ -14,6 +14,9 @@
       "env": {
         "ATHENA_OUTPUT_BUCKET": "s3://llama-stack-dev-test0/athena-results/",
         "OPENAI_API_KEY": "your-openai-api-key-here"
+        // "OPENAI_BASE_URL": "http://localhost:11434/v1"  // Optional: Use for Ollama, Llama Stack, or other OpenAI-compatible endpoints
+        // "OPENAI_VERIFY_SSL": "false"                    // Optional: Disable SSL verification for self-signed certs (dev only)
+        // "OPENAI_EMBEDDING_MODEL": "nomic-embed-text"    // Optional: Specify custom embedding model
       }
     }
   }

diff --git a/examples/cc_vec_complete_rag_workflow.py b/examples/cc_vec_complete_rag_workflow.py
@@ -10,6 +10,11 @@
 6. Cleanup
 
 Run with: uv run python examples/complete_rag_workflow.py
+
+Optional environment variables for alternative endpoints:
+    - OPENAI_BASE_URL: Custom endpoint (e.g., http://localhost:11434/v1)
+    - OPENAI_VERIFY_SSL: Set to "false" for self-signed certs (dev only)
+    - OPENAI_EMBEDDING_MODEL: Custom model (e.g., nomic-embed-text)
 """
 
 import os

diff --git a/examples/cc_vec_rag_example.py b/examples/cc_vec_rag_example.py
@@ -10,6 +10,11 @@
     - OPENAI_API_KEY environment variable
     - ATHENA_OUTPUT_BUCKET environment variable
     - AWS credentials configured
+
+Optional (for alternative endpoints):
+    - OPENAI_BASE_URL: Custom OpenAI-compatible endpoint (e.g., http://localhost:11434/v1 for Ollama)
+    - OPENAI_VERIFY_SSL: Set to "false" to disable SSL verification for self-signed certs (dev only)
+    - OPENAI_EMBEDDING_MODEL: Custom embedding model (e.g., nomic-embed-text for Ollama)
 """
 
 import os
@@ -20,6 +25,9 @@
 
 def main():
     # Initialize OpenAI client
+    # Note: If using alternative endpoints with self-signed certs, set:
+    #   export OPENAI_BASE_URL=https://localhost:8443/v1
+    #   export OPENAI_VERIFY_SSL=false
     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
     # Step 1: Index Common Crawl content into a vector store

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,11 +27,18 @@ dev = [
 
 [project.scripts]
 cc-vec = "cc_vec.cli.main:main"
+llama-stack-helper = "llama_stack_helper:main"
 
 [build-system]
 requires = ["uv_build>=0.8.13,<0.9.0"]
 build-backend = "uv_build"
 
+[tool.uv]
+package = true
+
+[tool.setuptools.package-data]
+llama_stack_helper = ["conf/*.yaml"]
+
 [dependency-groups]
 dev = [
     "pytest>=8.4.2",

diff --git a/src/cc_vec/api.py b/src/cc_vec/api.py
@@ -3,6 +3,7 @@
 import logging
 from typing import List, Dict, Any, Optional
 
+import httpx
 from openai import OpenAI
 from .types import FilterConfig, CrawlRecord, StatsResponse, VectorStoreConfig
 from .types.config import load_config
@@ -55,9 +56,16 @@ def _get_openai_client() -> OpenAI:
     global _openai_client
     if _openai_client is None:
         config = load_config()
+
+        # Create custom httpx client if SSL verification is disabled
+        http_client = None
+        if not config.openai.verify_ssl:
+            http_client = httpx.Client(verify=False)
+
         _openai_client = OpenAI(
             api_key=config.openai.api_key,
             base_url=config.openai.base_url,
+            http_client=http_client,
         )
     return _openai_client
 

diff --git a/src/cc_vec/cli/main.py b/src/cc_vec/cli/main.py
@@ -651,11 +651,17 @@ def index(ctx, vector_store_name, limit, chunk_size, overlap, output, **filter_k
 
             click.echo(f"Auto-generated vector store name: {vector_store_name}")
 
+        # Load config to get embedding model settings from environment
+        config = load_config()
+
         # Construct VectorStoreConfig
         vector_store_config = VectorStoreConfig(
             name=vector_store_name,
             chunk_size=chunk_size,
             overlap=overlap,
+            embedding_model=config.openai.embedding_model
+            or "text-embedding-3-small",
+            embedding_dimensions=config.openai.embedding_dimensions or 1536,
         )
 
         # Use the simplified API that handles all client initialization

diff --git a/src/cc_vec/lib/index.py b/src/cc_vec/lib/index.py
@@ -43,7 +43,7 @@ def create_vector_store(self) -> str:
                 "created_by": "cc-vec",
                 "cc_vec_version": "0.1.0",
                 "embedding_model": self.config.embedding_model,
-                "embedding_dimension": str(self.config.embedding_dimensions),
+                "embedding_dimensions": str(self.config.embedding_dimensions),
             },
             "chunking_strategy": {
                 "type": "static",
@@ -144,6 +144,34 @@ def upload_to_vector_store(
             logger.info(f"Upload completed with status: {file_batch.status}")
             logger.info(f"File counts: {file_batch.file_counts}")
 
+            # Log detailed failure information if any files failed
+            if file_batch.file_counts.failed > 0:
+                logger.warning(f"{file_batch.file_counts.failed} files failed to upload")
+                # Try to get detailed error information for failed files
+                try:
+                    batch_files = self.client.vector_stores.file_batches.list_files(
+                        vector_store_id=vector_store_id,
+                        batch_id=file_batch.id,
+                        filter="failed"
+                    )
+                    for failed_file in batch_files.data[:3]:  # Show first 3 failures
+                        # Try multiple ways to get error information
+                        error_msg = getattr(failed_file, 'last_error', None)
+                        status = getattr(failed_file, 'status', 'unknown')
+
+                        if error_msg:
+                            # last_error might be an object with message/code
+                            if hasattr(error_msg, 'message'):
+                                logger.error(f"File {failed_file.id} ({status}): {error_msg.message}")
+                            else:
+                                logger.error(f"File {failed_file.id} ({status}): {error_msg}")
+                        else:
+                            # Dump the entire object to see what's available
+                            logger.error(f"File {failed_file.id} status: {status}")
+                            logger.error(f"Full file object: {failed_file}")
+                except Exception as list_error:
+                    logger.warning(f"Could not retrieve detailed failure information: {list_error}")
+
             return {
                 "status": file_batch.status,
                 "file_counts": file_batch.file_counts,
@@ -233,6 +261,23 @@ def index(
 
     upload_result = loader.upload_to_vector_store(vector_store_id, successful_fetches)
 
+    # Check if upload failed completely
+    file_counts = upload_result["file_counts"]
+    if upload_result["status"] == "failed" and file_counts.completed == 0:
+        error_msg = (
+            f"All {file_counts.total} files failed to upload to vector store. "
+            f"Failed: {file_counts.failed}, Cancelled: {file_counts.cancelled}. "
+            f"Check the logs above for detailed error messages."
+        )
+        logger.error(error_msg)
+        raise RuntimeError(error_msg)
+
+    # Warn if some files failed but some succeeded
+    if file_counts.failed > 0 and file_counts.completed > 0:
+        logger.warning(
+            f"Partial upload success: {file_counts.completed} succeeded, {file_counts.failed} failed"
+        )
+
     return {
         "vector_store_id": vector_store_id,
         "vector_store_name": vector_store_config.name,

diff --git a/src/cc_vec/mcp/handlers/cc_index.py b/src/cc_vec/mcp/handlers/cc_index.py
@@ -46,11 +46,19 @@ async def handle(self, args: Dict[str, Any]) -> List[TextContent]:
             else:
                 vector_store_name = f"ccvec_{timestamp}"
 
+        # Load config to get embedding model settings from environment
+        from ...types.config import load_config
+
+        config = load_config()
+
         # Construct VectorStoreConfig
         vector_store_config = VectorStoreConfig(
             name=vector_store_name,
             chunk_size=chunk_size,
             overlap=overlap,
+            embedding_model=config.openai.embedding_model
+            or "text-embedding-3-small",
+            embedding_dimensions=config.openai.embedding_dimensions or 1536,
         )
 
         try:

diff --git a/src/cc_vec/types/main_config.py b/src/cc_vec/types/main_config.py
@@ -34,6 +34,8 @@ def from_env(cls) -> "CCVecConfig":
                 embedding_dimensions=int(os.getenv("OPENAI_EMBEDDING_DIMENSIONS"))
                 if os.getenv("OPENAI_EMBEDDING_DIMENSIONS")
                 else None,
+                verify_ssl=os.getenv("OPENAI_VERIFY_SSL", "true").lower()
+                not in ["false", "0", "no"],
             ),
             logging=LoggingSettings(
                 level=os.getenv("LOG_LEVEL", "INFO"),
@@ -62,6 +64,11 @@ def setup_logging(self) -> None:
             logger.info("OpenAI client configured")
             if self.openai.base_url:
                 logger.info(f"Using custom OpenAI base URL: {self.openai.base_url}")
+            if not self.openai.verify_ssl:
+                logger.warning(
+                    "⚠️  SSL certificate verification is DISABLED. "
+                    "This should only be used in development with trusted endpoints."
+                )
         else:
             logger.warning(
                 "OpenAI API key not configured - vector operations unavailable"

diff --git a/src/cc_vec/types/openai_config.py b/src/cc_vec/types/openai_config.py
@@ -12,6 +12,7 @@ class OpenAISettings:
     base_url: Optional[str] = None
     embedding_model: Optional[str] = None
     embedding_dimensions: Optional[int] = None
+    verify_ssl: bool = True
 
     def is_configured(self) -> bool:
         """Check if OpenAI is properly configured."""