rdevaul · garrettekinsman · Mar 19, 2026
diff --git a/README.md b/README.md
@@ -240,6 +240,19 @@ tail -f ~/.tag-context/comparison-log.jsonl | python3 -m json.tool
 curl http://localhost:8300/comparison-log
 ```
 
+## Logger / Client Bridge
+
+The `contextgraph-logger/` package provides standalone tools for ingesting OpenClaw conversations and memory files into the ContextGraph engine, as well as pulling assembled context back for injection into system prompts.
+
+**Rule:** This package only calls ContextGraph's HTTP API. It never modifies the server code.
+
+See [`contextgraph-logger/README.md`](contextgraph-logger/README.md) for setup and usage.
+
+Key components:
+- **harvester.py** — Batch ingest: OpenClaw session DB + memory files → `/ingest` (idempotent, content-hash dedup)
+- **live_ingest.py** — Per-turn shim: POST one exchange → `/ingest`
+- **context_pull.py** — Query ContextGraph `/assemble` → formatted markdown context block
+
 ## Tests
 
 ```bash

diff --git a/api/server.py b/api/server.py
@@ -1,5 +1,6 @@
 import sys
 import time
+import re
 from pathlib import Path
 
 sys.path.insert(0, str(Path(__file__).parent.parent))
@@ -25,6 +26,37 @@
 
 app = FastAPI()
 
+# ── Security: Input Sanitization ───────────────────────────────────────────────
+
+def _sanitize_content(text: str) -> str:
+    """
+    Remove prompt injection patterns from user-provided text before storage.
+
+    Prevents adversarial content from influencing retrieval context by stripping
+    common jailbreak patterns and instruction override attempts.
+    """
+    if not text:
+        return text
+
+    # Patterns to strip (case-insensitive)
+    injection_patterns = [
+        r"ignore\s+previous\s+instructions",
+        r"ignore\s+all\s+previous\s+instructions",
+        r"disregard\s+your\s+instructions",
+        r"disregard\s+previous\s+instructions",
+        r"system\s+prompt:",
+        r"you\s+are\s+now\s+",
+        r"forget\s+your\s+instructions",
+        r"new\s+instructions:",
+        r"override\s+instructions",
+    ]
+
+    sanitized = text
+    for pattern in injection_patterns:
+        sanitized = re.sub(pattern, "", sanitized, flags=re.IGNORECASE)
+
+    return sanitized.strip()
+
 class TagRequest(BaseModel):
     user_text: str
     assistant_text: str
@@ -96,17 +128,22 @@ def ingest(request: IngestRequest):
         # Envelope text (message_id, sender_id, timestamps) is noise for
         # tag inference and retrieval — stripping prevents tag pollution.
         clean_user = strip_envelope(request.user_text)
-        features = extract_features(clean_user, request.assistant_text)
-        tags = ensemble.assign(features, clean_user, request.assistant_text).tags
+
+        # Security: Sanitize inputs to prevent prompt injection attacks
+        sanitized_user = _sanitize_content(clean_user)
+        sanitized_assistant = _sanitize_content(request.assistant_text)
+
+        features = extract_features(sanitized_user, sanitized_assistant)
+        tags = ensemble.assign(features, sanitized_user, sanitized_assistant).tags
         message = Message(
             id=message_id,
             session_id=request.session_id,
-            user_text=clean_user,
-            assistant_text=request.assistant_text,
+            user_text=sanitized_user,
+            assistant_text=sanitized_assistant,
             timestamp=request.timestamp,
             user_id=request.user_id or "default",
             tags=tags,
-            token_count=len(clean_user.split()) + len(request.assistant_text.split()),
+            token_count=len(sanitized_user.split()) + len(sanitized_assistant.split()),
             external_id=request.external_id
         )
         store.add_message(message)
@@ -633,4 +670,5 @@ def get_pins():
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8350)
+    # Security: Bind to localhost only. Use reverse proxy for remote access.
+    uvicorn.run(app, host="127.0.0.1", port=8350)
diff --git a/contextgraph-logger/.gitignore b/contextgraph-logger/.gitignore
@@ -0,0 +1,5 @@
+venv/
+__pycache__/
+*.pyc
+data/ingest-state.json
+data/memory-state.json
diff --git a/contextgraph-logger/README.md b/contextgraph-logger/README.md
@@ -0,0 +1,126 @@
+---
+*Prepared by **Agent: Mei (梅)** — PhD candidate, Tsinghua KEG Lab. Specialist in Chinese AI ecosystem, inference optimization, and MoE architectures.*
+*Running: anthropic/claude-sonnet-4-6*
+
+*Human in the Loop: Garrett Kinsman*
+
+---
+
+# contextgraph-logger
+
+Bridge between OpenClaw conversations and Rich DeVaul's ContextGraph engine.
+
+**Rule: this package only calls ContextGraph's HTTP API. It never modifies Rich's code.**
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `config.py` | Server URL, paths, token budgets |
+| `harvester.py` | Batch ingest: session DB + memory files → `/ingest` |
+| `live_ingest.py` | Per-turn shim: POST one exchange → `/ingest` |
+| `context_pull.py` | Query ContextGraph → formatted markdown context block |
+| `data/ingest-state.json` | Tracks which session DB rows have been ingested |
+| `data/memory-state.json` | Tracks content hashes of memory files |
+
+## Setup
+
+```bash
+cd projects/contextgraph-logger
+pip install -r requirements.txt
+```
+
+ContextGraph server must be running at `http://127.0.0.1:8300`.
+
+## Usage
+
+### Batch harvest (run nightly or on demand)
+
+```bash
+# Dry run — shows what would be ingested
+python3 harvester.py --dry-run --verbose
+
+# Full run
+python3 harvester.py --verbose
+
+# Memory files only
+python3 harvester.py --memory-only
+
+# Session DB only
+python3 harvester.py --sessions-only
+
+# Re-ingest all memory files (ignores hash state)
+python3 harvester.py --memory-only --force
+```
+
+### Live turn logging (call after each OpenClaw turn)
+
+```bash
+# Via JSON on stdin
+echo '{"session_id":"abc123","user_text":"hi","assistant_text":"hello","timestamp":1234567890}' \
+  | python3 live_ingest.py
+
+# Via CLI args
+python3 live_ingest.py \
+  --session-id abc123 \
+  --user-text "what's the maxrisk status?" \
+  --assistant-text "MaxRisk is paused pending risk review..."
+
+# Python import
+from live_ingest import ingest_turn
+result = ingest_turn(
+    session_id="abc123",
+    user_text="what's the status?",
+    assistant_text="Here's the status...",
+)
+```
+
+### Context pull (query → markdown block for system prompt injection)
+
+```bash
+python3 context_pull.py "memory harvester not working"
+python3 context_pull.py --budget 1500 "maxrisk project status"
+python3 context_pull.py --tags "maxrisk,trading" "portfolio review"
+python3 context_pull.py --json "memory architecture"  # raw JSON
+```
+
+Python import:
+```python
+from context_pull import pull_context
+
+result = pull_context("memory harvester not working")
+if result["ok"] and result["context_block"]:
+    # inject result["context_block"] into system prompt
+    pass
+```
+
+## State files
+
+Both harvesters are **idempotent** — re-running is safe:
+
+- `data/ingest-state.json` — maps `external_id → timestamp` for session DB rows
+- `data/memory-state.json` — maps `relpath → content_hash` for memory files
+
+Delete these files to force a full re-ingest.
+
+## API reference
+
+Rich's server at `http://127.0.0.1:8300`:
+
+- `POST /ingest` — `{session_id, user_text, assistant_text, timestamp, external_id?}`
+- `POST /assemble` — `{user_text, tags?, token_budget?, session_id?}`
+- `POST /tag` — `{user_text, assistant_text}`
+- `POST /compare` — `{user_text, assistant_text}`
+- `GET /health` — server health check
+
+## Architecture
+
+```
+OpenClaw session DB ──► harvester.py ──► POST /ingest ──► ContextGraph
+memory/ files ────────►              └──►
+OpenClaw turn ────────► live_ingest.py ► POST /ingest ──►
+
+                        context_pull.py ► POST /assemble ◄── ContextGraph
+                              │
+                              └──► Markdown context block → system prompt
+```
diff --git a/contextgraph-logger/config.py b/contextgraph-logger/config.py
@@ -0,0 +1,39 @@
+"""
+config.py — Central configuration for contextgraph-logger.
+
+Single source of truth for server URL, paths, and token budgets.
+"""
+
+from pathlib import Path
+
+# ── Server ────────────────────────────────────────────────────────────────────
+SERVER_URL = "http://127.0.0.1:8300"
+
+# ── Paths ─────────────────────────────────────────────────────────────────────
+HOME = Path.home()
+OPENCLAW_DATA = HOME / ".openclaw" / "data"
+WORKSPACE = HOME / ".openclaw" / "workspace"
+
+# OpenClaw session SQLite DB (may be empty / schema not yet created)
+MESSAGES_DB = OPENCLAW_DATA / "messages.db"
+
+# Memory directories to harvest
+MEMORY_ROOT = WORKSPACE / "memory"
+MEMORY_DIRS = [
+    MEMORY_ROOT / "daily",
+    MEMORY_ROOT / "projects",
+    MEMORY_ROOT / "decisions",
+    MEMORY_ROOT / "contacts",
+]
+
+# State files (within this package's data/ dir)
+PKG_DATA = Path(__file__).parent / "data"
+INGEST_STATE_FILE = PKG_DATA / "ingest-state.json"    # session harvester state
+MEMORY_STATE_FILE = PKG_DATA / "memory-state.json"    # memory file hash state
+
+# ── Token budgets ─────────────────────────────────────────────────────────────
+DEFAULT_TOKEN_BUDGET = 2000          # context_pull default
+MAX_CONTENT_PER_FILE = 1500          # chars per memory file
+
+# ── Request settings ──────────────────────────────────────────────────────────
+REQUEST_TIMEOUT = 10                 # seconds per HTTP request