iPythoning
diff --git a/‎CHANGELOG.md‎
Lines changed: 34 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎whatsapp-old-account-onboarding/docs/README.md‎
Lines changed: 10 additions & 2 deletions b/‎whatsapp-old-account-onboarding/docs/README.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎whatsapp-old-account-onboarding/samples/pa-config.example.json‎
Lines changed: 6 additions & 0 deletions b/‎whatsapp-old-account-onboarding/samples/pa-config.example.json‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎whatsapp-old-account-onboarding/scripts/bootstrap.sh‎
Lines changed: 43 additions & 3 deletions b/‎whatsapp-old-account-onboarding/scripts/bootstrap.sh‎
Lines changed: 43 additions & 3 deletions
diff --git a/‎whatsapp-old-account-onboarding/scripts/bulk-embed.py‎
Lines changed: 194 additions & 0 deletions b/‎whatsapp-old-account-onboarding/scripts/bulk-embed.py‎
Lines changed: 194 additions & 0 deletions
@@ -8,6 +8,40 @@ Changes sourced from upstream (openclaw/openclaw) are labeled with the originati
 
 ## [Unreleased]
 
+## 2026-05-21 — WhatsApp Onboarding Spec v0.3 (Layer B + Layer C scripts)
+
+Closes the remaining gap to a true end-to-end delivery: Layer B miner,
+Layer A pusher, and Layer C chunk uploader. `bootstrap.sh` now wires
+them all together so a complete run produces:
+
+  - `profiles/`               → MemOS-ready customer YAMLs
+  - `golden/`                 → Layer B segments awaiting human review
+  - `layer-c-chunks.jsonl`    → conversation history chunks for KB import
+
+### Added
+
+- **scripts/mine-golden-segments.py** — Two-pass Layer B miner:
+  pass 1 sliding-window keyword detection (EN/ZH/ES signals across five
+  tag classes), pass 2 Haiku LLM scoring + retag + tactical-move
+  extraction. Drops segments scoring < 3.
+- **scripts/memos-upsert.py** — Pushes `profiles/*.yaml` to PulseAgent
+  MemOS endpoint. Honors `_auto_onboard` gate; supports `--force`.
+  Config sources: CLI > pa-config.json > env vars.
+- **scripts/bulk-embed.py** — Chunks `parsed/*.jsonl` into KB-ready
+  records with strict `customer_hash` metadata. Two modes:
+  emit JSONL for offline import, or `--upload` to push directly to
+  `/api/kb/upsert`. Embedding stays on the PA backend by design.
+- **samples/pa-config.example.json** — Reference shape for
+  `~/.pa-config.json`.
+- **bootstrap.sh** wires mining + Layer C chunking + optional push step
+  into the standard delivery flow.
+
+### Fixed
+
+- `whatsapp-export-parser.py` MEDIA regex now strips outer `<>` around
+  `<image omitted>` / `<Media omitted>` so chunked text is clean.
+- `bulk-embed.py` removed duplicated media tag in chunk text body.
+
 ## 2026-05-21 — WhatsApp Onboarding Spec v0.2 (customer delivery kit)
 
 Turns the v0.1 spec into something a delivery engineer can actually run on
 
@@ -226,15 +226,23 @@ Any metric 2× baseline → pause expansion, audit prompt + samples.
 ```
 whatsapp-old-account-onboarding/
 ├── scripts/
+│   ├── bootstrap.sh                       ← one-command entry
 │   ├── whatsapp-export-parser.py
-│   └── customer-profile-extractor.py
+│   ├── customer-profile-extractor.py
+│   ├── mine-golden-segments.py            ← Layer B miner
+│   ├── memos-upsert.py                    ← Layer A push
+│   ├── bulk-embed.py                      ← Layer C chunk + push
+│   └── requirements.txt
 ├── docs/
 │   ├── README.md                          ← you are here
 │   ├── README.zh-CN.md
+│   ├── CUSTOMER-DELIVERY-GUIDE.md
+│   ├── CUSTOMER-DELIVERY-GUIDE.zh-CN.md
 │   ├── OpenClaw-knowledge-base-import.md
 │   └── system-prompt-template.md
 └── samples/
-    └── example-customer-profile.yaml
+    ├── example-customer-profile.yaml
+    └── pa-config.example.json             ← copy to ~/.pa-config.json
 ```
 
 ---
 
@@ -0,0 +1,6 @@
+{
+  "_comment": "Copy to ~/.pa-config.json and fill in. Read by memos-upsert.py and bulk-embed.py.",
+  "endpoint": "https://your-pulseagent-host.example.com",
+  "token": "Bearer-token-from-PA-settings",
+  "tenant": "your-tenant-slug"
+}
@@ -214,6 +214,44 @@ else
         --parsed "${PROJECT_DIR}/parsed" \
         --output "${PROJECT_DIR}/profiles" \
         --min-turns 20 2>&1 | tee -a "${LOG_FILE}"
+
+    say ""
+    info "Mining Layer B golden segments (sales playbook)..."
+    python3 "${SCRIPT_DIR}/mine-golden-segments.py" \
+        --parsed "${PROJECT_DIR}/parsed" \
+        --output "${PROJECT_DIR}/golden" \
+        --min-score 3 2>&1 | tee -a "${LOG_FILE}"
+
+    say ""
+    info "Chunking Layer C conversation history..."
+    python3 "${SCRIPT_DIR}/bulk-embed.py" \
+        --parsed "${PROJECT_DIR}/parsed" \
+        --output "${PROJECT_DIR}/layer-c-chunks.jsonl" 2>&1 | tee -a "${LOG_FILE}"
+fi
+
+# ---- Step 5b: optional push to PulseAgent -----------------------------------
+
+if [[ "${DELIVERY_PATH}" != "A" ]]; then
+    say ""
+    ask_choice "Push to PulseAgent now?" PUSH_NOW \
+        "No, I'll review and push later" \
+        "Yes, push profiles to MemOS + chunks to KB"
+
+    if [[ "${PUSH_NOW}" == Yes* ]]; then
+        if [[ ! -f "${HOME}/.pa-config.json" && -z "${PA_ENDPOINT:-}" ]]; then
+            warn "No ~/.pa-config.json or PA_ENDPOINT env var found."
+            ask "PulseAgent endpoint URL (e.g. https://pa.example.com)" PA_ENDPOINT
+            ask "PulseAgent API token (Bearer)" PA_TOKEN
+            ask "Tenant slug" PA_TENANT
+            export PA_ENDPOINT PA_TOKEN PA_TENANT
+        fi
+        say ""
+        info "Upserting profiles to MemOS..."
+        python3 "${SCRIPT_DIR}/memos-upsert.py" --profiles "${PROJECT_DIR}/profiles" 2>&1 | tee -a "${LOG_FILE}"
+        say ""
+        info "Uploading conversation chunks to KB..."
+        python3 "${SCRIPT_DIR}/bulk-embed.py" --parsed "${PROJECT_DIR}/parsed" --upload 2>&1 | tee -a "${LOG_FILE}"
+    fi
 fi
 
 # ---- Step 6: verification report --------------------------------------------
@@ -239,9 +277,11 @@ say "============================================================"
 ok "Bootstrap complete."
 say ""
 say "Next steps:"
-say "  1. Open ${PROJECT_DIR}/profiles/_manual_review.txt and triage."
-say "  2. Push approved profiles to MemOS (see docs/README.md Step 4)."
-say "  3. Run Layer B segment mining (docs/OpenClaw-knowledge-base-import.md)."
+say "  1. Open ${PROJECT_DIR}/profiles/_manual_review.txt and triage gated customers."
+say "  2. Manually audit ${PROJECT_DIR}/golden/*.yaml — set _human_reviewed: true on keepers."
+say "  3. If you skipped the push step, run:"
+say "       python3 scripts/memos-upsert.py --profiles profiles"
+say "       python3 scripts/bulk-embed.py   --parsed parsed --upload"
 say "  4. Configure system prompt (docs/system-prompt-template.md)."
 say "  5. Run the 5 pre-launch verification cases before going live."
 say ""
 
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+"""
+Bulk Embed — Layer C upload
+
+Chunks ./parsed/<customer_hash>.jsonl into KB-ready records and uploads to
+PulseAgent / OpenClaw Knowledge Base `conversation_history` collection.
+
+By design we DO NOT compute embeddings locally — the PulseAgent KB backend
+owns embedding model selection so all customers stay consistent. This script
+just slices conversation turns into chunks + metadata.
+
+Two output modes:
+  --upload   POST each chunk to PA `/api/kb/upsert` endpoint
+  (default)  Emit a single ./layer-c-chunks.jsonl for offline import
+
+Chunk schema (per record):
+    {
+      "collection":    "conversation_history",
+      "customer_hash": "<16-hex>",         <-- isolation key
+      "session_id":    "<hash>-<isoTs>",
+      "chunk_id":      "<hash>-c0042",
+      "ts_start":      "ISO timestamp",
+      "ts_end":        "ISO timestamp",
+      "turn_count":    8,
+      "text":          "[me] ...\n[customer] ...\n..."
+    }
+
+Config sources mirror memos-upsert.py (pa-config.json / env vars / CLI).
+
+Usage:
+    python bulk-embed.py --parsed ./parsed                  # emit JSONL
+    python bulk-embed.py --parsed ./parsed --upload         # push to PA
+    python bulk-embed.py --parsed ./parsed --chunk-size 12  # larger chunks
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Iterable
+from urllib import error, request
+
+
+@dataclass
+class Chunk:
+    collection: str
+    customer_hash: str
+    session_id: str
+    chunk_id: str
+    ts_start: str
+    ts_end: str
+    turn_count: int
+    text: str
+
+
+def load_turns(jsonl_path: Path) -> list[dict]:
+    turns: list[dict] = []
+    with jsonl_path.open(encoding="utf-8") as fp:
+        for line in fp:
+            line = line.strip()
+            if line:
+                turns.append(json.loads(line))
+    return turns
+
+
+def chunk_turns(
+    turns: list[dict], chunk_size: int, overlap: int
+) -> Iterable[Chunk]:
+    if chunk_size <= overlap:
+        raise ValueError("chunk_size must exceed overlap")
+    if not turns:
+        return
+    customer_hash = turns[0]["customer_hash"]
+    step = chunk_size - overlap
+    idx = 0
+    chunk_num = 0
+    while idx < len(turns):
+        window = turns[idx : idx + chunk_size]
+        if not window:
+            break
+        ts_start, ts_end = window[0]["ts"], window[-1]["ts"]
+        text = "\n".join(f"[{t['sender']}] {t['text']}" for t in window)
+        yield Chunk(
+            collection="conversation_history",
+            customer_hash=customer_hash,
+            session_id=window[0].get("session_id", ""),
+            chunk_id=f"{customer_hash}-c{chunk_num:04d}",
+            ts_start=ts_start,
+            ts_end=ts_end,
+            turn_count=len(window),
+            text=text,
+        )
+        chunk_num += 1
+        if idx + chunk_size >= len(turns):
+            break
+        idx += step
+
+
+def load_config(args: argparse.Namespace) -> dict[str, str]:
+    cfg: dict[str, str] = {}
+    for path in (Path.cwd() / "pa-config.json", Path.home() / ".pa-config.json"):
+        if path.is_file():
+            cfg.update(json.loads(path.read_text(encoding="utf-8")))
+            break
+    for key in ("endpoint", "token", "tenant"):
+        env = os.environ.get(f"PA_{key.upper()}")
+        if env:
+            cfg[key] = env
+        cli = getattr(args, key, None)
+        if cli:
+            cfg[key] = cli
+    return cfg
+
+
+def upload_chunk(cfg: dict[str, str], chunk: Chunk) -> tuple[int, str]:
+    url = f"{cfg['endpoint'].rstrip('/')}/api/kb/upsert"
+    payload = json.dumps({"tenant": cfg.get("tenant"), **asdict(chunk)}).encode("utf-8")
+    req = request.Request(
+        url,
+        data=payload,
+        method="POST",
+        headers={
+            "Authorization": f"Bearer {cfg['token']}",
+            "Content-Type": "application/json",
+        },
+    )
+    try:
+        with request.urlopen(req, timeout=30) as resp:
+            return resp.status, resp.read().decode("utf-8", errors="replace")
+    except error.HTTPError as e:
+        return e.code, e.read().decode("utf-8", errors="replace")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--parsed", type=Path, required=True)
+    ap.add_argument("--output", type=Path, default=Path("./layer-c-chunks.jsonl"))
+    ap.add_argument("--chunk-size", type=int, default=8)
+    ap.add_argument("--chunk-overlap", type=int, default=2)
+    ap.add_argument("--upload", action="store_true", help="POST chunks to PA KB endpoint")
+    ap.add_argument("--endpoint")
+    ap.add_argument("--token")
+    ap.add_argument("--tenant")
+    args = ap.parse_args()
+
+    cfg = load_config(args)
+    if args.upload:
+        missing = [k for k in ("endpoint", "token") if k not in cfg]
+        if missing:
+            sys.exit(f"--upload requires config keys: {missing}. See script docstring.")
+
+    total_chunks = 0
+    total_uploads_ok = 0
+    total_uploads_err = 0
+
+    if not args.upload:
+        out_fp = args.output.open("w", encoding="utf-8")
+    else:
+        out_fp = None
+
+    try:
+        for jsonl in sorted(args.parsed.glob("*.jsonl")):
+            turns = load_turns(jsonl)
+            chunks = list(chunk_turns(turns, args.chunk_size, args.chunk_overlap))
+            total_chunks += len(chunks)
+            print(f"[chunk] {jsonl.stem}: {len(turns)} turns -> {len(chunks)} chunks")
+
+            for c in chunks:
+                if out_fp:
+                    out_fp.write(json.dumps(asdict(c), ensure_ascii=False) + "\n")
+                else:
+                    code, body = upload_chunk(cfg, c)
+                    if 200 <= code < 300:
+                        total_uploads_ok += 1
+                    else:
+                        total_uploads_err += 1
+                        snippet = body[:120].replace("\n", " ")
+                        print(f"[err] {c.chunk_id} -> HTTP {code}: {snippet}")
+    finally:
+        if out_fp:
+            out_fp.close()
+
+    if args.upload:
+        print(f"\nUploaded {total_uploads_ok}/{total_chunks} chunks; errors {total_uploads_err}.")
+    else:
+        print(f"\nWrote {total_chunks} chunks to {args.output}")
+        print("Next: feed to PulseAgent KB importer with collection=conversation_history.")
+
+
+if __name__ == "__main__":
+    main()