Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e1ef012
feat: Implement documentation ingestion pipeline
pulkitsetu Jan 7, 2026
3da1bff
feat: Add documentation embedding pipeline with AWS Bedrock integration
pulkitsetu Jan 8, 2026
1396640
feat: Implement S3 content upload pipeline with environment variable …
pulkitsetu Jan 9, 2026
1f61910
refactor: improve formatting and readability of MDX files by consolid…
pulkitsetu Jan 20, 2026
cfd9dce
feat: Add MDX normalization script and related tests, update package.…
pulkitsetu Jan 29, 2026
9e68e2d
feat: add API spec normalization script and plain Markdown parser
pulkitsetu Feb 6, 2026
c4bff26
refactor: update embedding pipeline documentation and remove FORCE_EM…
pulkitsetu Feb 6, 2026
703d7cd
feat: fix oversized chunks and add product metadata to ingestion pipe…
pulkitsetu Feb 10, 2026
5d61210
update with docs
pulkitsetu Mar 12, 2026
be2bd14
fix: update .gitignore to remove package-lock.json and yarn.lock; adj…
pulkitsetu Mar 16, 2026
1308c97
fix: restrict pull request branches to main for Docs Ingestion CI
pulkitsetu Mar 16, 2026
254f04e
fix: defer VectorDB construction for dry-run mode and update access m…
pulkitsetu Mar 16, 2026
ca99dc6
fix: use plain Markdown parser for normalized files and update .gitig…
pulkitsetu Mar 18, 2026
8877aa5
chore: migrate from npm to yarn and trigger CI on staging PRs
pulkitsetu Mar 23, 2026
bce9b1b
fix: run normalize-mdx before tests in CI
pulkitsetu Mar 23, 2026
0442bbf
feat: production readiness fixes for ingestion and embedding pipelines
pulkitsetu Mar 25, 2026
edd885b
fix: enable Corepack for Yarn 4 in CI
pulkitsetu Mar 25, 2026
22eaf1a
fix: address code review findings for ingestion and embedding pipelines
pulkitsetu Mar 25, 2026
fb6a0d8
fix: apply review suggestions for ingestion pipeline
pulkitsetu Mar 25, 2026
f9bb103
feat: reduce knowledge gaps with dynamic embedding thresholds
pulkitsetu Mar 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
327 changes: 327 additions & 0 deletions .github/workflows/docs-ingestion-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
name: Docs Ingestion CI

on:
pull_request:
branches: [main, staging]
paths:
- 'docs-ingestion/**'
- 'docs-embeddings/**'
- 'api-references/**'
- 'content/**'
push:
branches: [main, staging]
paths:
- 'docs-ingestion/**'
- 'docs-embeddings/**'
- 'api-references/**'
- 'content/**'

jobs:
build-and-test:
name: Build & Test
runs-on: ubuntu-latest
defaults:
run:
working-directory: docs-ingestion

steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

- name: Enable Corepack
run: corepack enable

- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: '20'
cache: 'yarn'
cache-dependency-path: docs-ingestion/yarn.lock

- name: Install dependencies
run: yarn install --immutable

- name: Security audit
run: yarn npm audit --severity moderate
continue-on-error: true

- name: Build
run: yarn build

- name: Normalize MDX (required by integration tests)
run: yarn normalize-mdx

# ── E. Test suite ──
- name: Run tests
run: yarn test

normalize-api-specs:
name: API Spec Normalization
runs-on: ubuntu-latest
defaults:
run:
working-directory: docs-ingestion

steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

- name: Enable Corepack
run: corepack enable

- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: '20'
cache: 'yarn'
cache-dependency-path: docs-ingestion/yarn.lock

- name: Install dependencies
run: yarn install --immutable

# ── A. Normalization run ──
- name: Run API spec normalization
run: yarn normalize-api-specs

- name: Verify output directory exists
run: |
if [ ! -d "../.api-reference-normalized" ]; then
echo "FAIL: .api-reference-normalized/ directory does not exist"
exit 1
fi
echo "PASS: Directory exists"

- name: Verify file count
run: |
count=$(find ../.api-reference-normalized -name '*.md' | wc -l | tr -d ' ')
echo "Found $count normalized files"
if [ "$count" -lt 200 ]; then
echo "FAIL: Expected at least 200 files, got $count"
exit 1
fi
echo "PASS: File count ($count) >= 200"

# ── B. Determinism check ──
- name: Copy first run output
run: cp -r ../.api-reference-normalized /tmp/api-ref-norm-run1

- name: Run normalization again
run: yarn normalize-api-specs

- name: Verify determinism
run: |
diff_output=$(diff -r ../.api-reference-normalized /tmp/api-ref-norm-run1 2>&1) || true
if [ -n "$diff_output" ]; then
echo "FAIL: Normalization is not deterministic:"
echo "$diff_output" | head -20
exit 1
fi
echo "PASS: Output is deterministic"

# ── C. Token limit compliance ──
- name: Check token limits
run: yarn check-token-limits

# ── F. Git ignored state check ──
- name: Verify .api-reference-normalized is gitignored
run: |
cd ..
if git ls-files --error-unmatch .api-reference-normalized/ 2>/dev/null; then
echo "FAIL: .api-reference-normalized/ is tracked by git"
exit 1
fi
echo "PASS: .api-reference-normalized/ is not tracked"

- name: Verify .docs-normalized is gitignored
run: |
cd ..
if git ls-files --error-unmatch .docs-normalized/ 2>/dev/null; then
echo "FAIL: .docs-normalized/ is tracked by git"
exit 1
fi
echo "PASS: .docs-normalized/ is not tracked"

ingestion-smoke-test:
name: Ingestion Smoke Test
runs-on: ubuntu-latest
needs: [build-and-test, normalize-api-specs]
defaults:
run:
working-directory: docs-ingestion

steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

- name: Enable Corepack
run: corepack enable

- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: '20'
cache: 'yarn'
cache-dependency-path: docs-ingestion/yarn.lock

- name: Install dependencies
run: yarn install --immutable

- name: Build
run: yarn build

# Generate the normalized API specs (required for smoke test)
- name: Normalize API specs
run: yarn normalize-api-specs

# Normalize MDX if content/ exists
- name: Normalize MDX (if content exists)
run: |
if [ -d "../content" ]; then
yarn normalize-mdx || exit 1
else
echo "No content/ directory — skipping MDX normalization"
fi

# ── D. Ingestion smoke test ──
- name: Run ingestion smoke test
run: yarn smoke-test-ingestion

embedding-dry-run:
name: Embedding Dry Run
runs-on: ubuntu-latest
needs: [ingestion-smoke-test]

steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

- name: Enable Corepack
run: corepack enable

- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: '20'
cache: 'yarn'
cache-dependency-path: |
docs-ingestion/yarn.lock
docs-embeddings/yarn.lock

# Build ingestion pipeline and produce chunks.json
- name: Install ingestion dependencies
working-directory: docs-ingestion
run: yarn install --immutable

- name: Build ingestion
working-directory: docs-ingestion
run: yarn build

- name: Normalize API specs
working-directory: docs-ingestion
run: yarn normalize-api-specs

- name: Normalize MDX (if content exists)
working-directory: docs-ingestion
run: |
if [ -d "../content" ]; then
yarn normalize-mdx || exit 1
else
echo "No content/ directory — skipping MDX normalization"
fi

- name: Run ingestion pipeline
working-directory: docs-ingestion
run: node dist/index.js
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no argument for the file path for ./docs-normalized. intentional?


- name: Verify chunks.json exists
run: |
if [ ! -f "docs-ingestion/output/chunks.json" ]; then
echo "FAIL: chunks.json not produced"
exit 1
fi
echo "PASS: chunks.json exists"

# Build embeddings pipeline and run dry-run
- name: Install embedding dependencies
working-directory: docs-embeddings
run: yarn install --immutable

- name: Build embeddings
working-directory: docs-embeddings
run: yarn build

# ── E. Embedding dry run ──
- name: Run embedding dry run
working-directory: docs-embeddings
env:
DRY_RUN: 'true'
INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json
run: node dist/index.js --dry-run

# ── Deploy: update knowledge base (main only) ──
deploy-knowledge-base:
name: Deploy Knowledge Base
runs-on: ubuntu-latest
needs: [embedding-dry-run]
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
permissions:
id-token: write
contents: read

steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

- name: Enable Corepack
run: corepack enable

- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: '20'
cache: 'yarn'
cache-dependency-path: |
docs-ingestion/yarn.lock
docs-embeddings/yarn.lock

- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c # v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ap-south-1

# ── Build ingestion pipeline ──
- name: Install ingestion dependencies
working-directory: docs-ingestion
run: yarn install --immutable

- name: Build ingestion
working-directory: docs-ingestion
run: yarn build

- name: Normalize API specs
working-directory: docs-ingestion
run: yarn normalize-api-specs

- name: Normalize MDX
working-directory: docs-ingestion
run: yarn normalize-mdx

- name: Run ingestion pipeline
working-directory: docs-ingestion
run: node dist/index.js

# ── Upload content to S3 ──
- name: Upload content to S3
working-directory: docs-ingestion
env:
CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }}
run: node dist/upload-content.js

# ── Build and run embedding sync ──
- name: Install embedding dependencies
working-directory: docs-embeddings
run: yarn install --immutable

- name: Build embeddings
working-directory: docs-embeddings
run: yarn build

- name: Run embedding sync
working-directory: docs-embeddings
env:
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
PINECONE_INDEX: ${{ secrets.PINECONE_INDEX }}
CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }}
INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json
run: node dist/index.js
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Normalized MDX output (generated by docs-ingestion/normalize-mdx)
.docs-normalized/

# Normalized API spec output (generated by docs-ingestion/normalize-api-specs)
.api-reference-normalized/

# Ruflo
.ruflo/

# Claude Code
.claude-flow/
.mcp.json
.claude
.swarm
Loading
Loading