Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions .github/workflows/docs-ingestion-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,11 @@ jobs:
working-directory: docs-embeddings
env:
DRY_RUN: 'true'
AWS_REGION: ${{ vars.AWS_REGION }}
BEDROCK_MODEL_ID: ${{ vars.BEDROCK_MODEL_ID }}
PINECONE_INDEX: ${{ vars.PINECONE_INDEX }}
BATCH_SIZE: ${{ vars.BATCH_SIZE }}
EMBEDDING_CONCURRENCY: ${{ vars.EMBEDDING_CONCURRENCY }}
INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json
run: node dist/index.js --dry-run

Expand Down Expand Up @@ -278,7 +283,7 @@ jobs:
uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c # v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ap-south-1
aws-region: ${{ vars.AWS_REGION }}

# ── Build ingestion pipeline ──
- name: Install ingestion dependencies
Expand All @@ -305,7 +310,8 @@ jobs:
- name: Upload content to S3
working-directory: docs-ingestion
env:
CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }}
AWS_REGION: ${{ vars.AWS_REGION }}
CONTENT_BUCKET_NAME: ${{ vars.CONTENT_BUCKET_NAME }}
run: node dist/upload-content.js

# ── Build and run embedding sync ──
Expand All @@ -320,8 +326,12 @@ jobs:
- name: Run embedding sync
working-directory: docs-embeddings
env:
AWS_REGION: ${{ vars.AWS_REGION }}
BEDROCK_MODEL_ID: ${{ vars.BEDROCK_MODEL_ID }}
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
PINECONE_INDEX: ${{ secrets.PINECONE_INDEX }}
CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }}
PINECONE_INDEX: ${{ vars.PINECONE_INDEX }}
CONTENT_BUCKET_NAME: ${{ vars.CONTENT_BUCKET_NAME }}
BATCH_SIZE: ${{ vars.BATCH_SIZE }}
EMBEDDING_CONCURRENCY: ${{ vars.EMBEDDING_CONCURRENCY }}
INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json
run: node dist/index.js
25 changes: 15 additions & 10 deletions docs-embeddings/src/embed-all.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ function fail(msg: string): never {
process.exit(1);
}

function requireEnv(name: string): string {
const value = process.env[name];
if (!value) fail(`${name} environment variable is required`);
return value;
}

function runStep(label: string, cmd: string, cwd: string): void {
console.log(`\n── ${label} ──`);
try {
Expand Down Expand Up @@ -144,7 +150,10 @@ async function main(): Promise<void> {
console.log("\n── Pinecone namespace check ──");

const pineconeApiKey = process.env.PINECONE_API_KEY;
const pineconeIndex = process.env.PINECONE_INDEX || "docs-embeddings";
const pineconeIndex = process.env.PINECONE_INDEX;
if (!pineconeIndex) {
fail("PINECONE_INDEX environment variable is required");
}

if (!pineconeApiKey) {
fail("PINECONE_API_KEY environment variable is required");
Expand Down Expand Up @@ -241,18 +250,14 @@ async function main(): Promise<void> {
const config: EmbeddingConfig = {
ingestionOutputPath: chunksPath,
stateFilePath: path.join(process.cwd(), "state", "indexed-hashes.json"),
awsRegion: process.env.AWS_REGION || "ap-south-1",
bedrockModelId:
process.env.BEDROCK_MODEL_ID || "amazon.titan-embed-text-v2:0",
awsRegion: requireEnv("AWS_REGION"),
bedrockModelId: requireEnv("BEDROCK_MODEL_ID"),
pineconeApiKey: pineconeApiKey!,
pineconeIndex,
batchSize: parseInt(process.env.BATCH_SIZE || "25", 10),
pineconeIndex: pineconeIndex!,
batchSize: parseInt(requireEnv("BATCH_SIZE"), 10),
s3ContentBucket: process.env.CONTENT_BUCKET_NAME || undefined,
dryRun: false,
embeddingConcurrency: parseInt(
process.env.EMBEDDING_CONCURRENCY || "5",
10,
),
embeddingConcurrency: parseInt(requireEnv("EMBEDDING_CONCURRENCY"), 10),
};

try {
Expand Down
35 changes: 20 additions & 15 deletions docs-embeddings/src/embedder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import {
BedrockRuntimeClient,
InvokeModelCommand,
InvokeModelCommandInput
} from '@aws-sdk/client-bedrock-runtime';
InvokeModelCommandInput,
} from "@aws-sdk/client-bedrock-runtime";

export class BedrockEmbedder {
private client: BedrockRuntimeClient;
private modelId: string;
private callCount: number = 0;

constructor(region: string = 'ap-south-1', modelId: string = 'amazon.titan-embed-text-v2:0') {
constructor(region: string, modelId: string) {
this.client = new BedrockRuntimeClient({ region });
this.modelId = modelId;
}
Expand All @@ -33,42 +33,47 @@ export class BedrockEmbedder {
try {
const input: InvokeModelCommandInput = {
modelId: this.modelId,
contentType: 'application/json',
accept: 'application/json',
contentType: "application/json",
accept: "application/json",
body: JSON.stringify({
inputText: content,
normalize: true
})
normalize: true,
}),
};

const command = new InvokeModelCommand(input);
const response = await this.client.send(command);

const responseBody = JSON.parse(
new TextDecoder().decode(response.body)
new TextDecoder().decode(response.body),
);

this.callCount++;
return responseBody.embedding;
} catch (error: any) {
lastError = error;
const isRetryable =
error?.name === 'ThrottlingException' ||
error?.name === 'ServiceUnavailableException' ||
error?.name === 'ModelTimeoutException' ||
error?.name === "ThrottlingException" ||
error?.name === "ServiceUnavailableException" ||
error?.name === "ModelTimeoutException" ||
error?.$metadata?.httpStatusCode === 429 ||
error?.$metadata?.httpStatusCode >= 500;

if (!isRetryable || attempt === BedrockEmbedder.MAX_RETRIES) {
console.error(`Bedrock embedding failed (attempt ${attempt + 1}/${BedrockEmbedder.MAX_RETRIES + 1}):`, error);
console.error(
`Bedrock embedding failed (attempt ${attempt + 1}/${BedrockEmbedder.MAX_RETRIES + 1}):`,
error,
);
throw error;
}

// Full jitter: randomize within [50%, 100%] of exponential delay
// to prevent thundering herd when multiple concurrent calls retry
const maxDelay = Math.min(1000 * Math.pow(2, attempt), 16000);
const delay = Math.floor(maxDelay / 2 + Math.random() * maxDelay / 2);
console.warn(` Bedrock throttled (attempt ${attempt + 1}), retrying in ${delay}ms...`);
const delay = Math.floor(maxDelay / 2 + (Math.random() * maxDelay) / 2);
console.warn(
` Bedrock throttled (attempt ${attempt + 1}), retrying in ${delay}ms...`,
);
await this.sleep(delay);
}
}
Expand Down Expand Up @@ -113,6 +118,6 @@ export class BedrockEmbedder {
}

private sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
return new Promise((resolve) => setTimeout(resolve, ms));
}
}
16 changes: 11 additions & 5 deletions docs-embeddings/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ import type { EmbeddingConfig } from './types.js';
dotenv.config({ path: '.env.local' });
dotenv.config({ path: '.env' });

function requireEnv(name: string): string {
const value = process.env[name];
if (!value) throw new Error(`${name} environment variable is required`);
return value;
}

async function main() {
try {
const dryRun = process.argv.includes('--dry-run') || process.env.DRY_RUN === 'true';
Expand All @@ -21,14 +27,14 @@ async function main() {
path.join(process.cwd(), '..', 'docs-ingestion', 'output', 'chunks.json'),
stateFilePath: process.env.STATE_FILE_PATH ||
path.join(process.cwd(), 'state', 'indexed-hashes.json'),
awsRegion: process.env.AWS_REGION || 'ap-south-1',
bedrockModelId: process.env.BEDROCK_MODEL_ID || 'amazon.titan-embed-text-v2:0',
awsRegion: requireEnv('AWS_REGION'),
bedrockModelId: requireEnv('BEDROCK_MODEL_ID'),
pineconeApiKey: process.env.PINECONE_API_KEY || '',
pineconeIndex: process.env.PINECONE_INDEX || 'docs-embeddings',
batchSize: parseInt(process.env.BATCH_SIZE || '25', 10),
pineconeIndex: requireEnv('PINECONE_INDEX'),
batchSize: parseInt(requireEnv('BATCH_SIZE'), 10),
s3ContentBucket: process.env.CONTENT_BUCKET_NAME || undefined,
dryRun,
embeddingConcurrency: parseInt(process.env.EMBEDDING_CONCURRENCY || '3', 10),
embeddingConcurrency: parseInt(requireEnv('EMBEDDING_CONCURRENCY'), 10),
};

// Validate (Pinecone key not required in dry-run)
Expand Down
7 changes: 5 additions & 2 deletions docs-embeddings/src/verify-embed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@ async function main(): Promise<void> {

// ── Connect to Pinecone ──────────────────────────────────────
const pineconeApiKey = process.env.PINECONE_API_KEY;
const pineconeIndex = process.env.PINECONE_INDEX || "docs-embeddings";
const pineconeIndex = process.env.PINECONE_INDEX;
if (!pineconeIndex) {
fail("PINECONE_INDEX environment variable is required");
}

if (!pineconeApiKey) {
fail("PINECONE_API_KEY environment variable is required");
Expand Down Expand Up @@ -105,7 +108,7 @@ async function main(): Promise<void> {
console.log(" CONTENT_BUCKET_NAME not set — skipping S3 check\n");
} else {
const s3 = new S3Client({
region: process.env.AWS_REGION || "ap-south-1",
region: (() => { const r = process.env.AWS_REGION; if (!r) fail("AWS_REGION environment variable is required"); return r; })(),
});

// Deterministic sample: pick every Nth chunk instead of random
Expand Down
5 changes: 4 additions & 1 deletion docs-ingestion/src/upload-content.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ async function main() {
try {
// Load config from environment
const bucketName = process.env.CONTENT_BUCKET_NAME;
const awsRegion = process.env.AWS_REGION || "ap-south-1";
const awsRegion = process.env.AWS_REGION;
if (!awsRegion) {
throw new Error("AWS_REGION environment variable is required");
}
const outputPath =
process.env.OUTPUT_PATH ||
path.join(process.cwd(), "output", "chunks.json");
Expand Down
Loading