Skip to content

Commit 11c37ac

Browse files
committed
feat: new build script
1 parent 07d67b6 commit 11c37ac

File tree

3 files changed

+78
-1
lines changed

3 files changed

+78
-1
lines changed

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"start:discord": "pnpm --filter discord-qa -r start",
66
"start:api": "pnpm --filter api -r start",
77
"dev:api": "pnpm --filter api -r dev",
8-
"generate:discord-index": "python3 scripts/generate_discord_index.py > search-index/discord_theads_store.json"
8+
"generate:discord-index": "python3 scripts/generate_discord_index.py > search-index/discord_theads_store.json",
9+
"build:index": "pnpm --filter docs-qa -r build"
910
}
1011
}

packages/qa/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
"name": "@gitpod/docs-qa",
33
"private": true,
44
"main": "./src/index.ts",
5+
"scripts": {
6+
"build": "tsm src/build.ts"
7+
},
58
"devDependencies": {
69
"@types/marked": "^5.0.0",
710
"@types/node": "^20.2.1",

packages/qa/src/build.ts

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
// import { UnstructuredLoader } from 'langchain/document_loaders/fs/unstructured';
2+
import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
3+
import { MarkdownTextSplitter } from 'langchain/text_splitter';
4+
import { HNSWLib } from 'langchain/vectorstores/hnswlib';
5+
import { TEMP_PATH, SEARCH_PATH } from './paths';
6+
import { Document } from 'langchain/document';
7+
import { simpleGit } from 'simple-git';
8+
import { readFile } from 'fs/promises';
9+
import { totalist } from 'totalist';
10+
import { existsSync } from 'fs';
11+
12+
const repoPath = `${TEMP_PATH}/gitpod`;
13+
14+
if (!existsSync(repoPath)) {
15+
console.log('Cloning Gitpod Repo');
16+
17+
await simpleGit().clone('https://github.com/gitpod-io/website', repoPath, {
18+
'--depth': 1,
19+
});
20+
}
21+
22+
console.log('Finding MD files');
23+
24+
const paths: { path: string; url: string }[] = [];
25+
26+
await totalist(repoPath, (relative, path) => {
27+
if (!relative.endsWith('.md') || !relative.startsWith('src/routes')) return;
28+
29+
const pathname = relative
30+
.replace('src/routes', '')
31+
.slice(0, -3)
32+
.replace(/\/index$/gm, '/')
33+
.replace(/\/$/gm, '');
34+
35+
paths.push({
36+
url: `https://www.gitpod.io${pathname}`,
37+
path,
38+
});
39+
});
40+
41+
console.log('Getting documents');
42+
43+
const documents: Document[] = [];
44+
45+
for (const { path, url } of paths) {
46+
// const loader = new UnstructuredLoader(path, {
47+
// // Switch to this if you are getting ratelimits, needs unstructured running locally
48+
// // https://js.langchain.com/docs/modules/indexes/document_loaders/examples/file_loaders/unstructured#setup
49+
// apiUrl: 'http://127.0.0.1:8000/general/v0/general',
50+
// });
51+
52+
// const docs = await loader.loadAndSplit(new MarkdownTextSplitter());
53+
54+
// for (const doc of docs) {
55+
// doc.metadata.source = url;
56+
// }
57+
58+
const content = await readFile(path, 'utf-8');
59+
const splitter = new MarkdownTextSplitter();
60+
61+
const docs = await splitter.createDocuments([content], [{ source: url }]);
62+
63+
documents.push(...docs);
64+
}
65+
66+
console.log(`Found ${documents.length} documents`);
67+
68+
console.log('Creating Store');
69+
70+
const store = await HNSWLib.fromDocuments(documents, new OpenAIEmbeddings());
71+
store.save(SEARCH_PATH);
72+
73+
console.log('Done');

0 commit comments

Comments
 (0)