|
| 1 | +// import { UnstructuredLoader } from 'langchain/document_loaders/fs/unstructured'; |
| 2 | +import { OpenAIEmbeddings } from 'langchain/embeddings/openai'; |
| 3 | +import { MarkdownTextSplitter } from 'langchain/text_splitter'; |
| 4 | +import { HNSWLib } from 'langchain/vectorstores/hnswlib'; |
| 5 | +import { TEMP_PATH, SEARCH_PATH } from './paths'; |
| 6 | +import { Document } from 'langchain/document'; |
| 7 | +import { simpleGit } from 'simple-git'; |
| 8 | +import { readFile } from 'fs/promises'; |
| 9 | +import { totalist } from 'totalist'; |
| 10 | +import { existsSync } from 'fs'; |
| 11 | + |
| 12 | +const repoPath = `${TEMP_PATH}/gitpod`; |
| 13 | + |
| 14 | +if (!existsSync(repoPath)) { |
| 15 | + console.log('Cloning Gitpod Repo'); |
| 16 | + |
| 17 | + await simpleGit().clone('https://github.com/gitpod-io/website', repoPath, { |
| 18 | + '--depth': 1, |
| 19 | + }); |
| 20 | +} |
| 21 | + |
| 22 | +console.log('Finding MD files'); |
| 23 | + |
| 24 | +const paths: { path: string; url: string }[] = []; |
| 25 | + |
| 26 | +await totalist(repoPath, (relative, path) => { |
| 27 | + if (!relative.endsWith('.md') || !relative.startsWith('src/routes')) return; |
| 28 | + |
| 29 | + const pathname = relative |
| 30 | + .replace('src/routes', '') |
| 31 | + .slice(0, -3) |
| 32 | + .replace(/\/index$/gm, '/') |
| 33 | + .replace(/\/$/gm, ''); |
| 34 | + |
| 35 | + paths.push({ |
| 36 | + url: `https://www.gitpod.io${pathname}`, |
| 37 | + path, |
| 38 | + }); |
| 39 | +}); |
| 40 | + |
| 41 | +console.log('Getting documents'); |
| 42 | + |
| 43 | +const documents: Document[] = []; |
| 44 | + |
| 45 | +for (const { path, url } of paths) { |
| 46 | + // const loader = new UnstructuredLoader(path, { |
| 47 | + // // Switch to this if you are getting ratelimits, needs unstructured running locally |
| 48 | + // // https://js.langchain.com/docs/modules/indexes/document_loaders/examples/file_loaders/unstructured#setup |
| 49 | + // apiUrl: 'http://127.0.0.1:8000/general/v0/general', |
| 50 | + // }); |
| 51 | + |
| 52 | + // const docs = await loader.loadAndSplit(new MarkdownTextSplitter()); |
| 53 | + |
| 54 | + // for (const doc of docs) { |
| 55 | + // doc.metadata.source = url; |
| 56 | + // } |
| 57 | + |
| 58 | + const content = await readFile(path, 'utf-8'); |
| 59 | + const splitter = new MarkdownTextSplitter(); |
| 60 | + |
| 61 | + const docs = await splitter.createDocuments([content], [{ source: url }]); |
| 62 | + |
| 63 | + documents.push(...docs); |
| 64 | +} |
| 65 | + |
| 66 | +console.log(`Found ${documents.length} documents`); |
| 67 | + |
| 68 | +console.log('Creating Store'); |
| 69 | + |
| 70 | +const store = await HNSWLib.fromDocuments(documents, new OpenAIEmbeddings()); |
| 71 | +store.save(SEARCH_PATH); |
| 72 | + |
| 73 | +console.log('Done'); |
0 commit comments