Skip to content

Commit d216350

Browse files
author
StackMemory Bot (CLI)
committed
feat(wiki): add ingest command — crawl URLs or local paths into wiki
stackmemory wiki ingest <url> — crawl site, follow internal links (max 20 pages) stackmemory wiki ingest <path> — ingest local file or directory Includes HTML→markdown converter, link extraction, internal link crawling. Tested: 20 Stripe API docs pages ingested in one command.
1 parent 5c5d01e commit d216350

2 files changed

Lines changed: 338 additions & 0 deletions

File tree

src/cli/commands/wiki.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,54 @@ export function createWikiCommand(): Command {
265265
console.log(chalk.gray(` Total articles: ${result.totalArticles}`));
266266
});
267267

268+
// ── ingest ──
269+
cmd
270+
.command('ingest <source>')
271+
.description('Ingest a URL or local path into the wiki')
272+
.option('--wiki-dir <path>', 'Override wiki directory')
273+
.option('-n, --max-pages <n>', 'Max pages to crawl for URLs', '20')
274+
.option('--json', 'Output as JSON')
275+
.action(async (source: string, options) => {
276+
const compiler = getCompiler(options.wikiDir);
277+
await compiler.initialize();
278+
279+
const isUrl =
280+
source.startsWith('http://') || source.startsWith('https://');
281+
let result;
282+
283+
if (isUrl) {
284+
console.log(
285+
chalk.cyan(`Crawling ${source} (max ${options.maxPages} pages)...`)
286+
);
287+
result = await compiler.ingestUrl(source, {
288+
maxPages: parseInt(options.maxPages),
289+
});
290+
} else {
291+
console.log(chalk.cyan(`Ingesting ${source}...`));
292+
result = await compiler.ingestPath(source);
293+
}
294+
295+
if (options.json) {
296+
console.log(JSON.stringify(result, null, 2));
297+
return;
298+
}
299+
300+
console.log(chalk.green('\nIngested.'));
301+
console.log(chalk.gray(` Articles created: ${result.created.length}`));
302+
console.log(chalk.gray(` Total articles: ${result.totalArticles}`));
303+
if (result.created.length > 0) {
304+
console.log(chalk.gray('\n Created:'));
305+
result.created
306+
.slice(0, 10)
307+
.forEach((p) => console.log(chalk.gray(` - ${p}`)));
308+
if (result.created.length > 10) {
309+
console.log(
310+
chalk.gray(` ...and ${result.created.length - 10} more`)
311+
);
312+
}
313+
}
314+
});
315+
268316
// ── lint ──
269317
cmd
270318
.command('lint')

src/core/wiki/wiki-compiler.ts

Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,296 @@ export class WikiCompiler {
383383
};
384384
}
385385

386+
/**
387+
* Ingest a URL — fetch page content, convert to markdown, compile into wiki.
388+
* Supports single pages and basic site crawling (follows internal links up to maxPages).
389+
*/
390+
async ingestUrl(
391+
url: string,
392+
opts?: { maxPages?: number; depth?: number }
393+
): Promise<CompileResult> {
394+
const maxPages = opts?.maxPages ?? 20;
395+
const created: string[] = [];
396+
const visited = new Set<string>();
397+
const queue = [url];
398+
let baseHost: string;
399+
400+
try {
401+
baseHost = new URL(url).hostname;
402+
} catch {
403+
return {
404+
created: [],
405+
updated: [],
406+
totalArticles: this.countArticles(),
407+
compiledAt: Date.now(),
408+
};
409+
}
410+
411+
while (queue.length > 0 && visited.size < maxPages) {
412+
const pageUrl = queue.shift();
413+
if (!pageUrl || visited.has(pageUrl)) continue;
414+
visited.add(pageUrl);
415+
416+
try {
417+
const { title, content, links } = await this.fetchPage(pageUrl);
418+
if (!content || content.length < 50) continue;
419+
420+
// Create source article
421+
const slug = this.slugify(title || this.urlToSlug(pageUrl));
422+
const sourcePath = `sources/${slug}.md`;
423+
424+
const article = [
425+
'---',
426+
`title: "${this.escapeYaml(title || pageUrl)}"`,
427+
`category: source`,
428+
`url: "${pageUrl}"`,
429+
`created: ${new Date().toISOString()}`,
430+
`updated: ${new Date().toISOString()}`,
431+
`tags: [source, web-ingest, ${baseHost}]`,
432+
'---',
433+
'',
434+
`# ${title || pageUrl}`,
435+
'',
436+
`> Source: ${pageUrl}`,
437+
'',
438+
content.slice(0, 8000),
439+
content.length > 8000 ? '\n\n_...truncated..._' : '',
440+
'',
441+
].join('\n');
442+
443+
this.writeArticle(sourcePath, article);
444+
created.push(sourcePath);
445+
446+
// Queue internal links
447+
for (const link of links) {
448+
try {
449+
const parsed = new URL(link, pageUrl);
450+
if (parsed.hostname === baseHost && !visited.has(parsed.href)) {
451+
queue.push(parsed.href);
452+
}
453+
} catch {
454+
// skip invalid URLs
455+
}
456+
}
457+
} catch {
458+
// skip failed pages
459+
}
460+
}
461+
462+
if (created.length > 0) {
463+
this.updateIndex();
464+
this.appendLog(
465+
`## [${new Date().toISOString().slice(0, 10)}] ingest-url | ${baseHost}`,
466+
`Crawled ${visited.size} pages from ${url}${created.length} articles created`
467+
);
468+
}
469+
470+
return {
471+
created,
472+
updated: [],
473+
totalArticles: this.countArticles(),
474+
compiledAt: Date.now(),
475+
};
476+
}
477+
478+
/**
479+
* Ingest a local file or directory into the wiki.
480+
*/
481+
async ingestPath(filePath: string): Promise<CompileResult> {
482+
const created: string[] = [];
483+
const { statSync } = await import('fs');
484+
const stat = statSync(filePath);
485+
486+
const processFile = (fp: string) => {
487+
if (!existsSync(fp)) return;
488+
const content = readFileSync(fp, 'utf-8');
489+
const basename = fp.split('/').pop() ?? fp;
490+
const ext = basename.split('.').pop() ?? '';
491+
492+
if (
493+
![
494+
'md',
495+
'txt',
496+
'json',
497+
'yaml',
498+
'yml',
499+
'toml',
500+
'ts',
501+
'js',
502+
'py',
503+
].includes(ext)
504+
)
505+
return;
506+
507+
const title = basename.replace(/\.[^.]+$/, '');
508+
const slug = this.slugify(title);
509+
const sourcePath = `sources/${slug}.md`;
510+
511+
const article = [
512+
'---',
513+
`title: "${this.escapeYaml(title)}"`,
514+
`category: source`,
515+
`source_file: "${fp}"`,
516+
`created: ${new Date().toISOString()}`,
517+
`updated: ${new Date().toISOString()}`,
518+
`tags: [source, local-ingest]`,
519+
'---',
520+
'',
521+
`# ${title}`,
522+
'',
523+
`> Source: \`${fp}\``,
524+
'',
525+
content.slice(0, 8000),
526+
content.length > 8000 ? '\n\n_...truncated..._' : '',
527+
'',
528+
].join('\n');
529+
530+
this.writeArticle(sourcePath, article);
531+
created.push(sourcePath);
532+
};
533+
534+
if (stat.isDirectory()) {
535+
const entries = readdirSync(filePath, { recursive: true }) as string[];
536+
for (const entry of entries) {
537+
processFile(join(filePath, entry));
538+
}
539+
} else {
540+
processFile(filePath);
541+
}
542+
543+
if (created.length > 0) {
544+
this.updateIndex();
545+
this.appendLog(
546+
`## [${new Date().toISOString().slice(0, 10)}] ingest-path | ${filePath}`,
547+
`${created.length} articles from local path`
548+
);
549+
}
550+
551+
return {
552+
created,
553+
updated: [],
554+
totalArticles: this.countArticles(),
555+
compiledAt: Date.now(),
556+
};
557+
}
558+
559+
/** Fetch a web page and extract title, markdown content, and links */
560+
private async fetchPage(
561+
url: string
562+
): Promise<{ title: string; content: string; links: string[] }> {
563+
const res = await fetch(url, {
564+
headers: { 'User-Agent': 'StackMemory-Wiki/1.0' },
565+
signal: AbortSignal.timeout(10000),
566+
});
567+
if (!res.ok) return { title: '', content: '', links: [] };
568+
569+
const html = await res.text();
570+
571+
// Extract title
572+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
573+
const title = titleMatch?.[1]?.trim() ?? '';
574+
575+
// Extract links
576+
const links: string[] = [];
577+
const linkRe = /href="([^"]+)"/g;
578+
let linkMatch;
579+
while ((linkMatch = linkRe.exec(html)) !== null) {
580+
const href = linkMatch[1] ?? '';
581+
if (
582+
href &&
583+
!href.startsWith('#') &&
584+
!href.startsWith('javascript:') &&
585+
!href.startsWith('mailto:')
586+
) {
587+
links.push(href);
588+
}
589+
}
590+
591+
// Convert HTML to markdown (simple extraction)
592+
const content = this.htmlToMarkdown(html);
593+
594+
return { title, content, links };
595+
}
596+
597+
/** Simple HTML to markdown conversion */
598+
private htmlToMarkdown(html: string): string {
599+
let text = html;
600+
601+
// Remove script, style, nav, footer, header
602+
text = text.replace(
603+
/<(script|style|nav|footer|header|aside)[^>]*>[\s\S]*?<\/\1>/gi,
604+
''
605+
);
606+
607+
// Extract main/article content if present
608+
const mainMatch = text.match(
609+
/<(?:main|article)[^>]*>([\s\S]*?)<\/(?:main|article)>/i
610+
);
611+
if (mainMatch) text = mainMatch[1] ?? text;
612+
613+
// Headings
614+
text = text.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n# $1\n');
615+
text = text.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n## $1\n');
616+
text = text.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n### $1\n');
617+
text = text.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n#### $1\n');
618+
619+
// Paragraphs and line breaks
620+
text = text.replace(/<p[^>]*>/gi, '\n');
621+
text = text.replace(/<\/p>/gi, '\n');
622+
text = text.replace(/<br\s*\/?>/gi, '\n');
623+
624+
// Lists
625+
text = text.replace(/<li[^>]*>/gi, '- ');
626+
text = text.replace(/<\/li>/gi, '\n');
627+
628+
// Bold, italic
629+
text = text.replace(
630+
/<(?:strong|b)[^>]*>([\s\S]*?)<\/(?:strong|b)>/gi,
631+
'**$1**'
632+
);
633+
text = text.replace(/<(?:em|i)[^>]*>([\s\S]*?)<\/(?:em|i)>/gi, '*$1*');
634+
635+
// Code
636+
text = text.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, '`$1`');
637+
text = text.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, '\n```\n$1\n```\n');
638+
639+
// Links
640+
text = text.replace(
641+
/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi,
642+
'[$2]($1)'
643+
);
644+
645+
// Strip remaining tags
646+
text = text.replace(/<[^>]+>/g, '');
647+
648+
// Clean up entities
649+
text = text.replace(/&amp;/g, '&');
650+
text = text.replace(/&lt;/g, '<');
651+
text = text.replace(/&gt;/g, '>');
652+
text = text.replace(/&quot;/g, '"');
653+
text = text.replace(/&#39;/g, "'");
654+
text = text.replace(/&nbsp;/g, ' ');
655+
656+
// Clean up whitespace
657+
text = text.replace(/\n{3,}/g, '\n\n');
658+
text = text.trim();
659+
660+
return text;
661+
}
662+
663+
/** Convert URL path to a readable slug */
664+
private urlToSlug(url: string): string {
665+
try {
666+
const parsed = new URL(url);
667+
const path = parsed.pathname.replace(/^\/|\/$/g, '');
668+
return path
669+
? this.slugify(path.replace(/\//g, '-'))
670+
: this.slugify(parsed.hostname);
671+
} catch {
672+
return this.slugify(url);
673+
}
674+
}
675+
386676
/** Lint the wiki for health issues */
387677
async lint(): Promise<WikiLintResult> {
388678
const allArticles = this.listAllArticles();

0 commit comments

Comments
 (0)