@@ -383,6 +383,296 @@ export class WikiCompiler {
383383 } ;
384384 }
385385
386+ /**
387+ * Ingest a URL — fetch page content, convert to markdown, compile into wiki.
388+ * Supports single pages and basic site crawling (follows internal links up to maxPages).
389+ */
390+ async ingestUrl (
391+ url : string ,
392+ opts ?: { maxPages ?: number ; depth ?: number }
393+ ) : Promise < CompileResult > {
394+ const maxPages = opts ?. maxPages ?? 20 ;
395+ const created : string [ ] = [ ] ;
396+ const visited = new Set < string > ( ) ;
397+ const queue = [ url ] ;
398+ let baseHost : string ;
399+
400+ try {
401+ baseHost = new URL ( url ) . hostname ;
402+ } catch {
403+ return {
404+ created : [ ] ,
405+ updated : [ ] ,
406+ totalArticles : this . countArticles ( ) ,
407+ compiledAt : Date . now ( ) ,
408+ } ;
409+ }
410+
411+ while ( queue . length > 0 && visited . size < maxPages ) {
412+ const pageUrl = queue . shift ( ) ;
413+ if ( ! pageUrl || visited . has ( pageUrl ) ) continue ;
414+ visited . add ( pageUrl ) ;
415+
416+ try {
417+ const { title, content, links } = await this . fetchPage ( pageUrl ) ;
418+ if ( ! content || content . length < 50 ) continue ;
419+
420+ // Create source article
421+ const slug = this . slugify ( title || this . urlToSlug ( pageUrl ) ) ;
422+ const sourcePath = `sources/${ slug } .md` ;
423+
424+ const article = [
425+ '---' ,
426+ `title: "${ this . escapeYaml ( title || pageUrl ) } "` ,
427+ `category: source` ,
428+ `url: "${ pageUrl } "` ,
429+ `created: ${ new Date ( ) . toISOString ( ) } ` ,
430+ `updated: ${ new Date ( ) . toISOString ( ) } ` ,
431+ `tags: [source, web-ingest, ${ baseHost } ]` ,
432+ '---' ,
433+ '' ,
434+ `# ${ title || pageUrl } ` ,
435+ '' ,
436+ `> Source: ${ pageUrl } ` ,
437+ '' ,
438+ content . slice ( 0 , 8000 ) ,
439+ content . length > 8000 ? '\n\n_...truncated..._' : '' ,
440+ '' ,
441+ ] . join ( '\n' ) ;
442+
443+ this . writeArticle ( sourcePath , article ) ;
444+ created . push ( sourcePath ) ;
445+
446+ // Queue internal links
447+ for ( const link of links ) {
448+ try {
449+ const parsed = new URL ( link , pageUrl ) ;
450+ if ( parsed . hostname === baseHost && ! visited . has ( parsed . href ) ) {
451+ queue . push ( parsed . href ) ;
452+ }
453+ } catch {
454+ // skip invalid URLs
455+ }
456+ }
457+ } catch {
458+ // skip failed pages
459+ }
460+ }
461+
462+ if ( created . length > 0 ) {
463+ this . updateIndex ( ) ;
464+ this . appendLog (
465+ `## [${ new Date ( ) . toISOString ( ) . slice ( 0 , 10 ) } ] ingest-url | ${ baseHost } ` ,
466+ `Crawled ${ visited . size } pages from ${ url } — ${ created . length } articles created`
467+ ) ;
468+ }
469+
470+ return {
471+ created,
472+ updated : [ ] ,
473+ totalArticles : this . countArticles ( ) ,
474+ compiledAt : Date . now ( ) ,
475+ } ;
476+ }
477+
478+ /**
479+ * Ingest a local file or directory into the wiki.
480+ */
481+ async ingestPath ( filePath : string ) : Promise < CompileResult > {
482+ const created : string [ ] = [ ] ;
483+ const { statSync } = await import ( 'fs' ) ;
484+ const stat = statSync ( filePath ) ;
485+
486+ const processFile = ( fp : string ) => {
487+ if ( ! existsSync ( fp ) ) return ;
488+ const content = readFileSync ( fp , 'utf-8' ) ;
489+ const basename = fp . split ( '/' ) . pop ( ) ?? fp ;
490+ const ext = basename . split ( '.' ) . pop ( ) ?? '' ;
491+
492+ if (
493+ ! [
494+ 'md' ,
495+ 'txt' ,
496+ 'json' ,
497+ 'yaml' ,
498+ 'yml' ,
499+ 'toml' ,
500+ 'ts' ,
501+ 'js' ,
502+ 'py' ,
503+ ] . includes ( ext )
504+ )
505+ return ;
506+
507+ const title = basename . replace ( / \. [ ^ . ] + $ / , '' ) ;
508+ const slug = this . slugify ( title ) ;
509+ const sourcePath = `sources/${ slug } .md` ;
510+
511+ const article = [
512+ '---' ,
513+ `title: "${ this . escapeYaml ( title ) } "` ,
514+ `category: source` ,
515+ `source_file: "${ fp } "` ,
516+ `created: ${ new Date ( ) . toISOString ( ) } ` ,
517+ `updated: ${ new Date ( ) . toISOString ( ) } ` ,
518+ `tags: [source, local-ingest]` ,
519+ '---' ,
520+ '' ,
521+ `# ${ title } ` ,
522+ '' ,
523+ `> Source: \`${ fp } \`` ,
524+ '' ,
525+ content . slice ( 0 , 8000 ) ,
526+ content . length > 8000 ? '\n\n_...truncated..._' : '' ,
527+ '' ,
528+ ] . join ( '\n' ) ;
529+
530+ this . writeArticle ( sourcePath , article ) ;
531+ created . push ( sourcePath ) ;
532+ } ;
533+
534+ if ( stat . isDirectory ( ) ) {
535+ const entries = readdirSync ( filePath , { recursive : true } ) as string [ ] ;
536+ for ( const entry of entries ) {
537+ processFile ( join ( filePath , entry ) ) ;
538+ }
539+ } else {
540+ processFile ( filePath ) ;
541+ }
542+
543+ if ( created . length > 0 ) {
544+ this . updateIndex ( ) ;
545+ this . appendLog (
546+ `## [${ new Date ( ) . toISOString ( ) . slice ( 0 , 10 ) } ] ingest-path | ${ filePath } ` ,
547+ `${ created . length } articles from local path`
548+ ) ;
549+ }
550+
551+ return {
552+ created,
553+ updated : [ ] ,
554+ totalArticles : this . countArticles ( ) ,
555+ compiledAt : Date . now ( ) ,
556+ } ;
557+ }
558+
559+ /** Fetch a web page and extract title, markdown content, and links */
560+ private async fetchPage (
561+ url : string
562+ ) : Promise < { title : string ; content : string ; links : string [ ] } > {
563+ const res = await fetch ( url , {
564+ headers : { 'User-Agent' : 'StackMemory-Wiki/1.0' } ,
565+ signal : AbortSignal . timeout ( 10000 ) ,
566+ } ) ;
567+ if ( ! res . ok ) return { title : '' , content : '' , links : [ ] } ;
568+
569+ const html = await res . text ( ) ;
570+
571+ // Extract title
572+ const titleMatch = html . match ( / < t i t l e [ ^ > ] * > ( [ ^ < ] + ) < \/ t i t l e > / i) ;
573+ const title = titleMatch ?. [ 1 ] ?. trim ( ) ?? '' ;
574+
575+ // Extract links
576+ const links : string [ ] = [ ] ;
577+ const linkRe = / h r e f = " ( [ ^ " ] + ) " / g;
578+ let linkMatch ;
579+ while ( ( linkMatch = linkRe . exec ( html ) ) !== null ) {
580+ const href = linkMatch [ 1 ] ?? '' ;
581+ if (
582+ href &&
583+ ! href . startsWith ( '#' ) &&
584+ ! href . startsWith ( 'javascript:' ) &&
585+ ! href . startsWith ( 'mailto:' )
586+ ) {
587+ links . push ( href ) ;
588+ }
589+ }
590+
591+ // Convert HTML to markdown (simple extraction)
592+ const content = this . htmlToMarkdown ( html ) ;
593+
594+ return { title, content, links } ;
595+ }
596+
597+ /** Simple HTML to markdown conversion */
598+ private htmlToMarkdown ( html : string ) : string {
599+ let text = html ;
600+
601+ // Remove script, style, nav, footer, header
602+ text = text . replace (
603+ / < ( s c r i p t | s t y l e | n a v | f o o t e r | h e a d e r | a s i d e ) [ ^ > ] * > [ \s \S ] * ?< \/ \1> / gi,
604+ ''
605+ ) ;
606+
607+ // Extract main/article content if present
608+ const mainMatch = text . match (
609+ / < (?: m a i n | a r t i c l e ) [ ^ > ] * > ( [ \s \S ] * ?) < \/ (?: m a i n | a r t i c l e ) > / i
610+ ) ;
611+ if ( mainMatch ) text = mainMatch [ 1 ] ?? text ;
612+
613+ // Headings
614+ text = text . replace ( / < h 1 [ ^ > ] * > ( [ \s \S ] * ?) < \/ h 1 > / gi, '\n# $1\n' ) ;
615+ text = text . replace ( / < h 2 [ ^ > ] * > ( [ \s \S ] * ?) < \/ h 2 > / gi, '\n## $1\n' ) ;
616+ text = text . replace ( / < h 3 [ ^ > ] * > ( [ \s \S ] * ?) < \/ h 3 > / gi, '\n### $1\n' ) ;
617+ text = text . replace ( / < h 4 [ ^ > ] * > ( [ \s \S ] * ?) < \/ h 4 > / gi, '\n#### $1\n' ) ;
618+
619+ // Paragraphs and line breaks
620+ text = text . replace ( / < p [ ^ > ] * > / gi, '\n' ) ;
621+ text = text . replace ( / < \/ p > / gi, '\n' ) ;
622+ text = text . replace ( / < b r \s * \/ ? > / gi, '\n' ) ;
623+
624+ // Lists
625+ text = text . replace ( / < l i [ ^ > ] * > / gi, '- ' ) ;
626+ text = text . replace ( / < \/ l i > / gi, '\n' ) ;
627+
628+ // Bold, italic
629+ text = text . replace (
630+ / < (?: s t r o n g | b ) [ ^ > ] * > ( [ \s \S ] * ?) < \/ (?: s t r o n g | b ) > / gi,
631+ '**$1**'
632+ ) ;
633+ text = text . replace ( / < (?: e m | i ) [ ^ > ] * > ( [ \s \S ] * ?) < \/ (?: e m | i ) > / gi, '*$1*' ) ;
634+
635+ // Code
636+ text = text . replace ( / < c o d e [ ^ > ] * > ( [ \s \S ] * ?) < \/ c o d e > / gi, '`$1`' ) ;
637+ text = text . replace ( / < p r e [ ^ > ] * > ( [ \s \S ] * ?) < \/ p r e > / gi, '\n```\n$1\n```\n' ) ;
638+
639+ // Links
640+ text = text . replace (
641+ / < a [ ^ > ] * h r e f = " ( [ ^ " ] * ) " [ ^ > ] * > ( [ \s \S ] * ?) < \/ a > / gi,
642+ '[$2]($1)'
643+ ) ;
644+
645+ // Strip remaining tags
646+ text = text . replace ( / < [ ^ > ] + > / g, '' ) ;
647+
648+ // Clean up entities
649+ text = text . replace ( / & a m p ; / g, '&' ) ;
650+ text = text . replace ( / & l t ; / g, '<' ) ;
651+ text = text . replace ( / & g t ; / g, '>' ) ;
652+ text = text . replace ( / & q u o t ; / g, '"' ) ;
653+ text = text . replace ( / & # 3 9 ; / g, "'" ) ;
654+ text = text . replace ( / & n b s p ; / g, ' ' ) ;
655+
656+ // Clean up whitespace
657+ text = text . replace ( / \n { 3 , } / g, '\n\n' ) ;
658+ text = text . trim ( ) ;
659+
660+ return text ;
661+ }
662+
663+ /** Convert URL path to a readable slug */
664+ private urlToSlug ( url : string ) : string {
665+ try {
666+ const parsed = new URL ( url ) ;
667+ const path = parsed . pathname . replace ( / ^ \/ | \/ $ / g, '' ) ;
668+ return path
669+ ? this . slugify ( path . replace ( / \/ / g, '-' ) )
670+ : this . slugify ( parsed . hostname ) ;
671+ } catch {
672+ return this . slugify ( url ) ;
673+ }
674+ }
675+
386676 /** Lint the wiki for health issues */
387677 async lint ( ) : Promise < WikiLintResult > {
388678 const allArticles = this . listAllArticles ( ) ;
0 commit comments