chore: streamlined legacy-json and ast generation

ovflowd · ovflowd · commit 9f42aa5dee92 · 2025-12-09T02:34:41.000+01:00
diff --git a/bin/commands/generate.mjs b/bin/commands/generate.mjs
@@ -6,20 +6,13 @@ import { coerce } from 'semver';
 import { NODE_CHANGELOG_URL, NODE_VERSION } from '../../src/constants.mjs';
 import { publicGenerators } from '../../src/generators/index.mjs';
 import createGenerator from '../../src/generators.mjs';
+import logger from '../../src/logger/index.mjs';
 import { parseChangelog, parseIndex } from '../../src/parsers/markdown.mjs';
 import { DEFAULT_TYPE_MAP } from '../../src/utils/parser/constants.mjs';
 import { loadFromURL } from '../../src/utils/parser.mjs';
-import { loadAndParse } from '../utils.mjs';
 
 const availableGenerators = Object.keys(publicGenerators);
 
-// Half of available logical CPUs guarantees in general all physical CPUs are being used
-// which in most scenarios is the best way to maximize performance
-// When spawning more than a said number of threads, the overhead of context switching
-// and CPU contention starts to degrade performance rather than improve it.
-// Therefore, we set the optimal threads to half the number of CPU cores, with a minimum of 6.
-const optimalThreads = Math.max(cpus().length, 2);
-
 /**
  * @typedef {Object} Options
  * @property {Array<string>|string} input - Specifies the glob/path for input files.
@@ -70,7 +63,7 @@ export default {
       prompt: {
         type: 'text',
         message: 'How many threads to allow',
-        initialValue: String(Math.max(optimalThreads, 2)),
+        initialValue: String(cpus().length),
       },
     },
     chunkSize: {
@@ -146,15 +139,18 @@ export default {
    * @returns {Promise<void>}
    */
   async action(opts) {
-    const docs = await loadAndParse(opts.input, opts.ignore);
+    logger.debug('Starting doc-kit', opts);
+
+    const { runGenerators } = createGenerator();
+
     const releases = await parseChangelog(opts.changelog);
 
     const rawTypeMap = await loadFromURL(opts.typeMap);
     const typeMap = JSON.parse(rawTypeMap);
 
     const index = opts.index && (await parseIndex(opts.index));
 
-    const { runGenerators } = createGenerator(docs);
+    logger.debug(`Starting generation with targets: ${opts.target.join(', ')}`);
 
     await runGenerators({
       generators: opts.target,
diff --git a/src/generators.mjs b/src/generators.mjs
@@ -13,12 +13,11 @@ const generatorsLogger = logger.child('generators');
  * documentation generators in dependency order, with support for parallel
  * processing and streaming results.
  *
- * @param {ParserOutput} input - The API doc AST tree
  * @returns {{ runGenerators: (options: GeneratorOptions) => Promise<unknown[]> }}
  */
-const createGenerator = input => {
+const createGenerator = () => {
   /** @type {{ [key: string]: Promise<unknown> | AsyncGenerator }} */
-  const cachedGenerators = { ast: Promise.resolve(input) };
+  const cachedGenerators = {};
 
   const streamingCache = createStreamingCache();
 
@@ -28,10 +27,14 @@ const createGenerator = input => {
   /**
    * Gets the collected input from a dependency generator.
    *
-   * @param {string} dependsOn - Dependency generator name
+   * @param {string | undefined} dependsOn - Dependency generator name
    * @returns {Promise<unknown>}
    */
   const getDependencyInput = async dependsOn => {
+    if (!dependsOn) {
+      return undefined;
+    }
+
     const result = await cachedGenerators[dependsOn];
 
     if (isAsyncGenerator(result)) {
diff --git a/src/generators/__tests__/index.test.mjs b/src/generators/__tests__/index.test.mjs
@@ -5,7 +5,7 @@ import semver from 'semver';
 
 import { allGenerators } from '../index.mjs';
 
-const validDependencies = [...Object.keys(allGenerators), 'ast'];
+const validDependencies = Object.keys(allGenerators);
 const generatorEntries = Object.entries(allGenerators);
 
 describe('All Generators', () => {
@@ -34,9 +34,18 @@ describe('All Generators', () => {
       if (generator.dependsOn) {
         assert.ok(
           validDependencies.includes(generator.dependsOn),
-          `Generator "${key}" depends on "${generator.dependsOn}" which is not a valid generator or 'ast'`
+          `Generator "${key}" depends on "${generator.dependsOn}" which is not a valid generator`
         );
       }
     });
   });
+
+  it('should have ast generator as a top-level generator with no dependencies', () => {
+    assert.ok(allGenerators.ast, 'ast generator should exist');
+    assert.equal(
+      allGenerators.ast.dependsOn,
+      undefined,
+      'ast generator should have no dependencies'
+    );
+  });
 });
diff --git a/src/generators/api-links/__tests__/fixtures.test.mjs b/src/generators/api-links/__tests__/fixtures.test.mjs
@@ -1,7 +1,7 @@
 import { readdir } from 'node:fs/promises';
 import { cpus } from 'node:os';
 import { basename, extname, join } from 'node:path';
-import { describe, it } from 'node:test';
+import { after, before, describe, it } from 'node:test';
 
 import createWorkerPool from '../../../threading/index.mjs';
 import createParallelWorker from '../../../threading/parallel.mjs';
@@ -16,12 +16,20 @@ const sourceFiles = fixtures
   .map(fixture => join(FIXTURES_DIRECTORY, fixture));
 
 describe('api links', () => {
+  const threads = cpus().length;
+  let pool;
+
+  before(() => {
+    pool = createWorkerPool(threads);
+  });
+
+  after(async () => {
+    await pool.destroy();
+  });
+
   describe('should work correctly for all fixtures', () => {
     sourceFiles.forEach(sourceFile => {
       it(`${basename(sourceFile)}`, async t => {
-        const threads = cpus().length;
-        const pool = createWorkerPool(threads);
-
         const worker = createParallelWorker('ast-js', pool, {
           threads,
           chunkSize: 10,
@@ -46,8 +54,6 @@ describe('api links', () => {
         }
 
         t.assert.snapshot(actualOutput);
-
-        await pool.destroy();
       });
     });
   });
diff --git a/src/generators/ast/index.mjs b/src/generators/ast/index.mjs
@@ -0,0 +1,90 @@
+'use strict';
+
+import { readFile } from 'node:fs/promises';
+import { extname } from 'node:path';
+
+import { globSync } from 'glob';
+import { VFile } from 'vfile';
+
+import createQueries from '../../utils/queries/index.mjs';
+import { getRemark } from '../../utils/remark.mjs';
+
+const remarkProcessor = getRemark();
+
+const { updateStabilityPrefixToLink } = createQueries();
+
+/**
+ * Parses a single markdown file into an AST.
+ *
+ * @param {string} filePath - Path to the markdown file
+ * @returns {Promise<ParserOutput<import('mdast').Root>>}
+ */
+const parseMarkdownFile = async filePath => {
+  const fileContents = await readFile(filePath, 'utf-8');
+  const vfile = new VFile({ path: filePath, value: fileContents });
+
+  // Normalizes all the Stability Index prefixes with Markdown links
+  updateStabilityPrefixToLink(vfile);
+
+  // Parses the API doc into an AST tree using `unified` and `remark`
+  const tree = remarkProcessor.parse(vfile);
+
+  return { file: { stem: vfile.stem, basename: vfile.basename }, tree };
+};
+
+/**
+ * This generator parses Markdown API doc files into AST trees.
+ * It parallelizes the parsing across worker threads for better performance.
+ *
+ * @typedef {undefined} Input
+ *
+ * @type {GeneratorMetadata<Input, Array<ParserOutput<import('mdast').Root>>>}
+ */
+export default {
+  name: 'ast',
+
+  version: '1.0.0',
+
+  description: 'Parses Markdown API doc files into AST trees',
+
+  dependsOn: undefined,
+
+  processChunk: Object.assign(
+    /**
+     * Process a chunk of markdown files in a worker thread.
+     * Loads and parses markdown files into AST representations.
+     *
+     * @param {string[]} inputSlice - Sliced input paths for this chunk
+     * @param {number[]} itemIndices - Indices into the sliced array
+     * @returns {Promise<Array<ParserOutput<import('mdast').Root>>>}
+     */
+    async (inputSlice, itemIndices) => {
+      const results = [];
+
+      for (const idx of itemIndices) {
+        const parsed = await parseMarkdownFile(inputSlice[idx]);
+
+        results.push(parsed);
+      }
+
+      return results;
+    },
+    { sliceInput: true }
+  ),
+
+  /**
+   * Generates AST trees from markdown input files.
+   *
+   * @param {Input} _ - Unused (top-level generator)
+   * @param {Partial<GeneratorOptions>} options
+   * @returns {AsyncGenerator<Array<ParserOutput<import('mdast').Root>>>}
+   */
+  async *generate(_, { input = [], worker }) {
+    const files = globSync(input).filter(path => extname(path) === '.md');
+
+    // Parse markdown files in parallel using worker threads
+    for await (const chunkResult of worker.stream(files, files)) {
+      yield chunkResult;
+    }
+  },
+};
diff --git a/src/generators/index.mjs b/src/generators/index.mjs
@@ -2,6 +2,7 @@
 
 import addonVerify from './addon-verify/index.mjs';
 import apiLinks from './api-links/index.mjs';
+import ast from './ast/index.mjs';
 import astJs from './ast-js/index.mjs';
 import jsonSimple from './json-simple/index.mjs';
 import jsxAst from './jsx-ast/index.mjs';
@@ -32,6 +33,7 @@ export const publicGenerators = {
 // These ones are special since they don't produce standard output,
 // and hence, we don't expose them to the CLI.
 const internalGenerators = {
+  ast,
   metadata,
   'jsx-ast': jsxAst,
   'ast-js': astJs,
diff --git a/src/generators/jsx-ast/utils/buildContent.mjs b/src/generators/jsx-ast/utils/buildContent.mjs
@@ -295,10 +295,7 @@ const buildContent = async (metadataEntries, head, sideBarProps, remark) => {
   const ast = await remark.run(root);
 
   // The final MDX content is the expression in the Program's first body node
-  return {
-    ...ast.body[0].expression,
-    data: head,
-  };
+  return { ...ast.body[0].expression, data: head };
 };
 
 export default buildContent;
diff --git a/src/generators/legacy-json/index.mjs b/src/generators/legacy-json/index.mjs
@@ -30,31 +30,31 @@ export default {
 
   dependsOn: 'metadata',
 
-  /**
-   * Process a chunk of items in a worker thread.
-   * Builds JSON sections - FS operations happen in generate().
-   *
-   * @param {Input} fullInput - Full metadata input for context rebuilding
-   * @param {number[]} itemIndices - Indices of head nodes to process
-   * @param {Partial<Omit<GeneratorOptions, 'worker'>>} _options - Serializable options (unused)
-   * @returns {Promise<import('./types.d.ts').Section[]>} JSON sections for each processed module
-   */
-  async processChunk(fullInput, itemIndices) {
-    const groupedModules = groupNodesByModule(fullInput);
-
-    const headNodes = fullInput.filter(node => node.heading.depth === 1);
-
-    const results = [];
-
-    for (const idx of itemIndices) {
-      const head = headNodes[idx];
-      const nodes = groupedModules.get(head.api);
+  processChunk: Object.assign(
+    /**
+     * Process a chunk of items in a worker thread.
+     * Builds JSON sections - FS operations happen in generate().
+     *
+     * With sliceInput, each item is pre-grouped {head, nodes} - no need to
+     * recompute groupNodesByModule for every chunk.
+     *
+     * @param {Array<{head: ApiDocMetadataEntry, nodes: ApiDocMetadataEntry[]}>} slicedInput - Pre-sliced module data
+     * @param {number[]} itemIndices - Indices into the sliced array
+     * @returns {Promise<import('./types.d.ts').Section[]>} JSON sections for each processed module
+     */
+    async (slicedInput, itemIndices) => {
+      const results = [];
+
+      for (const idx of itemIndices) {
+        const { head, nodes } = slicedInput[idx];
+
+        results.push(buildSection(head, nodes));
+      }
 
-      results.push(buildSection(head, nodes));
-    }
-
-    return results;
-  },
+      return results;
+    },
+    { sliceInput: true }
+  ),
 
   /**
    * Generates a legacy JSON file.
@@ -64,11 +64,18 @@ export default {
    * @returns {AsyncGenerator<Array<import('./types.d.ts').Section>>}
    */
   async *generate(input, { output, worker }) {
+    const groupedModules = groupNodesByModule(input);
+
     const headNodes = input.filter(node => node.heading.depth === 1);
 
-    const deps = { output };
+    // Create sliced input: each item contains head + its module's entries
+    // This avoids sending all 4900+ entries to every worker
+    const slicedInput = headNodes.map(head => ({
+      head,
+      nodes: groupedModules.get(head.api),
+    }));
 
-    for await (const chunkResult of worker.stream(headNodes, input, deps)) {
+    for await (const chunkResult of worker.stream(slicedInput, slicedInput)) {
       if (output) {
         for (const section of chunkResult) {
           const out = join(output, `${section.api}.json`);
diff --git a/src/generators/types.d.ts b/src/generators/types.d.ts
@@ -101,14 +101,12 @@ declare global {
      * If you pass `createGenerator` with ['react', 'html'], the 'react' generator will be executed first,
      * as it is a top level generator and then the 'html' generator would be executed after the 'react' generator.
      *
-     * The 'ast' generator is the top-level parser, and if 'ast' is passed to `dependsOn`, then the generator
-     * will be marked as a top-level generator.
+     * The 'ast' generator is the top-level parser for markdown files. It has no dependencies.
      *
      * The `ast-js` generator is the top-level parser for JavaScript files. It
-     * passes the ASTs for any JavaScript files given in the input. Like `ast`,
-     * any generator depending on it is marked as a top-level generator.
+     * passes the ASTs for any JavaScript files given in the input.
      */
-    dependsOn: keyof AllGenerators | 'ast';
+    dependsOn: keyof AllGenerators | undefined;
 
     /**
      * Generators are abstract and the different generators have different sort of inputs and outputs.