refactor(build-infra): DRY Unicode property escape transforms

jdalton · jdalton · commit 490c67da260e · 2025-10-29T11:20:23.000-04:00
- Move shared transforms to @socketsecurity/build-infra package
- Export unicode-property-escape-transform from build-infra
- Update babel-plugin-with-intl-none to import from build-infra
- Update bootstrap esbuild-plugin-smol-transform to import from build-infra
- Remove duplicate transformUnicodePropertyEscapes() implementation
- Add build-infra as workspace dependency to bootstrap package

This eliminates code duplication and provides a single source of truth for
Unicode property escape transformations used across CLI and bootstrap builds.
diff --git a/packages/bootstrap/.config/esbuild-plugin-smol-transform.mjs b/packages/bootstrap/.config/esbuild-plugin-smol-transform.mjs
@@ -5,6 +5,8 @@
  * for smol builds.
  */
 
+import { transformUnicodePropertyEscapes } from '@socketsecurity/build-infra/lib/unicode-property-escape-transform'
+
 /**
  * Create smol transformation plugin.
  * @returns {import('esbuild').Plugin}
@@ -128,94 +130,3 @@ export function smolTransformPlugin() {
     },
   }
 }
-
-/**
- * Transform Unicode property escapes in regex patterns for ICU-free environments.
- * Based on babel-plugin-with-intl-none.mjs transformations.
- *
- * @param {string} content - Source code to transform
- * @returns {string} Transformed source code
- */
-function transformUnicodePropertyEscapes(content) {
-  let transformed = content
-
-  // Map of Unicode property escapes to basic character class alternatives.
-  const unicodePropertyMap = {
-    __proto__: null,
-    // Letter categories.
-    'Letter': 'a-zA-Z',
-    'L': 'a-zA-Z',
-    'Alpha': 'a-zA-Z',
-    'Alphabetic': 'a-zA-Z',
-    // Number categories.
-    'Number': '0-9',
-    'N': '0-9',
-    'Digit': '0-9',
-    'Nd': '0-9',
-    // Whitespace.
-    'Space': '\\s',
-    'White_Space': '\\s',
-    // ASCII range.
-    'ASCII': '\\x00-\\x7F',
-    // Control characters (basic approximation).
-    'Control': '\\x00-\\x1F\\x7F-\\x9F',
-    'Cc': '\\x00-\\x1F\\x7F-\\x9F',
-    // Format characters (approximate with zero-width space).
-    'Format': '\\u200B-\\u200D\\uFEFF',
-    'Cf': '\\u200B-\\u200D\\uFEFF',
-    // Mark categories (combining marks - approximate).
-    'Mark': '\\u0300-\\u036F',
-    'M': '\\u0300-\\u036F',
-    // Default_Ignorable_Code_Point (approximate with common invisibles).
-    'Default_Ignorable_Code_Point': '\\u00AD\\u034F\\u061C\\u115F-\\u1160\\u17B4-\\u17B5\\u180B-\\u180D\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u206F\\u3164\\uFE00-\\uFE0F\\uFEFF\\uFFA0\\uFFF0-\\uFFF8',
-  }
-
-  // Transform \p{Property} inside character classes [...].
-  // Example: /[\p{Letter}\p{Number}]+/u → /[a-zA-Z0-9]+/
-  transformed = transformed.replace(
-    /\[([^\]]*\\p\{[^}]+\}[^\]]*)\]/g,
-    (_match, charClass) => {
-      let newCharClass = charClass
-
-      // Replace each \p{Property} with its character class equivalent.
-      for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
-        const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
-        newCharClass = newCharClass.replace(
-          new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
-          replacement,
-        )
-      }
-
-      return `[${newCharClass}]`
-    },
-  )
-
-  // Transform standalone \p{Property} (not inside character class).
-  // Example: /\p{Letter}+/u → /[a-zA-Z]+/
-  for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
-    const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
-    // Match \p{Property} that's NOT inside square brackets.
-    // This is a simplified approach - proper parsing would be better.
-    transformed = transformed.replace(
-      new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
-      `[${replacement}]`,
-    )
-  }
-
-  // Remove /u and /v flags from regexes that used Unicode property escapes.
-  // This is safe because we've replaced them with basic character classes.
-  // Match regex literals: /pattern/flags
-  transformed = transformed.replace(
-    /\/([^/\\]|\\.)+\/([gimsuvy]+)/g,
-    (match, _pattern, flags) => {
-      // Only remove u/v flags if the regex originally had Unicode escapes.
-      if (flags.includes('u') || flags.includes('v')) {
-        const newFlags = flags.replace(/[uv]/g, '')
-        return match.slice(0, -flags.length) + newFlags
-      }
-      return match
-    },
-  )
-
-  return transformed
-}
diff --git a/packages/bootstrap/package.json b/packages/bootstrap/package.json
@@ -14,6 +14,7 @@
     "clean": "del-cli dist"
   },
   "devDependencies": {
+    "@socketsecurity/build-infra": "workspace:*",
     "@socketsecurity/lib": "catalog:",
     "del-cli": "catalog:",
     "esbuild": "catalog:",
diff --git a/packages/build-infra/lib/unicode-property-escape-transform.mjs b/packages/build-infra/lib/unicode-property-escape-transform.mjs
@@ -0,0 +1,114 @@
+/**
+ * @fileoverview Shared Unicode property escape transformations for --with-intl=none.
+ *
+ * Transforms Unicode property escapes (\p{...}) into basic character class alternatives
+ * that work without ICU support. This enables Node.js builds with --with-intl=none to
+ * save ~6-8MB by removing ICU.
+ *
+ * Used by:
+ * - babel-plugin-with-intl-none.mjs (CLI Babel transforms)
+ * - bootstrap esbuild-plugin-smol-transform.mjs (Bootstrap esbuild transforms)
+ *
+ * @example
+ * import { transformUnicodePropertyEscapes } from './unicode-property-escape-transform.mjs'
+ *
+ * const code = 'const regex = /[\\p{Letter}\\p{Number}]+/u'
+ * const transformed = transformUnicodePropertyEscapes(code)
+ * // Result: 'const regex = /[a-zA-Z0-9]+/'
+ */
+
+/**
+ * Map of Unicode property escapes to basic character class alternatives.
+ * Approximations are used where exact equivalents don't exist.
+ *
+ * @type {Record<string, string>}
+ */
+export const unicodePropertyMap = {
+  __proto__: null,
+  // Letter categories.
+  'Letter': 'a-zA-Z',
+  'L': 'a-zA-Z',
+  'Alpha': 'a-zA-Z',
+  'Alphabetic': 'a-zA-Z',
+  // Number categories.
+  'Number': '0-9',
+  'N': '0-9',
+  'Digit': '0-9',
+  'Nd': '0-9',
+  // Whitespace.
+  'Space': '\\s',
+  'White_Space': '\\s',
+  // ASCII range.
+  'ASCII': '\\x00-\\x7F',
+  // Control characters (basic approximation).
+  'Control': '\\x00-\\x1F\\x7F-\\x9F',
+  'Cc': '\\x00-\\x1F\\x7F-\\x9F',
+  // Format characters (approximate with zero-width space).
+  'Format': '\\u200B-\\u200D\\uFEFF',
+  'Cf': '\\u200B-\\u200D\\uFEFF',
+  // Mark categories (combining marks - approximate).
+  'Mark': '\\u0300-\\u036F',
+  'M': '\\u0300-\\u036F',
+  // Default_Ignorable_Code_Point (approximate with common invisibles).
+  // Covers most common cases: soft hyphen, zero-width spaces, format controls, etc.
+  'Default_Ignorable_Code_Point': '\\u00AD\\u034F\\u061C\\u115F-\\u1160\\u17B4-\\u17B5\\u180B-\\u180D\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u206F\\u3164\\uFE00-\\uFE0F\\uFEFF\\uFFA0\\uFFF0-\\uFFF8',
+}
+
+/**
+ * Transform Unicode property escapes in regex patterns for ICU-free environments.
+ *
+ * @param {string} content - Source code to transform
+ * @returns {string} Transformed source code
+ */
+export function transformUnicodePropertyEscapes(content) {
+  let transformed = content
+
+  // Transform \p{Property} inside character classes [...].
+  // Example: /[\p{Letter}\p{Number}]+/u → /[a-zA-Z0-9]+/
+  transformed = transformed.replace(
+    /\[([^\]]*\\p\{[^}]+\}[^\]]*)\]/g,
+    (_match, charClass) => {
+      let newCharClass = charClass
+
+      // Replace each \p{Property} with its character class equivalent.
+      for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
+        const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
+        newCharClass = newCharClass.replace(
+          new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
+          replacement,
+        )
+      }
+
+      return `[${newCharClass}]`
+    },
+  )
+
+  // Transform standalone \p{Property} (not inside character class).
+  // Example: /\p{Letter}+/u → /[a-zA-Z]+/
+  for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
+    const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
+    // Match \p{Property} that's NOT inside square brackets.
+    // This is a simplified approach - proper parsing would be better.
+    transformed = transformed.replace(
+      new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
+      `[${replacement}]`,
+    )
+  }
+
+  // Remove /u and /v flags from regexes that used Unicode property escapes.
+  // This is safe because we've replaced them with basic character classes.
+  // Match regex literals: /pattern/flags
+  transformed = transformed.replace(
+    /\/([^/\\]|\\.)+\/([gimsuvy]+)/g,
+    (match, _pattern, flags) => {
+      // Only remove u/v flags if the regex originally had Unicode escapes.
+      if (flags.includes('u') || flags.includes('v')) {
+        const newFlags = flags.replace(/[uv]/g, '')
+        return match.slice(0, -flags.length) + newFlags
+      }
+      return match
+    },
+  )
+
+  return transformed
+}
diff --git a/packages/build-infra/package.json b/packages/build-infra/package.json
@@ -17,7 +17,8 @@
     "./lib/preflight-checks": "./lib/preflight-checks.mjs",
     "./lib/rust-builder": "./lib/rust-builder.mjs",
     "./lib/script-runner": "./lib/script-runner.mjs",
-    "./lib/tool-installer": "./lib/tool-installer.mjs"
+    "./lib/tool-installer": "./lib/tool-installer.mjs",
+    "./lib/unicode-property-escape-transform": "./lib/unicode-property-escape-transform.mjs"
   },
   "dependencies": {
     "@socketsecurity/lib": "catalog:"
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/scripts/babel/babel-plugin-with-intl-none.mjs b/scripts/babel/babel-plugin-with-intl-none.mjs
@@ -9,7 +9,7 @@
  * Transformations:
  * 1. `.toLocaleString()` → Simple formatting with commas/basic date strings
  * 2. `Intl.*` APIs → Polyfills or basic implementations
- * 3. Unicode regex `\p{...}` → Character class alternatives
+ * 3. Unicode regex `\p{...}` → Character class alternatives (shared transform)
  * 4. Unicode regex `/v` flag → Downgrade to `/u` or remove
  * 5. `.localeCompare()` → Basic string comparison
  *
@@ -25,6 +25,8 @@
  * const regex = /[a-zA-Z0-9]+/
  */
 
+import { unicodePropertyMap } from '@socketsecurity/build-infra/lib/unicode-property-escape-transform'
+
 /**
  * Helper Functions (injected at runtime via Babel template.ast):
  *