Skip to content

Commit 490c67d

Browse files
committed
refactor(build-infra): DRY Unicode property escape transforms
- Move shared transforms to @socketsecurity/build-infra package - Export unicode-property-escape-transform from build-infra - Update babel-plugin-with-intl-none to import from build-infra - Update bootstrap esbuild-plugin-smol-transform to import from build-infra - Remove duplicate transformUnicodePropertyEscapes() implementation - Add build-infra as workspace dependency to bootstrap package This eliminates code duplication and provides a single source of truth for Unicode property escape transformations used across CLI and bootstrap builds.
1 parent a701856 commit 490c67d

File tree

6 files changed

+125
-93
lines changed

6 files changed

+125
-93
lines changed

packages/bootstrap/.config/esbuild-plugin-smol-transform.mjs

Lines changed: 2 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* for smol builds.
66
*/
77

8+
import { transformUnicodePropertyEscapes } from '@socketsecurity/build-infra/lib/unicode-property-escape-transform'
9+
810
/**
911
* Create smol transformation plugin.
1012
* @returns {import('esbuild').Plugin}
@@ -128,94 +130,3 @@ export function smolTransformPlugin() {
128130
},
129131
}
130132
}
131-
132-
/**
133-
* Transform Unicode property escapes in regex patterns for ICU-free environments.
134-
* Based on babel-plugin-with-intl-none.mjs transformations.
135-
*
136-
* @param {string} content - Source code to transform
137-
* @returns {string} Transformed source code
138-
*/
139-
function transformUnicodePropertyEscapes(content) {
140-
let transformed = content
141-
142-
// Map of Unicode property escapes to basic character class alternatives.
143-
const unicodePropertyMap = {
144-
__proto__: null,
145-
// Letter categories.
146-
'Letter': 'a-zA-Z',
147-
'L': 'a-zA-Z',
148-
'Alpha': 'a-zA-Z',
149-
'Alphabetic': 'a-zA-Z',
150-
// Number categories.
151-
'Number': '0-9',
152-
'N': '0-9',
153-
'Digit': '0-9',
154-
'Nd': '0-9',
155-
// Whitespace.
156-
'Space': '\\s',
157-
'White_Space': '\\s',
158-
// ASCII range.
159-
'ASCII': '\\x00-\\x7F',
160-
// Control characters (basic approximation).
161-
'Control': '\\x00-\\x1F\\x7F-\\x9F',
162-
'Cc': '\\x00-\\x1F\\x7F-\\x9F',
163-
// Format characters (approximate with zero-width space).
164-
'Format': '\\u200B-\\u200D\\uFEFF',
165-
'Cf': '\\u200B-\\u200D\\uFEFF',
166-
// Mark categories (combining marks - approximate).
167-
'Mark': '\\u0300-\\u036F',
168-
'M': '\\u0300-\\u036F',
169-
// Default_Ignorable_Code_Point (approximate with common invisibles).
170-
'Default_Ignorable_Code_Point': '\\u00AD\\u034F\\u061C\\u115F-\\u1160\\u17B4-\\u17B5\\u180B-\\u180D\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u206F\\u3164\\uFE00-\\uFE0F\\uFEFF\\uFFA0\\uFFF0-\\uFFF8',
171-
}
172-
173-
// Transform \p{Property} inside character classes [...].
174-
// Example: /[\p{Letter}\p{Number}]+/u → /[a-zA-Z0-9]+/
175-
transformed = transformed.replace(
176-
/\[([^\]]*\\p\{[^}]+\}[^\]]*)\]/g,
177-
(_match, charClass) => {
178-
let newCharClass = charClass
179-
180-
// Replace each \p{Property} with its character class equivalent.
181-
for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
182-
const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
183-
newCharClass = newCharClass.replace(
184-
new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
185-
replacement,
186-
)
187-
}
188-
189-
return `[${newCharClass}]`
190-
},
191-
)
192-
193-
// Transform standalone \p{Property} (not inside character class).
194-
// Example: /\p{Letter}+/u → /[a-zA-Z]+/
195-
for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
196-
const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
197-
// Match \p{Property} that's NOT inside square brackets.
198-
// This is a simplified approach - proper parsing would be better.
199-
transformed = transformed.replace(
200-
new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
201-
`[${replacement}]`,
202-
)
203-
}
204-
205-
// Remove /u and /v flags from regexes that used Unicode property escapes.
206-
// This is safe because we've replaced them with basic character classes.
207-
// Match regex literals: /pattern/flags
208-
transformed = transformed.replace(
209-
/\/([^/\\]|\\.)+\/([gimsuvy]+)/g,
210-
(match, _pattern, flags) => {
211-
// Only remove u/v flags if the regex originally had Unicode escapes.
212-
if (flags.includes('u') || flags.includes('v')) {
213-
const newFlags = flags.replace(/[uv]/g, '')
214-
return match.slice(0, -flags.length) + newFlags
215-
}
216-
return match
217-
},
218-
)
219-
220-
return transformed
221-
}

packages/bootstrap/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"clean": "del-cli dist"
1515
},
1616
"devDependencies": {
17+
"@socketsecurity/build-infra": "workspace:*",
1718
"@socketsecurity/lib": "catalog:",
1819
"del-cli": "catalog:",
1920
"esbuild": "catalog:",
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/**
2+
* @fileoverview Shared Unicode property escape transformations for --with-intl=none.
3+
*
4+
* Transforms Unicode property escapes (\p{...}) into basic character class alternatives
5+
* that work without ICU support. This enables Node.js builds with --with-intl=none to
6+
* save ~6-8MB by removing ICU.
7+
*
8+
* Used by:
9+
* - babel-plugin-with-intl-none.mjs (CLI Babel transforms)
10+
* - bootstrap esbuild-plugin-smol-transform.mjs (Bootstrap esbuild transforms)
11+
*
12+
* @example
13+
* import { transformUnicodePropertyEscapes } from './unicode-property-escape-transform.mjs'
14+
*
15+
* const code = 'const regex = /[\\p{Letter}\\p{Number}]+/u'
16+
* const transformed = transformUnicodePropertyEscapes(code)
17+
* // Result: 'const regex = /[a-zA-Z0-9]+/'
18+
*/
19+
20+
/**
21+
* Map of Unicode property escapes to basic character class alternatives.
22+
* Approximations are used where exact equivalents don't exist.
23+
*
24+
* @type {Record<string, string>}
25+
*/
26+
export const unicodePropertyMap = {
27+
__proto__: null,
28+
// Letter categories.
29+
'Letter': 'a-zA-Z',
30+
'L': 'a-zA-Z',
31+
'Alpha': 'a-zA-Z',
32+
'Alphabetic': 'a-zA-Z',
33+
// Number categories.
34+
'Number': '0-9',
35+
'N': '0-9',
36+
'Digit': '0-9',
37+
'Nd': '0-9',
38+
// Whitespace.
39+
'Space': '\\s',
40+
'White_Space': '\\s',
41+
// ASCII range.
42+
'ASCII': '\\x00-\\x7F',
43+
// Control characters (basic approximation).
44+
'Control': '\\x00-\\x1F\\x7F-\\x9F',
45+
'Cc': '\\x00-\\x1F\\x7F-\\x9F',
46+
// Format characters (approximate with zero-width space).
47+
'Format': '\\u200B-\\u200D\\uFEFF',
48+
'Cf': '\\u200B-\\u200D\\uFEFF',
49+
// Mark categories (combining marks - approximate).
50+
'Mark': '\\u0300-\\u036F',
51+
'M': '\\u0300-\\u036F',
52+
// Default_Ignorable_Code_Point (approximate with common invisibles).
53+
// Covers most common cases: soft hyphen, zero-width spaces, format controls, etc.
54+
'Default_Ignorable_Code_Point': '\\u00AD\\u034F\\u061C\\u115F-\\u1160\\u17B4-\\u17B5\\u180B-\\u180D\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u206F\\u3164\\uFE00-\\uFE0F\\uFEFF\\uFFA0\\uFFF0-\\uFFF8',
55+
}
56+
57+
/**
58+
* Transform Unicode property escapes in regex patterns for ICU-free environments.
59+
*
60+
* @param {string} content - Source code to transform
61+
* @returns {string} Transformed source code
62+
*/
63+
export function transformUnicodePropertyEscapes(content) {
64+
let transformed = content
65+
66+
// Transform \p{Property} inside character classes [...].
67+
// Example: /[\p{Letter}\p{Number}]+/u → /[a-zA-Z0-9]+/
68+
transformed = transformed.replace(
69+
/\[([^\]]*\\p\{[^}]+\}[^\]]*)\]/g,
70+
(_match, charClass) => {
71+
let newCharClass = charClass
72+
73+
// Replace each \p{Property} with its character class equivalent.
74+
for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
75+
const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
76+
newCharClass = newCharClass.replace(
77+
new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
78+
replacement,
79+
)
80+
}
81+
82+
return `[${newCharClass}]`
83+
},
84+
)
85+
86+
// Transform standalone \p{Property} (not inside character class).
87+
// Example: /\p{Letter}+/u → /[a-zA-Z]+/
88+
for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
89+
const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
90+
// Match \p{Property} that's NOT inside square brackets.
91+
// This is a simplified approach - proper parsing would be better.
92+
transformed = transformed.replace(
93+
new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
94+
`[${replacement}]`,
95+
)
96+
}
97+
98+
// Remove /u and /v flags from regexes that used Unicode property escapes.
99+
// This is safe because we've replaced them with basic character classes.
100+
// Match regex literals: /pattern/flags
101+
transformed = transformed.replace(
102+
/\/([^/\\]|\\.)+\/([gimsuvy]+)/g,
103+
(match, _pattern, flags) => {
104+
// Only remove u/v flags if the regex originally had Unicode escapes.
105+
if (flags.includes('u') || flags.includes('v')) {
106+
const newFlags = flags.replace(/[uv]/g, '')
107+
return match.slice(0, -flags.length) + newFlags
108+
}
109+
return match
110+
},
111+
)
112+
113+
return transformed
114+
}

packages/build-infra/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
"./lib/preflight-checks": "./lib/preflight-checks.mjs",
1818
"./lib/rust-builder": "./lib/rust-builder.mjs",
1919
"./lib/script-runner": "./lib/script-runner.mjs",
20-
"./lib/tool-installer": "./lib/tool-installer.mjs"
20+
"./lib/tool-installer": "./lib/tool-installer.mjs",
21+
"./lib/unicode-property-escape-transform": "./lib/unicode-property-escape-transform.mjs"
2122
},
2223
"dependencies": {
2324
"@socketsecurity/lib": "catalog:"

pnpm-lock.yaml

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/babel/babel-plugin-with-intl-none.mjs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* Transformations:
1010
* 1. `.toLocaleString()` → Simple formatting with commas/basic date strings
1111
* 2. `Intl.*` APIs → Polyfills or basic implementations
12-
* 3. Unicode regex `\p{...}` → Character class alternatives
12+
* 3. Unicode regex `\p{...}` → Character class alternatives (shared transform)
1313
* 4. Unicode regex `/v` flag → Downgrade to `/u` or remove
1414
* 5. `.localeCompare()` → Basic string comparison
1515
*
@@ -25,6 +25,8 @@
2525
* const regex = /[a-zA-Z0-9]+/
2626
*/
2727

28+
import { unicodePropertyMap } from '@socketsecurity/build-infra/lib/unicode-property-escape-transform'
29+
2830
/**
2931
* Helper Functions (injected at runtime via Babel template.ast):
3032
*

0 commit comments

Comments
 (0)