Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
282 changes: 282 additions & 0 deletions dev/design/regex_preprocessing_fixes.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -1189,31 +1189,54 @@ public static void compileAssignmentOperator(BytecodeCompiler bytecodeCompiler,
// Handle array slice assignment: @array[1, 3, 5] = (20, 30, 40)
if (leftBin.operator.equals("[") && leftBin.left instanceof OperatorNode arrayOp) {

// Must be @array (not $array)
if (arrayOp.operator.equals("@") && arrayOp.operand instanceof IdentifierNode) {
String varName = "@" + ((IdentifierNode) arrayOp.operand).name;

// Must be @array or @$ref (not $array)
if (arrayOp.operator.equals("@")) {
int arrayReg;
if (bytecodeCompiler.currentSubroutineBeginId != 0 && bytecodeCompiler.currentSubroutineClosureVars != null
&& bytecodeCompiler.currentSubroutineClosureVars.contains(varName)) {

if (arrayOp.operand instanceof IdentifierNode) {
String varName = "@" + ((IdentifierNode) arrayOp.operand).name;

if (bytecodeCompiler.currentSubroutineBeginId != 0 && bytecodeCompiler.currentSubroutineClosureVars != null
&& bytecodeCompiler.currentSubroutineClosureVars.contains(varName)) {
arrayReg = bytecodeCompiler.allocateRegister();
int nameIdx = bytecodeCompiler.addToStringPool(varName);
bytecodeCompiler.emitWithToken(Opcodes.RETRIEVE_BEGIN_ARRAY, node.getIndex());
bytecodeCompiler.emitReg(arrayReg);
bytecodeCompiler.emit(nameIdx);
bytecodeCompiler.emit(bytecodeCompiler.currentSubroutineBeginId);
} else if (bytecodeCompiler.hasVariable(varName)) {
arrayReg = bytecodeCompiler.getVariableRegister(varName);
} else {
arrayReg = bytecodeCompiler.allocateRegister();
String globalArrayName = NameNormalizer.normalizeVariableName(
((IdentifierNode) arrayOp.operand).name,
bytecodeCompiler.getCurrentPackage()
);
int nameIdx = bytecodeCompiler.addToStringPool(globalArrayName);
bytecodeCompiler.emit(Opcodes.LOAD_GLOBAL_ARRAY);
bytecodeCompiler.emitReg(arrayReg);
bytecodeCompiler.emit(nameIdx);
}
} else if (arrayOp.operand instanceof OperatorNode || arrayOp.operand instanceof BlockNode) {
// @$ref[@idx] = ... or @{expr}[@idx] = ...
// Compile the scalar reference expression and dereference to array
bytecodeCompiler.compileNode(arrayOp.operand, -1, RuntimeContextType.SCALAR);
int scalarReg = bytecodeCompiler.lastResultReg;
arrayReg = bytecodeCompiler.allocateRegister();
int nameIdx = bytecodeCompiler.addToStringPool(varName);
bytecodeCompiler.emitWithToken(Opcodes.RETRIEVE_BEGIN_ARRAY, node.getIndex());
bytecodeCompiler.emitReg(arrayReg);
bytecodeCompiler.emit(nameIdx);
bytecodeCompiler.emit(bytecodeCompiler.currentSubroutineBeginId);
} else if (bytecodeCompiler.hasVariable(varName)) {
arrayReg = bytecodeCompiler.getVariableRegister(varName);
if (bytecodeCompiler.isStrictRefsEnabled()) {
bytecodeCompiler.emitWithToken(Opcodes.DEREF_ARRAY, node.getIndex());
bytecodeCompiler.emitReg(arrayReg);
bytecodeCompiler.emitReg(scalarReg);
} else {
int pkgIdx = bytecodeCompiler.addToStringPool(bytecodeCompiler.getCurrentPackage());
bytecodeCompiler.emitWithToken(Opcodes.DEREF_ARRAY_NONSTRICT, node.getIndex());
bytecodeCompiler.emitReg(arrayReg);
bytecodeCompiler.emitReg(scalarReg);
bytecodeCompiler.emit(pkgIdx);
}
} else {
arrayReg = bytecodeCompiler.allocateRegister();
String globalArrayName = NameNormalizer.normalizeVariableName(
((IdentifierNode) arrayOp.operand).name,
bytecodeCompiler.getCurrentPackage()
);
int nameIdx = bytecodeCompiler.addToStringPool(globalArrayName);
bytecodeCompiler.emit(Opcodes.LOAD_GLOBAL_ARRAY);
bytecodeCompiler.emitReg(arrayReg);
bytecodeCompiler.emit(nameIdx);
bytecodeCompiler.throwCompilerException("Array slice assignment requires identifier or reference");
return;
}

// Compile indices (right side of [])
Expand Down
19 changes: 19 additions & 0 deletions src/main/java/org/perlonjava/backend/jvm/EmitOperator.java
Original file line number Diff line number Diff line change
Expand Up @@ -534,9 +534,28 @@ static void handleSpliceBuiltin(EmitterVisitor emitterVisitor, OperatorNode node

if (first != null) {
try {
MethodVisitor mv = emitterVisitor.ctx.mv;
first.accept(emitterVisitor.with(RuntimeContextType.LIST));

// Spill the first operand before evaluating remaining args so
// non-local control flow can't jump to returnLabel with an
// extra value on the JVM operand stack.
int firstSlot = emitterVisitor.ctx.javaClassInfo.acquireSpillSlot();
boolean pooled = firstSlot >= 0;
if (!pooled) {
firstSlot = emitterVisitor.ctx.symbolTable.allocateLocalVariable();
}
mv.visitVarInsn(Opcodes.ASTORE, firstSlot);

// Accept the remaining arguments in LIST context.
args.accept(emitterVisitor.with(RuntimeContextType.LIST));

mv.visitVarInsn(Opcodes.ALOAD, firstSlot);
mv.visitInsn(Opcodes.SWAP);

if (pooled) {
emitterVisitor.ctx.javaClassInfo.releaseSpillSlot();
}
} finally {
listArgs.elements.addFirst(first);
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/perlonjava/core/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public final class Configuration {
* Automatically populated by Gradle/Maven during build.
* DO NOT EDIT MANUALLY - this value is replaced at build time.
*/
public static final String gitCommitId = "b8043f312";
public static final String gitCommitId = "c6ee04074";

/**
* Git commit date of the build (ISO format: YYYY-MM-DD).
Expand All @@ -48,7 +48,7 @@ public final class Configuration {
* Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at"
* DO NOT EDIT MANUALLY - this value is replaced at build time.
*/
public static final String buildTimestamp = "Apr 10 2026 11:59:23";
public static final String buildTimestamp = "Apr 10 2026 13:40:40";

// Prevent instantiation
private Configuration() {
Expand Down
69 changes: 68 additions & 1 deletion src/main/java/org/perlonjava/frontend/parser/OperatorParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,61 @@ static BinaryOperatorNode parsePrint(Parser parser, LexerToken token, int curren
return new BinaryOperatorNode(token.text, handle, operand, currentIndex);
}

/**
* Check if a variable name refers to a forced-global variable that cannot
* be lexicalized with 'my' or 'state'.
*
* Perl rule: the following are always global:
* - $_, @_, %_ (the underscore variables, since Perl 5.30)
* - $0, $1, $2, ... (digit-only names)
* - $!, $/, $@, $;, $,, $., $|, etc. (single punctuation character names)
* - $^W, $^H, etc. (control character / caret variable names)
*/
private static boolean isGlobalOnlyVariable(String name) {
if (name == null || name.isEmpty()) return false;

// Underscore: $_, @_, %_ are all forced global (since Perl 5.30)
if (name.equals("_")) return true;

// Digit-only names: $0, $1, $2, ...
boolean allDigits = true;
for (int i = 0; i < name.length(); i++) {
if (!Character.isDigit(name.charAt(i))) {
allDigits = false;
break;
}
}
if (allDigits) return true;

// Single ASCII non-alphanumeric, non-underscore character: $!, $/, $@, $;, etc.
// Only check ASCII range — Unicode characters (>= 128) may be valid identifiers
// even if Java's Character.isLetterOrDigit() doesn't recognize them.
if (name.length() == 1) {
char c = name.charAt(0);
if (c < 128 && !Character.isLetterOrDigit(c) && c != '_') return true;
}

// Control character prefix (caret variables like $^W stored as chr(23))
if (name.charAt(0) < 32) return true;

return false;
}

/**
* Format a variable name for display in error messages.
* Converts internal control character representation back to ^X form.
* E.g., chr(23) + "" becomes "^W", chr(8) + "MATCH" becomes "^HMATCH".
*/
private static String formatVarNameForDisplay(String name) {
if (name == null || name.isEmpty()) return name;
char first = name.charAt(0);
if (first < 32) {
// Control character: convert to ^X notation
return "^" + (char) (first + 'A' - 1) + name.substring(1);
}
return name;
}

private static void addVariableToScope(EmitterContext ctx, String operator, OperatorNode node) {
String sigil = node.operator;
if ("$@%".contains(sigil)) {
Expand All @@ -260,7 +315,19 @@ private static void addVariableToScope(EmitterContext ctx, String operator, Oper
if (identifierNode instanceof IdentifierNode) { // my $a
String name = ((IdentifierNode) identifierNode).name;
String var = sigil + name;


// Check for global-only variables in my/state declarations
// Perl: "Can't use global $0 in "my""
if ((operator.equals("my") || operator.equals("state"))
&& isGlobalOnlyVariable(name)) {
throw new PerlCompilerException(
node.getIndex(),
"Can't use global " + sigil + formatVarNameForDisplay(name)
+ " in \"" + operator + "\"",
ctx.errorUtil
);
}

// Check for redeclaration warnings
if (operator.equals("our")) {
// For 'our', only warn if redeclared in the same package (matching Perl behavior)
Expand Down
9 changes: 6 additions & 3 deletions src/main/java/org/perlonjava/runtime/perlmodule/Storable.java
Original file line number Diff line number Diff line change
Expand Up @@ -548,9 +548,12 @@ private static RuntimeScalar deepClone(RuntimeScalar scalar, IdentityHashMap<Obj
RuntimeArray thawArgs = new RuntimeArray();
RuntimeArray.push(thawArgs, newObj);
RuntimeArray.push(thawArgs, new RuntimeScalar(1)); // cloning = true
// Pass serialized data and any extra refs from freeze
for (int i = 0; i < freezeArray.size(); i++) {
RuntimeArray.push(thawArgs, freezeArray.get(i));
// First element is the serialized string — pass as-is
RuntimeArray.push(thawArgs, freezeArray.get(0));
// Remaining elements are extra refs — deep-clone them
// so the thawed object gets independent copies
for (int i = 1; i < freezeArray.size(); i++) {
RuntimeArray.push(thawArgs, deepClone(freezeArray.get(i), cloned));
}
RuntimeCode.apply(thawMethod, thawArgs, RuntimeContextType.VOID);
}
Expand Down
67 changes: 58 additions & 9 deletions src/main/java/org/perlonjava/runtime/regex/CaptureNameEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -161,15 +161,64 @@ public static boolean isCodeBlockCapture(String captureName) {
return captureName != null && captureName.startsWith("cb") && captureName.length() > 5;
}

// FUTURE ENHANCEMENTS:
//
// For underscore support: (?<my_name>)
// Use the same hex encoding pattern: (?<ncHEX>) where HEX encodes "my_name"
// Then %CAPTURE decodes back to show original name to user
// UNDERSCORE ENCODING:
//
// For duplicate names: (?<name>a)|(?<name>b)
// Encode with disambiguation: (?<ncHEX1>a)|(?<ncHEX2>b) where HEX encodes "name"
// Track mapping for proper capture group retrieval
// Java regex doesn't allow underscores in group names (only [a-zA-Z][a-zA-Z0-9]*).
// Perl allows \w+ (letters, digits, underscores) for group names.
//
// The generic hex encoding pattern is reusable for all Java regex limitations!
// Encoding: Replace each underscore with "U95" (ASCII code 95 for '_')
// (?<my_name>) → (?<myU95name>)
// (?<_>) → (?<U95>)
// (?<_foo>) → (?<U95foo>)
//
// Names starting with underscore need a letter prefix for Java, so U95 works
// since it starts with 'U'. To avoid ambiguity, literal "U95" sequences in
// names are escaped as "UU95" (the 'U' itself is escaped).

/**
* Encodes a Perl capture group name for use in Java regex.
* Replaces underscores with "U95" and escapes literal "U95" sequences.
*
* @param perlName The original Perl capture group name
* @return The encoded name safe for Java regex, or the original if no encoding needed
*/
public static String encodeGroupName(String perlName) {
if (perlName == null || (!perlName.contains("_") && !perlName.contains("U95"))) {
return perlName;
}
// First escape any existing "U95" as "UU95" to avoid ambiguity
String encoded = perlName.replace("U95", "UU95");
// Then replace underscores with "U95"
encoded = encoded.replace("_", "U95");
return encoded;
}

/**
* Decodes a Java regex capture group name back to the original Perl name.
* Reverses the encoding done by encodeGroupName.
*
* @param javaName The encoded Java group name
* @return The original Perl capture group name
*/
public static String decodeGroupName(String javaName) {
if (javaName == null || !javaName.contains("U95")) {
return javaName;
}
// First restore underscores from "U95"
String decoded = javaName.replace("U95", "_");
// Then restore literal "U95" from "U_95" (which was "UU95" before first step)
decoded = decoded.replace("U_95", "U95");
return decoded;
}

/**
* Checks if a capture group name is an internal name that should be hidden
* from user-visible variables like %+ and %-.
*
* @param captureName The capture group name to check
* @return true if this is an internal capture (code block or \K marker)
*/
public static boolean isInternalCapture(String captureName) {
return isCodeBlockCapture(captureName) || "perlK".equals(captureName);
}
}
Loading
Loading