PerlOnJava/src/main/java/org/perlonjava/frontend/parser/Parser.java at 082ceee97a30b588e529df1cbb0df6bfcb22f4d6 · fglock/PerlOnJava · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
package org.perlonjava.frontend.parser;

import org.perlonjava.app.cli.CompilerOptions;

import org.perlonjava.backend.jvm.EmitterContext;
import org.perlonjava.frontend.astnode.AbstractNode;
import org.perlonjava.frontend.astnode.FormatNode;
import org.perlonjava.frontend.astnode.Node;
import org.perlonjava.frontend.astnode.OperatorNode;
import org.perlonjava.frontend.lexer.LexerToken;
import org.perlonjava.frontend.lexer.LexerTokenType;
import org.perlonjava.runtime.runtimetypes.ErrorMessageUtil;
import org.perlonjava.runtime.runtimetypes.PerlCompilerException;
import org.perlonjava.runtime.runtimetypes.PerlParserException;

import java.util.ArrayList;
import java.util.List;

import static org.perlonjava.frontend.parser.SpecialBlockParser.setCurrentScope;
import static org.perlonjava.frontend.parser.TokenUtils.peek;

/**
 * The Parser class is responsible for parsing a list of tokens into an abstract syntax tree (AST).
 * It handles operator precedence, associativity, and special token combinations.
 */
public class Parser {

    // Context for code emission.
    public final EmitterContext ctx;
    // List of tokens to be parsed.
    public final List<LexerToken> tokens;
    // List to store format nodes encountered during parsing.
    private final List<FormatNode> formatNodes = new ArrayList<>();
    // List to store completed format nodes after template parsing.
    private final List<FormatNode> completedFormatNodes = new ArrayList<>();
    // Current index in the token list.
    public int tokenIndex = 0;
    // Flags to indicate special parsing states.
    public boolean parsingForLoopVariable = false;
    public boolean parsingTakeReference = false;
    // Are we currently parsing a my/our/state declaration's variable list?
    // Used to suppress strict vars checking for the variable being declared.
    public boolean parsingDeclaration = false;
    // Are we parsing a variable used as the class in indirect object syntax?
    // Suppresses the "syntax error" check for $var( in Variable.java
    public boolean parsingIndirectObject = false;
    // Are we parsing the top level script?
    public boolean isTopLevelScript = false;
    // Are we parsing inside a class block?
    public boolean isInClassBlock = false;
    // Are we parsing inside a method?
    public boolean isInMethod = false;
    // Are we parsing inside a braced dereference like %{...} or @{...}?
    // When true, inner {} should default to hash constructor, not block.
    public boolean insideBracedDereference = false;
    // List to store ADJUST blocks for the current class
    public List<Node> classAdjustBlocks = new ArrayList<>();
    // List to store heredoc nodes encountered during parsing.
    private List<OperatorNode> heredocNodes = new ArrayList<>();
    // When heredocs are processed before BEGIN blocks, this tracks where to skip to
    // after the NEWLINE (past the heredoc content that was already consumed).
    public int heredocSkipToIndex = -1;
    // The specific NEWLINE token index that should trigger the skip.
    public int heredocNewlineIndex = -1;
    // Base line number for string sub-parsers. When > 0, this parser operates on
    // re-tokenized string content and __LINE__ should use this as the base line,
    // counting newlines from the inner token list to offset from it.
    public int baseLineNumber = 0;

    /**
     * Constructs a Parser with the given context and tokens.
     *
     * @param ctx    The context for code emission.
     * @param tokens The list of tokens to parse.
     */
    public Parser(EmitterContext ctx, List<LexerToken> tokens) {
        this.ctx = ctx;
        this.tokens = tokens;
        if (ctx != null && ctx.symbolTable != null) {
            setCurrentScope(ctx.symbolTable);
        }
    }

    // Add this constructor to the Parser class
    public Parser(EmitterContext ctx, List<LexerToken> tokens, List<OperatorNode> sharedHeredocNodes) {
        this.ctx = ctx;
        this.tokens = tokens;
        this.tokenIndex = 0;
        // Share the heredoc nodes list instead of creating a new one
        this.heredocNodes = sharedHeredocNodes;
        if (ctx != null && ctx.symbolTable != null) {
            setCurrentScope(ctx.symbolTable);
        }
    }

    public static boolean isExpressionTerminator(LexerToken token) {
        return token.type == LexerTokenType.EOF || ParserTables.TERMINATORS.contains(token.text);
    }

    /**
     * Returns the list of heredoc nodes encountered during parsing.
     *
     * @return The list of heredoc nodes.
     */
    public List<OperatorNode> getHeredocNodes() {
        return heredocNodes;
    }

    /**
     * Returns the list of format nodes encountered during parsing.
     *
     * @return The list of format nodes.
     */
    public List<FormatNode> getFormatNodes() {
        return formatNodes;
    }

    /**
     * Returns the list of completed format nodes after template parsing.
     *
     * @return The list of completed format nodes.
     */
    public List<FormatNode> getCompletedFormatNodes() {
        return completedFormatNodes;
    }

    /**
     * Retrieves the precedence of the given operator.
     *
     * @param operator The operator to check.
     * @return The precedence level of the operator.
     */
    public int getPrecedence(String operator) {
        return ParserTables.precedenceMap.getOrDefault(operator, 24);
    }

    /**
     * Parses the tokens into an abstract syntax tree (AST).
     *
     * @return The root node of the parsed AST.
     */
    public Node parse() {
        if (tokens.get(tokenIndex).text.equals("=")) {
            // looks like pod: insert a newline to trigger pod parsing
            tokens.addFirst(new LexerToken(LexerTokenType.NEWLINE, "\n"));
        }
        Node ast = ParseBlock.parseBlock(this);
        // Mark the AST as a top-level file block for proper bare block return value handling
        // This annotation is checked in EmitBlock to handle RUNTIME context bare blocks
        if (!isTopLevelScript && ast instanceof AbstractNode) {
            // For do "file" and require, mark the block so bare blocks return their value
            ((AbstractNode) ast).setAnnotation("isFileLevelBlock", true);
        }
        if (!getHeredocNodes().isEmpty()) {
            ParseHeredoc.heredocError(this);
        }
        return ast;
    }

    /**
     * Parses an expression based on operator precedence.
     * <p>
     * Higher precedence means tighter: `*` has higher precedence than `+`
     * <p>
     * Explanation of the  <a href="https://en.wikipedia.org/wiki/Operator-precedence_parser">precedence climbing method</a>
     * can be found in Wikipedia.
     * </p>
     *
     * @param precedence The precedence level of the current expression.
     * @return The root node of the parsed expression.
     */
    public Node parseExpression(int precedence) {
        // First, parse the primary expression (like a number or a variable).
        Node left = ParsePrimary.parsePrimary(this);

        // Continuously process tokens until we reach the end of the expression.
        while (true) {
            // Peek at the next token to determine what to do next.
            LexerToken token = peek(this);

            // Check if we have reached the end of the input (EOF) or a terminator (like `;`).
            if (isExpressionTerminator(token)) {
                break; // Exit the loop if we're done parsing.
            }

            // Get the precedence of the current token.
            int tokenPrecedence = getPrecedence(token.text);

            // Special case: if this is an IDENTIFIER that's a quote-like operator with high precedence,
            // it's not actually an infix operator - stop parsing here
            if (token.type == LexerTokenType.IDENTIFIER &&
                    tokenPrecedence == 24 &&
                    ParsePrimary.isIsQuoteLikeOperator(token.text)) {
                // This is a quote-like operator that should be parsed as a new expression, not as infix
                break;
            }

            // If the token's precedence is less than or equal to the precedence of the current expression, stop parsing.
            if (tokenPrecedence <= precedence) {
                break;
            }

            // Check for the special case of 'x=' tokens;
            // This handles cases where 'x=' is used as an operator.
            // The token combination is also used in assignments like '$x=3'.
            if (token.text.equals("x") && tokens.get(tokenIndex + 1).text.equals("=")) {
                // Check if this is actually 'x =>' (fat comma autoquoting)
                // In that case, 'x' should be treated as a bareword, not as the repetition operator
                // This is critical for Moo which uses hash keys like: x => 1
                // Without this fix, 'x =>' would be parsed as repetition operator 'x=' followed by '>'
                if (tokens.get(tokenIndex + 2).text.equals(">")) {
                    break; // Stop parsing infix, let 'x' be parsed as a bareword argument
                }
                // Combine 'x' and '=' into a single token 'x='
                token.text = "x=";
                // Set the token type to OPERATOR to reflect its usage
                token.type = LexerTokenType.OPERATOR;
                // Remove the '=' token from the list as it is now part of 'x='
                tokens.remove(tokenIndex + 1);
            }

            // Check for the special case of 'x3' style tokens
            // This handles cases where 'x' is followed directly by a number without space
            if (token.text.startsWith("x") && token.text.length() > 1) {
                String remainder = token.text.substring(1);
                // Check if the remainder is a valid integer
                try {
                    Integer.parseInt(remainder);
                    // Split the token into 'x' operator and the number
                    token.text = "x";
                    token.type = LexerTokenType.OPERATOR;
                    // Insert a new token for the number after the current position
                    LexerToken numberToken = new LexerToken(LexerTokenType.NUMBER, remainder);
                    tokens.add(tokenIndex + 1, numberToken);
                } catch (NumberFormatException e) {
                    // Not a valid integer, leave the token as is
                }
            }

            // If the operator is right associative (like exponentiation), parse it with lower precedence.
            if (ParserTables.RIGHT_ASSOC_OP.contains(token.text)) {
                if (CompilerOptions.DEBUG_ENABLED) ctx.logDebug("parseExpression `" + token.text + "` precedence: " + tokenPrecedence + " right assoc");
                left = ParseInfix.parseInfixOperation(this, left, tokenPrecedence - 1); // Parse the right side with lower precedence.
            } else {
                // Otherwise, parse it normally with the same precedence.
                if (CompilerOptions.DEBUG_ENABLED) ctx.logDebug("parseExpression `" + token.text + "` precedence: " + tokenPrecedence + " left assoc");
                left = ParseInfix.parseInfixOperation(this, left, tokenPrecedence);
            }
        }

        // Return the root node of the constructed expression tree.
        return left;
    }

    public void throwError(String message) {
        throw new PerlCompilerException(this.tokenIndex, message, this.ctx.errorUtil);
    }

    public void throwError(int index, String message) {
        throw new PerlCompilerException(index, message, this.ctx.errorUtil);
    }

    /**
     * Throws a clean parser error that matches Perl's exact error message format
     * without additional context or stack traces.
     */
    public void throwCleanError(String message) {
        ErrorMessageUtil.SourceLocation loc = this.ctx.errorUtil.getSourceLocationAccurate(this.tokenIndex);
        String cleanMessage = message + " at " + loc.fileName() + " line " + loc.lineNumber() + ".";
        throw new PerlParserException(cleanMessage);
    }

    public void debugHeredocState(String location) {
        if (CompilerOptions.DEBUG_ENABLED) this.ctx.logDebug("HEREDOC_STATE [" + location + "] tokenIndex=" + tokenIndex +
                " heredocCount=" + heredocNodes.size());
    }

}