Skip to content

Commit 10e5ccf

Browse files
fix: avoid MemoryError in tokenize
1 parent 4599335 commit 10e5ccf

2 files changed

Lines changed: 26 additions & 7 deletions

File tree

Lib/test/test_tokenize.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3188,6 +3188,16 @@ def get_tokens(string):
31883188
with self.subTest(case=case):
31893189
self.assertRaises(tokenize.TokenError, get_tokens, case)
31903190

3191+
def test_tstring_multiline_bang_underflow(self):
3192+
# gh-149183: t-string with '!' across two lines used to raise
3193+
# MemoryError because last_expr_end > last_expr_size produced a
3194+
# negative length that was cast to a huge size_t.
3195+
self.assertRaises(
3196+
tokenize.TokenError,
3197+
list,
3198+
tokenize.tokenize(BytesIO(b't"{!\n!x').readline),
3199+
)
3200+
31913201
@support.skip_wasi_stack_overflow()
31923202
def test_max_indent(self):
31933203
MAXINDENT = 100

Parser/lexer/lexer.c

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,22 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) {
121121
}
122122
PyObject *res = NULL;
123123

124+
Py_ssize_t expr_len = tok_mode->last_expr_size - tok_mode->last_expr_end;
125+
if (expr_len < 0) {
126+
/* last_expr_end > last_expr_size: happens when '{' and the closing
127+
delimiter span different source lines, causing the strlen-based
128+
size tracking to underflow. Treat as a tokenizer error rather
129+
than passing a negative length (cast to huge size_t) to malloc or
130+
PyUnicode_DecodeUTF8. */
131+
return -1;
132+
}
133+
124134
// Look for a # character outside of string literals
125135
int hash_detected = 0;
126136
int in_string = 0;
127137
char quote_char = 0;
128138

129-
for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) {
139+
for (Py_ssize_t i = 0; i < expr_len; i++) {
130140
char ch = tok_mode->last_expr_buffer[i];
131141

132142
// Skip escaped characters
@@ -163,7 +173,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) {
163173
// If we found a # character in the expression, we need to handle comments
164174
if (hash_detected) {
165175
// Allocate buffer for processed result
166-
char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char));
176+
char *result = (char *)PyMem_Malloc((expr_len + 1) * sizeof(char));
167177
if (!result) {
168178
return -1;
169179
}
@@ -174,7 +184,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) {
174184
quote_char = 0; // Current string quote char
175185

176186
// Process each character
177-
while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) {
187+
while (i < expr_len) {
178188
char ch = tok_mode->last_expr_buffer[i];
179189

180190
// Handle string quotes
@@ -190,11 +200,10 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) {
190200
}
191201
// Skip comments
192202
else if (ch == '#' && !in_string) {
193-
while (i < tok_mode->last_expr_size - tok_mode->last_expr_end &&
194-
tok_mode->last_expr_buffer[i] != '\n') {
203+
while (i < expr_len && tok_mode->last_expr_buffer[i] != '\n') {
195204
i++;
196205
}
197-
if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) {
206+
if (i < expr_len) {
198207
result[j++] = '\n';
199208
}
200209
}
@@ -211,7 +220,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) {
211220
} else {
212221
res = PyUnicode_DecodeUTF8(
213222
tok_mode->last_expr_buffer,
214-
tok_mode->last_expr_size - tok_mode->last_expr_end,
223+
expr_len,
215224
NULL
216225
);
217226
}

0 commit comments

Comments
 (0)