summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2025-10-17 10:42:32 +0200
committerDylan Baker <dylan@pnwbakers.com>2025-12-08 10:08:10 -0800
commit4ef1866dba127a9c12f4c475d3bbb6aaac97a243 (patch)
tree87c930b0b907b5f0c024fb2e4485f3b6c4ddd0f5
parent908e85b326ff161273f079a4f4fa3dfd207ced9c (diff)
downloadmeson-4ef1866dba127a9c12f4c475d3bbb6aaac97a243.tar.gz
mparser: lexer: reduce regular expression usage
Match single-character tokens a separate dictionary lookup. As pointed out by dcbaker, this is even faster than str.index and gives the syntax error check for free (via KeyError). It also enables splitting the special-case "if" in two parts, one for long tokens and one for short tokens, thus providing further speedup. This shaves about 2/3rds of the time spent in lex(). Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r--mesonbuild/mparser.py136
1 files changed, 72 insertions, 64 deletions
diff --git a/mesonbuild/mparser.py b/mesonbuild/mparser.py
index a42fcdb06..5a9494606 100644
--- a/mesonbuild/mparser.py
+++ b/mesonbuild/mparser.py
@@ -119,36 +119,39 @@ class Lexer:
('id', IDENT_RE),
('number', re.compile(r'0[bB][01]+|0[oO][0-7]+|0[xX][0-9a-fA-F]+|0|[1-9]\d*')),
('eol_cont', re.compile(r'\\[ \t]*(#.*)?\n')),
- ('eol', re.compile(r'\n')),
('multiline_string', re.compile(r"'''(.|\n)*?'''", re.M)),
('comment', re.compile(r'#.*')),
- ('lparen', re.compile(r'\(')),
- ('rparen', re.compile(r'\)')),
- ('lbracket', re.compile(r'\[')),
- ('rbracket', re.compile(r'\]')),
- ('lcurl', re.compile(r'\{')),
- ('rcurl', re.compile(r'\}')),
- ('dblquote', re.compile(r'"')),
('string', re.compile(r"'([^'\\]|(\\.))*'")),
- ('comma', re.compile(r',')),
('plusassign', re.compile(r'\+=')),
- ('dot', re.compile(r'\.')),
- ('plus', re.compile(r'\+')),
- ('dash', re.compile(r'-')),
- ('star', re.compile(r'\*')),
- ('percent', re.compile(r'%')),
- ('fslash', re.compile(r'/')),
- ('colon', re.compile(r':')),
('equal', re.compile(r'==')),
('nequal', re.compile(r'!=')),
- ('assign', re.compile(r'=')),
('le', re.compile(r'<=')),
- ('lt', re.compile(r'<')),
('ge', re.compile(r'>=')),
- ('gt', re.compile(r'>')),
- ('questionmark', re.compile(r'\?')),
]
+ self.single_char_tokens = {
+ '\n': 'eol',
+ '(': 'lparen',
+ ')': 'rparen',
+ '[': 'lbracket',
+ ']': 'rbracket',
+ '{': 'lcurl',
+ '}': 'rcurl',
+ '"': 'dblquote',
+ ',': 'comma',
+ '.': 'dot',
+ '+': 'plus',
+ '-': 'dash',
+ '*': 'star',
+ '%': 'percent',
+ '/': 'fslash',
+ ':': 'colon',
+ '=': 'assign',
+ '<': 'lt',
+ '>': 'gt',
+ '?': 'questionmark',
+ }
+
def getline(self, line_start: int) -> str:
return self.code[line_start:self.code.find('\n', line_start)]
@@ -159,22 +162,25 @@ class Lexer:
par_count = 0
bracket_count = 0
curl_count = 0
- col = 0
- while loc < len(self.code):
- matched = False
- value: str = ''
- for (tid, reg) in self.token_specification:
- mo = reg.match(self.code, loc)
- if mo:
- curline = lineno
- curline_start = line_start
- col = mo.start() - line_start
- matched = True
- span_start = loc
- loc = mo.end()
- span_end = loc
- bytespan = (span_start, span_end)
- value = mo.group()
+ try:
+ while loc < len(self.code):
+ value: str
+ span_start = loc
+ col = loc - line_start
+ curline = lineno
+ curline_start = line_start
+ for (tid, reg) in self.token_specification:
+ mo = reg.match(self.code, loc)
+ if mo:
+ value = mo.group()
+ loc = mo.end()
+ break
+ else:
+ # lex single characters and raise an exception for invalid tokens
+ value = self.code[loc]
+ tid = self.single_char_tokens[value]
+ loc += 1
+
if tid == 'lparen':
par_count += 1
elif tid == 'rparen':
@@ -189,39 +195,41 @@ class Lexer:
curl_count -= 1
elif tid == 'dblquote':
raise ParseException('Double quotes are not supported. Use single quotes.', self.getline(line_start), lineno, col)
- elif tid in {'string', 'fstring'}:
- if value.find("\n") != -1:
- msg = ("Newline character in a string detected, use ''' (three single quotes) "
- "for multiline strings instead.\n"
- "This will become a hard error in a future Meson release.")
- mlog.warning(mlog.code_line(msg, self.getline(line_start), col), location=BaseNode(lineno, col, filename))
- value = value[2 if tid == 'fstring' else 1:-1]
- elif tid in {'multiline_string', 'multiline_fstring'}:
- value = value[4 if tid == 'multiline_fstring' else 3:-3]
- lines = value.split('\n')
- if len(lines) > 1:
- lineno += len(lines) - 1
- line_start = mo.end() - len(lines[-1]) - 3
- elif tid == 'eol_cont':
- lineno += 1
- line_start = loc
- tid = 'whitespace'
elif tid == 'eol':
lineno += 1
line_start = loc
if par_count > 0 or bracket_count > 0 or curl_count > 0:
tid = 'whitespace'
- elif tid == 'id':
- if value in self.keywords:
- tid = value
- else:
- if value in self.future_keywords:
- mlog.warning(f"Identifier '{value}' will become a reserved keyword in a future release. Please rename it.",
- location=BaseNode(lineno, col, filename))
- yield Token(tid, filename, curline_start, curline, col, bytespan, value)
- break
- if not matched:
- raise ParseException(f'lexer: unrecognized token {self.code[loc]!r}', self.getline(line_start), lineno, loc - line_start)
+
+ if tid in {'string', 'fstring'}:
+ if value.find("\n") != -1:
+ msg = ("Newline character in a string detected, use ''' (three single quotes) "
+ "for multiline strings instead.\n"
+ "This will become a hard error in a future Meson release.")
+ mlog.warning(mlog.code_line(msg, self.getline(line_start), col), location=BaseNode(lineno, col, filename))
+ value = value[2 if tid == 'fstring' else 1:-1]
+ elif tid in {'multiline_string', 'multiline_fstring'}:
+ value = value[4 if tid == 'multiline_fstring' else 3:-3]
+ lines = value.split('\n')
+ if len(lines) > 1:
+ lineno += len(lines) - 1
+ line_start = loc - len(lines[-1]) - 3
+ elif tid == 'eol_cont':
+ lineno += 1
+ line_start = loc
+ tid = 'whitespace'
+ elif tid == 'id':
+ if value in self.keywords:
+ tid = value
+ else:
+ if value in self.future_keywords:
+ mlog.warning(f"Identifier '{value}' will become a reserved keyword in a future release. Please rename it.",
+ location=BaseNode(lineno, col, filename))
+ bytespan = (span_start, loc)
+ yield Token(tid, filename, curline_start, curline, col, bytespan, value)
+
+ except KeyError:
+ raise ParseException(f'lexer: unrecognized token {self.code[loc]!r}', self.getline(line_start), lineno, loc - line_start)
@dataclass
class BaseNode: