mparser: lexer: reduce regular expression usage

Match single-character tokens a separate dictionary lookup. As pointed out by dcbaker, this is even faster than str.index and gives the syntax error check for free (via KeyError). It also enables splitting the special-case "if" in two parts, one for long tokens and one for short tokens, thus providing further speedup. This shaves about 2/3rds of the time spent in lex(). Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
author: Paolo Bonzini <pbonzini@redhat.com> 2025-10-17 10:42:32 +0200
committer: Dylan Baker <dylan@pnwbakers.com> 2025-12-08 10:08:10 -0800
commit: 4ef1866dba127a9c12f4c475d3bbb6aaac97a243 (patch)
tree: 87c930b0b907b5f0c024fb2e4485f3b6c4ddd0f5
parent: 908e85b326ff161273f079a4f4fa3dfd207ced9c (diff)
download: meson-4ef1866dba127a9c12f4c475d3bbb6aaac97a243.tar.gz
1 files changed, 72 insertions, 64 deletions
diff --git a/mesonbuild/mparser.py b/mesonbuild/mparser.py
index a42fcdb06..5a9494606 100644
--- a/mesonbuild/mparser.py
+++ b/mesonbuild/mparser.py
@@ -119,36 +119,39 @@ class Lexer:
             ('id', IDENT_RE),
             ('number', re.compile(r'0[bB][01]+|0[oO][0-7]+|0[xX][0-9a-fA-F]+|0|[1-9]\d*')),
             ('eol_cont', re.compile(r'\\[ \t]*(#.*)?\n')),
-            ('eol', re.compile(r'\n')),
             ('multiline_string', re.compile(r"'''(.|\n)*?'''", re.M)),
             ('comment', re.compile(r'#.*')),
-            ('lparen', re.compile(r'\(')),
-            ('rparen', re.compile(r'\)')),
-            ('lbracket', re.compile(r'\[')),
-            ('rbracket', re.compile(r'\]')),
-            ('lcurl', re.compile(r'\{')),
-            ('rcurl', re.compile(r'\}')),
-            ('dblquote', re.compile(r'"')),
             ('string', re.compile(r"'([^'\\]|(\\.))*'")),
-            ('comma', re.compile(r',')),
             ('plusassign', re.compile(r'\+=')),
-            ('dot', re.compile(r'\.')),
-            ('plus', re.compile(r'\+')),
-            ('dash', re.compile(r'-')),
-            ('star', re.compile(r'\*')),
-            ('percent', re.compile(r'%')),
-            ('fslash', re.compile(r'/')),
-            ('colon', re.compile(r':')),
             ('equal', re.compile(r'==')),
             ('nequal', re.compile(r'!=')),
-            ('assign', re.compile(r'=')),
             ('le', re.compile(r'<=')),
-            ('lt', re.compile(r'<')),
             ('ge', re.compile(r'>=')),
-            ('gt', re.compile(r'>')),
-            ('questionmark', re.compile(r'\?')),
         ]
 
+        self.single_char_tokens = {
+            '\n': 'eol',
+            '(': 'lparen',
+            ')': 'rparen',
+            '[': 'lbracket',
+            ']': 'rbracket',
+            '{': 'lcurl',
+            '}': 'rcurl',
+            '"': 'dblquote',
+            ',': 'comma',
+            '.': 'dot',
+            '+': 'plus',
+            '-': 'dash',
+            '*': 'star',
+            '%': 'percent',
+            '/': 'fslash',
+            ':': 'colon',
+            '=': 'assign',
+            '<': 'lt',
+            '>': 'gt',
+            '?': 'questionmark',
+        }
+
     def getline(self, line_start: int) -> str:
         return self.code[line_start:self.code.find('\n', line_start)]
 
@@ -159,22 +162,25 @@ class Lexer:
         par_count = 0
         bracket_count = 0
         curl_count = 0
-        col = 0
-        while loc < len(self.code):
-            matched = False
-            value: str = ''
-            for (tid, reg) in self.token_specification:
-                mo = reg.match(self.code, loc)
-                if mo:
-                    curline = lineno
-                    curline_start = line_start
-                    col = mo.start() - line_start
-                    matched = True
-                    span_start = loc
-                    loc = mo.end()
-                    span_end = loc
-                    bytespan = (span_start, span_end)
-                    value = mo.group()
+        try:
+            while loc < len(self.code):
+                value: str
+                span_start = loc
+                col = loc - line_start
+                curline = lineno
+                curline_start = line_start
+                for (tid, reg) in self.token_specification:
+                    mo = reg.match(self.code, loc)
+                    if mo:
+                        value = mo.group()
+                        loc = mo.end()
+                        break
+                else:
+                    # lex single characters and raise an exception for invalid tokens
+                    value = self.code[loc]
+                    tid = self.single_char_tokens[value]
+                    loc += 1
+
                     if tid == 'lparen':
                         par_count += 1
                     elif tid == 'rparen':
@@ -189,39 +195,41 @@ class Lexer:
                         curl_count -= 1
                     elif tid == 'dblquote':
                         raise ParseException('Double quotes are not supported. Use single quotes.', self.getline(line_start), lineno, col)
-                    elif tid in {'string', 'fstring'}:
-                        if value.find("\n") != -1:
-                            msg = ("Newline character in a string detected, use ''' (three single quotes) "
-                                   "for multiline strings instead.\n"
-                                   "This will become a hard error in a future Meson release.")
-                            mlog.warning(mlog.code_line(msg, self.getline(line_start), col), location=BaseNode(lineno, col, filename))
-                        value = value[2 if tid == 'fstring' else 1:-1]
-                    elif tid in {'multiline_string', 'multiline_fstring'}:
-                        value = value[4 if tid == 'multiline_fstring' else 3:-3]
-                        lines = value.split('\n')
-                        if len(lines) > 1:
-                            lineno += len(lines) - 1
-                            line_start = mo.end() - len(lines[-1]) - 3
-                    elif tid == 'eol_cont':
-                        lineno += 1
-                        line_start = loc
-                        tid = 'whitespace'
                     elif tid == 'eol':
                         lineno += 1
                         line_start = loc
                         if par_count > 0 or bracket_count > 0 or curl_count > 0:
                             tid = 'whitespace'
-                    elif tid == 'id':
-                        if value in self.keywords:
-                            tid = value
-                        else:
-                            if value in self.future_keywords:
-                                mlog.warning(f"Identifier '{value}' will become a reserved keyword in a future release. Please rename it.",
-                                             location=BaseNode(lineno, col, filename))
-                    yield Token(tid, filename, curline_start, curline, col, bytespan, value)
-                    break
-            if not matched:
-                raise ParseException(f'lexer: unrecognized token {self.code[loc]!r}', self.getline(line_start), lineno, loc - line_start)
+
+                if tid in {'string', 'fstring'}:
+                    if value.find("\n") != -1:
+                        msg = ("Newline character in a string detected, use ''' (three single quotes) "
+                               "for multiline strings instead.\n"
+                               "This will become a hard error in a future Meson release.")
+                        mlog.warning(mlog.code_line(msg, self.getline(line_start), col), location=BaseNode(lineno, col, filename))
+                    value = value[2 if tid == 'fstring' else 1:-1]
+                elif tid in {'multiline_string', 'multiline_fstring'}:
+                    value = value[4 if tid == 'multiline_fstring' else 3:-3]
+                    lines = value.split('\n')
+                    if len(lines) > 1:
+                        lineno += len(lines) - 1
+                        line_start = loc - len(lines[-1]) - 3
+                elif tid == 'eol_cont':
+                    lineno += 1
+                    line_start = loc
+                    tid = 'whitespace'
+                elif tid == 'id':
+                    if value in self.keywords:
+                        tid = value
+                    else:
+                        if value in self.future_keywords:
+                            mlog.warning(f"Identifier '{value}' will become a reserved keyword in a future release. Please rename it.",
+                                         location=BaseNode(lineno, col, filename))
+                bytespan = (span_start, loc)
+                yield Token(tid, filename, curline_start, curline, col, bytespan, value)
+
+        except KeyError:
+            raise ParseException(f'lexer: unrecognized token {self.code[loc]!r}', self.getline(line_start), lineno, loc - line_start)
 
 @dataclass
 class BaseNode:
author	Paolo Bonzini <pbonzini@redhat.com>	2025-10-17 10:42:32 +0200
committer	Dylan Baker <dylan@pnwbakers.com>	2025-12-08 10:08:10 -0800
commit	4ef1866dba127a9c12f4c475d3bbb6aaac97a243 (patch)
tree	87c930b0b907b5f0c024fb2e4485f3b6c4ddd0f5
parent	908e85b326ff161273f079a4f4fa3dfd207ced9c (diff)
download	meson-4ef1866dba127a9c12f4c475d3bbb6aaac97a243.tar.gz