22from functools import lru_cache
33from typing import Any
44
5+ from lark import Lark , Token
6+
57# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
68# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
79# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
2022ncname_regex_ns = rf"{ ncname_regex } (?:\:{ ncname_regex } )?"
2123ncname_regex_ns_named = rf"(?P<ncname_ns>{ ncname_regex_ns } )"
2224
23- date_regex = r"-?\d{4}-\d{2}-\d{2}"
24- time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
25- date_time_regex = date_regex + "T" + time_regex
26-
2725# pyxform_ref_outer picks up possible refs, and matches unterminated refs to exclude them.
2826pyxform_ref_outer = r"\$\{(?P<pyxform_ref>[^}]+)\}|\$\{[^}]*$"
2927pyxform_ref_inner = rf"(?P<last_saved>last-saved#)?{ ncname_regex_named } "
3230)
3331pyxform_ref = rf"(?P<pyxform_ref>\$\{{{ pyxform_ref_inner } \}})"
3432
35- # Rule order is significant - match priority runs top to bottom.
36- LEXER_RULES = {
37- # https://www.w3.org/TR/xmlschema-2/#dateTime
38- "DATETIME" : date_time_regex ,
39- "DATE" : date_regex ,
40- "TIME" : time_regex ,
41- "NUMBER" : r"-?\d+\.\d*|-?\.\d+|-?\d+" ,
42- # https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
43- "OPS_MATH" : r"[\*\+\-]| mod | div " ,
44- "OPS_COMP" : r"\=|\!\=|\<|\>|\<=|>=" ,
45- "OPS_BOOL" : r" and | or " ,
46- "OPS_UNION" : r"\|" ,
47- "OPEN_PAREN" : r"\(" ,
48- "CLOSE_PAREN" : r"\)" ,
49- "BRACKET" : r"\[\]\{\}" ,
50- "PARENT_REF" : r"\.\." ,
51- "SELF_REF" : r"\." ,
52- "PATH_SEP" : r"\/" , # javarosa.xpath says "//" is an "unsupported construct".
53- "SYSTEM_LITERAL" : r""""[^"]*"|'[^']*'""" ,
54- "COMMA" : r"," ,
55- "WHITESPACE" : r"\s+" ,
56- "PYXFORM_REF" : pyxform_ref ,
57- "FUNC_CALL" : ncname_regex_ns_named + r"\(" ,
58- "XPATH_PRED_START" : ncname_regex_ns_named + r"\[" ,
59- "XPATH_PRED_END" : r"\]" ,
60- "URI_SCHEME" : ncname_regex_named + r"://" ,
61- "NAME" : ncname_regex_named , # Must be after rules containing ncname_regex.
62- "PYXFORM_REF_START" : r"\$\{" ,
63- "PYXFORM_REF_END" : r"\}" ,
64- "OTHER" : r".+?" , # Catch any other character so that parsing doesn't stop.
65- }
66-
33+ lark_grammar = rf"""
34+ // Parser
35+ start: (token | WHITESPACE)*
36+ ?token: DATETIME
37+ | DATE
38+ | TIME
39+ | NUMBER
40+ | OPS_MATH
41+ | OPS_COMP
42+ | OPS_BOOL
43+ | OPS_UNION
44+ | OPEN_PAREN
45+ | CLOSE_PAREN
46+ | BRACKET
47+ | PARENT_REF
48+ | SELF_REF
49+ | PATH_SEP
50+ | SYSTEM_LITERAL
51+ | COMMA
52+ | PYXFORM_REF
53+ | FUNC_CALL
54+ | XPATH_PRED_START
55+ | XPATH_PRED_END
56+ | URI_SCHEME
57+ | NAME
58+ | PYXFORM_REF_START
59+ | PYXFORM_REF_END
60+ | OTHER
61+
62+ // Lexer
63+ // https://www.w3.org/TR/xmlschema-2/#dateTime
64+ DATETIME.25: DATE "T" TIME
65+ DATE.24: /-?\d{{4}}-\d{{2}}-\d{{2}}/
66+ TIME.23: /\d{{2}}:\d{{2}}:\d{{2}}(\.\s+)?(((\+|\-)\d{{2}}:\d{{2}})|Z)?/
67+ NUMBER.22: /-?\d+\.\d*|-?\.\d+|-?\d+/
68+ // https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
69+ OPS_MATH.21: /[\*\+\-]| mod | div /
70+ OPS_COMP.20: /\=|\!\=|\<|\>|\<=|>=/
71+ OPS_BOOL.19: / and | or /
72+ OPS_UNION.18: /\|/
73+ OPEN_PAREN.17: /\(/
74+ CLOSE_PAREN.16: /\)/
75+ BRACKET.15: /[\[\{{\}}]/
76+ PARENT_REF.14: /\.\./
77+ SELF_REF.13: /\./\
78+ // # javarosa.xpath says "//" is an "unsupported construct".
79+ PATH_SEP.12: /\//
80+ SYSTEM_LITERAL.11: /"[^"]*"|'[^']*'/
81+ COMMA.10: /,/
82+ WHITESPACE.9: /\s+/
83+ PYXFORM_REF.8: /\$\{{(?:last-saved#)?{ ncname_regex } \}}/
84+ FUNC_CALL.7: /{ ncname_regex_ns } \(/
85+ XPATH_PRED_START.6: /{ ncname_regex_ns } \[/
86+ XPATH_PRED_END.5: /\]/
87+ URI_SCHEME.4: /{ ncname_regex } :\/\//
88+ // Must be lower priority than rules containing ncname_regex.
89+ NAME.3: /{ ncname_regex_ns } /
90+ PYXFORM_REF_START.2: /\$\{{/
91+ PYXFORM_REF_END.1: /\}}/\
92+ // Catch any other character so that parsing doesn't stop.
93+ OTHER.0: /.+?/\
94+ """
6795
6896RE_NCNAME_NAMESPACED = re .compile (ncname_regex_ns_named )
6997RE_PYXFORM_REF = re .compile (pyxform_ref )
7098RE_PYXFORM_REF_OUTER = re .compile (pyxform_ref_outer )
7199RE_PYXFORM_REF_INNER = re .compile (pyxform_ref_inner )
72100
73101
74- def get_expression_lexer () -> re .Scanner :
75- def get_tokenizer (name ):
76- def tokenizer (scan , value ) -> ExpLexerToken | str :
77- match = scan .match
78- return ExpLexerToken (name , value , match .start (), match .end ())
79-
80- return tokenizer
81-
82- lexicon = [(v , get_tokenizer (k )) for k , v in LEXER_RULES .items ()]
83- # re.Scanner is undocumented but has been around since at least 2003
84- # https://mail.python.org/pipermail/python-dev/2003-April/035075.html
85- return re .Scanner (lexicon )
86-
87-
88- class ExpLexerToken :
89- __slots__ = ("end" , "name" , "start" , "value" )
90-
91- def __init__ (self , name : str , value : str , start : int , end : int ) -> None :
92- self .name : str = name
93- self .value : str = value
94- self .start : int = start
95- self .end : int = end
96-
97-
98- # Scanner takes a few 100ms to compile so use the shared instance.
99- _EXPRESSION_LEXER = get_expression_lexer ()
102+ _EXPRESSION_LEXER = Lark (
103+ lark_grammar , parser = "lalr" , start = "start" , propagate_positions = True
104+ )
100105
101106
102107@lru_cache (maxsize = 128 )
103- def parse_expression (text : str ) -> tuple [list [ ExpLexerToken ], str ]:
108+ def parse_expression (text : str ) -> tuple [Token , ... ]:
104109 """
105110 Parse an expression.
106111
@@ -109,8 +114,7 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
109114 :param text: The expression.
110115 :return: The parsed tokens, and any remaining unparsed text.
111116 """
112- tokens , remainder = _EXPRESSION_LEXER .scan (text )
113- return tokens , remainder
117+ return tuple (_EXPRESSION_LEXER .lex (text ))
114118
115119
116120def is_xml_tag (value : str ) -> bool :
0 commit comments