Skip to content

Commit fa24bf3

Browse files
authored
Merge pull request #789 from lindsay-stevens/pyxform-714
714: Indicate which sheet a ${} replacement error comes from
2 parents c2bf6bc + dddb563 commit fa24bf3

22 files changed

Lines changed: 1339 additions & 245 deletions

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies = [
1111
"xlrd==2.0.1", # Read XLS files
1212
"openpyxl==3.1.5", # Read XLSX files
1313
"defusedxml==0.7.1", # Parse XML
14+
"lark==1.3.1", # Parse custom grammars
1415
]
1516

1617
[project.optional-dependencies]

pyxform/entities/entities_parsing.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,11 +260,12 @@ def get_validated_repeat_name(entity) -> str | None:
260260
match = parse_pyxform_references(value=value, match_limit=1, match_full=True)
261261
except PyXFormError as e:
262262
e.context.update(sheet="entities", column="repeat", row=2)
263+
raise
263264
else:
264-
if not match or not is_xml_tag(match[0]):
265+
if not match or match[0].last_saved:
265266
raise PyXFormError(ENTITY001.format(value=value))
266267
else:
267-
return match[0]
268+
return match[0].name
268269

269270

270271
def validate_entity_saveto(

pyxform/errors.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,19 @@ class ErrorCode(Enum):
5656
),
5757
)
5858
PYREF_003: Detail = Detail(
59-
name="PyXForm Reference Question Not Found",
59+
name="PyXForm Reference Name Not Found",
6060
msg=(
6161
"[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
62-
"Reference variables must refer to a question name. Could not find '{q}'."
62+
"Reference variables must contain a name from the 'survey' sheet. Could not "
63+
"find the name '{q}'."
64+
),
65+
)
66+
PYREF_004: Detail = Detail(
67+
name="PyXForm Reference Duplicate Name",
68+
msg=(
69+
"[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
70+
"Reference variables names must be unique anywhere in the 'survey'. The name "
71+
"'{q}' appears more than once."
6372
),
6473
)
6574
INTERNAL_001: Detail = Detail(

pyxform/parsing/expression.py

Lines changed: 69 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from functools import lru_cache
33
from typing import Any
44

5+
from lark import Lark, Token
6+
57
# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
68
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
79
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
@@ -20,10 +22,6 @@
2022
ncname_regex_ns = rf"{ncname_regex}(?:\:{ncname_regex})?"
2123
ncname_regex_ns_named = rf"(?P<ncname_ns>{ncname_regex_ns})"
2224

23-
date_regex = r"-?\d{4}-\d{2}-\d{2}"
24-
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
25-
date_time_regex = date_regex + "T" + time_regex
26-
2725
# pyxform_ref_outer picks up possible refs, and matches unterminated refs to exclude them.
2826
pyxform_ref_outer = r"\$\{(?P<pyxform_ref>[^}]+)\}|\$\{[^}]*$"
2927
pyxform_ref_inner = rf"(?P<last_saved>last-saved#)?{ncname_regex_named}"
@@ -32,75 +30,82 @@
3230
)
3331
pyxform_ref = rf"(?P<pyxform_ref>\$\{{{pyxform_ref_inner}\}})"
3432

35-
# Rule order is significant - match priority runs top to bottom.
36-
LEXER_RULES = {
37-
# https://www.w3.org/TR/xmlschema-2/#dateTime
38-
"DATETIME": date_time_regex,
39-
"DATE": date_regex,
40-
"TIME": time_regex,
41-
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
42-
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
43-
"OPS_MATH": r"[\*\+\-]| mod | div ",
44-
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
45-
"OPS_BOOL": r" and | or ",
46-
"OPS_UNION": r"\|",
47-
"OPEN_PAREN": r"\(",
48-
"CLOSE_PAREN": r"\)",
49-
"BRACKET": r"\[\]\{\}",
50-
"PARENT_REF": r"\.\.",
51-
"SELF_REF": r"\.",
52-
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
53-
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
54-
"COMMA": r",",
55-
"WHITESPACE": r"\s+",
56-
"PYXFORM_REF": pyxform_ref,
57-
"FUNC_CALL": ncname_regex_ns_named + r"\(",
58-
"XPATH_PRED_START": ncname_regex_ns_named + r"\[",
59-
"XPATH_PRED_END": r"\]",
60-
"URI_SCHEME": ncname_regex_named + r"://",
61-
"NAME": ncname_regex_named, # Must be after rules containing ncname_regex.
62-
"PYXFORM_REF_START": r"\$\{",
63-
"PYXFORM_REF_END": r"\}",
64-
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
65-
}
66-
33+
lark_grammar = rf"""
34+
// Parser
35+
start: (token | WHITESPACE)*
36+
?token: DATETIME
37+
| DATE
38+
| TIME
39+
| NUMBER
40+
| OPS_MATH
41+
| OPS_COMP
42+
| OPS_BOOL
43+
| OPS_UNION
44+
| OPEN_PAREN
45+
| CLOSE_PAREN
46+
| BRACKET
47+
| PARENT_REF
48+
| SELF_REF
49+
| PATH_SEP
50+
| SYSTEM_LITERAL
51+
| COMMA
52+
| PYXFORM_REF
53+
| FUNC_CALL
54+
| XPATH_PRED_START
55+
| XPATH_PRED_END
56+
| URI_SCHEME
57+
| NAME
58+
| PYXFORM_REF_START
59+
| PYXFORM_REF_END
60+
| OTHER
61+
62+
// Lexer
63+
// https://www.w3.org/TR/xmlschema-2/#dateTime
64+
DATETIME.25: DATE "T" TIME
65+
DATE.24: /-?\d{{4}}-\d{{2}}-\d{{2}}/
66+
TIME.23: /\d{{2}}:\d{{2}}:\d{{2}}(\.\s+)?(((\+|\-)\d{{2}}:\d{{2}})|Z)?/
67+
NUMBER.22: /-?\d+\.\d*|-?\.\d+|-?\d+/
68+
// https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
69+
OPS_MATH.21: /[\*\+\-]| mod | div /
70+
OPS_COMP.20: /\=|\!\=|\<|\>|\<=|>=/
71+
OPS_BOOL.19: / and | or /
72+
OPS_UNION.18: /\|/
73+
OPEN_PAREN.17: /\(/
74+
CLOSE_PAREN.16: /\)/
75+
BRACKET.15: /[\[\{{\}}]/
76+
PARENT_REF.14: /\.\./
77+
SELF_REF.13: /\./\
78+
// # javarosa.xpath says "//" is an "unsupported construct".
79+
PATH_SEP.12: /\//
80+
SYSTEM_LITERAL.11: /"[^"]*"|'[^']*'/
81+
COMMA.10: /,/
82+
WHITESPACE.9: /\s+/
83+
PYXFORM_REF.8: /\$\{{(?:last-saved#)?{ncname_regex}\}}/
84+
FUNC_CALL.7: /{ncname_regex_ns}\(/
85+
XPATH_PRED_START.6: /{ncname_regex_ns}\[/
86+
XPATH_PRED_END.5: /\]/
87+
URI_SCHEME.4: /{ncname_regex}:\/\//
88+
// Must be lower priority than rules containing ncname_regex.
89+
NAME.3: /{ncname_regex_ns}/
90+
PYXFORM_REF_START.2: /\$\{{/
91+
PYXFORM_REF_END.1: /\}}/\
92+
// Catch any other character so that parsing doesn't stop.
93+
OTHER.0: /.+?/\
94+
"""
6795

6896
RE_NCNAME_NAMESPACED = re.compile(ncname_regex_ns_named)
6997
RE_PYXFORM_REF = re.compile(pyxform_ref)
7098
RE_PYXFORM_REF_OUTER = re.compile(pyxform_ref_outer)
7199
RE_PYXFORM_REF_INNER = re.compile(pyxform_ref_inner)
72100

73101

74-
def get_expression_lexer() -> re.Scanner:
75-
def get_tokenizer(name):
76-
def tokenizer(scan, value) -> ExpLexerToken | str:
77-
match = scan.match
78-
return ExpLexerToken(name, value, match.start(), match.end())
79-
80-
return tokenizer
81-
82-
lexicon = [(v, get_tokenizer(k)) for k, v in LEXER_RULES.items()]
83-
# re.Scanner is undocumented but has been around since at least 2003
84-
# https://mail.python.org/pipermail/python-dev/2003-April/035075.html
85-
return re.Scanner(lexicon)
86-
87-
88-
class ExpLexerToken:
89-
__slots__ = ("end", "name", "start", "value")
90-
91-
def __init__(self, name: str, value: str, start: int, end: int) -> None:
92-
self.name: str = name
93-
self.value: str = value
94-
self.start: int = start
95-
self.end: int = end
96-
97-
98-
# Scanner takes a few 100ms to compile so use the shared instance.
99-
_EXPRESSION_LEXER = get_expression_lexer()
102+
_EXPRESSION_LEXER = Lark(
103+
lark_grammar, parser="lalr", start="start", propagate_positions=True
104+
)
100105

101106

102107
@lru_cache(maxsize=128)
103-
def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
108+
def parse_expression(text: str) -> tuple[Token, ...]:
104109
"""
105110
Parse an expression.
106111
@@ -109,8 +114,7 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
109114
:param text: The expression.
110115
:return: The parsed tokens, and any remaining unparsed text.
111116
"""
112-
tokens, remainder = _EXPRESSION_LEXER.scan(text)
113-
return tokens, remainder
117+
return tuple(_EXPRESSION_LEXER.lex(text))
114118

115119

116120
def is_xml_tag(value: str) -> bool:

pyxform/parsing/instance_expression.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
2121
:param xml_text: XML text that may contain an instance expression.
2222
:return: Tokens in instance expression, and the string position boundaries.
2323
"""
24-
tokens, _ = parse_expression(xml_text)
24+
tokens = parse_expression(xml_text)
2525
if not tokens:
2626
return []
2727
instance_enter = False
@@ -33,43 +33,43 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
3333
for t in tokens:
3434
emit = False
3535
# If an instance expression had started, note the string position boundary.
36-
if not instance_enter and t.name == "FUNC_CALL" and t.value == "instance(":
36+
if not instance_enter and t.type == "FUNC_CALL" and t.value == "instance(":
3737
instance_enter = True
3838
emit = True
39-
boundaries.append(t.start)
39+
boundaries.append(t.start_pos)
4040
# Tokens that are part of an instance expression.
4141
elif instance_enter:
4242
# Tokens that are part of the instance call.
4343
if (
44-
t.name == "SYSTEM_LITERAL"
45-
and last_token.name == "FUNC_CALL"
44+
t.type == "SYSTEM_LITERAL"
45+
and last_token.type == "FUNC_CALL"
4646
and last_token.value == "instance("
4747
):
4848
emit = True
49-
elif last_token.name == "SYSTEM_LITERAL" and t.name == "CLOSE_PAREN":
49+
elif last_token.type == "SYSTEM_LITERAL" and t.type == "CLOSE_PAREN":
5050
emit = True
51-
elif t.name == "PATH_SEP" and last_token.name == "CLOSE_PAREN":
51+
elif t.type == "PATH_SEP" and last_token.type == "CLOSE_PAREN":
5252
emit = True
5353
path_enter = True
5454
# A XPath path may continue after a predicate.
55-
elif t.name == "PATH_SEP" and last_token.name == "XPATH_PRED_END":
55+
elif t.type == "PATH_SEP" and last_token.type == "XPATH_PRED_END":
5656
emit = True
5757
path_enter = True
5858
# Tokens that are part of a XPath path.
5959
elif path_enter:
60-
if t.name == "WHITESPACE":
60+
if t.type == "WHITESPACE":
6161
path_enter = False
62-
elif t.name != "XPATH_PRED_START":
62+
elif t.type != "XPATH_PRED_START":
6363
emit = True
64-
elif t.name == "XPATH_PRED_START":
64+
elif t.type == "XPATH_PRED_START":
6565
emit = True
6666
path_enter = False
6767
pred_enter = True
6868
# Tokens that are part of a XPath predicate.
6969
elif pred_enter:
70-
if t.name != "XPATH_PRED_END":
70+
if t.type != "XPATH_PRED_END":
7171
emit = True
72-
elif t.name == "XPATH_PRED_END":
72+
elif t.type == "XPATH_PRED_END":
7373
emit = True
7474
pred_enter = False
7575
# Track instance expression tokens, ignore others.
@@ -78,10 +78,10 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
7878
# If an instance expression had ended, note the string position boundary.
7979
elif instance_enter:
8080
instance_enter = False
81-
boundaries.append(last_token.end)
81+
boundaries.append(last_token.end_pos)
8282

8383
if last_token is not None:
84-
boundaries.append(last_token.end)
84+
boundaries.append(last_token.end_pos)
8585

8686
# Pair up the boundaries [1, 2, 3, 4] -> [(1, 2), (3, 4)].
8787
bounds = iter(boundaries)

0 commit comments

Comments
 (0)