diff --git a/bibtexparser/middlewares/interpolate.py b/bibtexparser/middlewares/interpolate.py index 2e5d832..0842f9d 100644 --- a/bibtexparser/middlewares/interpolate.py +++ b/bibtexparser/middlewares/interpolate.py @@ -21,6 +21,72 @@ def _value_is_nonstring_or_enclosed(value: Any) -> bool: return False +def _split_concatenation(value: str) -> "list[str] | None": + """Split a value on top-level ``#`` concatenation operators. + + Returns the list of stripped tokens if the value contains at least one + ``#`` outside of any ``"..."`` or ``{...}`` group, otherwise ``None`` + (i.e., the value is not a concatenation expression). + """ + tokens = [] + current = [] + depth = 0 + in_quotes = False + found_separator = False + for char in value: + if char == '"' and depth == 0: + in_quotes = not in_quotes + current.append(char) + elif char == "{" and not in_quotes: + depth += 1 + current.append(char) + elif char == "}" and not in_quotes: + depth = max(0, depth - 1) + current.append(char) + elif char == "#" and depth == 0 and not in_quotes: + found_separator = True + tokens.append("".join(current).strip()) + current = [] + else: + current.append(char) + + if not found_separator: + return None + + tokens.append("".join(current).strip()) + return tokens + + +def _resolve_concatenation(tokens: "list[str]", string_values: "dict[str, str]") -> "str | None": + """Resolve concatenation tokens to their joined string content. + + Each token is a number (kept verbatim), a quoted or braced string (its + inner content is used), or a string reference (resolved via + ``string_values``). Returns ``None`` if any reference is unknown, so the + caller can leave the original expression untouched. + """ + resolved = [] + for token in tokens: + if not token: + return None + if token.startswith('"') and token.endswith('"'): + resolved.append(token[1:-1]) + elif token.startswith("{") and token.endswith("}"): + resolved.append(token[1:-1]) + elif token.isdigit(): + resolved.append(token) + else: + try: + referenced = string_values[token.lower()] + except KeyError: + return None + if referenced.startswith(('"', "{")) and referenced.endswith(('"', "}")): + referenced = referenced[1:-1] + resolved.append(referenced) + + return "".join(resolved) + + class ResolveStringReferencesMiddleware(LibraryMiddleware): """Replace strings references with their values.""" @@ -59,6 +125,18 @@ def transform(self, library: Library) -> Library: field: Field for field in entry.fields: + if isinstance(field.value, str): + tokens = _split_concatenation(field.value) + if tokens is not None: + joined = _resolve_concatenation(tokens, string_values) + if joined is not None: + # Keep the result enclosed (in braces) so the + # downstream enclosing middlewares treat it as a + # plain string rather than an unenclosed reference. + field.value = "{" + joined + "}" + resolved_fields.append(field.key) + continue + if _value_is_nonstring_or_enclosed(field.value): continue try: diff --git a/tests/middleware_tests/test_interpolate.py b/tests/middleware_tests/test_interpolate.py index 264373c..a223598 100644 --- a/tests/middleware_tests/test_interpolate.py +++ b/tests/middleware_tests/test_interpolate.py @@ -1,5 +1,6 @@ import pytest +import bibtexparser from bibtexparser.middlewares.enclosing import RemoveEnclosingMiddleware from bibtexparser.middlewares.interpolate import ResolveStringReferencesMiddleware from bibtexparser.splitter import Splitter @@ -66,3 +67,65 @@ def test_warning_is_raised_if_enclosings_are_removed(): assert len(record) == 1 assert "RemoveEnclosing" in record[0].message.args[0] + + +def test_string_interpolation_resolves_concatenation(): + bibtex = """ + @string{jan = "Jan."} + + @inbook{test_inbook, + month = 10 # "~" # jan, + title = "hello" # " " # "world", + mixed = jan # { } # "X", + numbers = 1 # 2 # 3 + } + """ + library = Splitter(bibtex).split() + + m = ResolveStringReferencesMiddleware() + library = m.transform(library) + + fields = library.entries_dict["test_inbook"].fields_dict + # Concatenations are resolved and kept enclosed in braces. + assert fields["month"].value == "{10~Jan.}" + assert fields["title"].value == "{hello world}" + assert fields["mixed"].value == "{Jan. X}" + assert fields["numbers"].value == "{123}" + + +def test_string_interpolation_leaves_unresolvable_concatenation_untouched(): + bibtex = """ + @inbook{test_inbook, + note = unknown_macro # " suffix" + } + """ + library = Splitter(bibtex).split() + + m = ResolveStringReferencesMiddleware() + library = m.transform(library) + + # An unknown reference means the expression is left exactly as-is. + assert ( + library.entries_dict["test_inbook"].fields_dict["note"].value == 'unknown_macro # " suffix"' + ) + + +def test_parse_string_resolves_concatenation_end_to_end(): + # Exact reproduction of issue #396. + bibtex_str = """ + @STRING{ jan = "Jan." } + + @INBOOK{inbook-full, + month = 10 # "~" # jan, + } + """ + library = bibtexparser.parse_string(bibtex_str) + month = library.entries[0].fields_dict["month"].value + assert month == "10~Jan." + + # The resolved value writes as a valid, brace-enclosed string and + # round-trips back to the same value. + written = bibtexparser.write_string(library) + assert "month = {10~Jan.}" in written + reparsed = bibtexparser.parse_string(written) + assert reparsed.entries[0].fields_dict["month"].value == "10~Jan."