From b5db685df9a9163c93b9d50ac21112137d2d3aef Mon Sep 17 00:00:00 2001 From: "Marko K. S." Date: Mon, 29 Jun 2026 22:55:45 +0200 Subject: [PATCH] Fix: Improve string literal parsing and validation This change enhances the YINI parser's string handling by: - Correctly decoding classic string escapes, including octal (`\o`), hexadecimal (`\x`, `\u`, `\U`) sequences. - Validating that literal control characters are disallowed in single-quoted and double-quoted strings, while allowing them in multiline triple-quoted strings as per YINI's specification. - Ensuring consistent UTF-8 output for the test-suite adapter on Windows. ``` --- CHANGELOG.md | 5 + src/yini_parser/core/value_decoders.py | 204 ++++++++++++++++++++++++- tests/test_adapter.py | 40 +++++ tests/test_values.py | 35 +++++ tools/yini_parser_adapter.py | 15 ++ 5 files changed, 294 insertions(+), 5 deletions(-) create mode 100644 tests/test_adapter.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e88e00..cd49834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # CHANGELOG +## 0.2.0 beta 2 - 2026 June + LATER +- **Fixed:** The test-suite adapter now writes UTF-8 JSON consistently on Windows. +- **Fixed:** Classic strings now validate YINI escape sequences directly, including invalid octal and Unicode escapes. +- **Fixed:** Literal control characters are rejected in single-line strings while multiline triple-quoted strings still preserve valid formatting. + ## 0.2.0 beta 1 - 2026 June - **Fixed:** Improved error reporting when a YINI file has broken syntax, including unfinished block comments such as `/* comment`. - **Fixed:** `#!` lines outside the first line are now safely ignored as comment-like lines. diff --git a/src/yini_parser/core/value_decoders.py b/src/yini_parser/core/value_decoders.py index 0fbb7e1..7fae74c 100644 --- a/src/yini_parser/core/value_decoders.py +++ b/src/yini_parser/core/value_decoders.py @@ -1,4 +1,6 @@ # src/yini_parser/core/value_decoders.py +from typing import NoReturn + from ..api.errors import YiniParseError """ @@ -6,6 +8,19 @@ - Decoders converts tokens into its runtime value. """ +_CLASSIC_SIMPLE_ESCAPES = { + "\\": "\\", + '"': '"', + "'": "'", + "a": "\a", + "b": "\b", + "f": "\f", + "n": "\n", + "r": "\r", + "t": "\t", + "v": "\v", +} + def decode_string_token( token_text: str, @@ -46,10 +61,17 @@ def decode_string_token( if prefix in {"C", "c"}: return _decode_classic_string( inner, + allow_line_breaks=True, line=line, column=column, ) + _validate_string_content( + inner, + allow_line_breaks=True, + line=line, + column=column, + ) return inner # Single-quoted or double-quoted string. @@ -58,12 +80,19 @@ def decode_string_token( # Raw and unprefixed strings: return as-is. if prefix in {"", "R", "r"}: + _validate_string_content( + inner, + allow_line_breaks=False, + line=line, + column=column, + ) return inner # Classic strings: decode escapes. if prefix in {"C", "c"}: return _decode_classic_string( inner, + allow_line_breaks=False, line=line, column=column, ) @@ -141,17 +170,182 @@ def parse_number_literal(text, line=None, column=None): def _decode_classic_string( inner: str, *, + allow_line_breaks: bool, line: int | None = None, column: int | None = None, ) -> str: - try: - return bytes(inner, "utf-8").decode("unicode_escape") - except UnicodeDecodeError as exc: + result: list[str] = [] + index = 0 + + while index < len(inner): + char = inner[index] + + if char != "\\": + _validate_string_content( + char, + allow_line_breaks=allow_line_breaks, + line=line, + column=column, + ) + result.append(char) + index += 1 + continue + + if index + 1 >= len(inner): + _raise_invalid_escape( + "Invalid string escape sequence: trailing backslash.", + line=line, + column=column, + ) + + escape = inner[index + 1] + + if escape in _CLASSIC_SIMPLE_ESCAPES: + result.append(_CLASSIC_SIMPLE_ESCAPES[escape]) + index += 2 + continue + + if escape == "o": + result.append( + _decode_digits_escape( + inner, + start=index + 2, + length=3, + base=8, + prefix="\\o", + line=line, + column=column, + ) + ) + index += 5 + continue + + if escape == "x": + result.append( + _decode_digits_escape( + inner, + start=index + 2, + length=2, + base=16, + prefix="\\x", + line=line, + column=column, + ) + ) + index += 4 + continue + + if escape == "u": + result.append( + _decode_digits_escape( + inner, + start=index + 2, + length=4, + base=16, + prefix="\\u", + line=line, + column=column, + ) + ) + index += 6 + continue + + if escape == "U": + result.append( + _decode_digits_escape( + inner, + start=index + 2, + length=8, + base=16, + prefix="\\U", + line=line, + column=column, + ) + ) + index += 10 + continue + + _raise_invalid_escape( + f"Invalid string escape sequence: \\{escape}.", + line=line, + column=column, + ) + + return "".join(result) + + +def _validate_string_content( + text: str, + *, + allow_line_breaks: bool, + line: int | None = None, + column: int | None = None, +) -> None: + for char in text: + if ord(char) >= 0x20: + continue + + if allow_line_breaks and char in {"\n", "\r", "\t"}: + continue + raise YiniParseError( - f"Invalid string escape sequence: {exc.reason}.", + "Invalid string literal: literal control characters are not allowed.", line=line, column=column, - ) from None + ) + + +def _decode_digits_escape( + text: str, + *, + start: int, + length: int, + base: int, + prefix: str, + line: int | None = None, + column: int | None = None, +) -> str: + digits = text[start : start + length] + + if len(digits) != length or not _digits_match_base(digits, base): + _raise_invalid_escape( + f"Invalid string escape sequence: {prefix} requires {length} " + f"base-{base} digit(s).", + line=line, + column=column, + ) + + try: + return chr(int(digits, base)) + except ValueError: + _raise_invalid_escape( + f"Invalid string escape sequence: {prefix}{digits}.", + line=line, + column=column, + ) + + +def _digits_match_base(digits: str, base: int) -> bool: + if base == 8: + return all("0" <= digit <= "7" for digit in digits) + + if base == 16: + return all(digit in "0123456789abcdefABCDEF" for digit in digits) + + raise ValueError(f"Unsupported escape base: {base}") + + +def _raise_invalid_escape( + message: str, + *, + line: int | None = None, + column: int | None = None, +) -> NoReturn: + raise YiniParseError( + message, + line=line, + column=column, + ) def _parse_duodecimal( diff --git a/tests/test_adapter.py b/tests/test_adapter.py new file mode 100644 index 0000000..b7fcece --- /dev/null +++ b/tests/test_adapter.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + + +def test_adapter_writes_utf8_json_for_unicode_strings(tmp_path: Path) -> None: + input_path = tmp_path / "unicode.yini" + input_path.write_text( + """ +@yini + +^ Strings +quote = "She said “hello” and left." +""".lstrip(), + encoding="utf-8", + ) + + completed = subprocess.run( + [ + sys.executable, + "tools/yini_parser_adapter.py", + "--input", + str(input_path), + "--mode", + "lenient", + ], + check=True, + capture_output=True, + ) + + stdout = completed.stdout.decode("utf-8") + + assert json.loads(stdout) == { + "Strings": { + "quote": "She said “hello” and left.", + }, + } diff --git a/tests/test_values.py b/tests/test_values.py index b4dec96..7caaeb7 100644 --- a/tests/test_values.py +++ b/tests/test_values.py @@ -1,6 +1,9 @@ # tests/test_values.py from __future__ import annotations +import pytest + +from yini_parser.api.errors import YiniParseError from yini_parser.api.load import loads @@ -90,6 +93,38 @@ def test_parses_basic_strings() -> None: } +def test_parses_classic_octal_escape() -> None: + text = r""" +^ App +letter = C"\o141" +""".lstrip() + + result = loads(text) + + assert result == { + "App": { + "letter": "a", + }, + } + + +def test_rejects_invalid_classic_octal_escape() -> None: + text = r""" +^ App +bad = C"\o378" +""".lstrip() + + with pytest.raises(YiniParseError): + loads(text) + + +def test_rejects_literal_control_character_in_string() -> None: + text = '^ App\nbad = "alpha\tbeta"\n' + + with pytest.raises(YiniParseError): + loads(text) + + def test_parses_lists() -> None: text = """ ^ App diff --git a/tools/yini_parser_adapter.py b/tools/yini_parser_adapter.py index 947af95..dae7ffd 100644 --- a/tools/yini_parser_adapter.py +++ b/tools/yini_parser_adapter.py @@ -7,6 +7,20 @@ from typing import NoReturn +def _configure_stdio() -> None: + """ + The yini-test-suite reads adapter output as UTF-8 JSON. On some Windows + setups, Python may encode piped stdout/stderr with the active locale unless + we make the encoding explicit. + """ + + if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8") + + if hasattr(sys.stderr, "reconfigure"): + sys.stderr.reconfigure(encoding="utf-8") + + def _ensure_local_src_on_path() -> None: """ Allows this adapter to run directly from the repository root without @@ -48,6 +62,7 @@ def _parse_args() -> argparse.Namespace: def main() -> int: + _configure_stdio() _ensure_local_src_on_path() from yini_parser.api.errors import YiniParseError