From b5db685df9a9163c93b9d50ac21112137d2d3aef Mon Sep 17 00:00:00 2001
From: "Marko K. S." <marko.seppanen@gmail.com>
Date: Mon, 29 Jun 2026 22:55:45 +0200
Subject: [PATCH] Fix: Improve string literal parsing and validation

This change enhances the YINI parser's string handling by:
- Correctly decoding classic string escapes, including octal (`\o`), hexadecimal (`\x`, `\u`, `\U`) sequences.
- Validating that literal control characters are disallowed in single-quoted and double-quoted strings, while allowing them in multiline triple-quoted strings as per YINI's specification.
- Ensuring consistent UTF-8 output for the test-suite adapter on Windows.
```
---
 CHANGELOG.md                           |   5 +
 src/yini_parser/core/value_decoders.py | 204 ++++++++++++++++++++++++-
 tests/test_adapter.py                  |  40 +++++
 tests/test_values.py                   |  35 +++++
 tools/yini_parser_adapter.py           |  15 ++
 5 files changed, 294 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_adapter.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1e88e00..cd49834 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # CHANGELOG
 
+## 0.2.0 beta 2 - 2026 June + LATER
+- **Fixed:** The test-suite adapter now writes UTF-8 JSON consistently on Windows.
+- **Fixed:** Classic strings now validate YINI escape sequences directly, including invalid octal and Unicode escapes.
+- **Fixed:** Literal control characters are rejected in single-line strings while multiline triple-quoted strings still preserve valid formatting.
+
 ## 0.2.0 beta 1 - 2026 June
 - **Fixed:** Improved error reporting when a YINI file has broken syntax, including unfinished block comments such as `/* comment`.
 - **Fixed:** `#!` lines outside the first line are now safely ignored as comment-like lines.
diff --git a/src/yini_parser/core/value_decoders.py b/src/yini_parser/core/value_decoders.py
index 0fbb7e1..7fae74c 100644
--- a/src/yini_parser/core/value_decoders.py
+++ b/src/yini_parser/core/value_decoders.py
@@ -1,4 +1,6 @@
 # src/yini_parser/core/value_decoders.py
+from typing import NoReturn
+
 from ..api.errors import YiniParseError
 
 """
@@ -6,6 +8,19 @@
 - Decoders converts tokens into its runtime value.
 """
 
+_CLASSIC_SIMPLE_ESCAPES = {
+    "\\": "\\",
+    '"': '"',
+    "'": "'",
+    "a": "\a",
+    "b": "\b",
+    "f": "\f",
+    "n": "\n",
+    "r": "\r",
+    "t": "\t",
+    "v": "\v",
+}
+
 
 def decode_string_token(
     token_text: str,
@@ -46,10 +61,17 @@ def decode_string_token(
         if prefix in {"C", "c"}:
             return _decode_classic_string(
                 inner,
+                allow_line_breaks=True,
                 line=line,
                 column=column,
             )
 
+        _validate_string_content(
+            inner,
+            allow_line_breaks=True,
+            line=line,
+            column=column,
+        )
         return inner
 
     # Single-quoted or double-quoted string.
@@ -58,12 +80,19 @@ def decode_string_token(
 
         # Raw and unprefixed strings: return as-is.
         if prefix in {"", "R", "r"}:
+            _validate_string_content(
+                inner,
+                allow_line_breaks=False,
+                line=line,
+                column=column,
+            )
             return inner
 
         # Classic strings: decode escapes.
         if prefix in {"C", "c"}:
             return _decode_classic_string(
                 inner,
+                allow_line_breaks=False,
                 line=line,
                 column=column,
             )
@@ -141,17 +170,182 @@ def parse_number_literal(text, line=None, column=None):
 def _decode_classic_string(
     inner: str,
     *,
+    allow_line_breaks: bool,
     line: int | None = None,
     column: int | None = None,
 ) -> str:
-    try:
-        return bytes(inner, "utf-8").decode("unicode_escape")
-    except UnicodeDecodeError as exc:
+    result: list[str] = []
+    index = 0
+
+    while index < len(inner):
+        char = inner[index]
+
+        if char != "\\":
+            _validate_string_content(
+                char,
+                allow_line_breaks=allow_line_breaks,
+                line=line,
+                column=column,
+            )
+            result.append(char)
+            index += 1
+            continue
+
+        if index + 1 >= len(inner):
+            _raise_invalid_escape(
+                "Invalid string escape sequence: trailing backslash.",
+                line=line,
+                column=column,
+            )
+
+        escape = inner[index + 1]
+
+        if escape in _CLASSIC_SIMPLE_ESCAPES:
+            result.append(_CLASSIC_SIMPLE_ESCAPES[escape])
+            index += 2
+            continue
+
+        if escape == "o":
+            result.append(
+                _decode_digits_escape(
+                    inner,
+                    start=index + 2,
+                    length=3,
+                    base=8,
+                    prefix="\\o",
+                    line=line,
+                    column=column,
+                )
+            )
+            index += 5
+            continue
+
+        if escape == "x":
+            result.append(
+                _decode_digits_escape(
+                    inner,
+                    start=index + 2,
+                    length=2,
+                    base=16,
+                    prefix="\\x",
+                    line=line,
+                    column=column,
+                )
+            )
+            index += 4
+            continue
+
+        if escape == "u":
+            result.append(
+                _decode_digits_escape(
+                    inner,
+                    start=index + 2,
+                    length=4,
+                    base=16,
+                    prefix="\\u",
+                    line=line,
+                    column=column,
+                )
+            )
+            index += 6
+            continue
+
+        if escape == "U":
+            result.append(
+                _decode_digits_escape(
+                    inner,
+                    start=index + 2,
+                    length=8,
+                    base=16,
+                    prefix="\\U",
+                    line=line,
+                    column=column,
+                )
+            )
+            index += 10
+            continue
+
+        _raise_invalid_escape(
+            f"Invalid string escape sequence: \\{escape}.",
+            line=line,
+            column=column,
+        )
+
+    return "".join(result)
+
+
+def _validate_string_content(
+    text: str,
+    *,
+    allow_line_breaks: bool,
+    line: int | None = None,
+    column: int | None = None,
+) -> None:
+    for char in text:
+        if ord(char) >= 0x20:
+            continue
+
+        if allow_line_breaks and char in {"\n", "\r", "\t"}:
+            continue
+
         raise YiniParseError(
-            f"Invalid string escape sequence: {exc.reason}.",
+            "Invalid string literal: literal control characters are not allowed.",
             line=line,
             column=column,
-        ) from None
+        )
+
+
+def _decode_digits_escape(
+    text: str,
+    *,
+    start: int,
+    length: int,
+    base: int,
+    prefix: str,
+    line: int | None = None,
+    column: int | None = None,
+) -> str:
+    digits = text[start : start + length]
+
+    if len(digits) != length or not _digits_match_base(digits, base):
+        _raise_invalid_escape(
+            f"Invalid string escape sequence: {prefix} requires {length} "
+            f"base-{base} digit(s).",
+            line=line,
+            column=column,
+        )
+
+    try:
+        return chr(int(digits, base))
+    except ValueError:
+        _raise_invalid_escape(
+            f"Invalid string escape sequence: {prefix}{digits}.",
+            line=line,
+            column=column,
+        )
+
+
+def _digits_match_base(digits: str, base: int) -> bool:
+    if base == 8:
+        return all("0" <= digit <= "7" for digit in digits)
+
+    if base == 16:
+        return all(digit in "0123456789abcdefABCDEF" for digit in digits)
+
+    raise ValueError(f"Unsupported escape base: {base}")
+
+
+def _raise_invalid_escape(
+    message: str,
+    *,
+    line: int | None = None,
+    column: int | None = None,
+) -> NoReturn:
+    raise YiniParseError(
+        message,
+        line=line,
+        column=column,
+    )
 
 
 def _parse_duodecimal(
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
new file mode 100644
index 0000000..b7fcece
--- /dev/null
+++ b/tests/test_adapter.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+def test_adapter_writes_utf8_json_for_unicode_strings(tmp_path: Path) -> None:
+    input_path = tmp_path / "unicode.yini"
+    input_path.write_text(
+        """
+@yini
+
+^ Strings
+quote = "She said “hello” and left."
+""".lstrip(),
+        encoding="utf-8",
+    )
+
+    completed = subprocess.run(
+        [
+            sys.executable,
+            "tools/yini_parser_adapter.py",
+            "--input",
+            str(input_path),
+            "--mode",
+            "lenient",
+        ],
+        check=True,
+        capture_output=True,
+    )
+
+    stdout = completed.stdout.decode("utf-8")
+
+    assert json.loads(stdout) == {
+        "Strings": {
+            "quote": "She said “hello” and left.",
+        },
+    }
diff --git a/tests/test_values.py b/tests/test_values.py
index b4dec96..7caaeb7 100644
--- a/tests/test_values.py
+++ b/tests/test_values.py
@@ -1,6 +1,9 @@
 # tests/test_values.py
 from __future__ import annotations
 
+import pytest
+
+from yini_parser.api.errors import YiniParseError
 from yini_parser.api.load import loads
 
 
@@ -90,6 +93,38 @@ def test_parses_basic_strings() -> None:
     }
 
 
+def test_parses_classic_octal_escape() -> None:
+    text = r"""
+^ App
+letter = C"\o141"
+""".lstrip()
+
+    result = loads(text)
+
+    assert result == {
+        "App": {
+            "letter": "a",
+        },
+    }
+
+
+def test_rejects_invalid_classic_octal_escape() -> None:
+    text = r"""
+^ App
+bad = C"\o378"
+""".lstrip()
+
+    with pytest.raises(YiniParseError):
+        loads(text)
+
+
+def test_rejects_literal_control_character_in_string() -> None:
+    text = '^ App\nbad = "alpha\tbeta"\n'
+
+    with pytest.raises(YiniParseError):
+        loads(text)
+
+
 def test_parses_lists() -> None:
     text = """
 ^ App
diff --git a/tools/yini_parser_adapter.py b/tools/yini_parser_adapter.py
index 947af95..dae7ffd 100644
--- a/tools/yini_parser_adapter.py
+++ b/tools/yini_parser_adapter.py
@@ -7,6 +7,20 @@
 from typing import NoReturn
 
 
+def _configure_stdio() -> None:
+    """
+    The yini-test-suite reads adapter output as UTF-8 JSON. On some Windows
+    setups, Python may encode piped stdout/stderr with the active locale unless
+    we make the encoding explicit.
+    """
+
+    if hasattr(sys.stdout, "reconfigure"):
+        sys.stdout.reconfigure(encoding="utf-8")
+
+    if hasattr(sys.stderr, "reconfigure"):
+        sys.stderr.reconfigure(encoding="utf-8")
+
+
 def _ensure_local_src_on_path() -> None:
     """
     Allows this adapter to run directly from the repository root without
@@ -48,6 +62,7 @@ def _parse_args() -> argparse.Namespace:
 
 
 def main() -> int:
+    _configure_stdio()
     _ensure_local_src_on_path()
 
     from yini_parser.api.errors import YiniParseError