From f7e791c24d5b5948561ecc205b89c3954bd3cb88 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Sat, 23 May 2026 15:41:33 +0500 Subject: [PATCH] fix(cli): pin UTF-8 encoding on init-options and .extensionignore I/O MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``Path.read_text`` / ``Path.write_text`` default to the system locale codec, which is cp1252 / gb2312 / cp932 on Windows. Two user-facing file paths in spec-kit were calling them without an explicit ``encoding=`` argument: - ``src/specify_cli/__init__.py:400,412`` — ``save_init_options`` / ``load_init_options`` for ``.specify/init-options.json``. A peer machine with a different default locale (or a UTF-8 Unix CI runner reading a file written on a cp1252 Windows host) cannot decode the file, raising ``UnicodeDecodeError``. ``UnicodeDecodeError`` is a subclass of ``ValueError`` — not ``OSError`` / ``json.JSONDecodeError`` — so the existing fall-back ``except`` tuple in ``load_init_options`` also misses it and the error propagates raw to the CLI. - ``src/specify_cli/extensions.py:764`` — ``.extensionignore`` pattern reader. The very next line already normalises backslashes "so Windows-authored files work", proving the codebase expects Windows authors to write this file. Multibyte UTF-8 patterns (Chinese filenames, accented directory names) silently mojibake when the host locale is not UTF-8, so the patterns fail to match and unintended files are shipped with the extension. The sibling integration-catalog reader at ``src/specify_cli/integrations/catalog.py:150,156,193,202,374`` already pins ``encoding="utf-8"`` everywhere. PR #2280 fixed the symmetric PowerShell-template BOM bug. This change brings the two remaining drifted paths in line with that precedent. Regression tests: - ``tests/test_presets.py::TestInitOptions`` — parametrized non-ASCII round-trip (CJK, Latin-1, Greek, emoji) plus a corrupted-file case that asserts the existing "fall back to {}" contract still holds when a peer file contains bytes invalid as UTF-8. - ``tests/test_extensions.py::TestExtensionIgnore`` — Japanese (``ドキュメント/``) and Latin-1 (``café/``) ignore patterns correctly exclude their directories during install. --- src/specify_cli/__init__.py | 21 +++++++++++++--- src/specify_cli/extensions.py | 8 +++++- tests/test_extensions.py | 46 +++++++++++++++++++++++++++++++++-- tests/test_presets.py | 39 +++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 6 deletions(-) diff --git a/src/specify_cli/__init__.py b/src/specify_cli/__init__.py index c0bdbaabe3..559e6df5f8 100644 --- a/src/specify_cli/__init__.py +++ b/src/specify_cli/__init__.py @@ -397,7 +397,15 @@ def save_init_options(project_path: Path, options: dict[str, Any]) -> None: """ dest = project_path / INIT_OPTIONS_FILE dest.parent.mkdir(parents=True, exist_ok=True) - dest.write_text(json.dumps(options, indent=2, sort_keys=True)) + # Pin UTF-8 explicitly: ``Path.write_text`` defaults to the system + # locale codec, which is cp1252 / gb2312 / cp932 on Windows. A + # locale-encoded write succeeds locally but produces a file a peer + # machine (different locale) or Unix CI cannot decode. The sibling + # integration-catalog writer in ``integrations/catalog.py`` already + # pins ``encoding="utf-8"`` for the same reason. + dest.write_text( + json.dumps(options, indent=2, sort_keys=True), encoding="utf-8" + ) def load_init_options(project_path: Path) -> dict[str, Any]: @@ -409,8 +417,15 @@ def load_init_options(project_path: Path) -> dict[str, Any]: if not path.exists(): return {} try: - return json.loads(path.read_text()) - except (json.JSONDecodeError, OSError): + # Match the explicit UTF-8 used by ``save_init_options``; without + # it ``read_text`` falls back to the system codec on Windows and + # raises ``UnicodeDecodeError`` on any file a peer wrote with + # non-ASCII content. ``UnicodeDecodeError`` is a subclass of + # ``ValueError``, not ``OSError`` / ``json.JSONDecodeError``, so + # it must be listed explicitly here to preserve the existing + # "fall back to empty dict" contract. + return json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError, UnicodeDecodeError): return {} diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 871503f0ae..ba8020279b 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -761,7 +761,13 @@ def _load_extensionignore(source_dir: Path) -> Optional[Callable[[str, List[str] if not ignore_file.exists(): return None - lines: List[str] = ignore_file.read_text().splitlines() + # Pin UTF-8 explicitly: ``Path.read_text`` defaults to the system + # locale codec on Windows (cp1252 / gb2312 / cp932), which silently + # corrupts multibyte patterns when the file is shared across + # machines with different locales. The next line already + # normalises backslashes "so Windows-authored files work" — the + # codebase already expects Windows authors to write this file. + lines: List[str] = ignore_file.read_text(encoding="utf-8").splitlines() # Normalise backslashes in patterns so Windows-authored files work normalised: List[str] = [] diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 153388a541..9cdf7b3ea2 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -3147,9 +3147,13 @@ def _make_extension(self, temp_dir, valid_manifest_data, extra_files=None, ignor else: p.write_text(content) - # Write .extensionignore + # Write .extensionignore. Pinned to UTF-8 so non-ASCII patterns + # in tests (see ``test_extensionignore_utf8_patterns``) survive + # the round-trip on Windows runners with non-UTF-8 default locales. if ignore_content is not None: - (ext_dir / ".extensionignore").write_text(ignore_content) + (ext_dir / ".extensionignore").write_text( + ignore_content, encoding="utf-8" + ) return ext_dir @@ -3379,6 +3383,44 @@ def test_extensionignore_windows_backslash_patterns(self, temp_dir, valid_manife assert (dest / "docs" / "guide.md").exists() assert not (dest / "docs" / "internal" / "draft.md").exists() + def test_extensionignore_utf8_patterns(self, temp_dir, valid_manifest_data): + """Non-ASCII patterns in .extensionignore work on every locale. + + ``Path.read_text`` defaults to the system locale codec on Windows + (cp1252 / gb2312 / cp932). Without an explicit ``encoding="utf-8"``, + a pattern like ``ドキュメント/`` written by a UTF-8 host becomes + mojibake on a cp1252 host and silently fails to match — leaking + files the author intended to exclude. The existing + ``test_extensionignore_windows_backslash_patterns`` already shows + the codebase treats this as a Windows-author-friendly file; UTF-8 + is part of that same contract. + """ + ext_dir = self._make_extension( + temp_dir, + valid_manifest_data, + extra_files={ + "ドキュメント/private.md": "secret", + "ドキュメント/public.md": "public", + "docs/guide.md": "# Guide", + "café/résumé.txt": "draft", + }, + ignore_content="ドキュメント/\ncafé/\n", + ) + + proj_dir = temp_dir / "project" + proj_dir.mkdir() + (proj_dir / ".specify").mkdir() + + manager = ExtensionManager(proj_dir) + manager.install_from_directory(ext_dir, "0.1.0", register_commands=False) + + dest = proj_dir / ".specify" / "extensions" / "test-ext" + # Multibyte patterns excluded. + assert not (dest / "ドキュメント").exists() + assert not (dest / "café").exists() + # ASCII path with no matching pattern is unaffected. + assert (dest / "docs" / "guide.md").exists() + def test_extensionignore_star_does_not_cross_directories(self, temp_dir, valid_manifest_data): """'*' should NOT match across directory boundaries (gitignore semantics).""" ext_dir = self._make_extension( diff --git a/tests/test_presets.py b/tests/test_presets.py index f1c0e95f4e..d3d61b4c5a 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -2269,6 +2269,45 @@ def test_load_returns_empty_on_invalid_json(self, project_dir): assert load_init_options(project_dir) == {} + @pytest.mark.parametrize( + "value", + ["名前-プロジェクト", "café-résumé", "Ωmega-Δelta", "🚀-launch"], + ) + def test_save_load_round_trip_preserves_non_ascii(self, project_dir, value): + """Non-ASCII values round-trip via explicit UTF-8 encoding. + + ``Path.write_text`` / ``Path.read_text`` default to the system + locale codec on Windows (cp1252 / gb2312 / cp932). Without + ``encoding="utf-8"`` pinned on both ends, a project name like + ``café`` written on a UTF-8 host becomes garbled or unreadable on + a cp1252 host (and vice versa). Pin UTF-8 explicitly so init + options round-trip across machines and CI. + """ + from specify_cli import save_init_options, load_init_options + + save_init_options(project_dir, {"ai": "claude", "project_name": value}) + + loaded = load_init_options(project_dir) + assert loaded["project_name"] == value + + def test_load_returns_empty_on_locale_corrupted_file(self, project_dir): + """A file written in a non-UTF-8 codec falls back to {}, not crash. + + Simulates a file produced by an old client (or by a peer machine + with a different default locale) that contains bytes invalid as + UTF-8. ``load_init_options`` should fall back to ``{}`` per the + existing contract — never propagate a raw ``UnicodeDecodeError`` + to the CLI surface. + """ + from specify_cli import load_init_options + + opts_file = project_dir / ".specify" / "init-options.json" + opts_file.parent.mkdir(parents=True, exist_ok=True) + # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8. + opts_file.write_bytes(b'{"project_name": "caf\xe9"}') + + assert load_init_options(project_dir) == {} + class TestPresetSkills: """Tests for preset skill registration and unregistration.