diff --git a/src/specify_cli/__init__.py b/src/specify_cli/__init__.py index c0bdbaabe3..559e6df5f8 100644 --- a/src/specify_cli/__init__.py +++ b/src/specify_cli/__init__.py @@ -397,7 +397,15 @@ def save_init_options(project_path: Path, options: dict[str, Any]) -> None: """ dest = project_path / INIT_OPTIONS_FILE dest.parent.mkdir(parents=True, exist_ok=True) - dest.write_text(json.dumps(options, indent=2, sort_keys=True)) + # Pin UTF-8 explicitly: ``Path.write_text`` defaults to the system + # locale codec, which is cp1252 / gb2312 / cp932 on Windows. A + # locale-encoded write succeeds locally but produces a file a peer + # machine (different locale) or Unix CI cannot decode. The sibling + # integration-catalog writer in ``integrations/catalog.py`` already + # pins ``encoding="utf-8"`` for the same reason. + dest.write_text( + json.dumps(options, indent=2, sort_keys=True), encoding="utf-8" + ) def load_init_options(project_path: Path) -> dict[str, Any]: @@ -409,8 +417,15 @@ def load_init_options(project_path: Path) -> dict[str, Any]: if not path.exists(): return {} try: - return json.loads(path.read_text()) - except (json.JSONDecodeError, OSError): + # Match the explicit UTF-8 used by ``save_init_options``; without + # it ``read_text`` falls back to the system codec on Windows and + # raises ``UnicodeDecodeError`` on any file a peer wrote with + # non-ASCII content. ``UnicodeDecodeError`` is a subclass of + # ``ValueError``, not ``OSError`` / ``json.JSONDecodeError``, so + # it must be listed explicitly here to preserve the existing + # "fall back to empty dict" contract. + return json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError, UnicodeDecodeError): return {} diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 871503f0ae..ba8020279b 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -761,7 +761,13 @@ def _load_extensionignore(source_dir: Path) -> Optional[Callable[[str, List[str] if not ignore_file.exists(): return None - lines: List[str] = ignore_file.read_text().splitlines() + # Pin UTF-8 explicitly: ``Path.read_text`` defaults to the system + # locale codec on Windows (cp1252 / gb2312 / cp932), which silently + # corrupts multibyte patterns when the file is shared across + # machines with different locales. The next line already + # normalises backslashes "so Windows-authored files work" — the + # codebase already expects Windows authors to write this file. + lines: List[str] = ignore_file.read_text(encoding="utf-8").splitlines() # Normalise backslashes in patterns so Windows-authored files work normalised: List[str] = [] diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 153388a541..9cdf7b3ea2 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -3147,9 +3147,13 @@ def _make_extension(self, temp_dir, valid_manifest_data, extra_files=None, ignor else: p.write_text(content) - # Write .extensionignore + # Write .extensionignore. Pinned to UTF-8 so non-ASCII patterns + # in tests (see ``test_extensionignore_utf8_patterns``) survive + # the round-trip on Windows runners with non-UTF-8 default locales. if ignore_content is not None: - (ext_dir / ".extensionignore").write_text(ignore_content) + (ext_dir / ".extensionignore").write_text( + ignore_content, encoding="utf-8" + ) return ext_dir @@ -3379,6 +3383,44 @@ def test_extensionignore_windows_backslash_patterns(self, temp_dir, valid_manife assert (dest / "docs" / "guide.md").exists() assert not (dest / "docs" / "internal" / "draft.md").exists() + def test_extensionignore_utf8_patterns(self, temp_dir, valid_manifest_data): + """Non-ASCII patterns in .extensionignore work on every locale. + + ``Path.read_text`` defaults to the system locale codec on Windows + (cp1252 / gb2312 / cp932). Without an explicit ``encoding="utf-8"``, + a pattern like ``ドキュメント/`` written by a UTF-8 host becomes + mojibake on a cp1252 host and silently fails to match — leaking + files the author intended to exclude. The existing + ``test_extensionignore_windows_backslash_patterns`` already shows + the codebase treats this as a Windows-author-friendly file; UTF-8 + is part of that same contract. + """ + ext_dir = self._make_extension( + temp_dir, + valid_manifest_data, + extra_files={ + "ドキュメント/private.md": "secret", + "ドキュメント/public.md": "public", + "docs/guide.md": "# Guide", + "café/résumé.txt": "draft", + }, + ignore_content="ドキュメント/\ncafé/\n", + ) + + proj_dir = temp_dir / "project" + proj_dir.mkdir() + (proj_dir / ".specify").mkdir() + + manager = ExtensionManager(proj_dir) + manager.install_from_directory(ext_dir, "0.1.0", register_commands=False) + + dest = proj_dir / ".specify" / "extensions" / "test-ext" + # Multibyte patterns excluded. + assert not (dest / "ドキュメント").exists() + assert not (dest / "café").exists() + # ASCII path with no matching pattern is unaffected. + assert (dest / "docs" / "guide.md").exists() + def test_extensionignore_star_does_not_cross_directories(self, temp_dir, valid_manifest_data): """'*' should NOT match across directory boundaries (gitignore semantics).""" ext_dir = self._make_extension( diff --git a/tests/test_presets.py b/tests/test_presets.py index f1c0e95f4e..d3d61b4c5a 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -2269,6 +2269,45 @@ def test_load_returns_empty_on_invalid_json(self, project_dir): assert load_init_options(project_dir) == {} + @pytest.mark.parametrize( + "value", + ["名前-プロジェクト", "café-résumé", "Ωmega-Δelta", "🚀-launch"], + ) + def test_save_load_round_trip_preserves_non_ascii(self, project_dir, value): + """Non-ASCII values round-trip via explicit UTF-8 encoding. + + ``Path.write_text`` / ``Path.read_text`` default to the system + locale codec on Windows (cp1252 / gb2312 / cp932). Without + ``encoding="utf-8"`` pinned on both ends, a project name like + ``café`` written on a UTF-8 host becomes garbled or unreadable on + a cp1252 host (and vice versa). Pin UTF-8 explicitly so init + options round-trip across machines and CI. + """ + from specify_cli import save_init_options, load_init_options + + save_init_options(project_dir, {"ai": "claude", "project_name": value}) + + loaded = load_init_options(project_dir) + assert loaded["project_name"] == value + + def test_load_returns_empty_on_locale_corrupted_file(self, project_dir): + """A file written in a non-UTF-8 codec falls back to {}, not crash. + + Simulates a file produced by an old client (or by a peer machine + with a different default locale) that contains bytes invalid as + UTF-8. ``load_init_options`` should fall back to ``{}`` per the + existing contract — never propagate a raw ``UnicodeDecodeError`` + to the CLI surface. + """ + from specify_cli import load_init_options + + opts_file = project_dir / ".specify" / "init-options.json" + opts_file.parent.mkdir(parents=True, exist_ok=True) + # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8. + opts_file.write_bytes(b'{"project_name": "caf\xe9"}') + + assert load_init_options(project_dir) == {} + class TestPresetSkills: """Tests for preset skill registration and unregistration.