From c94eede797b56108216859f7e0a698b16c0ab7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Laurenz=20Altenm=C3=BCller?= Date: Wed, 14 Jan 2026 21:14:30 +0100 Subject: [PATCH 1/4] Quote all files if original RECORD had all files quoted --- python/private/pypi/repack_whl.py | 8 +++-- tools/wheelmaker.py | 58 ++++++++++++++++--------------- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/python/private/pypi/repack_whl.py b/python/private/pypi/repack_whl.py index 519631f272..59b5a2b8fa 100644 --- a/python/private/pypi/repack_whl.py +++ b/python/private/pypi/repack_whl.py @@ -151,17 +151,21 @@ def main(sys_argv): logging.debug(f"Found dist-info dir: {distinfo_dir}") record_path = distinfo_dir / "RECORD" record_contents = record_path.read_text() if record_path.exists() else "" + quote_files = all(line.startswith('"') for line in record_contents.splitlines()) distribution_prefix = distinfo_dir.with_suffix("").name with _WhlFile( - args.output, mode="w", distribution_prefix=distribution_prefix + args.output, + mode="w", + distribution_prefix=distribution_prefix, + quote_all_filenames=quote_files, ) as out: for p in _files_to_pack(patched_wheel_dir, record_contents): rel_path = p.relative_to(patched_wheel_dir) out.add_file(str(rel_path), p) logging.debug(f"Writing RECORD file") - got_record = out.add_recordfile().decode("utf-8", "surrogateescape") + got_record = out.add_recordfile() if got_record == record_contents: logging.info(f"Created a whl file: {args.output}") diff --git a/tools/wheelmaker.py b/tools/wheelmaker.py index de6b8f48af..546c9893b6 100644 --- a/tools/wheelmaker.py +++ b/tools/wheelmaker.py @@ -132,13 +132,17 @@ def __init__( distribution_prefix: str, strip_path_prefixes=None, compression=zipfile.ZIP_DEFLATED, + quote_all_filenames: bool = False, **kwargs, ): self._distribution_prefix = distribution_prefix self._strip_path_prefixes = strip_path_prefixes or [] - # Entries for the RECORD file as (filename, hash, size) tuples. - self._record = [] + # Entries for the RECORD file as (filename, digest, size) tuples. + self._record: list[tuple[str, str, str]] = [] + # Whether to quote filenames in the RECORD file (for compatibility with + # some wheels like torch that have quoted filenames in their RECORD). + self.quote_all_filenames = quote_all_filenames super().__init__(filename, mode=mode, compression=compression, **kwargs) @@ -192,16 +196,15 @@ def add_string(self, filename, contents): hash.update(contents) self._add_to_record(filename, self._serialize_digest(hash), len(contents)) - def _serialize_digest(self, hash): + def _serialize_digest(self, hash) -> str: # https://www.python.org/dev/peps/pep-0376/#record # "base64.urlsafe_b64encode(digest) with trailing = removed" digest = base64.urlsafe_b64encode(hash.digest()) digest = b"sha256=" + digest.rstrip(b"=") - return digest + return digest.decode("utf-8", "surrogateescape") - def _add_to_record(self, filename, hash, size): - size = str(size).encode("ascii") - self._record.append((filename, hash, size)) + def _add_to_record(self, filename: str, hash: str, size: int) -> None: + self._record.append((filename, hash, str(size))) def _zipinfo(self, filename): """Construct deterministic ZipInfo entry for a file named filename""" @@ -223,29 +226,28 @@ def _zipinfo(self, filename): zinfo.compress_type = self.compression return zinfo - def add_recordfile(self): + def _quote_filename(self, filename: str) -> str: + """Return a possibly quoted filename for RECORD file.""" + # Use csv writer to auto-quote the filename (may contain ",") + with io.StringIO() as buf: + csv.writer(buf).writerow([filename.lstrip("/")]) + filename = buf.getvalue().strip() + # Some RECORDs like torch have *all* filenames quoted and we must minimize diff + if self.quote_all_filenames and not filename.startswith('"'): + filename = f'"{filename}"' + return filename + + def add_recordfile(self) -> str: """Write RECORD file to the distribution.""" record_path = self.distinfo_path("RECORD") - entries = self._record + [(record_path, b"", b"")] - with io.StringIO() as contents_io: - writer = csv.writer(contents_io, lineterminator="\n") - for filename, digest, size in entries: - if isinstance(filename, str): - filename = filename.lstrip("/") - writer.writerow( - ( - ( - c - if isinstance(c, str) - else c.decode("utf-8", "surrogateescape") - ) - for c in (filename, digest, size) - ) - ) - - contents = contents_io.getvalue() - self.add_string(record_path, contents) - return contents.encode("utf-8", "surrogateescape") + entries = self._record + [(record_path, "", "")] + entries = [ + (self._quote_filename(fname), digest, size) + for fname, digest, size in entries + ] + contents = "\n".join(",".join(entry) for entry in entries) + "\n" + self.add_string(record_path, contents) + return contents class WheelMaker(object): From de1987b5b22ff764f11f1c5690b8f79af076c302 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Wed, 14 Jan 2026 21:31:41 +0100 Subject: [PATCH 2/4] Update tools/wheelmaker.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tools/wheelmaker.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tools/wheelmaker.py b/tools/wheelmaker.py index 546c9893b6..4390df3445 100644 --- a/tools/wheelmaker.py +++ b/tools/wheelmaker.py @@ -228,14 +228,13 @@ def _zipinfo(self, filename): def _quote_filename(self, filename: str) -> str: """Return a possibly quoted filename for RECORD file.""" - # Use csv writer to auto-quote the filename (may contain ",") + filename = filename.lstrip("/") + # Some RECORDs like torch have *all* filenames quoted and we must minimize diff. + # Otherwise, we quote only when necessary (e.g. for filenames with commas). + quoting = csv.QUOTE_ALL if self.quote_all_filenames else csv.QUOTE_MINIMAL with io.StringIO() as buf: - csv.writer(buf).writerow([filename.lstrip("/")]) - filename = buf.getvalue().strip() - # Some RECORDs like torch have *all* filenames quoted and we must minimize diff - if self.quote_all_filenames and not filename.startswith('"'): - filename = f'"{filename}"' - return filename + csv.writer(buf, quoting=quoting).writerow([filename]) + return buf.getvalue().strip() def add_recordfile(self) -> str: """Write RECORD file to the distribution.""" From 7628c22d6903fb57a937d4586f084485349e043c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Laurenz=20Altenm=C3=BCller?= Date: Mon, 19 Jan 2026 15:28:58 +0100 Subject: [PATCH 3/4] add tests --- python/private/pypi/BUILD.bazel | 7 +++++ python/private/pypi/repack_whl.py | 12 +++++++- tests/pypi/repack_whl/BUILD.bazel | 8 +++++ tests/pypi/repack_whl/repack_whl_test.py | 37 ++++++++++++++++++++++++ tests/tools/wheelmaker_test.py | 37 ++++++++++++++++++++++++ 5 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 tests/pypi/repack_whl/BUILD.bazel create mode 100644 tests/pypi/repack_whl/repack_whl_test.py diff --git a/python/private/pypi/BUILD.bazel b/python/private/pypi/BUILD.bazel index b46fd58d3c..4fea3684de 100644 --- a/python/private/pypi/BUILD.bazel +++ b/python/private/pypi/BUILD.bazel @@ -13,6 +13,7 @@ # limitations under the License. load("@bazel_skylib//:bzl_library.bzl", "bzl_library") +load("//python:py_library.bzl", "py_library") package(default_visibility = ["//:__subpackages__"]) @@ -377,6 +378,12 @@ bzl_library( ], ) +py_library( + name = "repack_whl", + srcs = ["repack_whl.py"], + deps = ["//tools:wheelmaker"], +) + bzl_library( name = "requirements_files_by_platform_bzl", srcs = ["requirements_files_by_platform.bzl"], diff --git a/python/private/pypi/repack_whl.py b/python/private/pypi/repack_whl.py index 59b5a2b8fa..92d052a81f 100644 --- a/python/private/pypi/repack_whl.py +++ b/python/private/pypi/repack_whl.py @@ -44,6 +44,16 @@ _DISTINFO = "dist-info" +def _has_all_quoted_filenames(record_contents: str) -> bool: + """Check if all filenames in the RECORD are quoted. + + Some wheels (like torch) have all filenames quoted in their RECORD file. + We detect this to preserve the quoting style when repacking. + """ + lines = record_contents.splitlines() + return all(line.startswith('"') for line in lines) + + def _unidiff_output(expected, actual, record): """ Helper function. Returns a string containing the unified diff of two @@ -151,7 +161,7 @@ def main(sys_argv): logging.debug(f"Found dist-info dir: {distinfo_dir}") record_path = distinfo_dir / "RECORD" record_contents = record_path.read_text() if record_path.exists() else "" - quote_files = all(line.startswith('"') for line in record_contents.splitlines()) + quote_files = _has_all_quoted_filenames(record_contents) distribution_prefix = distinfo_dir.with_suffix("").name with _WhlFile( diff --git a/tests/pypi/repack_whl/BUILD.bazel b/tests/pypi/repack_whl/BUILD.bazel new file mode 100644 index 0000000000..3f611a2e4f --- /dev/null +++ b/tests/pypi/repack_whl/BUILD.bazel @@ -0,0 +1,8 @@ +load("//python:py_test.bzl", "py_test") + +py_test( + name = "repack_whl_test", + size = "small", + srcs = ["repack_whl_test.py"], + deps = ["//python/private/pypi:repack_whl"], +) diff --git a/tests/pypi/repack_whl/repack_whl_test.py b/tests/pypi/repack_whl/repack_whl_test.py new file mode 100644 index 0000000000..50781cc0e6 --- /dev/null +++ b/tests/pypi/repack_whl/repack_whl_test.py @@ -0,0 +1,37 @@ +import unittest + +from python.private.pypi import repack_whl + + +class HasAllQuotedFilenamesTest(unittest.TestCase): + """Tests for _has_all_quoted_filenames detection logic.""" + + def test_all_quoted(self) -> None: + """Returns True when all lines start with quotes (torch-style).""" + record = """\ +"torch/__init__.py",sha256=abc,123 +"torch/utils.py",sha256=def,456 +"torch-2.0.0.dist-info/WHEEL",sha256=ghi,789 +""" + self.assertTrue(repack_whl._has_all_quoted_filenames(record)) + + def test_none_quoted(self) -> None: + """Returns False when no lines are quoted (standard style).""" + record = """\ +torch/__init__.py,sha256=abc,123 +torch/utils.py,sha256=def,456 +torch-2.0.0.dist-info/WHEEL,sha256=ghi,789 +""" + self.assertFalse(repack_whl._has_all_quoted_filenames(record)) + + def test_mixed_quoting(self) -> None: + """Returns False when only some lines are quoted.""" + record = """\ +"file,with,commas.py",sha256=abc,123 +normal_file.py,sha256=def,456 +""" + self.assertFalse(repack_whl._has_all_quoted_filenames(record)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/tools/wheelmaker_test.py b/tests/tools/wheelmaker_test.py index 0efe1c9fbc..288dde720a 100644 --- a/tests/tools/wheelmaker_test.py +++ b/tests/tools/wheelmaker_test.py @@ -1,8 +1,45 @@ +import io import unittest import tools.wheelmaker as wheelmaker +class QuoteAllFilenamesTest(unittest.TestCase): + """Tests for quote_all_filenames behavior in _WhlFile. + + Some wheels (like torch) have all filenames quoted in their RECORD file. + When repacking, we preserve this style to minimize diffs. + """ + + def _make_whl_file(self, quote_all: bool) -> wheelmaker._WhlFile: + """Create a _WhlFile instance for testing.""" + buf = io.BytesIO() + return wheelmaker._WhlFile( + buf, + mode="w", + distribution_prefix="test-1.0.0", + quote_all_filenames=quote_all, + ) + + def test_quote_all_quotes_simple_filenames(self) -> None: + """When quote_all_filenames=True, all filenames are quoted.""" + whl = self._make_whl_file(quote_all=True) + self.assertEqual(whl._quote_filename("foo/bar.py"), '"foo/bar.py"') + + def test_quote_all_false_leaves_simple_filenames_unquoted(self) -> None: + """When quote_all_filenames=False, simple filenames stay unquoted.""" + whl = self._make_whl_file(quote_all=False) + self.assertEqual(whl._quote_filename("foo/bar.py"), "foo/bar.py") + + def test_quote_all_quotes_filenames_with_commas(self) -> None: + """Filenames with commas are always quoted, regardless of quote_all_filenames.""" + whl = self._make_whl_file(quote_all=True) + self.assertEqual(whl._quote_filename("foo,bar/baz.py"), '"foo,bar/baz.py"') + + whl = self._make_whl_file(quote_all=False) + self.assertEqual(whl._quote_filename("foo,bar/baz.py"), '"foo,bar/baz.py"') + + class ArcNameFromTest(unittest.TestCase): def test_arcname_from(self) -> None: # (name, distribution_prefix, strip_path_prefixes, want) tuples From df9d217441ec01b1ca03b4aea606ff886169c264 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Tue, 20 Jan 2026 17:18:42 +0900 Subject: [PATCH 4/4] Add Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c9f812f3bd..70811ccf87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,8 @@ END_UNRELEASED_TEMPLATE * (binaries/tests) The `PYTHONBREAKPOINT` environment variable is automatically inherited * (binaries/tests) The {obj}`stamp` attribute now transitions the Bazel builtin {obj}`--stamp` flag. +* (pypi) Now the RECORD file patches will follow the quoted or unquoted filenames convention + in order to make `pytorch` and friends easier to patch. {#v0-0-0-fixed} ### Fixed