From a2dd01386c51e63f3822b540235269bdfa5c23f5 Mon Sep 17 00:00:00 2001 From: "Anna @CyberTailor" Date: Sun, 22 Mar 2026 12:14:37 +0500 Subject: [PATCH 1/3] fix TestRepoDirCheck::test_null_bytes Replace a 'utf_16_be'-decodable byte string with another, which is non-decodable. Signed-off-by: Anna (cybertailor) Vyalkova --- tests/checks/test_repo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/checks/test_repo.py b/tests/checks/test_repo.py index 04f2e8d1e..7a9b39a18 100644 --- a/tests/checks/test_repo.py +++ b/tests/checks/test_repo.py @@ -64,7 +64,7 @@ def test_ignored_root_dirs(self): def test_null_bytes(self): check = self.mk_check() with open(pjoin(self.repo.location, "foo"), "wb") as f: - f.write(b"foo\x00\xffbar") + f.write(b"foo\x00\xff\xffbar") r = self.assertReport(check, []) assert isinstance(r, repo.BinaryFile) assert r.path == "foo" From e2ea1651ee5c08d8a938709a50b125ab915833e4 Mon Sep 17 00:00:00 2001 From: "Anna @CyberTailor" Date: Sun, 22 Mar 2026 12:22:54 +0500 Subject: [PATCH 2/3] fix TestRepoDirCheck Replace 'Big5'-decodable byte string with another, which is non-decodable. Signed-off-by: Anna (cybertailor) Vyalkova --- tests/checks/test_repo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/checks/test_repo.py b/tests/checks/test_repo.py index 7a9b39a18..21e199e8a 100644 --- a/tests/checks/test_repo.py +++ b/tests/checks/test_repo.py @@ -58,7 +58,7 @@ def test_ignored_root_dirs(self): bin_path = pjoin(self.repo.location, d, "foo") os.makedirs(os.path.dirname(bin_path)) with open(bin_path, "wb") as f: - f.write(b"\xd3\xad\xbe\xef") + f.write(b"\xd3\x06\xf8\xef") self.assertNoReport(check, []) def test_null_bytes(self): @@ -74,7 +74,7 @@ def test_root_dir_binary(self): check = self.mk_check() bin_path = pjoin(self.repo.location, "foo") with open(bin_path, "wb") as f: - f.write(b"\xd3\xad\xbe\xef") + f.write(b"\xd3\x06\xf8\xef") r = self.assertReport(check, []) assert isinstance(r, repo.BinaryFile) assert r.path == "foo" @@ -84,7 +84,7 @@ def test_ebuild_filesdir_binary(self): check = self.mk_check() filesdir = self.mk_pkg("dev-util/foo") with open(pjoin(filesdir, "foo"), "wb") as f: - f.write(b"\xd3\xad\xbe\xef") + f.write(b"\xd3\x06\xf8\xef") r = self.assertReport(check, []) assert isinstance(r, repo.BinaryFile) assert r.path == "dev-util/foo/files/foo" @@ -96,7 +96,7 @@ def test_gitignore(self): distfiles = pjoin(self.repo.location, "distfiles") os.mkdir(distfiles) with open(pjoin(distfiles, "foo-0.tar.gz"), "wb") as f: - f.write(b"\xd3\xad\xbe\xef") + f.write(b"\xd3\x06\xf8\xef") r = self.assertReport(check, []) assert isinstance(r, repo.BinaryFile) assert "distfiles/foo-0.tar.gz" in str(r) From 8d0faa7020588bb44f28997a58f7665d47821e14 Mon Sep 17 00:00:00 2001 From: "Anna @CyberTailor" Date: Sun, 22 Mar 2026 12:38:37 +0500 Subject: [PATCH 3/3] utils: replace chardet with charset-normalizer Also replaces our custom heuristics with charset_normalizer.is_binary(). Signed-off-by: Anna (cybertailor) Vyalkova --- doc/conf.py | 2 +- pyproject.toml | 4 ++-- src/pkgcheck/utils.py | 48 +++---------------------------------------- 3 files changed, 6 insertions(+), 48 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index b89bd7bdf..03b85723b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -327,7 +327,7 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3", None), - "chardet": ("https://chardet.readthedocs.io/en/latest", None), + "charset-normalizer": ("https://charset-normalizer.readthedocs.io/en/stable/", None), "lazy-object-proxy": ("https://python-lazy-object-proxy.readthedocs.io/en/latest", None), "setuptools": ("https://setuptools.pypa.io/en/latest", None), } diff --git a/pyproject.toml b/pyproject.toml index 2273075e1..c2ef4754f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ requires = [ "flit_core >=3.8,<4", # repeat all normal runtime dependencies here - "chardet", + "charset-normalizer", "lxml", "pathspec", "tree-sitter~=0.25.0", @@ -40,7 +40,7 @@ classifiers = [ dynamic = ["version"] dependencies = [ - "chardet", + "charset-normalizer", "lxml", "pathspec", "tree-sitter~=0.25.0", diff --git a/src/pkgcheck/utils.py b/src/pkgcheck/utils.py index da7165681..555f4ba36 100644 --- a/src/pkgcheck/utils.py +++ b/src/pkgcheck/utils.py @@ -38,14 +38,6 @@ def is_binary(path, blocksize=1024): """Check if a given file is binary or not. - Uses a simplified version of the Perl detection algorithm, based roughly on - Eli Bendersky's translation to Python: - http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ - - This is biased slightly more in favour of deeming files as text files than - the Perl algorithm, since all ASCII compatible character sets are accepted as - text, not just utf-8. - :param path: Path to a file to check. :param blocksize: Amount of bytes to read for determination. :returns: True if appears to be a binary, otherwise False. @@ -60,46 +52,12 @@ def is_binary(path, blocksize=1024): if not byte_str: return False - # Now check for a high percentage of ASCII control characters - # Binary if control chars are > 30% of the string - low_chars = byte_str.translate(None, _printable_ascii) - nontext_ratio1 = len(low_chars) / len(byte_str) - - # and check for a low percentage of high ASCII characters: - # Binary if high ASCII chars are < 5% of the string - # From: https://en.wikipedia.org/wiki/UTF-8 - # If the bytes are random, the chances of a byte with the high bit set - # starting a valid UTF-8 character is only 6.64%. The chances of finding 7 - # of these without finding an invalid sequence is actually lower than the - # chance of the first three bytes randomly being the UTF-8 BOM. - high_chars = byte_str.translate(None, _printable_high_ascii) - nontext_ratio2 = len(high_chars) / len(byte_str) - - is_likely_binary = (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or ( - nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8 - ) - - decodable = False try: byte_str.decode() - decodable = True + return False except UnicodeDecodeError: # Delay import to hide during wheel/sdist builds that iterate over and # import most modules to generate check/keyword/reporter lists. - import chardet + import charset_normalizer - # guess character encoding using chardet - detected_encoding = chardet.detect(byte_str) - if detected_encoding["confidence"] > 0.8: - try: - byte_str.decode(encoding=detected_encoding["encoding"]) - decodable = True - except (UnicodeDecodeError, LookupError): - pass - - # finally use all the checks to decide binary or text - if decodable: - return False - if is_likely_binary or b"\x00" in byte_str: - return True - return False + return charset_normalizer.is_binary(byte_str)