From a2dd01386c51e63f3822b540235269bdfa5c23f5 Mon Sep 17 00:00:00 2001
From: "Anna @CyberTailor" <cyber@sysrq.in>
Date: Sun, 22 Mar 2026 12:14:37 +0500
Subject: [PATCH 1/3] fix TestRepoDirCheck::test_null_bytes

Replace a 'utf_16_be'-decodable byte string with another, which is
non-decodable.

Signed-off-by: Anna (cybertailor) Vyalkova <cyber+gentoo@sysrq.in>
---
 tests/checks/test_repo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/checks/test_repo.py b/tests/checks/test_repo.py
index 04f2e8d1e..7a9b39a18 100644
--- a/tests/checks/test_repo.py
+++ b/tests/checks/test_repo.py
@@ -64,7 +64,7 @@ def test_ignored_root_dirs(self):
     def test_null_bytes(self):
         check = self.mk_check()
         with open(pjoin(self.repo.location, "foo"), "wb") as f:
-            f.write(b"foo\x00\xffbar")
+            f.write(b"foo\x00\xff\xffbar")
         r = self.assertReport(check, [])
         assert isinstance(r, repo.BinaryFile)
         assert r.path == "foo"

From e2ea1651ee5c08d8a938709a50b125ab915833e4 Mon Sep 17 00:00:00 2001
From: "Anna @CyberTailor" <cyber@sysrq.in>
Date: Sun, 22 Mar 2026 12:22:54 +0500
Subject: [PATCH 2/3] fix TestRepoDirCheck

Replace 'Big5'-decodable byte string with another, which is
non-decodable.

Signed-off-by: Anna (cybertailor) Vyalkova <cyber+gentoo@sysrq.in>
---
 tests/checks/test_repo.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/checks/test_repo.py b/tests/checks/test_repo.py
index 7a9b39a18..21e199e8a 100644
--- a/tests/checks/test_repo.py
+++ b/tests/checks/test_repo.py
@@ -58,7 +58,7 @@ def test_ignored_root_dirs(self):
             bin_path = pjoin(self.repo.location, d, "foo")
             os.makedirs(os.path.dirname(bin_path))
             with open(bin_path, "wb") as f:
-                f.write(b"\xd3\xad\xbe\xef")
+                f.write(b"\xd3\x06\xf8\xef")
             self.assertNoReport(check, [])
 
     def test_null_bytes(self):
@@ -74,7 +74,7 @@ def test_root_dir_binary(self):
         check = self.mk_check()
         bin_path = pjoin(self.repo.location, "foo")
         with open(bin_path, "wb") as f:
-            f.write(b"\xd3\xad\xbe\xef")
+            f.write(b"\xd3\x06\xf8\xef")
         r = self.assertReport(check, [])
         assert isinstance(r, repo.BinaryFile)
         assert r.path == "foo"
@@ -84,7 +84,7 @@ def test_ebuild_filesdir_binary(self):
         check = self.mk_check()
         filesdir = self.mk_pkg("dev-util/foo")
         with open(pjoin(filesdir, "foo"), "wb") as f:
-            f.write(b"\xd3\xad\xbe\xef")
+            f.write(b"\xd3\x06\xf8\xef")
         r = self.assertReport(check, [])
         assert isinstance(r, repo.BinaryFile)
         assert r.path == "dev-util/foo/files/foo"
@@ -96,7 +96,7 @@ def test_gitignore(self):
         distfiles = pjoin(self.repo.location, "distfiles")
         os.mkdir(distfiles)
         with open(pjoin(distfiles, "foo-0.tar.gz"), "wb") as f:
-            f.write(b"\xd3\xad\xbe\xef")
+            f.write(b"\xd3\x06\xf8\xef")
         r = self.assertReport(check, [])
         assert isinstance(r, repo.BinaryFile)
         assert "distfiles/foo-0.tar.gz" in str(r)

From 8d0faa7020588bb44f28997a58f7665d47821e14 Mon Sep 17 00:00:00 2001
From: "Anna @CyberTailor" <cyber@sysrq.in>
Date: Sun, 22 Mar 2026 12:38:37 +0500
Subject: [PATCH 3/3] utils: replace chardet with charset-normalizer

Also replaces our custom heuristics with charset_normalizer.is_binary().

Signed-off-by: Anna (cybertailor) Vyalkova <cyber+gentoo@sysrq.in>
---
 doc/conf.py           |  2 +-
 pyproject.toml        |  4 ++--
 src/pkgcheck/utils.py | 48 +++----------------------------------------
 3 files changed, 6 insertions(+), 48 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index b89bd7bdf..03b85723b 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -327,7 +327,7 @@
 
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),
-    "chardet": ("https://chardet.readthedocs.io/en/latest", None),
+    "charset-normalizer": ("https://charset-normalizer.readthedocs.io/en/stable/", None),
     "lazy-object-proxy": ("https://python-lazy-object-proxy.readthedocs.io/en/latest", None),
     "setuptools": ("https://setuptools.pypa.io/en/latest", None),
 }
diff --git a/pyproject.toml b/pyproject.toml
index 2273075e1..c2ef4754f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 requires = [
 	"flit_core >=3.8,<4",
 	# repeat all normal runtime dependencies here
-	"chardet",
+	"charset-normalizer",
 	"lxml",
 	"pathspec",
 	"tree-sitter~=0.25.0",
@@ -40,7 +40,7 @@ classifiers = [
 dynamic = ["version"]
 
 dependencies = [
-	"chardet",
+	"charset-normalizer",
 	"lxml",
 	"pathspec",
 	"tree-sitter~=0.25.0",
diff --git a/src/pkgcheck/utils.py b/src/pkgcheck/utils.py
index da7165681..555f4ba36 100644
--- a/src/pkgcheck/utils.py
+++ b/src/pkgcheck/utils.py
@@ -38,14 +38,6 @@
 def is_binary(path, blocksize=1024):
     """Check if a given file is binary or not.
 
-    Uses a simplified version of the Perl detection algorithm, based roughly on
-    Eli Bendersky's translation to Python:
-    http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
-
-    This is biased slightly more in favour of deeming files as text files than
-    the Perl algorithm, since all ASCII compatible character sets are accepted as
-    text, not just utf-8.
-
     :param path: Path to a file to check.
     :param blocksize: Amount of bytes to read for determination.
     :returns: True if appears to be a binary, otherwise False.
@@ -60,46 +52,12 @@ def is_binary(path, blocksize=1024):
     if not byte_str:
         return False
 
-    # Now check for a high percentage of ASCII control characters
-    # Binary if control chars are > 30% of the string
-    low_chars = byte_str.translate(None, _printable_ascii)
-    nontext_ratio1 = len(low_chars) / len(byte_str)
-
-    # and check for a low percentage of high ASCII characters:
-    # Binary if high ASCII chars are < 5% of the string
-    # From: https://en.wikipedia.org/wiki/UTF-8
-    # If the bytes are random, the chances of a byte with the high bit set
-    # starting a valid UTF-8 character is only 6.64%. The chances of finding 7
-    # of these without finding an invalid sequence is actually lower than the
-    # chance of the first three bytes randomly being the UTF-8 BOM.
-    high_chars = byte_str.translate(None, _printable_high_ascii)
-    nontext_ratio2 = len(high_chars) / len(byte_str)
-
-    is_likely_binary = (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or (
-        nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8
-    )
-
-    decodable = False
     try:
         byte_str.decode()
-        decodable = True
+        return False
     except UnicodeDecodeError:
         # Delay import to hide during wheel/sdist builds that iterate over and
         # import most modules to generate check/keyword/reporter lists.
-        import chardet
+        import charset_normalizer
 
-        # guess character encoding using chardet
-        detected_encoding = chardet.detect(byte_str)
-        if detected_encoding["confidence"] > 0.8:
-            try:
-                byte_str.decode(encoding=detected_encoding["encoding"])
-                decodable = True
-            except (UnicodeDecodeError, LookupError):
-                pass
-
-    # finally use all the checks to decide binary or text
-    if decodable:
-        return False
-    if is_likely_binary or b"\x00" in byte_str:
-        return True
-    return False
+        return charset_normalizer.is_binary(byte_str)