Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@

intersphinx_mapping = {
"python": ("https://docs.python.org/3", None),
"chardet": ("https://chardet.readthedocs.io/en/latest", None),
"charset-normalizer": ("https://charset-normalizer.readthedocs.io/en/stable/", None),
"lazy-object-proxy": ("https://python-lazy-object-proxy.readthedocs.io/en/latest", None),
"setuptools": ("https://setuptools.pypa.io/en/latest", None),
}
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
requires = [
"flit_core >=3.8,<4",
# repeat all normal runtime dependencies here
"chardet",
"charset-normalizer",
"lxml",
"pathspec",
"tree-sitter~=0.25.0",
Expand Down Expand Up @@ -40,7 +40,7 @@ classifiers = [
dynamic = ["version"]

dependencies = [
"chardet",
"charset-normalizer",
"lxml",
"pathspec",
"tree-sitter~=0.25.0",
Expand Down
48 changes: 3 additions & 45 deletions src/pkgcheck/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,6 @@
def is_binary(path, blocksize=1024):
"""Check if a given file is binary or not.

Uses a simplified version of the Perl detection algorithm, based roughly on
Eli Bendersky's translation to Python:
http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/

This is biased slightly more in favour of deeming files as text files than
the Perl algorithm, since all ASCII compatible character sets are accepted as
text, not just utf-8.

:param path: Path to a file to check.
:param blocksize: Amount of bytes to read for determination.
:returns: True if appears to be a binary, otherwise False.
Expand All @@ -60,46 +52,12 @@ def is_binary(path, blocksize=1024):
if not byte_str:
return False

# Now check for a high percentage of ASCII control characters
# Binary if control chars are > 30% of the string
low_chars = byte_str.translate(None, _printable_ascii)
nontext_ratio1 = len(low_chars) / len(byte_str)

# and check for a low percentage of high ASCII characters:
# Binary if high ASCII chars are < 5% of the string
# From: https://en.wikipedia.org/wiki/UTF-8
# If the bytes are random, the chances of a byte with the high bit set
# starting a valid UTF-8 character is only 6.64%. The chances of finding 7
# of these without finding an invalid sequence is actually lower than the
# chance of the first three bytes randomly being the UTF-8 BOM.
high_chars = byte_str.translate(None, _printable_high_ascii)
nontext_ratio2 = len(high_chars) / len(byte_str)

is_likely_binary = (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or (
nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8
)

decodable = False
try:
byte_str.decode()
decodable = True
return False
except UnicodeDecodeError:
# Delay import to hide during wheel/sdist builds that iterate over and
# import most modules to generate check/keyword/reporter lists.
import chardet
import charset_normalizer

# guess character encoding using chardet
detected_encoding = chardet.detect(byte_str)
if detected_encoding["confidence"] > 0.8:
try:
byte_str.decode(encoding=detected_encoding["encoding"])
decodable = True
except (UnicodeDecodeError, LookupError):
pass

# finally use all the checks to decide binary or text
if decodable:
return False
if is_likely_binary or b"\x00" in byte_str:
return True
return False
return charset_normalizer.is_binary(byte_str)
10 changes: 5 additions & 5 deletions tests/checks/test_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ def test_ignored_root_dirs(self):
bin_path = pjoin(self.repo.location, d, "foo")
os.makedirs(os.path.dirname(bin_path))
with open(bin_path, "wb") as f:
f.write(b"\xd3\xad\xbe\xef")
f.write(b"\xd3\x06\xf8\xef")
self.assertNoReport(check, [])

def test_null_bytes(self):
check = self.mk_check()
with open(pjoin(self.repo.location, "foo"), "wb") as f:
f.write(b"foo\x00\xffbar")
f.write(b"foo\x00\xff\xffbar")
r = self.assertReport(check, [])
assert isinstance(r, repo.BinaryFile)
assert r.path == "foo"
Expand All @@ -74,7 +74,7 @@ def test_root_dir_binary(self):
check = self.mk_check()
bin_path = pjoin(self.repo.location, "foo")
with open(bin_path, "wb") as f:
f.write(b"\xd3\xad\xbe\xef")
f.write(b"\xd3\x06\xf8\xef")
r = self.assertReport(check, [])
assert isinstance(r, repo.BinaryFile)
assert r.path == "foo"
Expand All @@ -84,7 +84,7 @@ def test_ebuild_filesdir_binary(self):
check = self.mk_check()
filesdir = self.mk_pkg("dev-util/foo")
with open(pjoin(filesdir, "foo"), "wb") as f:
f.write(b"\xd3\xad\xbe\xef")
f.write(b"\xd3\x06\xf8\xef")
r = self.assertReport(check, [])
assert isinstance(r, repo.BinaryFile)
assert r.path == "dev-util/foo/files/foo"
Expand All @@ -96,7 +96,7 @@ def test_gitignore(self):
distfiles = pjoin(self.repo.location, "distfiles")
os.mkdir(distfiles)
with open(pjoin(distfiles, "foo-0.tar.gz"), "wb") as f:
f.write(b"\xd3\xad\xbe\xef")
f.write(b"\xd3\x06\xf8\xef")
r = self.assertReport(check, [])
assert isinstance(r, repo.BinaryFile)
assert "distfiles/foo-0.tar.gz" in str(r)
Expand Down
Loading