Skip to content

Commit c78e566

Browse files
committed
CM-60540: remove binaryornot dep
1 parent 6419aaf commit c78e566

File tree

7 files changed

+128
-42
lines changed

7 files changed

+128
-42
lines changed

cycode/cli/utils/binary_utils.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
_CONTROL_CHARS = b'\n\r\t\f\b'
2+
_PRINTABLE_ASCII = _CONTROL_CHARS + bytes(range(32, 127))
3+
_PRINTABLE_HIGH_ASCII = bytes(range(127, 256))
4+
5+
# BOM signatures for encodings that legitimately contain null bytes
6+
_BOM_ENCODINGS = (
7+
(b'\xff\xfe\x00\x00', 'utf-32-le'),
8+
(b'\x00\x00\xfe\xff', 'utf-32-be'),
9+
(b'\xff\xfe', 'utf-16-le'),
10+
(b'\xfe\xff', 'utf-16-be'),
11+
)
12+
13+
14+
def _has_bom_encoding(bytes_to_check: bytes) -> bool:
15+
"""Check if bytes start with a BOM and can be decoded as that encoding."""
16+
for bom, encoding in _BOM_ENCODINGS:
17+
if bytes_to_check.startswith(bom):
18+
try:
19+
bytes_to_check.decode(encoding)
20+
return True
21+
except (UnicodeDecodeError, LookupError):
22+
pass
23+
return False
24+
25+
26+
def _is_decodable_as_utf8(bytes_to_check: bytes) -> bool:
27+
"""Try to decode bytes as UTF-8."""
28+
try:
29+
bytes_to_check.decode('utf-8')
30+
return True
31+
except UnicodeDecodeError:
32+
return False
33+
34+
35+
def is_binary_string(bytes_to_check: bytes) -> bool:
36+
"""Check if a chunk of bytes appears to be binary content.
37+
38+
Uses a simplified version of the Perl detection algorithm, matching
39+
the structure of binaryornot's is_binary_string.
40+
"""
41+
if not bytes_to_check:
42+
return False
43+
44+
# Binary if control chars are > 30% of the string
45+
low_chars = bytes_to_check.translate(None, _PRINTABLE_ASCII)
46+
nontext_ratio1 = len(low_chars) / len(bytes_to_check)
47+
48+
# Binary if high ASCII chars are < 5% of the string
49+
high_chars = bytes_to_check.translate(None, _PRINTABLE_HIGH_ASCII)
50+
nontext_ratio2 = len(high_chars) / len(bytes_to_check)
51+
52+
is_likely_binary = (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or (
53+
nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8
54+
)
55+
56+
# BOM-marked UTF-16/32 files legitimately contain null bytes.
57+
# Check this first so they aren't misdetected as binary.
58+
if _has_bom_encoding(bytes_to_check):
59+
return False
60+
61+
has_null_or_xff = b'\x00' in bytes_to_check or b'\xff' in bytes_to_check
62+
63+
if is_likely_binary:
64+
# Only let UTF-8 rescue data that doesn't contain null bytes.
65+
# Null bytes are valid UTF-8 but almost never appear in real text files,
66+
# whereas binary formats (e.g. .DS_Store) are full of them.
67+
if has_null_or_xff:
68+
return True
69+
return not _is_decodable_as_utf8(bytes_to_check)
70+
71+
# Null bytes or 0xff in otherwise normal-looking data indicate binary
72+
return bool(has_null_or_xff)

cycode/cli/utils/path_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
from typing import TYPE_CHECKING, AnyStr, Optional, Union
55

66
import typer
7-
from binaryornot.helpers import is_binary_string
87

98
from cycode.cli.logger import logger
9+
from cycode.cli.utils.binary_utils import is_binary_string
1010

1111
if TYPE_CHECKING:
1212
from os import PathLike

cycode/cli/utils/string_utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
import string
66
from sys import getsizeof
77

8-
from binaryornot.check import is_binary_string
9-
108
from cycode.cli.consts import SCA_SHORTCUT_DEPENDENCY_PATHS
9+
from cycode.cli.utils.binary_utils import is_binary_string
1110

1211

1312
def obfuscate_text(text: str) -> str:

cycode/logger.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@ def _set_io_encodings() -> None:
3131
logging.getLogger('werkzeug').setLevel(logging.WARNING)
3232
logging.getLogger('schedule').setLevel(logging.WARNING)
3333
logging.getLogger('kubernetes').setLevel(logging.WARNING)
34-
logging.getLogger('binaryornot').setLevel(logging.WARNING)
35-
logging.getLogger('chardet').setLevel(logging.WARNING)
3634
logging.getLogger('git.cmd').setLevel(logging.WARNING)
3735
logging.getLogger('git.util').setLevel(logging.WARNING)
3836

poetry.lock

Lines changed: 12 additions & 36 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ pyyaml = ">=6.0,<7.0"
3939
marshmallow = ">=3.15.0,<4.0.0"
4040
gitpython = ">=3.1.30,<3.2.0"
4141
arrow = ">=1.0.0,<1.4.0"
42-
binaryornot = ">=0.4.4,<0.5.0"
4342
requests = ">=2.32.4,<3.0"
4443
urllib3 = ">=2.4.0,<3.0.0"
4544
pyjwt = ">=2.8.0,<3.0"

tests/utils/test_binary_utils.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import pytest
2+
3+
from cycode.cli.utils.binary_utils import is_binary_string
4+
5+
6+
@pytest.mark.parametrize(
7+
('data', 'expected'),
8+
[
9+
# Empty / None-ish
10+
(b'', False),
11+
(None, False),
12+
# Plain ASCII text
13+
(b'Hello, world!', False),
14+
(b'print("hello")\nfor i in range(10):\n pass\n', False),
15+
# Whitespace-heavy text (tabs, newlines) is not binary
16+
(b'\t\t\n\n\r\n some text\n', False),
17+
# UTF-8 multibyte text (accented, CJK, emoji)
18+
('café résumé naïve'.encode(), False),
19+
('日本語テキスト'.encode(), False),
20+
('🎉🚀💻'.encode(), False),
21+
# BOM-marked UTF-16/32 text is not binary
22+
('\ufeffHello UTF-16'.encode('utf-16-le'), False),
23+
('\ufeffHello UTF-16'.encode('utf-16-be'), False),
24+
('\ufeffHello UTF-32'.encode('utf-32-le'), False),
25+
('\ufeffHello UTF-32'.encode('utf-32-be'), False),
26+
# Null bytes → binary
27+
(b'\x00', True),
28+
(b'hello\x00world', True),
29+
(b'\x00\x01\x02\x03', True),
30+
# 0xff in otherwise normal data → binary
31+
(b'hello\xffworld', True),
32+
# Mostly control chars + invalid UTF-8 → binary
33+
(b'\x01\x02\x03\x04\x05\x06\x07\x0e\x0f\x10' * 10 + b'\x80', True),
34+
# Real binary format headers
35+
(b'\x89PNG\r\n\x1a\n' + b'\x00' * 100, True),
36+
(b'\x7fELF' + b'\x00' * 100, True),
37+
# DS_Store-like: null-byte-heavy valid UTF-8 → still binary
38+
(b'\x00\x00\x00\x01Bud1' + b'\x00' * 100, True),
39+
],
40+
)
41+
def test_is_binary_string(data: bytes, expected: bool) -> None:
42+
assert is_binary_string(data) is expected

0 commit comments

Comments
 (0)