Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions cycode/cli/utils/binary_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
_CONTROL_CHARS = b'\n\r\t\f\b'
_PRINTABLE_ASCII = _CONTROL_CHARS + bytes(range(32, 127))
_PRINTABLE_HIGH_ASCII = bytes(range(127, 256))

# BOM signatures for encodings that legitimately contain null bytes
_BOM_ENCODINGS = (
(b'\xff\xfe\x00\x00', 'utf-32-le'),
(b'\x00\x00\xfe\xff', 'utf-32-be'),
(b'\xff\xfe', 'utf-16-le'),
(b'\xfe\xff', 'utf-16-be'),
)


def _has_bom_encoding(bytes_to_check: bytes) -> bool:
"""Check if bytes start with a BOM and can be decoded as that encoding."""
for bom, encoding in _BOM_ENCODINGS:
if bytes_to_check.startswith(bom):
try:
bytes_to_check.decode(encoding)
return True
except (UnicodeDecodeError, LookupError):
pass
return False


def _is_decodable_as_utf8(bytes_to_check: bytes) -> bool:
"""Try to decode bytes as UTF-8."""
try:
bytes_to_check.decode('utf-8')
return True
except UnicodeDecodeError:
return False


def is_binary_string(bytes_to_check: bytes) -> bool:
"""Check if a chunk of bytes appears to be binary content.

Uses a simplified version of the Perl detection algorithm, matching
the structure of binaryornot's is_binary_string.
"""
if not bytes_to_check:
return False

# Binary if control chars are > 30% of the string
low_chars = bytes_to_check.translate(None, _PRINTABLE_ASCII)
nontext_ratio1 = len(low_chars) / len(bytes_to_check)

# Binary if high ASCII chars are < 5% of the string
high_chars = bytes_to_check.translate(None, _PRINTABLE_HIGH_ASCII)
nontext_ratio2 = len(high_chars) / len(bytes_to_check)

is_likely_binary = (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or (
nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8
)

# BOM-marked UTF-16/32 files legitimately contain null bytes.
# Check this first so they aren't misdetected as binary.
if _has_bom_encoding(bytes_to_check):
return False

has_null_or_xff = b'\x00' in bytes_to_check or b'\xff' in bytes_to_check

if is_likely_binary:
# Only let UTF-8 rescue data that doesn't contain null bytes.
# Null bytes are valid UTF-8 but almost never appear in real text files,
# whereas binary formats (e.g. .DS_Store) are full of them.
if has_null_or_xff:
return True
return not _is_decodable_as_utf8(bytes_to_check)

# Null bytes or 0xff in otherwise normal-looking data indicate binary
return bool(has_null_or_xff)
2 changes: 1 addition & 1 deletion cycode/cli/utils/path_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from typing import TYPE_CHECKING, AnyStr, Optional, Union

import typer
from binaryornot.helpers import is_binary_string

from cycode.cli.logger import logger
from cycode.cli.utils.binary_utils import is_binary_string

if TYPE_CHECKING:
from os import PathLike
Expand Down
3 changes: 1 addition & 2 deletions cycode/cli/utils/string_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
import string
from sys import getsizeof

from binaryornot.check import is_binary_string

from cycode.cli.consts import SCA_SHORTCUT_DEPENDENCY_PATHS
from cycode.cli.utils.binary_utils import is_binary_string


def obfuscate_text(text: str) -> str:
Expand Down
2 changes: 0 additions & 2 deletions cycode/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ def _set_io_encodings() -> None:
logging.getLogger('werkzeug').setLevel(logging.WARNING)
logging.getLogger('schedule').setLevel(logging.WARNING)
logging.getLogger('kubernetes').setLevel(logging.WARNING)
logging.getLogger('binaryornot').setLevel(logging.WARNING)
logging.getLogger('chardet').setLevel(logging.WARNING)
logging.getLogger('git.cmd').setLevel(logging.WARNING)
logging.getLogger('git.util').setLevel(logging.WARNING)

Expand Down
48 changes: 12 additions & 36 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ pyyaml = ">=6.0,<7.0"
marshmallow = ">=3.15.0,<4.0.0"
gitpython = ">=3.1.30,<3.2.0"
arrow = ">=1.0.0,<1.4.0"
binaryornot = ">=0.4.4,<0.5.0"
requests = ">=2.32.4,<3.0"
urllib3 = ">=2.4.0,<3.0.0"
pyjwt = ">=2.8.0,<3.0"
Expand Down
42 changes: 42 additions & 0 deletions tests/utils/test_binary_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pytest

from cycode.cli.utils.binary_utils import is_binary_string


@pytest.mark.parametrize(
('data', 'expected'),
[
# Empty / None-ish
(b'', False),
(None, False),
# Plain ASCII text
(b'Hello, world!', False),
(b'print("hello")\nfor i in range(10):\n pass\n', False),
# Whitespace-heavy text (tabs, newlines) is not binary
(b'\t\t\n\n\r\n some text\n', False),
# UTF-8 multibyte text (accented, CJK, emoji)
('café résumé naïve'.encode(), False),
('日本語テキスト'.encode(), False),
('🎉🚀💻'.encode(), False),
# BOM-marked UTF-16/32 text is not binary
('\ufeffHello UTF-16'.encode('utf-16-le'), False),
('\ufeffHello UTF-16'.encode('utf-16-be'), False),
('\ufeffHello UTF-32'.encode('utf-32-le'), False),
('\ufeffHello UTF-32'.encode('utf-32-be'), False),
# Null bytes → binary
(b'\x00', True),
(b'hello\x00world', True),
(b'\x00\x01\x02\x03', True),
# 0xff in otherwise normal data → binary
(b'hello\xffworld', True),
# Mostly control chars + invalid UTF-8 → binary
(b'\x01\x02\x03\x04\x05\x06\x07\x0e\x0f\x10' * 10 + b'\x80', True),
# Real binary format headers
(b'\x89PNG\r\n\x1a\n' + b'\x00' * 100, True),
(b'\x7fELF' + b'\x00' * 100, True),
# DS_Store-like: null-byte-heavy valid UTF-8 → still binary
(b'\x00\x00\x00\x01Bud1' + b'\x00' * 100, True),
],
)
def test_is_binary_string(data: bytes, expected: bool) -> None:
assert is_binary_string(data) is expected