cycodehq · gotbadger · Mar 5, 2026 · Mar 5, 2026
@@ -0,0 +1,72 @@
+_CONTROL_CHARS = b'\n\r\t\f\b'
+_PRINTABLE_ASCII = _CONTROL_CHARS + bytes(range(32, 127))
+_PRINTABLE_HIGH_ASCII = bytes(range(127, 256))
+
+# BOM signatures for encodings that legitimately contain null bytes
+_BOM_ENCODINGS = (
+    (b'\xff\xfe\x00\x00', 'utf-32-le'),
+    (b'\x00\x00\xfe\xff', 'utf-32-be'),
+    (b'\xff\xfe', 'utf-16-le'),
+    (b'\xfe\xff', 'utf-16-be'),
+)
+
+
+def _has_bom_encoding(bytes_to_check: bytes) -> bool:
+    """Check if bytes start with a BOM and can be decoded as that encoding."""
+    for bom, encoding in _BOM_ENCODINGS:
+        if bytes_to_check.startswith(bom):
+            try:
+                bytes_to_check.decode(encoding)
+                return True
+            except (UnicodeDecodeError, LookupError):
+                pass
+    return False
+
+
+def _is_decodable_as_utf8(bytes_to_check: bytes) -> bool:
+    """Try to decode bytes as UTF-8."""
+    try:
+        bytes_to_check.decode('utf-8')
+        return True
+    except UnicodeDecodeError:
+        return False
+
+
+def is_binary_string(bytes_to_check: bytes) -> bool:
+    """Check if a chunk of bytes appears to be binary content.
+
+    Uses a simplified version of the Perl detection algorithm, matching
+    the structure of binaryornot's is_binary_string.
+    """
+    if not bytes_to_check:
+        return False
+
+    # Binary if control chars are > 30% of the string
+    low_chars = bytes_to_check.translate(None, _PRINTABLE_ASCII)
+    nontext_ratio1 = len(low_chars) / len(bytes_to_check)
+
+    # Binary if high ASCII chars are < 5% of the string
+    high_chars = bytes_to_check.translate(None, _PRINTABLE_HIGH_ASCII)
+    nontext_ratio2 = len(high_chars) / len(bytes_to_check)
+
+    is_likely_binary = (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or (
+        nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8
+    )
+
+    # BOM-marked UTF-16/32 files legitimately contain null bytes.
+    # Check this first so they aren't misdetected as binary.
+    if _has_bom_encoding(bytes_to_check):
+        return False
+
+    has_null_or_xff = b'\x00' in bytes_to_check or b'\xff' in bytes_to_check
+
+    if is_likely_binary:
+        # Only let UTF-8 rescue data that doesn't contain null bytes.
+        # Null bytes are valid UTF-8 but almost never appear in real text files,
+        # whereas binary formats (e.g. .DS_Store) are full of them.
+        if has_null_or_xff:
+            return True
+        return not _is_decodable_as_utf8(bytes_to_check)
+
+    # Null bytes or 0xff in otherwise normal-looking data indicate binary
+    return bool(has_null_or_xff)
@@ -4,9 +4,9 @@
 from typing import TYPE_CHECKING, AnyStr, Optional, Union
 
 import typer
-from binaryornot.helpers import is_binary_string
 
 from cycode.cli.logger import logger
+from cycode.cli.utils.binary_utils import is_binary_string
 
 if TYPE_CHECKING:
     from os import PathLike

@@ -5,9 +5,8 @@
 import string
 from sys import getsizeof
 
-from binaryornot.check import is_binary_string
-
 from cycode.cli.consts import SCA_SHORTCUT_DEPENDENCY_PATHS
+from cycode.cli.utils.binary_utils import is_binary_string
 
 
 def obfuscate_text(text: str) -> str:

@@ -31,8 +31,6 @@ def _set_io_encodings() -> None:
 logging.getLogger('werkzeug').setLevel(logging.WARNING)
 logging.getLogger('schedule').setLevel(logging.WARNING)
 logging.getLogger('kubernetes').setLevel(logging.WARNING)
-logging.getLogger('binaryornot').setLevel(logging.WARNING)
-logging.getLogger('chardet').setLevel(logging.WARNING)
 logging.getLogger('git.cmd').setLevel(logging.WARNING)
 logging.getLogger('git.util').setLevel(logging.WARNING)
 

@@ -39,7 +39,6 @@ pyyaml = ">=6.0,<7.0"
 marshmallow = ">=3.15.0,<4.0.0"
 gitpython = ">=3.1.30,<3.2.0"
 arrow = ">=1.0.0,<1.4.0"
-binaryornot = ">=0.4.4,<0.5.0"
 requests = ">=2.32.4,<3.0"
 urllib3 = ">=2.4.0,<3.0.0"
 pyjwt = ">=2.8.0,<3.0"

@@ -0,0 +1,42 @@
+import pytest
+
+from cycode.cli.utils.binary_utils import is_binary_string
+
+
+@pytest.mark.parametrize(
+    ('data', 'expected'),
+    [
+        # Empty / None-ish
+        (b'', False),
+        (None, False),
+        # Plain ASCII text
+        (b'Hello, world!', False),
+        (b'print("hello")\nfor i in range(10):\n    pass\n', False),
+        # Whitespace-heavy text (tabs, newlines) is not binary
+        (b'\t\t\n\n\r\n  some text\n', False),
+        # UTF-8 multibyte text (accented, CJK, emoji)
+        ('café résumé naïve'.encode(), False),
+        ('日本語テキスト'.encode(), False),
+        ('🎉🚀💻'.encode(), False),
+        # BOM-marked UTF-16/32 text is not binary
+        ('\ufeffHello UTF-16'.encode('utf-16-le'), False),
+        ('\ufeffHello UTF-16'.encode('utf-16-be'), False),
+        ('\ufeffHello UTF-32'.encode('utf-32-le'), False),
+        ('\ufeffHello UTF-32'.encode('utf-32-be'), False),
+        # Null bytes → binary
+        (b'\x00', True),
+        (b'hello\x00world', True),
+        (b'\x00\x01\x02\x03', True),
+        # 0xff in otherwise normal data → binary
+        (b'hello\xffworld', True),
+        # Mostly control chars + invalid UTF-8 → binary
+        (b'\x01\x02\x03\x04\x05\x06\x07\x0e\x0f\x10' * 10 + b'\x80', True),
+        # Real binary format headers
+        (b'\x89PNG\r\n\x1a\n' + b'\x00' * 100, True),
+        (b'\x7fELF' + b'\x00' * 100, True),
+        # DS_Store-like: null-byte-heavy valid UTF-8 → still binary
+        (b'\x00\x00\x00\x01Bud1' + b'\x00' * 100, True),
+    ],
+)
+def test_is_binary_string(data: bytes, expected: bool) -> None:
+    assert is_binary_string(data) is expected