NygenAnalytics · parashardhapola · Mar 3, 2026 · Mar 3, 2026
diff --git a/cytetype/__init__.py b/cytetype/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.18.0"
+__version__ = "0.18.1"
 
 import requests
 
@@ -8,7 +8,12 @@
 from .preprocessing.marker_detection import rank_genes_groups_backed
 from .preprocessing.subsampling import subsample_by_group
 
-__all__ = ["CyteType", "marker_dotplot", "rank_genes_groups_backed", "subsample_by_group"]
+__all__ = [
+    "CyteType",
+    "marker_dotplot",
+    "rank_genes_groups_backed",
+    "subsample_by_group",
+]
 
 _PYPI_JSON_URL = "https://pypi.org/pypi/cytetype/json"
 

diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py
@@ -1,4 +1,6 @@
+import random
 import re
+
 import anndata
 
 from ..config import logger
@@ -19,7 +21,7 @@ def _is_gene_id_like(value: str) -> bool:
 
     value = value.strip()
 
-    if re.match(r"^ENS[A-Z]*G\d{11}$", value, re.IGNORECASE):
+    if re.match(r"^ENS[A-Z]*G\d{11}(\.\d+)?$", value, re.IGNORECASE):
         return True
 
     if re.match(r"^[NX][MR]_\d+$", value):
@@ -75,11 +77,15 @@ def clean_gene_names(names: list[str]) -> list[str]:
     return cleaned
 
 
-def _id_like_percentage(values: list[str]) -> float:
+def _id_like_percentage(values: list[str], seed: int = 42) -> float:
     if not values:
         return 100.0
-    n = min(500, len(values))
-    sample = values[:n]
+    n = min(2000, len(values))
+    if n < len(values):
+        rng = random.Random(seed)
+        sample = rng.sample(values, n)
+    else:
+        sample = values
     return sum(1 for v in sample if _is_gene_id_like(v)) / n * 100