Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions cytetype/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.18.0"
__version__ = "0.18.1"

import requests

Expand All @@ -8,7 +8,12 @@
from .preprocessing.marker_detection import rank_genes_groups_backed
from .preprocessing.subsampling import subsample_by_group

__all__ = ["CyteType", "marker_dotplot", "rank_genes_groups_backed", "subsample_by_group"]
__all__ = [
"CyteType",
"marker_dotplot",
"rank_genes_groups_backed",
"subsample_by_group",
]

_PYPI_JSON_URL = "https://pypi.org/pypi/cytetype/json"

Expand Down
14 changes: 10 additions & 4 deletions cytetype/preprocessing/validation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import random
import re

import anndata

from ..config import logger
Expand All @@ -19,7 +21,7 @@ def _is_gene_id_like(value: str) -> bool:

value = value.strip()

if re.match(r"^ENS[A-Z]*G\d{11}$", value, re.IGNORECASE):
if re.match(r"^ENS[A-Z]*G\d{11}(\.\d+)?$", value, re.IGNORECASE):
return True

if re.match(r"^[NX][MR]_\d+$", value):
Expand Down Expand Up @@ -75,11 +77,15 @@ def clean_gene_names(names: list[str]) -> list[str]:
return cleaned


def _id_like_percentage(values: list[str]) -> float:
def _id_like_percentage(values: list[str], seed: int = 42) -> float:
if not values:
return 100.0
n = min(500, len(values))
sample = values[:n]
n = min(2000, len(values))
if n < len(values):
rng = random.Random(seed)
sample = rng.sample(values, n)
else:
sample = values
return sum(1 for v in sample if _is_gene_id_like(v)) / n * 100


Expand Down