Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
bec9e00
Update version to 0.18.0 and enhance raw counts handling in save_feat…
parashardhapola Feb 28, 2026
098df32
Add artifact paths for vars.h5 and obs.duckdb, enhance artifact build…
parashardhapola Mar 2, 2026
6348c71
Refactor artifact cleanup in CyteType and update tests
parashardhapola Mar 2, 2026
d1e3f22
Add rank_genes_groups_backed function and update exports
parashardhapola Mar 2, 2026
fbb2c4d
Enhance gene symbol handling in CyteType
parashardhapola Mar 2, 2026
0608455
Update batch size for expression percentage calculations and refactor…
parashardhapola Mar 2, 2026
10d69a2
Refactor logging and enhance progress reporting in CyteType
parashardhapola Mar 2, 2026
fd133a2
Add WRITE_MEM_BUDGET constant and enhance logging in CyteType
parashardhapola Mar 2, 2026
f953a28
Enhance file upload functionality and error handling in CyteType
parashardhapola Mar 3, 2026
bcdbd08
Add subsampling functionality to preprocessing module
parashardhapola Mar 3, 2026
7112e16
Refactor subsampling functionality and improve logging in preprocessi…
parashardhapola Mar 3, 2026
aa57457
formatted
parashardhapola Mar 3, 2026
bd92cae
Update subsampling logic to merge subsets by taking the first occurre…
parashardhapola Mar 3, 2026
bed8784
Enhance gene name processing in preprocessing module
parashardhapola Mar 3, 2026
2bc6628
Optimize group statistics accumulation for sparse matrices in marker …
parashardhapola Mar 3, 2026
9a5b89d
Increase default timeout for file uploads in CyteType
parashardhapola Mar 3, 2026
1a6a347
fomatted
parashardhapola Mar 3, 2026
c24c9a4
Refactor subsampling logic in `_is_integer_valued` function to improv…
parashardhapola Mar 3, 2026
ef4e7bf
Update public API in `__init__.py` to include new plotting and subsam…
parashardhapola Mar 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions cytetype/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
__version__ = "0.17.0"
__version__ = "0.18.0"

import requests

from .config import logger
from .main import CyteType
from .plotting import marker_dotplot
from .preprocessing.marker_detection import rank_genes_groups_backed
from .preprocessing.subsampling import subsample_by_group

__all__ = ["CyteType"]
__all__ = ["CyteType", "marker_dotplot", "rank_genes_groups_backed", "subsample_by_group"]

_PYPI_JSON_URL = "https://pypi.org/pypi/cytetype/json"

Expand Down
70 changes: 48 additions & 22 deletions cytetype/api/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,32 @@

MAX_UPLOAD_BYTES: dict[UploadFileKind, int] = {
"obs_duckdb": 100 * 1024 * 1024, # 100MB
"vars_h5": 10 * 1024 * 1024 * 1024, # 10GB
"vars_h5": 50 * 1024 * 1024 * 1024, # 10GB
}

_CHUNK_RETRY_DELAYS = (1, 5, 20)
_RETRYABLE_API_ERROR_CODES = frozenset({"INTERNAL_ERROR", "HTTP_ERROR"})


def _try_import_tqdm() -> type | None:
try:
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore")
from tqdm.auto import tqdm

return tqdm # type: ignore[no-any-return]
except ImportError:
return None


def _upload_file(
base_url: str,
auth_token: str | None,
file_kind: UploadFileKind,
file_path: str,
timeout: float | tuple[float, float] = (30.0, 3600.0),
timeout: float | tuple[float, float] = (60.0, 3600.0),
max_workers: int = 4,
) -> UploadResponse:
path_obj = Path(file_path)
Expand Down Expand Up @@ -62,6 +75,12 @@ def _upload_file(
# Memory is bounded to ~max_workers Γ— chunk_size because each thread
# reads its chunk on demand via seek+read.
_tls = threading.local()
tqdm_cls = _try_import_tqdm()
pbar = (
tqdm_cls(total=n_chunks, desc="Uploading", unit="chunk")
if tqdm_cls is not None and n_chunks > 0
else None
)
_progress_lock = threading.Lock()
_chunks_done = [0]

Expand All @@ -82,15 +101,18 @@ def _upload_chunk(chunk_idx: int) -> None:
data=chunk_data,
timeout=timeout,
)
with _progress_lock:
_chunks_done[0] += 1
done = _chunks_done[0]
pct = 100 * done / n_chunks
print(
f"\r Uploading: {done}/{n_chunks} chunks ({pct:.0f}%)",
end="",
flush=True,
)
if pbar is not None:
pbar.update(1)
else:
with _progress_lock:
_chunks_done[0] += 1
done = _chunks_done[0]
pct = 100 * done / n_chunks
print(
f"\r Uploading: {done}/{n_chunks} chunks ({pct:.0f}%)",
end="",
flush=True,
)
return
except (NetworkError, TimeoutError) as exc:
last_exc = exc
Expand All @@ -103,13 +125,9 @@ def _upload_chunk(chunk_idx: int) -> None:
if attempt < len(_CHUNK_RETRY_DELAYS):
delay = _CHUNK_RETRY_DELAYS[attempt]
logger.warning(
"Chunk %d/%d upload failed (attempt %d/%d), retrying in %ds: %s",
chunk_idx + 1,
n_chunks,
attempt + 1,
1 + len(_CHUNK_RETRY_DELAYS),
delay,
last_exc,
f"Chunk {chunk_idx + 1}/{n_chunks} upload failed "
f"(attempt {attempt + 1}/{1 + len(_CHUNK_RETRY_DELAYS)}), "
f"retrying in {delay}s: {last_exc}"
)
time.sleep(delay)

Expand All @@ -120,9 +138,17 @@ def _upload_chunk(chunk_idx: int) -> None:
try:
with ThreadPoolExecutor(max_workers=effective_workers) as pool:
list(pool.map(_upload_chunk, range(n_chunks)))
print(f"\r \033[92mβœ“\033[0m Uploaded {n_chunks}/{n_chunks} chunks (100%)")
if pbar is not None:
pbar.close()
else:
print(
f"\r \033[92mβœ“\033[0m Uploaded {n_chunks}/{n_chunks} chunks (100%)"
)
except BaseException:
print() # ensure newline on failure
if pbar is not None:
pbar.close()
else:
print()
raise

# Step 3 – Complete upload (returns same UploadResponse shape as before)
Expand All @@ -136,7 +162,7 @@ def upload_obs_duckdb(
base_url: str,
auth_token: str | None,
file_path: str,
timeout: float | tuple[float, float] = (30.0, 3600.0),
timeout: float | tuple[float, float] = (60.0, 3600.0),
max_workers: int = 4,
) -> UploadResponse:
return _upload_file(
Expand All @@ -153,7 +179,7 @@ def upload_vars_h5(
base_url: str,
auth_token: str | None,
file_path: str,
timeout: float | tuple[float, float] = (30.0, 3600.0),
timeout: float | tuple[float, float] = (60.0, 3600.0),
max_workers: int = 4,
) -> UploadResponse:
return _upload_file(
Expand Down
7 changes: 7 additions & 0 deletions cytetype/api/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ class LLMValidationError(APIError):
pass


class ClientDisconnectedError(APIError):
"""Server detected client disconnection mid-request - CLIENT_DISCONNECTED (HTTP 499)."""

pass


# Client-side errors with default messages
class TimeoutError(CyteTypeError):
"""Client-side timeout waiting for results."""
Expand Down Expand Up @@ -87,6 +93,7 @@ def __init__(
"JOB_NOT_FOUND": JobNotFoundError,
"JOB_FAILED": JobFailedError,
"LLM_VALIDATION_FAILED": LLMValidationError,
"CLIENT_DISCONNECTED": ClientDisconnectedError,
"JOB_PROCESSING": APIError, # Generic - expected during polling
"JOB_NOT_COMPLETED": APIError, # Generic
"HTTP_ERROR": APIError, # Generic
Expand Down
2 changes: 2 additions & 0 deletions cytetype/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ def _log_format(record: Record) -> str:
level="INFO",
format=_log_format,
)

WRITE_MEM_BUDGET: int = 4 * 1024 * 1024 * 1024 # 4 GB
Loading