From d610a6d95e4ed469a2dcb343bfa0737d21b1dbb8 Mon Sep 17 00:00:00 2001 From: parashardhapola Date: Sun, 8 Mar 2026 10:11:34 +0100 Subject: [PATCH] Update version to 0.19.2 and enhance metadata handling in CyteType - Bump package version to 0.19.2. - Introduce max_metadata_categories parameter to limit unique values in categorical obs columns during cluster metadata aggregation, improving memory efficiency. - Increase maximum upload size for obs_duckdb from 100MB to 2GB, accommodating larger datasets. - Refactor save_obs_duckdb function to ensure proper cleanup of temporary columns after processing. --- cytetype/__init__.py | 2 +- cytetype/api/client.py | 2 +- cytetype/core/artifacts.py | 19 +++++++++++++------ cytetype/main.py | 6 ++++++ cytetype/preprocessing/aggregation.py | 17 +++++++++++++++-- 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/cytetype/__init__.py b/cytetype/__init__.py index 6ed29cc..37dfad9 100644 --- a/cytetype/__init__.py +++ b/cytetype/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.19.1" +__version__ = "0.19.2" import requests diff --git a/cytetype/api/client.py b/cytetype/api/client.py index 6cc178c..46d85e9 100644 --- a/cytetype/api/client.py +++ b/cytetype/api/client.py @@ -13,7 +13,7 @@ MAX_UPLOAD_BYTES: dict[UploadFileKind, int] = { - "obs_duckdb": 100 * 1024 * 1024, # 100MB + "obs_duckdb": 2 * 1024 * 1024 * 1024, # 2GB "vars_h5": 50 * 1024 * 1024 * 1024, # 10GB } diff --git a/cytetype/core/artifacts.py b/cytetype/core/artifacts.py index fd08aab..c589fca 100644 --- a/cytetype/core/artifacts.py +++ b/cytetype/core/artifacts.py @@ -491,16 +491,23 @@ def save_obs_duckdb( "Invalid table_name. Use letters, numbers, and underscores only." ) + added_cols: list[str] = [] if obsm_coordinates is not None and coordinates_key is not None: - obs_df = obs_df.copy() - obs_df[f"__vis_coordinates_{coordinates_key}_1"] = obsm_coordinates[:, 0] - obs_df[f"__vis_coordinates_{coordinates_key}_2"] = obsm_coordinates[:, 1] + col1 = f"__vis_coordinates_{coordinates_key}_1" + col2 = f"__vis_coordinates_{coordinates_key}_2" + obs_df[col1] = obsm_coordinates[:, 0] + obs_df[col2] = obsm_coordinates[:, 1] + added_cols = [col1, col2] dd_config: dict[str, Any] = { "threads": threads, "memory_limit": memory_limit, "temp_directory": temp_directory, } - with duckdb.connect(out_file, config=dd_config) as con: - con.register("obs_df", obs_df) - con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM obs_df") + try: + with duckdb.connect(out_file, config=dd_config) as con: + con.register("obs_df", obs_df) + con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM obs_df") + finally: + for col in added_cols: + obs_df.drop(columns=col, inplace=True, errors="ignore") diff --git a/cytetype/main.py b/cytetype/main.py index 8c69ec8..be2ea27 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -84,6 +84,7 @@ def __init__( max_cells_per_group: int = 1000, vars_h5_path: str = "vars.h5", obs_duckdb_path: str = "obs.duckdb", + max_metadata_categories: int = 500, api_url: str = "https://prod.cytetype.nygen.io", auth_token: str | None = None, ) -> None: @@ -116,6 +117,10 @@ def __init__( max_cells_per_group (int, optional): Maximum number of cells to sample per group for visualization. If a group has more cells than this limit, a random sample will be taken. Defaults to 1000. + max_metadata_categories (int, optional): Maximum number of unique values a categorical + obs column may have to be included in cluster metadata aggregation. Columns with + more unique values (e.g. cell barcodes, per-cell IDs) are skipped to avoid + excessive memory usage. Defaults to 500. api_url (str, optional): URL for the CyteType API endpoint. Only change if using a custom deployment. Defaults to "https://prod.cytetype.nygen.io". auth_token (str | None, optional): Bearer token for API authentication. If provided, @@ -186,6 +191,7 @@ def __init__( adata=self.adata, group_key=self.group_key, min_percentage=min_percentage, + max_categories=max_metadata_categories, ) # Replace keys in group_metadata using cluster_map self.group_metadata = { diff --git a/cytetype/preprocessing/aggregation.py b/cytetype/preprocessing/aggregation.py index c279725..a977ed1 100644 --- a/cytetype/preprocessing/aggregation.py +++ b/cytetype/preprocessing/aggregation.py @@ -1,6 +1,7 @@ import anndata import numpy as np +from ..config import logger from .marker_detection import _accumulate_group_stats @@ -55,6 +56,7 @@ def aggregate_cluster_metadata( adata: anndata.AnnData, group_key: str, min_percentage: int = 10, + max_categories: int = 500, ) -> dict[str, dict[str, dict[str, int]]]: """Aggregate categorical metadata per cluster. @@ -66,6 +68,9 @@ def aggregate_cluster_metadata( adata: AnnData object containing single-cell data group_key: Column name in adata.obs to group cells by min_percentage: Minimum percentage of cells in a group to include + max_categories: Maximum number of unique values a column may have to be + included. Columns exceeding this threshold are skipped to avoid + memory-expensive intermediate DataFrames. Returns: Nested dictionary structure: @@ -76,14 +81,22 @@ def aggregate_cluster_metadata( grouped_data = adata.obs.groupby(group_key, observed=False) column_distributions: dict[str, dict[str, dict[str, int]]] = {} - # Process each column in adata.obs for column_name in adata.obs.columns: if column_name == group_key: continue column_dtype = adata.obs[column_name].dtype if column_dtype in ["object", "category", "string"]: - # Calculate value counts for each group + n_unique = adata.obs[column_name].nunique() + if n_unique > max_categories: + logger.debug( + "Skipping column '{}' ({} unique values > max_categories={}).", + column_name, + n_unique, + max_categories, + ) + continue + value_counts_df = grouped_data[column_name].value_counts().unstack().T # Convert to percentages and filter for values >min_percentage