From d610a6d95e4ed469a2dcb343bfa0737d21b1dbb8 Mon Sep 17 00:00:00 2001
From: parashardhapola <parashar.dhapola@gmail.com>
Date: Sun, 8 Mar 2026 10:11:34 +0100
Subject: [PATCH] Update version to 0.19.2 and enhance metadata handling in
 CyteType

- Bump package version to 0.19.2.
- Introduce max_metadata_categories parameter to limit unique values in categorical obs columns during cluster metadata aggregation, improving memory efficiency.
- Increase maximum upload size for obs_duckdb from 100MB to 2GB, accommodating larger datasets.
- Refactor save_obs_duckdb function to ensure proper cleanup of temporary columns after processing.
---
 cytetype/__init__.py                  |  2 +-
 cytetype/api/client.py                |  2 +-
 cytetype/core/artifacts.py            | 19 +++++++++++++------
 cytetype/main.py                      |  6 ++++++
 cytetype/preprocessing/aggregation.py | 17 +++++++++++++++--
 5 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/cytetype/__init__.py b/cytetype/__init__.py
index 6ed29cc..37dfad9 100644
--- a/cytetype/__init__.py
+++ b/cytetype/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.19.1"
+__version__ = "0.19.2"
 
 import requests
 
diff --git a/cytetype/api/client.py b/cytetype/api/client.py
index 6cc178c..46d85e9 100644
--- a/cytetype/api/client.py
+++ b/cytetype/api/client.py
@@ -13,7 +13,7 @@
 
 
 MAX_UPLOAD_BYTES: dict[UploadFileKind, int] = {
-    "obs_duckdb": 100 * 1024 * 1024,  # 100MB
+    "obs_duckdb": 2 * 1024 * 1024 * 1024,  # 2GB
     "vars_h5": 50 * 1024 * 1024 * 1024,  # 10GB
 }
 
diff --git a/cytetype/core/artifacts.py b/cytetype/core/artifacts.py
index fd08aab..c589fca 100644
--- a/cytetype/core/artifacts.py
+++ b/cytetype/core/artifacts.py
@@ -491,16 +491,23 @@ def save_obs_duckdb(
             "Invalid table_name. Use letters, numbers, and underscores only."
         )
 
+    added_cols: list[str] = []
     if obsm_coordinates is not None and coordinates_key is not None:
-        obs_df = obs_df.copy()
-        obs_df[f"__vis_coordinates_{coordinates_key}_1"] = obsm_coordinates[:, 0]
-        obs_df[f"__vis_coordinates_{coordinates_key}_2"] = obsm_coordinates[:, 1]
+        col1 = f"__vis_coordinates_{coordinates_key}_1"
+        col2 = f"__vis_coordinates_{coordinates_key}_2"
+        obs_df[col1] = obsm_coordinates[:, 0]
+        obs_df[col2] = obsm_coordinates[:, 1]
+        added_cols = [col1, col2]
 
     dd_config: dict[str, Any] = {
         "threads": threads,
         "memory_limit": memory_limit,
         "temp_directory": temp_directory,
     }
-    with duckdb.connect(out_file, config=dd_config) as con:
-        con.register("obs_df", obs_df)
-        con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM obs_df")
+    try:
+        with duckdb.connect(out_file, config=dd_config) as con:
+            con.register("obs_df", obs_df)
+            con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM obs_df")
+    finally:
+        for col in added_cols:
+            obs_df.drop(columns=col, inplace=True, errors="ignore")
diff --git a/cytetype/main.py b/cytetype/main.py
index 8c69ec8..be2ea27 100644
--- a/cytetype/main.py
+++ b/cytetype/main.py
@@ -84,6 +84,7 @@ def __init__(
         max_cells_per_group: int = 1000,
         vars_h5_path: str = "vars.h5",
         obs_duckdb_path: str = "obs.duckdb",
+        max_metadata_categories: int = 500,
         api_url: str = "https://prod.cytetype.nygen.io",
         auth_token: str | None = None,
     ) -> None:
@@ -116,6 +117,10 @@ def __init__(
             max_cells_per_group (int, optional): Maximum number of cells to sample per group
                 for visualization. If a group has more cells than this limit, a random sample
                 will be taken. Defaults to 1000.
+            max_metadata_categories (int, optional): Maximum number of unique values a categorical
+                obs column may have to be included in cluster metadata aggregation. Columns with
+                more unique values (e.g. cell barcodes, per-cell IDs) are skipped to avoid
+                excessive memory usage. Defaults to 500.
             api_url (str, optional): URL for the CyteType API endpoint. Only change if using a custom
                 deployment. Defaults to "https://prod.cytetype.nygen.io".
             auth_token (str | None, optional): Bearer token for API authentication. If provided,
@@ -186,6 +191,7 @@ def __init__(
                 adata=self.adata,
                 group_key=self.group_key,
                 min_percentage=min_percentage,
+                max_categories=max_metadata_categories,
             )
             # Replace keys in group_metadata using cluster_map
             self.group_metadata = {
diff --git a/cytetype/preprocessing/aggregation.py b/cytetype/preprocessing/aggregation.py
index c279725..a977ed1 100644
--- a/cytetype/preprocessing/aggregation.py
+++ b/cytetype/preprocessing/aggregation.py
@@ -1,6 +1,7 @@
 import anndata
 import numpy as np
 
+from ..config import logger
 from .marker_detection import _accumulate_group_stats
 
 
@@ -55,6 +56,7 @@ def aggregate_cluster_metadata(
     adata: anndata.AnnData,
     group_key: str,
     min_percentage: int = 10,
+    max_categories: int = 500,
 ) -> dict[str, dict[str, dict[str, int]]]:
     """Aggregate categorical metadata per cluster.
 
@@ -66,6 +68,9 @@ def aggregate_cluster_metadata(
         adata: AnnData object containing single-cell data
         group_key: Column name in adata.obs to group cells by
         min_percentage: Minimum percentage of cells in a group to include
+        max_categories: Maximum number of unique values a column may have to be
+            included. Columns exceeding this threshold are skipped to avoid
+            memory-expensive intermediate DataFrames.
 
     Returns:
         Nested dictionary structure:
@@ -76,14 +81,22 @@ def aggregate_cluster_metadata(
     grouped_data = adata.obs.groupby(group_key, observed=False)
     column_distributions: dict[str, dict[str, dict[str, int]]] = {}
 
-    # Process each column in adata.obs
     for column_name in adata.obs.columns:
         if column_name == group_key:
             continue
 
         column_dtype = adata.obs[column_name].dtype
         if column_dtype in ["object", "category", "string"]:
-            # Calculate value counts for each group
+            n_unique = adata.obs[column_name].nunique()
+            if n_unique > max_categories:
+                logger.debug(
+                    "Skipping column '{}' ({} unique values > max_categories={}).",
+                    column_name,
+                    n_unique,
+                    max_categories,
+                )
+                continue
+
             value_counts_df = grouped_data[column_name].value_counts().unstack().T
 
             # Convert to percentages and filter for values >min_percentage