From e4843f3c14e700ae12bc3f330585d15ef0cd0c79 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:26:15 -0500 Subject: [PATCH 01/32] First stab at adding clique leaders. --- node_normalizer/model/input.py | 5 +++++ node_normalizer/normalizer.py | 30 +++++++++++++++++++++++++----- node_normalizer/server.py | 4 +++- node_normalizer/set_id.py | 2 +- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/node_normalizer/model/input.py b/node_normalizer/model/input.py index ea7820e..b6bf757 100644 --- a/node_normalizer/model/input.py +++ b/node_normalizer/model/input.py @@ -41,6 +41,11 @@ class CurieList(BaseModel): title="Whether to return taxa for equivalent identifiers" ) + include_clique_leaders: bool = Field( + default=False, + title="Whether to return clique leaders for conflated identifiers" + ) + class Config: schema_extra = { "example": { diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 32d9126..713c877 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -532,6 +532,7 @@ async def get_normalized_nodes( include_descriptions: bool = False, include_individual_types: bool = True, include_taxa: bool = True, + include_clique_leaders: bool = False, ) -> Dict[str, Optional[str]]: """ Get value(s) for key(s) using redis MGET @@ -555,6 +556,7 @@ async def get_normalized_nodes( canonical_ids = await app.state.eq_id_to_id_db.mget(*upper_curies, encoding='utf-8') canonical_nonan = [canonical_id for canonical_id in canonical_ids if canonical_id is not None] info_contents = {} + clique_leaders = {} # did we get some canonical ids if canonical_nonan: @@ -569,14 +571,18 @@ async def get_normalized_nodes( other_ids = [] if conflate_gene_protein: - other_ids.extend(await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8')) + gene_protein_clique_leaders = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') + other_ids.extend(gene_protein_clique_leaders) + clique_leaders.update(zip(canonical_nonan, gene_protein_clique_leaders)) # logger.error(f"After conflate_gene_protein: {other_ids}") if conflate_chemical_drug: - other_ids.extend(await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8')) + drug_chemical_clique_leaders = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') + other_ids.extend(drug_chemical_clique_leaders) + clique_leaders.update(zip(canonical_nonan, drug_chemical_clique_leaders)) - # logger.error(f"After conflate_chemical_drug: {other_ids}") + # logger.error(f"After conflate_chemical_drug: {other_ids}") # if there are other ids, then we want to rebuild eqids and types. That's because even though we have them, # they're not necessarily first. For instance if what came in and got canonicalized was a protein id @@ -635,9 +641,13 @@ async def get_normalized_nodes( dereference_ids = dict() dereference_types = dict() + # Don't write out clique leaders unless its requested. + if not include_clique_leaders: + clique_leaders = None + # output the final result normal_nodes = { - input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, + input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, clique_leaders, include_descriptions=include_descriptions, include_individual_types=include_individual_types, include_taxa=include_taxa, @@ -680,7 +690,7 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict: return new_attrib -async def create_node(app, canonical_id, equivalent_ids, types, info_contents, include_descriptions=True, +async def create_node(app, canonical_id, equivalent_ids, types, info_contents, clique_leaders, include_descriptions=True, include_individual_types=False, include_taxa=False, conflations=None): """Construct the output format given the compressed redis data""" # It's possible that we didn't find a canonical_id @@ -828,6 +838,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i if include_taxa and node_taxa: node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix) + # Add clique leaders if available. + if clique_leaders: + clique_leaders_for_node = clique_leaders.get(canonical_id, []) + clique_leaders_with_labels_and_types = [{ + 'identifier': cl, + 'labels': [eid['l'] for eid in eids if eid['i'] == cl], + 'types': [eid['t'] for eid in eids if eid['i'] == cl], + } for cl in clique_leaders_for_node] + node["clique_leaders"] = clique_leaders_with_labels_and_types + # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/TranslatorSRI/NodeNormalization/issues/173) if 'biolink:Entity' in types[canonical_id]: diff --git a/node_normalizer/server.py b/node_normalizer/server.py index 18ca7ca..2fc1430 100644 --- a/node_normalizer/server.py +++ b/node_normalizer/server.py @@ -265,6 +265,7 @@ async def get_normalized_node_handler( description: bool = fastapi.Query(False, description="Whether to return curie descriptions when possible"), individual_types: bool = fastapi.Query(False, description="Whether to return individual types for equivalent identifiers"), include_taxa: bool = fastapi.Query(True, description="Whether to return taxa for equivalent identifiers"), + include_clique_leaders: bool = fastapi.Query(False, description="Whether to return clique leaders for conflated identifiers"), ): """ Get value(s) for key(s) using redis MGET @@ -274,6 +275,7 @@ async def get_normalized_node_handler( include_descriptions=description, include_individual_types=individual_types, include_taxa=include_taxa, + include_clique_leaders=include_clique_leaders, ) # If curie contains at least one entry, then the only way normalized_nodes could be blank @@ -295,7 +297,7 @@ async def get_normalized_node_handler_post(curies: CurieList): """ normalized_nodes = await get_normalized_nodes(app, curies.curies, curies.conflate, curies.drug_chemical_conflate, curies.description, include_individual_types=curies.individual_types, - include_taxa=curies.include_taxa, + include_taxa=curies.include_taxa, include_clique_leaders=curies.include_clique_leaders, ) # If curies.curies contains at least one entry, then the only way normalized_nodes could be blank diff --git a/node_normalizer/set_id.py b/node_normalizer/set_id.py index 3c3dc30..37a2745 100644 --- a/node_normalizer/set_id.py +++ b/node_normalizer/set_id.py @@ -41,7 +41,7 @@ async def generate_setid(app, curies, conflations) -> SetIDResponse: # We use get_normalized_nodes() to normalize all the CURIEs for us. normalization_results = await get_normalized_nodes( - app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=False + app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=False, include_clique_leaders=False ) # We prepare a set of sorted, deduplicated curies. From db9f2a32b932994bd32bc5176a53b64c91bcddfb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:26:41 -0500 Subject: [PATCH 02/32] Added on:push trigger for testing. --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 800d57b..e9d8f8f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,6 +1,7 @@ name: 'Publish to GitHub Packages' on: + push: release: types: [published] From 5351dc67f6c1eb85377dcedc808e8f1e73da4f1c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:36:15 -0500 Subject: [PATCH 03/32] Attempt at fix. --- node_normalizer/normalizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 713c877..430a083 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -573,14 +573,14 @@ async def get_normalized_nodes( if conflate_gene_protein: gene_protein_clique_leaders = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') other_ids.extend(gene_protein_clique_leaders) - clique_leaders.update(zip(canonical_nonan, gene_protein_clique_leaders)) + clique_leaders.update(zip(*canonical_nonan, gene_protein_clique_leaders)) # logger.error(f"After conflate_gene_protein: {other_ids}") if conflate_chemical_drug: drug_chemical_clique_leaders = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') other_ids.extend(drug_chemical_clique_leaders) - clique_leaders.update(zip(canonical_nonan, drug_chemical_clique_leaders)) + clique_leaders.update(zip(*canonical_nonan, drug_chemical_clique_leaders)) # logger.error(f"After conflate_chemical_drug: {other_ids}") @@ -661,7 +661,7 @@ async def get_normalized_nodes( end_time = time.time_ns() logger.info(f"Normalized {len(curies)} nodes in {(end_time - start_time)/1_000_000:.2f} ms with arguments " + f"(curies={curies}, conflate_gene_protein={conflate_gene_protein}, conflate_chemical_drug={conflate_chemical_drug}, " + - f"include_descriptions={include_descriptions}, include_individual_types={include_individual_types})") + f"include_descriptions={include_descriptions}, include_individual_types={include_individual_types}, include_clique_leaders={include_clique_leaders})") return normal_nodes From 55753fb85834ee4331b779328de37cff082dcc57 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:46:18 -0500 Subject: [PATCH 04/32] Added logging for debugging. --- node_normalizer/normalizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 430a083..b876f46 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -840,6 +840,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # Add clique leaders if available. if clique_leaders: + logger.info(f"Getting clique_leaders from {clique_leaders} for canonical ID {canonical_id}") clique_leaders_for_node = clique_leaders.get(canonical_id, []) clique_leaders_with_labels_and_types = [{ 'identifier': cl, From dda3646b789e7f1f79c755b6865333545c21943a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:47:15 -0500 Subject: [PATCH 05/32] Replaced logging with logger. --- node_normalizer/normalizer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index b876f46..a921bb2 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -511,7 +511,7 @@ async def get_eqids_and_types( types_with_ancestors = [] for index, typ in enumerate(types): if not typ: - logging.error(f"No type information found for '{canonical_nonan[index]}' with eqids: {eqids[index]}, " + logger.error(f"No type information found for '{canonical_nonan[index]}' with eqids: {eqids[index]}, " f"replacing with {BIOLINK_NAMED_THING}") types_with_ancestors.append([BIOLINK_NAMED_THING]) else: @@ -625,7 +625,7 @@ async def get_normalized_nodes( t = [] for other in dereference_others[canonical_id]: - # logging.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}") + # logger.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}") e += deref_others_eqs[other] t += deref_others_typ[other] @@ -703,16 +703,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # If we have 'None' in the equivalent IDs, skip it so we don't confuse things further down the line. if None in equivalent_ids[canonical_id]: - logging.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}") + logger.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}") equivalent_ids[canonical_id] = [x for x in equivalent_ids[canonical_id] if x is not None] if not equivalent_ids[canonical_id]: - logging.warning(f"No non-None values found for ID {canonical_id} among filtered eqids: {equivalent_ids}") + logger.warning(f"No non-None values found for ID {canonical_id} among filtered eqids: {equivalent_ids}") return None # If we have 'None' in the canonical types, something went horribly wrong (specifically: we couldn't # find the type information for all the eqids for this clique). Return None. if None in types[canonical_id]: - logging.error(f"No types found for canonical ID {canonical_id} among types: {types}") + logger.error(f"No types found for canonical ID {canonical_id} among types: {types}") return None # OK, now we should have id's in the format [ {"i": "MONDO:12312", "l": "Scrofula"}, {},...] From de096d2599a6cbbd549e529d3a233cd9764ece71 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:59:12 -0500 Subject: [PATCH 06/32] Attempt to fix clique leader querying. --- node_normalizer/normalizer.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index a921bb2..4aaad64 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -573,14 +573,16 @@ async def get_normalized_nodes( if conflate_gene_protein: gene_protein_clique_leaders = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') other_ids.extend(gene_protein_clique_leaders) - clique_leaders.update(zip(*canonical_nonan, gene_protein_clique_leaders)) + if include_clique_leaders: + clique_leaders.update(zip(canonical_nonan, json.loads(gene_protein_clique_leaders))) # logger.error(f"After conflate_gene_protein: {other_ids}") if conflate_chemical_drug: drug_chemical_clique_leaders = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') other_ids.extend(drug_chemical_clique_leaders) - clique_leaders.update(zip(*canonical_nonan, drug_chemical_clique_leaders)) + if include_clique_leaders: + clique_leaders.update(zip(canonical_nonan, json.loads(drug_chemical_clique_leaders))) # logger.error(f"After conflate_chemical_drug: {other_ids}") @@ -641,10 +643,6 @@ async def get_normalized_nodes( dereference_ids = dict() dereference_types = dict() - # Don't write out clique leaders unless its requested. - if not include_clique_leaders: - clique_leaders = None - # output the final result normal_nodes = { input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, clique_leaders, From 53a82cb673383c5f439dc696538d154d83c209ea Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 15:54:49 -0500 Subject: [PATCH 07/32] Get the clique leaders translated again. --- node_normalizer/normalizer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 4aaad64..aa1dcae 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -571,25 +571,27 @@ async def get_normalized_nodes( other_ids = [] if conflate_gene_protein: - gene_protein_clique_leaders = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') + gene_protein_clique_leaders_strings = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') + gene_protein_clique_leaders = [json.loads(oids) if oids else [] for oids in gene_protein_clique_leaders_strings] other_ids.extend(gene_protein_clique_leaders) if include_clique_leaders: - clique_leaders.update(zip(canonical_nonan, json.loads(gene_protein_clique_leaders))) + clique_leaders.update(zip(canonical_nonan, gene_protein_clique_leaders)) # logger.error(f"After conflate_gene_protein: {other_ids}") if conflate_chemical_drug: - drug_chemical_clique_leaders = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') + drug_chemical_clique_leaders_strings = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') + drug_chemical_clique_leaders = [json.loads(oids) if oids else [] for oids in drug_chemical_clique_leaders_strings] other_ids.extend(drug_chemical_clique_leaders) if include_clique_leaders: - clique_leaders.update(zip(canonical_nonan, json.loads(drug_chemical_clique_leaders))) + clique_leaders.update(zip(canonical_nonan, drug_chemical_clique_leaders)) - # logger.error(f"After conflate_chemical_drug: {other_ids}") + # logger.error(f"After conflate_chemical_drug: {other_ids}") # if there are other ids, then we want to rebuild eqids and types. That's because even though we have them, # they're not necessarily first. For instance if what came in and got canonicalized was a protein id # and we want gene first, then we're relying on the order of the other_ids to put it back in the right place. - other_ids = [json.loads(oids) if oids else [] for oids in other_ids] + # other_ids = [json.loads(oids) if oids else [] for oids in other_ids] # Until we added conflate_chemical_drug, canonical_nonan and other_ids would always have the same # length, so we could figure out mappings from one to the other just by doing: From b304088549209b30e9c0d0fb919acab61001a4e3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 16:38:22 -0500 Subject: [PATCH 08/32] Fixed up output. --- node_normalizer/normalizer.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index aa1dcae..ec6c1ee 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -731,8 +731,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c identifiers_with_labels = eids else: # We have a conflation going on! To replicate Babel's behavior, we need to run the algorithem - # on the list of labels corresponding to the first - # So we need to run the algorithm on the first set of identifiers that have any + # on the list of labels corresponding to the first set of identifiers that have any # label whatsoever. identifiers_with_labels = [] curies_already_checked = set() @@ -810,12 +809,19 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # now need to reformat the identifier keys. It could be cleaner but we have to worry about if there is a label descriptions = [] + clique_leaders_output = {} node_taxa = set() node["equivalent_identifiers"] = [] for eqid in eids: eq_item = {"identifier": eqid["i"]} if "l" in eqid and eqid["l"]: eq_item["label"] = eqid["l"] + if clique_leaders and eqid["i"] in clique_leaders: + clique_leaders_output[eqid["i"]] = { + "identifier": eqid["i"], + "label": eqid["l"], + "biolink_type": types.get(eqid["i"], ["UNKNOWN"])[0], + } # if descriptions is enabled, add it to descriptions. if include_descriptions and "d" in eqid and len(eqid["d"]) > 0: desc = eqid["d"][0] @@ -840,14 +846,12 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # Add clique leaders if available. if clique_leaders: - logger.info(f"Getting clique_leaders from {clique_leaders} for canonical ID {canonical_id}") - clique_leaders_for_node = clique_leaders.get(canonical_id, []) - clique_leaders_with_labels_and_types = [{ - 'identifier': cl, - 'labels': [eid['l'] for eid in eids if eid['i'] == cl], - 'types': [eid['t'] for eid in eids if eid['i'] == cl], - } for cl in clique_leaders_for_node] - node["clique_leaders"] = clique_leaders_with_labels_and_types + # If there are any clique leader IDs we haven't included in clique_leaders_output, + # insert it anyway at this point. This shouldn't happen, but let's be careful. + missing_clique_leaders = (clique_leaders_output.keys() - clique_leaders) + for cl_id in missing_clique_leaders: + clique_leaders_output[cl_id] = {"identifier": cl_id, "biolink_type": types.get(cl_id, ["UNKNOWN"])[0]} + node["clique_leaders"] = clique_leaders_output # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/TranslatorSRI/NodeNormalization/issues/173) From 6abcd84d103aac2c1aa3017075d220d460e1ac97 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 16:52:31 -0500 Subject: [PATCH 09/32] Bugfixes. --- node_normalizer/normalizer.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index ec6c1ee..e6a7486 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -816,12 +816,6 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c eq_item = {"identifier": eqid["i"]} if "l" in eqid and eqid["l"]: eq_item["label"] = eqid["l"] - if clique_leaders and eqid["i"] in clique_leaders: - clique_leaders_output[eqid["i"]] = { - "identifier": eqid["i"], - "label": eqid["l"], - "biolink_type": types.get(eqid["i"], ["UNKNOWN"])[0], - } # if descriptions is enabled, add it to descriptions. if include_descriptions and "d" in eqid and len(eqid["d"]) > 0: desc = eqid["d"][0] @@ -837,6 +831,15 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) + if clique_leaders and eqid["i"] in clique_leaders: + clique_leaders_output[eqid["i"]] = { + "identifier": eqid["i"], + "label": eq_item.get("label", ""), + "description": eq_item.get("description", ""), + "taxa": eq_item.get("taxa", []), + "type": eq_item.get("type", "UNKNOWN") + } + if include_descriptions and descriptions: node["descriptions"] = descriptions node["id"]["description"] = descriptions[0] @@ -848,7 +851,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c if clique_leaders: # If there are any clique leader IDs we haven't included in clique_leaders_output, # insert it anyway at this point. This shouldn't happen, but let's be careful. - missing_clique_leaders = (clique_leaders_output.keys() - clique_leaders) + missing_clique_leaders = (clique_leaders - clique_leaders_output.keys()) for cl_id in missing_clique_leaders: clique_leaders_output[cl_id] = {"identifier": cl_id, "biolink_type": types.get(cl_id, ["UNKNOWN"])[0]} node["clique_leaders"] = clique_leaders_output From 321fb85233ad4126145417934840857dfce6c57e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 17:15:07 -0500 Subject: [PATCH 10/32] Fix clique leader output. --- node_normalizer/normalizer.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index e6a7486..b3361e6 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -831,14 +831,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - if clique_leaders and eqid["i"] in clique_leaders: - clique_leaders_output[eqid["i"]] = { - "identifier": eqid["i"], - "label": eq_item.get("label", ""), - "description": eq_item.get("description", ""), - "taxa": eq_item.get("taxa", []), - "type": eq_item.get("type", "UNKNOWN") - } + if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: + clique_leaders_output[eqid["i"]] = { "identifier": eqid["i"] } + if "label" in eq_item: + clique_leaders_output[eqid["i"]]["label"] = eq_item["label"] + if "description" in eq_item: + clique_leaders_output[eqid["i"]]["description"] = eq_item["description"] + if "taxa" in eq_item: + clique_leaders_output[eqid["i"]]["taxa"] = eqid["taxa"] + if "type" in eq_item: + clique_leaders_output[eqid["i"]]["type"] = eqid["type"] if include_descriptions and descriptions: node["descriptions"] = descriptions @@ -849,12 +851,14 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # Add clique leaders if available. if clique_leaders: - # If there are any clique leader IDs we haven't included in clique_leaders_output, - # insert it anyway at this point. This shouldn't happen, but let's be careful. - missing_clique_leaders = (clique_leaders - clique_leaders_output.keys()) - for cl_id in missing_clique_leaders: - clique_leaders_output[cl_id] = {"identifier": cl_id, "biolink_type": types.get(cl_id, ["UNKNOWN"])[0]} - node["clique_leaders"] = clique_leaders_output + node["clique_leaders"] = [] + for cl_id in clique_leaders: + if cl_id in clique_leaders_output: + node["clique_leaders"].append(clique_leaders_output[cl_id]) + else: + node["clique_leaders"].append({ + "identifier": cl_id, + }) # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/TranslatorSRI/NodeNormalization/issues/173) From 1ad6572b498cc98a2fa05a418e4ff6903c320717 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 17:22:54 -0500 Subject: [PATCH 11/32] Bugfix. --- node_normalizer/normalizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index b3361e6..4b9c007 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -816,6 +816,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c eq_item = {"identifier": eqid["i"]} if "l" in eqid and eqid["l"]: eq_item["label"] = eqid["l"] + # if descriptions is enabled, add it to descriptions. if include_descriptions and "d" in eqid and len(eqid["d"]) > 0: desc = eqid["d"][0] @@ -838,9 +839,9 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c if "description" in eq_item: clique_leaders_output[eqid["i"]]["description"] = eq_item["description"] if "taxa" in eq_item: - clique_leaders_output[eqid["i"]]["taxa"] = eqid["taxa"] + clique_leaders_output[eqid["i"]]["taxa"] = eq_item["taxa"] if "type" in eq_item: - clique_leaders_output[eqid["i"]]["type"] = eqid["type"] + clique_leaders_output[eqid["i"]]["type"] = eq_item["type"] if include_descriptions and descriptions: node["descriptions"] = descriptions From ff4e4d42f9ea1a4ec4b1ef3f6771d5b8966fe830 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Feb 2026 19:12:35 -0500 Subject: [PATCH 12/32] First stab at including direct types in response (see #359). --- node_normalizer/normalizer.py | 65 ++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 5e83ed7..d36c14b 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -499,9 +499,29 @@ async def get_info_content( async def get_eqids_and_types( app: FastAPI, - canonical_nonan: List) -> Tuple[List, List]: + canonical_nonan: List) -> Tuple[List, List, List]: + """ + Retrieve equivalent IDs and their corresponding types, along with ancestor types, by querying databases. + + This function processes a given list of canonical identifiers in batches and fetches their equivalent + IDs (`eqids`) from a database. For each identifier, it also retrieves its type information and computes + the ancestor types using a function. If no type information is found for a given identifier, a default + type is assigned, and an error is logged. The resulting data structures consolidate this information + to provide detailed insights into IDs and their associated types. + + :param app: An instance of the FastAPI application containing the database connections. + :type app: FastAPI + :param canonical_nonan: A list of canonical identifiers for which `eqids` and types need to be fetched. + :type canonical_nonan: List + :return: A tuple containing three lists: + 1. A list of equivalent IDs (`eqids`) for each input identifier. + 2. A list of Biolink types for each input identifier. + 3. A list of lists containing ancestor types for each input identifier. + :rtype: Tuple[List, List, List] + """ + if len(canonical_nonan) == 0: - return [], [] + return [], [], [] batch_size = int(os.environ.get("EQ_BATCH_SIZE", 2500)) eqids = [] for i in range(0, len(canonical_nonan), batch_size): @@ -521,7 +541,7 @@ async def get_eqids_and_types( for eqid in eqids[index]: eqid.update({'types': [typ]}) - return eqids, types_with_ancestors + return eqids, types, types_with_ancestors async def get_normalized_nodes( @@ -564,7 +584,7 @@ async def get_normalized_nodes( info_contents = await get_info_content(app, canonical_nonan) # Get the equivalent_ids and types - eqids, types = await get_eqids_and_types(app, canonical_nonan) + eqids, direct_types, types_with_ancestors = await get_eqids_and_types(app, canonical_nonan) # are we looking for conflated values if conflate_gene_protein or conflate_chemical_drug: @@ -608,46 +628,54 @@ async def get_normalized_nodes( dereference_others[canon].extend(oids) all_other_ids = sum(other_ids, []) - eqids2, types2 = await get_eqids_and_types(app, all_other_ids) + eqids2, direct_types2, types_with_ancestors2 = await get_eqids_and_types(app, all_other_ids) # logger.error(f"other_ids = {other_ids}") # logger.error(f"dereference_others = {dereference_others}") # logger.error(f"all_other_ids = {all_other_ids}") final_eqids = [] + final_direct_types = [] final_types = [] deref_others_eqs = dict(zip(all_other_ids, eqids2)) - deref_others_typ = dict(zip(all_other_ids, types2)) + deref_others_direct_types = dict(zip(all_other_ids, direct_types2)) + deref_others_typ = dict(zip(all_other_ids, types_with_ancestors2)) - zipped = zip(canonical_nonan, eqids, types) + zipped = zip(canonical_nonan, eqids, direct_types, types_with_ancestors) - for canonical_id, e, t in zipped: + for canonical_id, e, dt, t in zipped: # here's where we replace the eqids, types if len(dereference_others[canonical_id]) > 0: e = [] + dt = [] t = [] for other in dereference_others[canonical_id]: # logger.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}") e += deref_others_eqs[other] + dt += deref_others_direct_types[other] t += deref_others_typ[other] final_eqids.append(e) + final_direct_types.append(dt) final_types.append(uniquify_list(t)) dereference_ids = dict(zip(canonical_nonan, final_eqids)) + dereference_direct_types = dict(zip(canonical_nonan, final_direct_types)) dereference_types = dict(zip(canonical_nonan, final_types)) else: dereference_ids = dict(zip(canonical_nonan, eqids)) - dereference_types = dict(zip(canonical_nonan, types)) + dereference_direct_types = dict(zip(canonical_nonan, direct_types)) + dereference_types = dict(zip(canonical_nonan, types_with_ancestors)) else: dereference_ids = dict() + dereference_direct_types = dict() dereference_types = dict() # output the final result normal_nodes = { - input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, clique_leaders, + input_curie: await create_node(app, canonical_id, dereference_ids, dereference_direct_types, dereference_types, info_contents, clique_leaders, include_descriptions=include_descriptions, include_individual_types=include_individual_types, include_taxa=include_taxa, @@ -690,7 +718,7 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict: return new_attrib -async def create_node(app, canonical_id, equivalent_ids, types, info_contents, clique_leaders, include_descriptions=True, +async def create_node(app, canonical_id, equivalent_ids, direct_types, types_with_ancestors, info_contents, clique_leaders, include_descriptions=True, include_individual_types=False, include_taxa=False, conflations=None): """Construct the output format given the compressed redis data""" # It's possible that we didn't find a canonical_id @@ -711,8 +739,8 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # If we have 'None' in the canonical types, something went horribly wrong (specifically: we couldn't # find the type information for all the eqids for this clique). Return None. - if None in types[canonical_id]: - logger.error(f"No types found for canonical ID {canonical_id} among types: {types}") + if None in types_with_ancestors[canonical_id]: + logger.error(f"No types found for canonical ID {canonical_id} among types: {types_with_ancestors}") return None # OK, now we should have id's in the format [ {"i": "MONDO:12312", "l": "Scrofula"}, {},...] @@ -761,7 +789,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # need to reverse it in order to apply preferred_name_boost_prefixes for the most # specific type. possible_labels = [] - for typ in types[canonical_id][::-1]: + for typ in types_with_ancestors[canonical_id][::-1]: if typ in config['preferred_name_boost_prefixes']: # This is the most specific matching type, so we use this and then break. possible_labels = list(map(lambda ident: ident.get('l', ''), @@ -832,6 +860,9 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) + if direct_types and eqid["i"] in direct_types: + eq_item["biolink_type"] = direct_types[canonical_id] + if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: clique_leaders_output[eqid["i"]] = { "identifier": eqid["i"] } if "label" in eq_item: @@ -863,10 +894,10 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/NCATSTranslator/NodeNormalization/issues/173) - if 'biolink:Entity' in types[canonical_id]: - types[canonical_id].remove('biolink:Entity') + if 'biolink:Entity' in types_with_ancestors[canonical_id]: + types_with_ancestors[canonical_id].remove('biolink:Entity') - node['type'] = types[canonical_id] + node['type'] = types_with_ancestors[canonical_id] # add the info content to the node if we got one if info_contents[canonical_id] is not None: From 4874b46951b7537f7c7dc7025fcf3c4817de692d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Feb 2026 19:26:10 -0500 Subject: [PATCH 13/32] Fixed return value in get_eqids_and_types() call. --- node_normalizer/normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index d36c14b..92e4529 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -767,7 +767,7 @@ async def create_node(app, canonical_id, equivalent_ids, direct_types, types_wit curie = identifier.get('i', '') if curie in curies_already_checked: continue - results, _ = await get_eqids_and_types(app, [curie]) + results, _, _ = await get_eqids_and_types(app, [curie]) identifiers_with_labels = results[0] labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels) From 4b9a1f75c5adbea0b7a78dc8c4d4acfd084ce9d2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Feb 2026 19:35:19 -0500 Subject: [PATCH 14/32] Attempt to fix string-as-list issue. --- node_normalizer/normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 92e4529..23a6057 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -654,7 +654,7 @@ async def get_normalized_nodes( for other in dereference_others[canonical_id]: # logger.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}") e += deref_others_eqs[other] - dt += deref_others_direct_types[other] + dt += [deref_others_direct_types[other]] t += deref_others_typ[other] final_eqids.append(e) From e1657baa92533a43c90b52a3ccfc82920112009f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Feb 2026 19:48:05 -0500 Subject: [PATCH 15/32] What about this eh --- node_normalizer/normalizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 23a6057..f312fc2 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -662,11 +662,11 @@ async def get_normalized_nodes( final_types.append(uniquify_list(t)) dereference_ids = dict(zip(canonical_nonan, final_eqids)) - dereference_direct_types = dict(zip(canonical_nonan, final_direct_types)) + dereference_direct_types = dict(zip(final_eqids, final_direct_types)) dereference_types = dict(zip(canonical_nonan, final_types)) else: dereference_ids = dict(zip(canonical_nonan, eqids)) - dereference_direct_types = dict(zip(canonical_nonan, direct_types)) + dereference_direct_types = dict(zip(eqids, direct_types)) dereference_types = dict(zip(canonical_nonan, types_with_ancestors)) else: dereference_ids = dict() From e0f812977d2da918266948721ce35b5f4f9d71ed Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Feb 2026 20:03:58 -0500 Subject: [PATCH 16/32] Reverted a bunch of stuff because apparently I already implemented this. --- node_normalizer/normalizer.py | 38 +++++++++++++---------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index f312fc2..e898c24 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -499,7 +499,7 @@ async def get_info_content( async def get_eqids_and_types( app: FastAPI, - canonical_nonan: List) -> Tuple[List, List, List]: + canonical_nonan: List) -> Tuple[List, List]: """ Retrieve equivalent IDs and their corresponding types, along with ancestor types, by querying databases. @@ -515,13 +515,12 @@ async def get_eqids_and_types( :type canonical_nonan: List :return: A tuple containing three lists: 1. A list of equivalent IDs (`eqids`) for each input identifier. - 2. A list of Biolink types for each input identifier. - 3. A list of lists containing ancestor types for each input identifier. - :rtype: Tuple[List, List, List] + 2. A list of lists containing ancestor types for each input identifier, starting with the most specific type. + :rtype: Tuple[List, List] """ if len(canonical_nonan) == 0: - return [], [], [] + return [], [] batch_size = int(os.environ.get("EQ_BATCH_SIZE", 2500)) eqids = [] for i in range(0, len(canonical_nonan), batch_size): @@ -541,7 +540,7 @@ async def get_eqids_and_types( for eqid in eqids[index]: eqid.update({'types': [typ]}) - return eqids, types, types_with_ancestors + return eqids, types_with_ancestors async def get_normalized_nodes( @@ -584,7 +583,7 @@ async def get_normalized_nodes( info_contents = await get_info_content(app, canonical_nonan) # Get the equivalent_ids and types - eqids, direct_types, types_with_ancestors = await get_eqids_and_types(app, canonical_nonan) + eqids, types_with_ancestors = await get_eqids_and_types(app, canonical_nonan) # are we looking for conflated values if conflate_gene_protein or conflate_chemical_drug: @@ -628,54 +627,48 @@ async def get_normalized_nodes( dereference_others[canon].extend(oids) all_other_ids = sum(other_ids, []) - eqids2, direct_types2, types_with_ancestors2 = await get_eqids_and_types(app, all_other_ids) + # We don't care about direct types for conflated identifiers here -- if you want it, you can get it + # in clique_leaders. + eqids2, types_with_ancestors2 = await get_eqids_and_types(app, all_other_ids) # logger.error(f"other_ids = {other_ids}") # logger.error(f"dereference_others = {dereference_others}") # logger.error(f"all_other_ids = {all_other_ids}") final_eqids = [] - final_direct_types = [] final_types = [] deref_others_eqs = dict(zip(all_other_ids, eqids2)) - deref_others_direct_types = dict(zip(all_other_ids, direct_types2)) deref_others_typ = dict(zip(all_other_ids, types_with_ancestors2)) - zipped = zip(canonical_nonan, eqids, direct_types, types_with_ancestors) + zipped = zip(canonical_nonan, eqids, types_with_ancestors) - for canonical_id, e, dt, t in zipped: + for canonical_id, e, t in zipped: # here's where we replace the eqids, types if len(dereference_others[canonical_id]) > 0: e = [] - dt = [] t = [] for other in dereference_others[canonical_id]: # logger.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}") e += deref_others_eqs[other] - dt += [deref_others_direct_types[other]] t += deref_others_typ[other] final_eqids.append(e) - final_direct_types.append(dt) final_types.append(uniquify_list(t)) dereference_ids = dict(zip(canonical_nonan, final_eqids)) - dereference_direct_types = dict(zip(final_eqids, final_direct_types)) dereference_types = dict(zip(canonical_nonan, final_types)) else: dereference_ids = dict(zip(canonical_nonan, eqids)) - dereference_direct_types = dict(zip(eqids, direct_types)) dereference_types = dict(zip(canonical_nonan, types_with_ancestors)) else: dereference_ids = dict() - dereference_direct_types = dict() dereference_types = dict() # output the final result normal_nodes = { - input_curie: await create_node(app, canonical_id, dereference_ids, dereference_direct_types, dereference_types, info_contents, clique_leaders, + input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, clique_leaders, include_descriptions=include_descriptions, include_individual_types=include_individual_types, include_taxa=include_taxa, @@ -718,7 +711,7 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict: return new_attrib -async def create_node(app, canonical_id, equivalent_ids, direct_types, types_with_ancestors, info_contents, clique_leaders, include_descriptions=True, +async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, info_contents, clique_leaders, include_descriptions=True, include_individual_types=False, include_taxa=False, conflations=None): """Construct the output format given the compressed redis data""" # It's possible that we didn't find a canonical_id @@ -767,7 +760,7 @@ async def create_node(app, canonical_id, equivalent_ids, direct_types, types_wit curie = identifier.get('i', '') if curie in curies_already_checked: continue - results, _, _ = await get_eqids_and_types(app, [curie]) + results, _ = await get_eqids_and_types(app, [curie]) identifiers_with_labels = results[0] labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels) @@ -860,9 +853,6 @@ async def create_node(app, canonical_id, equivalent_ids, direct_types, types_wit eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - if direct_types and eqid["i"] in direct_types: - eq_item["biolink_type"] = direct_types[canonical_id] - if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: clique_leaders_output[eqid["i"]] = { "identifier": eqid["i"] } if "label" in eq_item: From 3121c86fc705aa9c639d1277d532c62eaae07d71 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 23 Feb 2026 18:54:43 -0500 Subject: [PATCH 17/32] Cleaned up code a bit. --- node_normalizer/normalizer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index e898c24..e93af15 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -854,15 +854,16 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i node["equivalent_identifiers"].append(eq_item) if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: - clique_leaders_output[eqid["i"]] = { "identifier": eqid["i"] } + clique_leader_output = { "identifier": eqid["i"] } if "label" in eq_item: - clique_leaders_output[eqid["i"]]["label"] = eq_item["label"] + clique_leader_output["label"] = eq_item["label"] if "description" in eq_item: - clique_leaders_output[eqid["i"]]["description"] = eq_item["description"] + clique_leader_output["description"] = eq_item["description"] if "taxa" in eq_item: - clique_leaders_output[eqid["i"]]["taxa"] = eq_item["taxa"] + clique_leader_output["taxa"] = eq_item["taxa"] if "type" in eq_item: - clique_leaders_output[eqid["i"]]["type"] = eq_item["type"] + clique_leader_output["type"] = eq_item["type"] + clique_leaders_output[eqid["i"]] = clique_leader_output if include_descriptions and descriptions: node["descriptions"] = descriptions From 004d966690e2c041e2bc816ef95be4c0cf647fd8 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 23 Feb 2026 18:59:18 -0500 Subject: [PATCH 18/32] Trying to improve output/get this to work. --- node_normalizer/normalizer.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index e93af15..e71bf29 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -854,7 +854,10 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i node["equivalent_identifiers"].append(eq_item) if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: - clique_leader_output = { "identifier": eqid["i"] } + clique_leader_output = { + "identifier": eqid["i"], + "contains": clique_leaders[canonical_id][eqid["i"]], + } if "label" in eq_item: clique_leader_output["label"] = eq_item["label"] if "description" in eq_item: @@ -874,14 +877,17 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i # Add clique leaders if available. if clique_leaders: - node["clique_leaders"] = [] + node["clique_leaders"] = {} for cl_id in clique_leaders: + if cl_id in node["clique_leaders"]: + raise RuntimeError(f"Duplicate clique leader {cl_id} in clique leaders {clique_leaders}") + if cl_id in clique_leaders_output: - node["clique_leaders"].append(clique_leaders_output[cl_id]) + node["clique_leaders"][cl_id] = clique_leaders_output[cl_id] else: - node["clique_leaders"].append({ + node["clique_leaders"][cl_id] = { "identifier": cl_id, - }) + } # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/NCATSTranslator/NodeNormalization/issues/173) From 19be4d449eb760fe3967b2474e19e41d4646f3bf Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 23 Feb 2026 19:09:50 -0500 Subject: [PATCH 19/32] Fixed an issue. --- node_normalizer/normalizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index e71bf29..9bb9970 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -856,7 +856,6 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: clique_leader_output = { "identifier": eqid["i"], - "contains": clique_leaders[canonical_id][eqid["i"]], } if "label" in eq_item: clique_leader_output["label"] = eq_item["label"] From 92e10f81421fc4e92243fb17d3e43141d6fe9054 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 23 Feb 2026 19:15:56 -0500 Subject: [PATCH 20/32] Try to fix. --- node_normalizer/normalizer.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 9bb9970..9cc74da 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -830,7 +830,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i # now need to reformat the identifier keys. It could be cleaner but we have to worry about if there is a label descriptions = [] - clique_leaders_output = {} + clique_leaders_output = [] node_taxa = set() node["equivalent_identifiers"] = [] for eqid in eids: @@ -865,7 +865,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i clique_leader_output["taxa"] = eq_item["taxa"] if "type" in eq_item: clique_leader_output["type"] = eq_item["type"] - clique_leaders_output[eqid["i"]] = clique_leader_output + clique_leaders_output.append(clique_leader_output) if include_descriptions and descriptions: node["descriptions"] = descriptions @@ -875,18 +875,11 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix) # Add clique leaders if available. - if clique_leaders: + if clique_leaders and canonical_id in clique_leaders: node["clique_leaders"] = {} - for cl_id in clique_leaders: - if cl_id in node["clique_leaders"]: - raise RuntimeError(f"Duplicate clique leader {cl_id} in clique leaders {clique_leaders}") - - if cl_id in clique_leaders_output: - node["clique_leaders"][cl_id] = clique_leaders_output[cl_id] - else: - node["clique_leaders"][cl_id] = { - "identifier": cl_id, - } + for clique_leader_output in clique_leaders_output: + cl_id = clique_leader_output["identifier"] + node["clique_leaders"][cl_id] = clique_leaders_output[cl_id] # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/NCATSTranslator/NodeNormalization/issues/173) From 7fdea7c1699b1e7428a85a6129aa86ae6e175953 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 23 Feb 2026 19:22:08 -0500 Subject: [PATCH 21/32] Fixed bug. --- node_normalizer/normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 9cc74da..c66c1bb 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -879,7 +879,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i node["clique_leaders"] = {} for clique_leader_output in clique_leaders_output: cl_id = clique_leader_output["identifier"] - node["clique_leaders"][cl_id] = clique_leaders_output[cl_id] + node["clique_leaders"][cl_id] = clique_leader_output # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/NCATSTranslator/NodeNormalization/issues/173) From 8d24ca51936039d438cf96dbf86e28e132d6fafa Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 17:54:31 -0500 Subject: [PATCH 22/32] Improved output. --- node_normalizer/normalizer.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index c66c1bb..fc43007 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -853,18 +853,22 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: + if clique_leaders and canonical_id in clique_leaders and eqid["i"].upper() in clique_leaders[canonical_id]: clique_leader_output = { "identifier": eqid["i"], } if "label" in eq_item: clique_leader_output["label"] = eq_item["label"] - if "description" in eq_item: - clique_leader_output["description"] = eq_item["description"] - if "taxa" in eq_item: - clique_leader_output["taxa"] = eq_item["taxa"] - if "type" in eq_item: - clique_leader_output["type"] = eq_item["type"] + + # For description, taxa and type, we could read them from eq_item, but that + # is only set if the appropriate flag was turned on. For completeness, let's + # try picking them up if they've been passed to us at all. + if "d" in eqid and len(eqid["d"]) > 0: + clique_leader_output["description"] = eqid["d"] + if "t" in eqid and eqid["t"]: + clique_leader_output["taxa"] = eqid["t"] + if 'types' in eqid: + clique_leader_output["type"] = eqid['types'][-1] clique_leaders_output.append(clique_leader_output) if include_descriptions and descriptions: From dcf72daeea9a2118ec6e75c76c00db4eb9768231 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 18:03:59 -0500 Subject: [PATCH 23/32] Maybe this will fix it. --- node_normalizer/normalizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index fc43007..ba17c85 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -853,7 +853,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - if clique_leaders and canonical_id in clique_leaders and eqid["i"].upper() in clique_leaders[canonical_id]: + if clique_leaders and canonical_id.strip().upper() in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: clique_leader_output = { "identifier": eqid["i"], } @@ -883,6 +883,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i node["clique_leaders"] = {} for clique_leader_output in clique_leaders_output: cl_id = clique_leader_output["identifier"] + # We could also leave this as a list. node["clique_leaders"][cl_id] = clique_leader_output # We need to remove `biolink:Entity` from the types returned. From 9449823013933eda8a7f7a92bef77aeff42b45ea Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 18:16:44 -0500 Subject: [PATCH 24/32] Some debugging please. --- node_normalizer/normalizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index ba17c85..4b834ce 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -853,7 +853,9 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - if clique_leaders and canonical_id.strip().upper() in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: + print(f"Checking if {canonical_id} is in clique_leaders: {json.dumps(clique_leaders, indent=2)}" + ") + if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: clique_leader_output = { "identifier": eqid["i"], } From acf820f65e439512cbfeaf57ddd444249209d9b8 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 18:17:28 -0500 Subject: [PATCH 25/32] Fixed syntax error. --- node_normalizer/normalizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 4b834ce..f6cca90 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -853,8 +853,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - print(f"Checking if {canonical_id} is in clique_leaders: {json.dumps(clique_leaders, indent=2)}" - ") + print(f"Checking if {canonical_id} is in clique_leaders: {json.dumps(clique_leaders, indent=2)}") if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: clique_leader_output = { "identifier": eqid["i"], From e2194dbec1f8b38b6dc1038fe7d0bd564b9de13b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 18:30:46 -0500 Subject: [PATCH 26/32] Fix bug in debugging. --- node_normalizer/normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index f6cca90..2163b58 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -853,7 +853,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - print(f"Checking if {canonical_id} is in clique_leaders: {json.dumps(clique_leaders, indent=2)}") + print(f"Checking if {canonical_id} is in clique_leaders: {builtin_json.dumps(clique_leaders, indent=2)}") if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: clique_leader_output = { "identifier": eqid["i"], From f17619f7dfeba59d3780d8fff6dbe20939f53d91 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 18:50:51 -0500 Subject: [PATCH 27/32] Fix bug, try to improve. --- node_normalizer/normalizer.py | 74 ++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 2163b58..3b62328 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -575,7 +575,8 @@ async def get_normalized_nodes( canonical_ids = await app.state.eq_id_to_id_db.mget(*upper_curies, encoding='utf-8') canonical_nonan = [canonical_id for canonical_id in canonical_ids if canonical_id is not None] info_contents = {} - clique_leaders = {} + clique_leaders_gene_protein = {} + clique_leaders_drug_chemical = {} # did we get some canonical ids if canonical_nonan: @@ -594,7 +595,7 @@ async def get_normalized_nodes( gene_protein_clique_leaders = [json.loads(oids) if oids else [] for oids in gene_protein_clique_leaders_strings] other_ids.extend(gene_protein_clique_leaders) if include_clique_leaders: - clique_leaders.update(zip(canonical_nonan, gene_protein_clique_leaders)) + clique_leaders_gene_protein.update(zip(canonical_nonan, gene_protein_clique_leaders)) # logger.error(f"After conflate_gene_protein: {other_ids}") @@ -603,7 +604,7 @@ async def get_normalized_nodes( drug_chemical_clique_leaders = [json.loads(oids) if oids else [] for oids in drug_chemical_clique_leaders_strings] other_ids.extend(drug_chemical_clique_leaders) if include_clique_leaders: - clique_leaders.update(zip(canonical_nonan, drug_chemical_clique_leaders)) + clique_leaders_drug_chemical.update(zip(canonical_nonan, drug_chemical_clique_leaders)) # logger.error(f"After conflate_chemical_drug: {other_ids}") @@ -628,7 +629,7 @@ async def get_normalized_nodes( all_other_ids = sum(other_ids, []) # We don't care about direct types for conflated identifiers here -- if you want it, you can get it - # in clique_leaders. + # in clique_leaders_gene_protein. eqids2, types_with_ancestors2 = await get_eqids_and_types(app, all_other_ids) # logger.error(f"other_ids = {other_ids}") @@ -668,14 +669,18 @@ async def get_normalized_nodes( # output the final result normal_nodes = { - input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, clique_leaders, - include_descriptions=include_descriptions, - include_individual_types=include_individual_types, - include_taxa=include_taxa, - conflations={ - 'GeneProtein': conflate_gene_protein, - 'DrugChemical': conflate_chemical_drug, - }) + input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, + clique_leaders = { + 'GeneProtein': clique_leaders_gene_protein, + 'DrugChemical': clique_leaders_drug_chemical, + }, + include_descriptions=include_descriptions, + include_individual_types=include_individual_types, + include_taxa=include_taxa, + conflations={ + 'GeneProtein': conflate_gene_protein, + 'DrugChemical': conflate_chemical_drug, + }) for input_curie, canonical_id in zip(curies, canonical_ids) } @@ -853,24 +858,27 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - print(f"Checking if {canonical_id} is in clique_leaders: {builtin_json.dumps(clique_leaders, indent=2)}") - if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: - clique_leader_output = { - "identifier": eqid["i"], - } - if "label" in eq_item: - clique_leader_output["label"] = eq_item["label"] - - # For description, taxa and type, we could read them from eq_item, but that - # is only set if the appropriate flag was turned on. For completeness, let's - # try picking them up if they've been passed to us at all. - if "d" in eqid and len(eqid["d"]) > 0: - clique_leader_output["description"] = eqid["d"] - if "t" in eqid and eqid["t"]: - clique_leader_output["taxa"] = eqid["t"] - if 'types' in eqid: - clique_leader_output["type"] = eqid['types'][-1] - clique_leaders_output.append(clique_leader_output) + # print(f"Checking if {canonical_id} is in clique_leaders: {builtin_json.dumps(clique_leaders, indent=2)}") + if clique_leaders: + for conflation_type in clique_leaders: + if canonical_id in clique_leaders[conflation_type] and eqid["i"] in clique_leaders[conflation_type][canonical_id]: + clique_leader_output = { + "identifier": eqid["i"], + "conflation": conflation_type, + } + if "label" in eq_item: + clique_leader_output["label"] = eq_item["label"] + + # For description, taxa and type, we could read them from eq_item, but that + # is only set if the appropriate flag was turned on. For completeness, let's + # try picking them up if they've been passed to us at all. + if "d" in eqid and len(eqid["d"]) > 0: + clique_leader_output["description"] = eqid["d"] + if "t" in eqid and eqid["t"]: + clique_leader_output["taxa"] = eqid["t"] + if 'types' in eqid: + clique_leader_output["type"] = eqid['types'][-1] + clique_leaders_output.append(clique_leader_output) if include_descriptions and descriptions: node["descriptions"] = descriptions @@ -881,11 +889,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i # Add clique leaders if available. if clique_leaders and canonical_id in clique_leaders: - node["clique_leaders"] = {} - for clique_leader_output in clique_leaders_output: - cl_id = clique_leader_output["identifier"] - # We could also leave this as a list. - node["clique_leaders"][cl_id] = clique_leader_output + node["clique_leaders"] = clique_leaders_output # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/NCATSTranslator/NodeNormalization/issues/173) From 00878cd893cfbd2b7adf89eab1ee81ea8e63695e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 19:46:07 -0500 Subject: [PATCH 28/32] Turned off on:push trigger after testing. --- .github/workflows/release.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e9d8f8f..800d57b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,7 +1,6 @@ name: 'Publish to GitHub Packages' on: - push: release: types: [published] From 08671333d2d5ce5ff7aea5edd628b5f6910b6ce5 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 19:58:46 -0500 Subject: [PATCH 29/32] Reinstated on:push trigger. --- .github/workflows/release.yml | 1 + node_normalizer/normalizer.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 800d57b..e9d8f8f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,6 +1,7 @@ name: 'Publish to GitHub Packages' on: + push: release: types: [published] diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 3b62328..d223de9 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -858,7 +858,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - # print(f"Checking if {canonical_id} is in clique_leaders: {builtin_json.dumps(clique_leaders, indent=2)}") + print(f"Checking if {canonical_id} is in clique_leaders: {builtin_json.dumps(clique_leaders, indent=2)}") if clique_leaders: for conflation_type in clique_leaders: if canonical_id in clique_leaders[conflation_type] and eqid["i"] in clique_leaders[conflation_type][canonical_id]: From 8a97c9b028d802e72cbb9f7c8ec142bfc179d875 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 19:59:47 -0500 Subject: [PATCH 30/32] Fix bug maybe. --- node_normalizer/normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index d223de9..3c3c940 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -888,7 +888,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix) # Add clique leaders if available. - if clique_leaders and canonical_id in clique_leaders: + if clique_leaders and clique_leaders_output: node["clique_leaders"] = clique_leaders_output # We need to remove `biolink:Entity` from the types returned. From 6397dbedaad5abc807a6f8ab363745228a23486a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 20:39:15 -0500 Subject: [PATCH 31/32] Removed on:push after testing. --- .github/workflows/release.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e9d8f8f..800d57b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,7 +1,6 @@ name: 'Publish to GitHub Packages' on: - push: release: types: [published] From e5306045c8b6b7b5146314e3dc694fa753a4b175 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 24 Feb 2026 20:47:36 -0500 Subject: [PATCH 32/32] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- node_normalizer/normalizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 3c3c940..0c02cdf 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -513,7 +513,7 @@ async def get_eqids_and_types( :type app: FastAPI :param canonical_nonan: A list of canonical identifiers for which `eqids` and types need to be fetched. :type canonical_nonan: List - :return: A tuple containing three lists: + :return: A tuple containing two lists: 1. A list of equivalent IDs (`eqids`) for each input identifier. 2. A list of lists containing ancestor types for each input identifier, starting with the most specific type. :rtype: Tuple[List, List] @@ -858,7 +858,7 @@ async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, i eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - print(f"Checking if {canonical_id} is in clique_leaders: {builtin_json.dumps(clique_leaders, indent=2)}") + # print(f"Checking if {canonical_id} is in clique_leaders: {builtin_json.dumps(clique_leaders, indent=2)}") if clique_leaders: for conflation_type in clique_leaders: if canonical_id in clique_leaders[conflation_type] and eqid["i"] in clique_leaders[conflation_type][canonical_id]: