Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
e4843f3
First stab at adding clique leaders.
gaurav Dec 15, 2025
db9f2a3
Added on:push trigger for testing.
gaurav Dec 15, 2025
5351dc6
Attempt at fix.
gaurav Dec 15, 2025
55753fb
Added logging for debugging.
gaurav Dec 15, 2025
dda3646
Replaced logging with logger.
gaurav Dec 15, 2025
de096d2
Attempt to fix clique leader querying.
gaurav Dec 15, 2025
53a82cb
Get the clique leaders translated again.
gaurav Dec 15, 2025
b304088
Fixed up output.
gaurav Dec 15, 2025
6abcd84
Bugfixes.
gaurav Dec 15, 2025
321fb85
Fix clique leader output.
gaurav Dec 15, 2025
1ad6572
Bugfix.
gaurav Dec 15, 2025
3c8ac2e
Merge branch 'master' into add-clique-leaders-option
gaurav Dec 18, 2025
4ef9fe0
Merge branch 'master' into add-clique-leaders-option
gaurav Feb 19, 2026
ff4e4d4
First stab at including direct types in response (see #359).
gaurav Feb 20, 2026
061306c
Merge branch 'add-nodenorm-version-to-status' into add-clique-leaders…
gaurav Feb 20, 2026
4874b46
Fixed return value in get_eqids_and_types() call.
gaurav Feb 20, 2026
4b9a1f7
Attempt to fix string-as-list issue.
gaurav Feb 20, 2026
e1657ba
What about this eh
gaurav Feb 20, 2026
e0f8129
Reverted a bunch of stuff because apparently I already implemented this.
gaurav Feb 20, 2026
86e9ed3
Merge branch 'master' into add-clique-leaders-option
gaurav Feb 23, 2026
3121c86
Cleaned up code a bit.
gaurav Feb 23, 2026
004d966
Trying to improve output/get this to work.
gaurav Feb 23, 2026
19be4d4
Fixed an issue.
gaurav Feb 24, 2026
92e10f8
Try to fix.
gaurav Feb 24, 2026
7fdea7c
Fixed bug.
gaurav Feb 24, 2026
4ca7664
Merge branch 'master' into add-clique-leaders-option
gaurav Feb 24, 2026
8d24ca5
Improved output.
gaurav Feb 24, 2026
dcf72da
Maybe this will fix it.
gaurav Feb 24, 2026
9449823
Some debugging please.
gaurav Feb 24, 2026
acf820f
Fixed syntax error.
gaurav Feb 24, 2026
e2194db
Fix bug in debugging.
gaurav Feb 24, 2026
f17619f
Fix bug, try to improve.
gaurav Feb 24, 2026
00878cd
Turned off on:push trigger after testing.
gaurav Feb 25, 2026
0867133
Reinstated on:push trigger.
gaurav Feb 25, 2026
8a97c9b
Fix bug maybe.
gaurav Feb 25, 2026
6397dbe
Removed on:push after testing.
gaurav Feb 25, 2026
e530604
Apply suggestions from code review
gaurav Feb 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions node_normalizer/model/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ class CurieList(BaseModel):
title="Whether to return taxa for equivalent identifiers"
)

include_clique_leaders: bool = Field(
default=False,
title="Whether to return clique leaders for conflated identifiers"
)

class Config:
schema_extra = {
"example": {
Expand Down
121 changes: 92 additions & 29 deletions node_normalizer/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,25 @@ async def get_info_content(
async def get_eqids_and_types(
app: FastAPI,
canonical_nonan: List) -> Tuple[List, List]:
"""
Retrieve equivalent IDs and their corresponding types, along with ancestor types, by querying databases.

This function processes a given list of canonical identifiers in batches and fetches their equivalent
IDs (`eqids`) from a database. For each identifier, it also retrieves its type information and computes
the ancestor types using a function. If no type information is found for a given identifier, a default
type is assigned, and an error is logged. The resulting data structures consolidate this information
to provide detailed insights into IDs and their associated types.

:param app: An instance of the FastAPI application containing the database connections.
:type app: FastAPI
:param canonical_nonan: A list of canonical identifiers for which `eqids` and types need to be fetched.
:type canonical_nonan: List
:return: A tuple containing two lists:
1. A list of equivalent IDs (`eqids`) for each input identifier.
2. A list of lists containing ancestor types for each input identifier, starting with the most specific type.
:rtype: Tuple[List, List]
"""

if len(canonical_nonan) == 0:
return [], []
batch_size = int(os.environ.get("EQ_BATCH_SIZE", 2500))
Expand All @@ -511,7 +530,7 @@ async def get_eqids_and_types(
types_with_ancestors = []
for index, typ in enumerate(types):
if not typ:
logging.error(f"No type information found for '{canonical_nonan[index]}' with eqids: {eqids[index]}, "
logger.error(f"No type information found for '{canonical_nonan[index]}' with eqids: {eqids[index]}, "
f"replacing with {BIOLINK_NAMED_THING}")
types_with_ancestors.append([BIOLINK_NAMED_THING])
else:
Expand All @@ -532,6 +551,7 @@ async def get_normalized_nodes(
include_descriptions: bool = False,
include_individual_types: bool = True,
include_taxa: bool = True,
include_clique_leaders: bool = False,
) -> Dict[str, Optional[str]]:
"""
Get value(s) for key(s) using redis MGET
Expand All @@ -555,33 +575,43 @@ async def get_normalized_nodes(
canonical_ids = await app.state.eq_id_to_id_db.mget(*upper_curies, encoding='utf-8')
canonical_nonan = [canonical_id for canonical_id in canonical_ids if canonical_id is not None]
info_contents = {}
clique_leaders_gene_protein = {}
clique_leaders_drug_chemical = {}

# did we get some canonical ids
if canonical_nonan:
# get the information content values
info_contents = await get_info_content(app, canonical_nonan)

# Get the equivalent_ids and types
eqids, types = await get_eqids_and_types(app, canonical_nonan)
eqids, types_with_ancestors = await get_eqids_and_types(app, canonical_nonan)

# are we looking for conflated values
if conflate_gene_protein or conflate_chemical_drug:
other_ids = []

if conflate_gene_protein:
other_ids.extend(await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8'))
gene_protein_clique_leaders_strings = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8')
gene_protein_clique_leaders = [json.loads(oids) if oids else [] for oids in gene_protein_clique_leaders_strings]
other_ids.extend(gene_protein_clique_leaders)
if include_clique_leaders:
clique_leaders_gene_protein.update(zip(canonical_nonan, gene_protein_clique_leaders))

# logger.error(f"After conflate_gene_protein: {other_ids}")

if conflate_chemical_drug:
other_ids.extend(await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8'))
drug_chemical_clique_leaders_strings = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8')
drug_chemical_clique_leaders = [json.loads(oids) if oids else [] for oids in drug_chemical_clique_leaders_strings]
other_ids.extend(drug_chemical_clique_leaders)
if include_clique_leaders:
clique_leaders_drug_chemical.update(zip(canonical_nonan, drug_chemical_clique_leaders))

# logger.error(f"After conflate_chemical_drug: {other_ids}")

# if there are other ids, then we want to rebuild eqids and types. That's because even though we have them,
# they're not necessarily first. For instance if what came in and got canonicalized was a protein id
# and we want gene first, then we're relying on the order of the other_ids to put it back in the right place.
other_ids = [json.loads(oids) if oids else [] for oids in other_ids]
# other_ids = [json.loads(oids) if oids else [] for oids in other_ids]

# Until we added conflate_chemical_drug, canonical_nonan and other_ids would always have the same
# length, so we could figure out mappings from one to the other just by doing:
Expand All @@ -598,7 +628,9 @@ async def get_normalized_nodes(
dereference_others[canon].extend(oids)

all_other_ids = sum(other_ids, [])
eqids2, types2 = await get_eqids_and_types(app, all_other_ids)
# We don't care about direct types for conflated identifiers here -- if you want it, you can get it
# in clique_leaders_gene_protein.
eqids2, types_with_ancestors2 = await get_eqids_and_types(app, all_other_ids)

# logger.error(f"other_ids = {other_ids}")
# logger.error(f"dereference_others = {dereference_others}")
Expand All @@ -608,9 +640,9 @@ async def get_normalized_nodes(
final_types = []

deref_others_eqs = dict(zip(all_other_ids, eqids2))
deref_others_typ = dict(zip(all_other_ids, types2))
deref_others_typ = dict(zip(all_other_ids, types_with_ancestors2))

zipped = zip(canonical_nonan, eqids, types)
zipped = zip(canonical_nonan, eqids, types_with_ancestors)

for canonical_id, e, t in zipped:
# here's where we replace the eqids, types
Expand All @@ -619,7 +651,7 @@ async def get_normalized_nodes(
t = []

for other in dereference_others[canonical_id]:
# logging.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}")
# logger.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}")
e += deref_others_eqs[other]
t += deref_others_typ[other]

Expand All @@ -630,28 +662,32 @@ async def get_normalized_nodes(
dereference_types = dict(zip(canonical_nonan, final_types))
else:
dereference_ids = dict(zip(canonical_nonan, eqids))
dereference_types = dict(zip(canonical_nonan, types))
dereference_types = dict(zip(canonical_nonan, types_with_ancestors))
else:
dereference_ids = dict()
dereference_types = dict()

# output the final result
normal_nodes = {
input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents,
include_descriptions=include_descriptions,
include_individual_types=include_individual_types,
include_taxa=include_taxa,
conflations={
'GeneProtein': conflate_gene_protein,
'DrugChemical': conflate_chemical_drug,
})
clique_leaders = {
'GeneProtein': clique_leaders_gene_protein,
'DrugChemical': clique_leaders_drug_chemical,
},
include_descriptions=include_descriptions,
include_individual_types=include_individual_types,
include_taxa=include_taxa,
conflations={
'GeneProtein': conflate_gene_protein,
'DrugChemical': conflate_chemical_drug,
})
for input_curie, canonical_id in zip(curies, canonical_ids)
}

end_time = time.time_ns()
logger.info(f"Normalized {len(curies)} nodes in {(end_time - start_time)/1_000_000:.2f} ms with arguments " +
f"(curies={curies}, conflate_gene_protein={conflate_gene_protein}, conflate_chemical_drug={conflate_chemical_drug}, " +
f"include_descriptions={include_descriptions}, include_individual_types={include_individual_types})")
f"include_descriptions={include_descriptions}, include_individual_types={include_individual_types}, include_clique_leaders={include_clique_leaders})")

return normal_nodes

Expand Down Expand Up @@ -680,7 +716,7 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict:
return new_attrib


async def create_node(app, canonical_id, equivalent_ids, types, info_contents, include_descriptions=True,
async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, info_contents, clique_leaders, include_descriptions=True,
include_individual_types=False, include_taxa=False, conflations=None):
"""Construct the output format given the compressed redis data"""
# It's possible that we didn't find a canonical_id
Expand All @@ -693,16 +729,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i

# If we have 'None' in the equivalent IDs, skip it so we don't confuse things further down the line.
if None in equivalent_ids[canonical_id]:
logging.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}")
logger.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}")
equivalent_ids[canonical_id] = [x for x in equivalent_ids[canonical_id] if x is not None]
if not equivalent_ids[canonical_id]:
logging.warning(f"No non-None values found for ID {canonical_id} among filtered eqids: {equivalent_ids}")
logger.warning(f"No non-None values found for ID {canonical_id} among filtered eqids: {equivalent_ids}")
return None

# If we have 'None' in the canonical types, something went horribly wrong (specifically: we couldn't
# find the type information for all the eqids for this clique). Return None.
if None in types[canonical_id]:
logging.error(f"No types found for canonical ID {canonical_id} among types: {types}")
if None in types_with_ancestors[canonical_id]:
logger.error(f"No types found for canonical ID {canonical_id} among types: {types_with_ancestors}")
return None

# OK, now we should have id's in the format [ {"i": "MONDO:12312", "l": "Scrofula"}, {},...]
Expand All @@ -721,8 +757,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
identifiers_with_labels = eids
else:
# We have a conflation going on! To replicate Babel's behavior, we need to run the algorithem
# on the list of labels corresponding to the first
# So we need to run the algorithm on the first set of identifiers that have any
# on the list of labels corresponding to the first set of identifiers that have any
# label whatsoever.
identifiers_with_labels = []
curies_already_checked = set()
Expand Down Expand Up @@ -752,7 +787,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
# need to reverse it in order to apply preferred_name_boost_prefixes for the most
# specific type.
possible_labels = []
for typ in types[canonical_id][::-1]:
for typ in types_with_ancestors[canonical_id][::-1]:
if typ in config['preferred_name_boost_prefixes']:
# This is the most specific matching type, so we use this and then break.
possible_labels = list(map(lambda ident: ident.get('l', ''),
Expand Down Expand Up @@ -800,12 +835,14 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i

# now need to reformat the identifier keys. It could be cleaner but we have to worry about if there is a label
descriptions = []
clique_leaders_output = []
node_taxa = set()
node["equivalent_identifiers"] = []
for eqid in eids:
eq_item = {"identifier": eqid["i"]}
if "l" in eqid and eqid["l"]:
eq_item["label"] = eqid["l"]

# if descriptions is enabled, add it to descriptions.
if include_descriptions and "d" in eqid and len(eqid["d"]) > 0:
desc = eqid["d"][0]
Expand All @@ -821,19 +858,45 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
eq_item["type"] = eqid['types'][-1]
node["equivalent_identifiers"].append(eq_item)

# print(f"Checking if {canonical_id} is in clique_leaders: {builtin_json.dumps(clique_leaders, indent=2)}")
if clique_leaders:
for conflation_type in clique_leaders:
if canonical_id in clique_leaders[conflation_type] and eqid["i"] in clique_leaders[conflation_type][canonical_id]:
clique_leader_output = {
"identifier": eqid["i"],
"conflation": conflation_type,
}
if "label" in eq_item:
clique_leader_output["label"] = eq_item["label"]

# For description, taxa and type, we could read them from eq_item, but that
# is only set if the appropriate flag was turned on. For completeness, let's
# try picking them up if they've been passed to us at all.
if "d" in eqid and len(eqid["d"]) > 0:
clique_leader_output["description"] = eqid["d"]
if "t" in eqid and eqid["t"]:
clique_leader_output["taxa"] = eqid["t"]
if 'types' in eqid:
clique_leader_output["type"] = eqid['types'][-1]
clique_leaders_output.append(clique_leader_output)
Comment on lines +862 to +881
Copy link

Copilot AI Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loop structure here could be optimized. Currently, for every equivalent identifier, the code checks all conflation types to see if it's a clique leader. This could be improved by pre-computing a set of clique leaders for faster lookup, especially since the print statement on line 861 will execute for every single equivalent identifier in the response, which could be hundreds or thousands of times for large queries. Consider moving the clique leader check logic outside the main loop or optimizing it with a set-based lookup.

Copilot uses AI. Check for mistakes.

if include_descriptions and descriptions:
node["descriptions"] = descriptions
node["id"]["description"] = descriptions[0]

if include_taxa and node_taxa:
node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix)

# Add clique leaders if available.
if clique_leaders and clique_leaders_output:
node["clique_leaders"] = clique_leaders_output

# We need to remove `biolink:Entity` from the types returned.
# (See explanation at https://github.com/NCATSTranslator/NodeNormalization/issues/173)
if 'biolink:Entity' in types[canonical_id]:
types[canonical_id].remove('biolink:Entity')
if 'biolink:Entity' in types_with_ancestors[canonical_id]:
types_with_ancestors[canonical_id].remove('biolink:Entity')

node['type'] = types[canonical_id]
node['type'] = types_with_ancestors[canonical_id]

# add the info content to the node if we got one
if info_contents[canonical_id] is not None:
Expand Down
4 changes: 3 additions & 1 deletion node_normalizer/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ async def get_normalized_node_handler(
description: bool = fastapi.Query(False, description="Whether to return curie descriptions when possible"),
individual_types: bool = fastapi.Query(False, description="Whether to return individual types for equivalent identifiers"),
include_taxa: bool = fastapi.Query(True, description="Whether to return taxa for equivalent identifiers"),
include_clique_leaders: bool = fastapi.Query(False, description="Whether to return clique leaders for conflated identifiers"),
):
"""
Get value(s) for key(s) using redis MGET
Expand All @@ -291,6 +292,7 @@ async def get_normalized_node_handler(
include_descriptions=description,
include_individual_types=individual_types,
include_taxa=include_taxa,
include_clique_leaders=include_clique_leaders,
)

# If curie contains at least one entry, then the only way normalized_nodes could be blank
Expand All @@ -312,7 +314,7 @@ async def get_normalized_node_handler_post(curies: CurieList):
"""
normalized_nodes = await get_normalized_nodes(app, curies.curies, curies.conflate, curies.drug_chemical_conflate,
curies.description, include_individual_types=curies.individual_types,
include_taxa=curies.include_taxa,
include_taxa=curies.include_taxa, include_clique_leaders=curies.include_clique_leaders,
)

# If curies.curies contains at least one entry, then the only way normalized_nodes could be blank
Expand Down
2 changes: 1 addition & 1 deletion node_normalizer/set_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async def generate_setid(app, curies, conflations) -> SetIDResponse:

# We use get_normalized_nodes() to normalize all the CURIEs for us.
normalization_results = await get_normalized_nodes(
app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=False
app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=False, include_clique_leaders=False
)

# We prepare a set of sorted, deduplicated curies.
Expand Down