-
Notifications
You must be signed in to change notification settings - Fork 8
Add clique leaders option #350
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
e4843f3
db9f2a3
5351dc6
55753fb
dda3646
de096d2
53a82cb
b304088
6abcd84
321fb85
1ad6572
3c8ac2e
4ef9fe0
ff4e4d4
061306c
4874b46
4b9a1f7
e1657ba
e0f8129
86e9ed3
3121c86
004d966
19be4d4
92e10f8
7fdea7c
4ca7664
8d24ca5
dcf72da
9449823
acf820f
e2194db
f17619f
00878cd
0867133
8a97c9b
6397dbe
e530604
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -500,6 +500,25 @@ async def get_info_content( | |
| async def get_eqids_and_types( | ||
| app: FastAPI, | ||
| canonical_nonan: List) -> Tuple[List, List]: | ||
| """ | ||
| Retrieve equivalent IDs and their corresponding types, along with ancestor types, by querying databases. | ||
|
|
||
| This function processes a given list of canonical identifiers in batches and fetches their equivalent | ||
| IDs (`eqids`) from a database. For each identifier, it also retrieves its type information and computes | ||
| the ancestor types using a function. If no type information is found for a given identifier, a default | ||
| type is assigned, and an error is logged. The resulting data structures consolidate this information | ||
| to provide detailed insights into IDs and their associated types. | ||
|
|
||
| :param app: An instance of the FastAPI application containing the database connections. | ||
| :type app: FastAPI | ||
| :param canonical_nonan: A list of canonical identifiers for which `eqids` and types need to be fetched. | ||
| :type canonical_nonan: List | ||
| :return: A tuple containing two lists: | ||
| 1. A list of equivalent IDs (`eqids`) for each input identifier. | ||
| 2. A list of lists containing ancestor types for each input identifier, starting with the most specific type. | ||
| :rtype: Tuple[List, List] | ||
| """ | ||
|
|
||
| if len(canonical_nonan) == 0: | ||
| return [], [] | ||
| batch_size = int(os.environ.get("EQ_BATCH_SIZE", 2500)) | ||
|
|
@@ -511,7 +530,7 @@ async def get_eqids_and_types( | |
| types_with_ancestors = [] | ||
| for index, typ in enumerate(types): | ||
| if not typ: | ||
| logging.error(f"No type information found for '{canonical_nonan[index]}' with eqids: {eqids[index]}, " | ||
| logger.error(f"No type information found for '{canonical_nonan[index]}' with eqids: {eqids[index]}, " | ||
| f"replacing with {BIOLINK_NAMED_THING}") | ||
| types_with_ancestors.append([BIOLINK_NAMED_THING]) | ||
| else: | ||
|
|
@@ -532,6 +551,7 @@ async def get_normalized_nodes( | |
| include_descriptions: bool = False, | ||
| include_individual_types: bool = True, | ||
| include_taxa: bool = True, | ||
| include_clique_leaders: bool = False, | ||
| ) -> Dict[str, Optional[str]]: | ||
| """ | ||
| Get value(s) for key(s) using redis MGET | ||
|
|
@@ -555,33 +575,43 @@ async def get_normalized_nodes( | |
| canonical_ids = await app.state.eq_id_to_id_db.mget(*upper_curies, encoding='utf-8') | ||
| canonical_nonan = [canonical_id for canonical_id in canonical_ids if canonical_id is not None] | ||
| info_contents = {} | ||
| clique_leaders_gene_protein = {} | ||
| clique_leaders_drug_chemical = {} | ||
|
|
||
| # did we get some canonical ids | ||
| if canonical_nonan: | ||
| # get the information content values | ||
| info_contents = await get_info_content(app, canonical_nonan) | ||
|
|
||
| # Get the equivalent_ids and types | ||
| eqids, types = await get_eqids_and_types(app, canonical_nonan) | ||
| eqids, types_with_ancestors = await get_eqids_and_types(app, canonical_nonan) | ||
|
|
||
| # are we looking for conflated values | ||
| if conflate_gene_protein or conflate_chemical_drug: | ||
| other_ids = [] | ||
|
|
||
| if conflate_gene_protein: | ||
| other_ids.extend(await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8')) | ||
| gene_protein_clique_leaders_strings = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') | ||
| gene_protein_clique_leaders = [json.loads(oids) if oids else [] for oids in gene_protein_clique_leaders_strings] | ||
| other_ids.extend(gene_protein_clique_leaders) | ||
| if include_clique_leaders: | ||
| clique_leaders_gene_protein.update(zip(canonical_nonan, gene_protein_clique_leaders)) | ||
|
|
||
| # logger.error(f"After conflate_gene_protein: {other_ids}") | ||
|
|
||
| if conflate_chemical_drug: | ||
| other_ids.extend(await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8')) | ||
| drug_chemical_clique_leaders_strings = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') | ||
| drug_chemical_clique_leaders = [json.loads(oids) if oids else [] for oids in drug_chemical_clique_leaders_strings] | ||
| other_ids.extend(drug_chemical_clique_leaders) | ||
| if include_clique_leaders: | ||
| clique_leaders_drug_chemical.update(zip(canonical_nonan, drug_chemical_clique_leaders)) | ||
|
|
||
| # logger.error(f"After conflate_chemical_drug: {other_ids}") | ||
|
|
||
| # if there are other ids, then we want to rebuild eqids and types. That's because even though we have them, | ||
| # they're not necessarily first. For instance if what came in and got canonicalized was a protein id | ||
| # and we want gene first, then we're relying on the order of the other_ids to put it back in the right place. | ||
| other_ids = [json.loads(oids) if oids else [] for oids in other_ids] | ||
| # other_ids = [json.loads(oids) if oids else [] for oids in other_ids] | ||
|
|
||
| # Until we added conflate_chemical_drug, canonical_nonan and other_ids would always have the same | ||
| # length, so we could figure out mappings from one to the other just by doing: | ||
|
|
@@ -598,7 +628,9 @@ async def get_normalized_nodes( | |
| dereference_others[canon].extend(oids) | ||
|
|
||
| all_other_ids = sum(other_ids, []) | ||
| eqids2, types2 = await get_eqids_and_types(app, all_other_ids) | ||
| # We don't care about direct types for conflated identifiers here -- if you want it, you can get it | ||
| # in clique_leaders_gene_protein. | ||
| eqids2, types_with_ancestors2 = await get_eqids_and_types(app, all_other_ids) | ||
|
|
||
| # logger.error(f"other_ids = {other_ids}") | ||
| # logger.error(f"dereference_others = {dereference_others}") | ||
|
|
@@ -608,9 +640,9 @@ async def get_normalized_nodes( | |
| final_types = [] | ||
|
|
||
| deref_others_eqs = dict(zip(all_other_ids, eqids2)) | ||
| deref_others_typ = dict(zip(all_other_ids, types2)) | ||
| deref_others_typ = dict(zip(all_other_ids, types_with_ancestors2)) | ||
|
|
||
| zipped = zip(canonical_nonan, eqids, types) | ||
| zipped = zip(canonical_nonan, eqids, types_with_ancestors) | ||
|
|
||
| for canonical_id, e, t in zipped: | ||
| # here's where we replace the eqids, types | ||
|
|
@@ -619,7 +651,7 @@ async def get_normalized_nodes( | |
| t = [] | ||
|
|
||
| for other in dereference_others[canonical_id]: | ||
| # logging.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}") | ||
| # logger.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}") | ||
| e += deref_others_eqs[other] | ||
| t += deref_others_typ[other] | ||
|
|
||
|
|
@@ -630,28 +662,32 @@ async def get_normalized_nodes( | |
| dereference_types = dict(zip(canonical_nonan, final_types)) | ||
| else: | ||
| dereference_ids = dict(zip(canonical_nonan, eqids)) | ||
| dereference_types = dict(zip(canonical_nonan, types)) | ||
| dereference_types = dict(zip(canonical_nonan, types_with_ancestors)) | ||
| else: | ||
| dereference_ids = dict() | ||
| dereference_types = dict() | ||
|
|
||
| # output the final result | ||
| normal_nodes = { | ||
| input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, | ||
| include_descriptions=include_descriptions, | ||
| include_individual_types=include_individual_types, | ||
| include_taxa=include_taxa, | ||
| conflations={ | ||
| 'GeneProtein': conflate_gene_protein, | ||
| 'DrugChemical': conflate_chemical_drug, | ||
| }) | ||
| clique_leaders = { | ||
| 'GeneProtein': clique_leaders_gene_protein, | ||
| 'DrugChemical': clique_leaders_drug_chemical, | ||
| }, | ||
| include_descriptions=include_descriptions, | ||
| include_individual_types=include_individual_types, | ||
| include_taxa=include_taxa, | ||
| conflations={ | ||
| 'GeneProtein': conflate_gene_protein, | ||
| 'DrugChemical': conflate_chemical_drug, | ||
| }) | ||
| for input_curie, canonical_id in zip(curies, canonical_ids) | ||
| } | ||
|
|
||
| end_time = time.time_ns() | ||
| logger.info(f"Normalized {len(curies)} nodes in {(end_time - start_time)/1_000_000:.2f} ms with arguments " + | ||
| f"(curies={curies}, conflate_gene_protein={conflate_gene_protein}, conflate_chemical_drug={conflate_chemical_drug}, " + | ||
| f"include_descriptions={include_descriptions}, include_individual_types={include_individual_types})") | ||
| f"include_descriptions={include_descriptions}, include_individual_types={include_individual_types}, include_clique_leaders={include_clique_leaders})") | ||
|
|
||
| return normal_nodes | ||
|
|
||
|
|
@@ -680,7 +716,7 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict: | |
| return new_attrib | ||
|
|
||
|
|
||
| async def create_node(app, canonical_id, equivalent_ids, types, info_contents, include_descriptions=True, | ||
| async def create_node(app, canonical_id, equivalent_ids, types_with_ancestors, info_contents, clique_leaders, include_descriptions=True, | ||
| include_individual_types=False, include_taxa=False, conflations=None): | ||
| """Construct the output format given the compressed redis data""" | ||
| # It's possible that we didn't find a canonical_id | ||
|
|
@@ -693,16 +729,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i | |
|
|
||
| # If we have 'None' in the equivalent IDs, skip it so we don't confuse things further down the line. | ||
| if None in equivalent_ids[canonical_id]: | ||
| logging.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}") | ||
| logger.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}") | ||
| equivalent_ids[canonical_id] = [x for x in equivalent_ids[canonical_id] if x is not None] | ||
| if not equivalent_ids[canonical_id]: | ||
| logging.warning(f"No non-None values found for ID {canonical_id} among filtered eqids: {equivalent_ids}") | ||
| logger.warning(f"No non-None values found for ID {canonical_id} among filtered eqids: {equivalent_ids}") | ||
| return None | ||
|
|
||
| # If we have 'None' in the canonical types, something went horribly wrong (specifically: we couldn't | ||
| # find the type information for all the eqids for this clique). Return None. | ||
| if None in types[canonical_id]: | ||
| logging.error(f"No types found for canonical ID {canonical_id} among types: {types}") | ||
| if None in types_with_ancestors[canonical_id]: | ||
| logger.error(f"No types found for canonical ID {canonical_id} among types: {types_with_ancestors}") | ||
| return None | ||
|
|
||
| # OK, now we should have id's in the format [ {"i": "MONDO:12312", "l": "Scrofula"}, {},...] | ||
|
|
@@ -721,8 +757,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i | |
| identifiers_with_labels = eids | ||
| else: | ||
| # We have a conflation going on! To replicate Babel's behavior, we need to run the algorithem | ||
| # on the list of labels corresponding to the first | ||
| # So we need to run the algorithm on the first set of identifiers that have any | ||
| # on the list of labels corresponding to the first set of identifiers that have any | ||
| # label whatsoever. | ||
| identifiers_with_labels = [] | ||
| curies_already_checked = set() | ||
|
|
@@ -752,7 +787,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i | |
| # need to reverse it in order to apply preferred_name_boost_prefixes for the most | ||
| # specific type. | ||
| possible_labels = [] | ||
| for typ in types[canonical_id][::-1]: | ||
| for typ in types_with_ancestors[canonical_id][::-1]: | ||
| if typ in config['preferred_name_boost_prefixes']: | ||
| # This is the most specific matching type, so we use this and then break. | ||
| possible_labels = list(map(lambda ident: ident.get('l', ''), | ||
|
|
@@ -800,12 +835,14 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i | |
|
|
||
| # now need to reformat the identifier keys. It could be cleaner but we have to worry about if there is a label | ||
| descriptions = [] | ||
| clique_leaders_output = [] | ||
| node_taxa = set() | ||
| node["equivalent_identifiers"] = [] | ||
| for eqid in eids: | ||
| eq_item = {"identifier": eqid["i"]} | ||
| if "l" in eqid and eqid["l"]: | ||
| eq_item["label"] = eqid["l"] | ||
|
|
||
| # if descriptions is enabled, add it to descriptions. | ||
| if include_descriptions and "d" in eqid and len(eqid["d"]) > 0: | ||
| desc = eqid["d"][0] | ||
|
|
@@ -821,19 +858,45 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i | |
| eq_item["type"] = eqid['types'][-1] | ||
| node["equivalent_identifiers"].append(eq_item) | ||
|
|
||
| # print(f"Checking if {canonical_id} is in clique_leaders: {builtin_json.dumps(clique_leaders, indent=2)}") | ||
| if clique_leaders: | ||
| for conflation_type in clique_leaders: | ||
| if canonical_id in clique_leaders[conflation_type] and eqid["i"] in clique_leaders[conflation_type][canonical_id]: | ||
| clique_leader_output = { | ||
| "identifier": eqid["i"], | ||
| "conflation": conflation_type, | ||
| } | ||
| if "label" in eq_item: | ||
| clique_leader_output["label"] = eq_item["label"] | ||
|
|
||
| # For description, taxa and type, we could read them from eq_item, but that | ||
| # is only set if the appropriate flag was turned on. For completeness, let's | ||
| # try picking them up if they've been passed to us at all. | ||
| if "d" in eqid and len(eqid["d"]) > 0: | ||
| clique_leader_output["description"] = eqid["d"] | ||
| if "t" in eqid and eqid["t"]: | ||
| clique_leader_output["taxa"] = eqid["t"] | ||
| if 'types' in eqid: | ||
| clique_leader_output["type"] = eqid['types'][-1] | ||
| clique_leaders_output.append(clique_leader_output) | ||
|
Comment on lines
+862
to
+881
|
||
|
|
||
| if include_descriptions and descriptions: | ||
| node["descriptions"] = descriptions | ||
| node["id"]["description"] = descriptions[0] | ||
|
|
||
| if include_taxa and node_taxa: | ||
| node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix) | ||
|
|
||
| # Add clique leaders if available. | ||
| if clique_leaders and clique_leaders_output: | ||
| node["clique_leaders"] = clique_leaders_output | ||
|
|
||
| # We need to remove `biolink:Entity` from the types returned. | ||
| # (See explanation at https://github.com/NCATSTranslator/NodeNormalization/issues/173) | ||
| if 'biolink:Entity' in types[canonical_id]: | ||
| types[canonical_id].remove('biolink:Entity') | ||
| if 'biolink:Entity' in types_with_ancestors[canonical_id]: | ||
| types_with_ancestors[canonical_id].remove('biolink:Entity') | ||
|
|
||
| node['type'] = types[canonical_id] | ||
| node['type'] = types_with_ancestors[canonical_id] | ||
|
|
||
| # add the info content to the node if we got one | ||
| if info_contents[canonical_id] is not None: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.