Skip to content

Commit

Permalink
still attempting to speed up orthology code. Fallback for AnnDataShad…
Browse files Browse the repository at this point in the history
…ows var/obs bug
  • Loading branch information
adkinsrs committed Oct 10, 2024
1 parent f411520 commit 96fe448
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 48 deletions.
72 changes: 40 additions & 32 deletions lib/gear/orthology.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import pandas as pd

from pathlib import Path
from collections import defaultdict

# append parent directory to path
sys.path.append(str(Path(__file__).resolve().parents[1]))
Expand Down Expand Up @@ -58,10 +57,10 @@ def get_ortholog_file(gene_organism_id: str, dataset_organism_id: str, annotatio
FileNotFoundError: If the orthologous mapping file is not found.
"""
orthomap_file_base = format_orthomap_file_base(gene_organism_id, dataset_organism_id, annotation_source)
orthomap_file = ORTHOLOG_BASE_DIR.joinpath(orthomap_file_base)
orthomap_file = ORTHOLOG_BASE_DIR / orthomap_file_base

if not orthomap_file.is_file():
raise FileNotFoundError("Orthologous mapping file not found: {0}".format(orthomap_file))
raise FileNotFoundError(f"Orthologous mapping file not found: {orthomap_file}")
return orthomap_file

def get_ortholog_files_from_dataset(dataset_organism_id: str, annotation_source: str="ensembl"):
Expand All @@ -85,7 +84,8 @@ def get_ortholog_files_from_dataset(dataset_organism_id: str, annotation_source:
orthomap_files.sort(key=lambda x: int(x.name.split("__")[0].split(".")[1]))

if not orthomap_files:
raise FileNotFoundError("Orthologous mapping files not found for dataset organism {0}".format(get_organism_name_by_id(dataset_organism_id)))
dataset_organism_name = get_organism_name_by_id(dataset_organism_id)
raise FileNotFoundError(f"Orthologous mapping files not found for dataset organism {dataset_organism_name}")

return list(orthomap_files)

Expand Down Expand Up @@ -127,14 +127,15 @@ def create_orthology_df(orthomap_file: Path):
pd.DataFrame: The DataFrame containing the orthologous mapping data.
"""
try:
orthomap_df = pd.read_hdf(str(orthomap_file)) # do not need to pass in key, as it is the default (only) key
orthology_df = pd.read_hdf(str(orthomap_file)) # do not need to pass in key, as it is the default (only) key
except Exception as e:
print("Error reading orthologous mapping file {1}: {0}".format(e, orthomap_file), file=sys.stderr)
error_message = f"Error reading orthologous mapping file {orthomap_file}: {e}"
print(error_message, file=sys.stderr)
# print stack trace
import traceback
traceback.print_exc()
raise Exception("Error reading orthologous mapping file: {0}".format(e))
return orthomap_df
raise Exception(error_message)
return orthology_df

def map_dataframe_genes(orig_df: pd.DataFrame, orthomap_file: Path):
"""
Expand All @@ -148,27 +149,33 @@ def map_dataframe_genes(orig_df: pd.DataFrame, orthomap_file: Path):
pd.DataFrame: The DataFrame with the genes mapped to their orthologous genes.
"""

orthomap_df = create_orthology_df(orthomap_file)
orthology_df = create_orthology_df(orthomap_file)

# NOTE: Not all genes can be mapped. Unmappable genes do not change in the original DataFrame.

def get_best_match(id1):
# Get the best match for the id2 gene symbol
best_match_for_id = orthomap_df[orthomap_df["id1"] == id1]
sorted_by_best_match = best_match_for_id.sort_values(by="algorithms_match_count", ascending=False)
# If no match, return the original id1
if sorted_by_best_match.empty:
return id1
best_match = sorted_by_best_match.iloc[0]
return best_match["id2"]
def get_best_match(gene_id: str) -> str:
"""
Get the best match for the given gene ID based on the orthology mapping.
Args:
gene_id (str): The gene ID to find the best match for.
Returns:
str: The best matching gene ID or the original gene ID if no match is found.
"""
matches = orthology_df[orthology_df["id1"] == gene_id]
if matches.empty:
return gene_id

best_match = matches.sort_values(by="algorithms_match_count", ascending=False).iloc[0]
return best_match["id2"]

# Rename orig_df index (id1) using orthologous id (id2). If id1 maps to multiple id2, then use the id2 with the highest algorithms_match_count
orig_df.index = orig_df.index.map(get_best_match)

return orig_df

def map_single_gene(gene_symbol:str, orthomap_file: Path):
def map_single_gene(gene_symbol: str, orthomap_file: Path) -> list:
"""
Maps a single gene symbol to its corresponding orthologous gene symbol(s) using an orthology mapping file.
Expand All @@ -179,17 +186,18 @@ def map_single_gene(gene_symbol:str, orthomap_file: Path):
Returns:
list or None: The list of orthologous gene symbol(s) corresponding to the input gene symbol. Returns an empty list if the gene symbol cannot be mapped.
"""
orthomap_df = create_orthology_df(orthomap_file)
orthology_df = create_orthology_df(orthomap_file)

# Create lowercase gs1 and gs2 columns
orthomap_df["lc_gs1"] = orthomap_df["gs1"].str.lower()
orthomap_df["lc_gs2"] = orthomap_df["gs2"].str.lower()
orthology_df["lc_gs1"] = orthology_df["gs1"].str.lower()
orthology_df["lc_gs2"] = orthology_df["gs2"].str.lower()

# Check if case-insensitive gene symbol is in dictionary
lc_gene_symbol = gene_symbol.lower()

# returns the list of all orthologs
return orthomap_df[orthomap_df["lc_gs1"] == lc_gene_symbol]["gs2"].tolist()
# Return the list of all orthologous gene symbols
orthologous_genes = orthology_df[orthology_df["lc_gs1"] == lc_gene_symbol]["gs2"].tolist()
return orthologous_genes

def map_multiple_genes(gene_symbols: list, orthomap_file: Path) -> dict:
"""
Expand All @@ -203,17 +211,17 @@ def map_multiple_genes(gene_symbols: list, orthomap_file: Path) -> dict:
dict: A dictionary where the keys are the input gene symbols and the values are lists of orthology gene symbols.
If a gene symbol does not have any orthology gene symbols, its value is set to an empty list.
"""
orthomap_df = create_orthology_df(orthomap_file)
orthology_df = create_orthology_df(orthomap_file)

# Create lowercase gs1 and gs2 columns
orthomap_df["lc_gs1"] = orthomap_df["gs1"].str.lower()
orthomap_df["lc_gs2"] = orthomap_df["gs2"].str.lower()
orthology_df["lc_gs1"] = orthology_df["gs1"].str.lower()
orthology_df["lc_gs2"] = orthology_df["gs2"].str.lower()

# Check if case-insensitive gene symbols are in orthology df.
# Create a dictionary of gene symbol to orthologous gene symbols
gene_symbol_dict = defaultdict(list)
for gene_symbol in gene_symbols:
lc_gene_symbol = gene_symbol.lower()
gene_symbol_dict[gene_symbol] = orthomap_df[orthomap_df["lc_gs1"] == lc_gene_symbol]["gs2"].tolist()
# This checks if case-insensitive gene symbols are in orthology df.
gene_symbol_dict = {
gene_symbol: orthology_df[orthology_df["lc_gs1"] == gene_symbol.lower()]["gs2"].tolist()
for gene_symbol in gene_symbols
}

return gene_symbol_dict
15 changes: 13 additions & 2 deletions www/api/resources/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
abs_path_www = Path(__file__).resolve().parents[TWO_LEVELS_UP] # web-root dir
PROJECTIONS_BASE_DIR = abs_path_www.joinpath('projections')

def get_adata_from_analysis(analysis_id, dataset_id, session_id):
user = get_user_from_session_id(session_id)
ana = Analysis(id=analysis_id, dataset_id=dataset_id, session_id=session_id, user_id=user.id)
ana.discover_type()
return ana.get_adata()

def get_adata_shadow_from_analysis(analysis_id, dataset_id, session_id):
user = get_user_from_session_id(session_id)
ana = Analysis(id=analysis_id, dataset_id=dataset_id, session_id=session_id, user_id=user.id)
Expand All @@ -31,9 +37,14 @@ def get_adata_shadow_from_primary(h5_path):

def get_adata_shadow(analysis_id, dataset_id, session_id, h5_path):
if analysis_id:
return get_adata_shadow_from_analysis(analysis_id, dataset_id, session_id)
adata = get_adata_shadow_from_analysis(analysis_id, dataset_id, session_id)
else:
return get_adata_shadow_from_primary(h5_path)
adata = get_adata_shadow_from_primary(h5_path)

if adata.var.gene_symbol.dtype == "int16":
print("Using AnnData instead of AnnDataShadow because var and obs are not correct for dataset {}".format(dataset_id), file=sys.stderr)
adata = get_adata_from_analysis(analysis_id, dataset_id, session_id)
return adata

def create_projection_adata(dataset_adata, dataset_id, projection_id):
# Create AnnData object out of readable CSV file
Expand Down
36 changes: 22 additions & 14 deletions www/api/resources/orthologs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os, sys
import geardb
from gear.orthology import get_ortholog_file, get_ortholog_files_from_dataset, map_single_gene, map_multiple_genes
from .common import get_adata_shadow

def normalize_searched_gene(gene_set, chosen_gene):
"""Convert to case-insensitive version of gene. Returns None if gene not found in dataset."""
Expand Down Expand Up @@ -30,19 +31,19 @@ def get_mapped_gene_symbol(gene_symbol, gene_organism_id, dataset_organism_id, e
Exception: If no orthologous mapping is found for the given gene symbol.
"""

# Determine if we need to get a single ortholog file or multiple
is_single_ortholog_file_needed = gene_organism_id and gene_organism_id != dataset_organism_id

try:
if is_single_ortholog_file_needed:
def fetch_ortholog_files():
# Determine if we need to get a single ortholog file or multiple
if gene_organism_id and gene_organism_id != dataset_organism_id:
# Get a single ortholog file
ortholog_files = [get_ortholog_file(gene_organism_id, dataset_organism_id, "ensembl")]
if not exclusive_org:
ortholog_files += get_ortholog_files_from_dataset(dataset_organism_id, "ensembl")

else:
# Get multiple ortholog files from the dataset
ortholog_files = get_ortholog_files_from_dataset(dataset_organism_id, "ensembl")
return ortholog_files

try:
ortholog_files = fetch_ortholog_files()
except FileNotFoundError as e:
# We want this to fail gracefully, so return an empty list. The original will be mapped back to itself downstream.
return []
Expand All @@ -55,7 +56,7 @@ def get_mapped_gene_symbol(gene_symbol, gene_organism_id, dataset_organism_id, e
# At this point, we have an empty list or dict, so we should continue to the next ortholog file
if gene_organism_id and exclusive_org:
raise Exception(f"No orthologous mapping found for the given gene symbols {gene_symbol}.")
continue

return []


Expand Down Expand Up @@ -256,21 +257,28 @@ def post(self, dataset_id):
if gene_organism_id:
gene_organism_id = int(gene_organism_id)

analysis_id = None
if analysis:
analysis_id = analysis.get('id')

# Get the dataset and organism ID
dataset = geardb.get_dataset_by_id(dataset_id)
dataset = geardb.Dataset(id=dataset_id, has_h5ad=1)
if not dataset:
return {"error": "The dataset was not found."}, 400
h5_path = dataset.get_file_path()
dataset_organism_id = dataset.organism_id

# Get the right AnnData object depending on if the analysis is provided
try:
ana = geardb.get_analysis(analysis, dataset_id, session_id)
except Exception as e:
return {"error": str(e)}, 400
adata = get_adata_shadow(analysis_id, dataset_id, session_id, h5_path)
except FileNotFoundError:
return {
"success": -1,
'message': "No h5 file found for this dataset"
}

adata = ana.get_adata(backed=False)
dataset_genes = set(adata.var['gene_symbol'].unique())


def normalize_gene(gene):
return normalize_searched_gene(dataset_genes, gene)

Expand Down

0 comments on commit 96fe448

Please sign in to comment.