Skip to content

Commit

Permalink
make type hints Python 3.8 compatible and switch to Docment docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed Nov 14, 2024
1 parent e4d45a3 commit e6721a1
Show file tree
Hide file tree
Showing 18 changed files with 1,941 additions and 3,383 deletions.
22 changes: 9 additions & 13 deletions glycowork/glycan_data/data_entry.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
import pandas as pd
from typing import Optional, Union
from glycowork.motif.processing import check_nomenclature
from glycowork.motif.graph import glycan_to_nxGraph, compare_glycans


def check_presence(glycan: str, df: pd.DataFrame, colname: str = 'glycan',
name: str | None = None, rank: str = 'Species', fast: bool = False) -> None:
"""checks whether glycan (of that species) is already present in dataset\n
| Arguments:
| :-
| glycan (string): IUPAC-condensed glycan sequence
| df (dataframe): glycan dataframe where glycans are under colname and ideally taxonomic labels are columns
| name (string): name of the species (etc.) of interest
| rank (string): column name for filtering; default: species
| fast (bool): True uses precomputed glycan graphs, only use if df has column 'graph' with glycan graphs\n
| Returns:
| :-
| Returns text output regarding whether the glycan is already in df"""
def check_presence(glycan: str, # IUPAC-condensed glycan sequence
df: pd.DataFrame, # glycan dataframe where glycans are under colname
colname: str = 'glycan', # column name containing glycans
name: Optional[str] = None, # name of species of interest
rank: str = 'Species', # column name for filtering
fast: bool = False # True uses precomputed glycan graphs
) -> None:
"checks whether glycan (of that species) is already present in dataset"
if any([p in glycan for p in ['RES', '=']]) or not isinstance(glycan, str):
check_nomenclature(glycan)
return
Expand Down
143 changes: 55 additions & 88 deletions glycowork/glycan_data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from os import path
from itertools import chain
from importlib import resources
from typing import Any, Dict
from typing import Any, Dict, List, Union, Optional

with resources.open_text("glycowork.glycan_data", "glycan_motifs.csv") as f:
motif_list = pd.read_csv(f)
Expand Down Expand Up @@ -85,29 +85,30 @@ def __dir__(self):
'OS': {'GlcNAc', 'Gal', 'GalNAc'}}


def unwrap(nested_list: list) -> list:
"""converts a nested list into a flat list"""
def unwrap(nested_list: List[Any] # list to be flattened
) -> List[Any]: # flattened list
"converts a nested list into a flat list"
return list(chain(*nested_list))


def find_nth(haystack: str, needle: str, n: int) -> int:
"""finds n-th instance of motif\n
| Arguments:
| :-
| haystack (string): string to search for motif
| needle (string): motif
| n (int): n-th occurrence in string (not zero-indexed)\n
| Returns:
| :-
| Returns starting index of n-th occurrence in string"""
def find_nth(haystack: str, # string to search for motif
needle: str, # motif
n: int # n-th occurrence in string (not zero-indexed)
) -> int: # starting index of n-th occurrence
"finds n-th instance of motif"
start = haystack.find(needle)
while start >= 0 and n > 1:
start = haystack.find(needle, start+len(needle))
n -= 1
return start


def find_nth_reverse(string: str, substring: str, n: int, ignore_branches: bool = False) -> int:
def find_nth_reverse(string: str, # string to search
substring: str, # substring to find
n: int, # n-th occurrence from end
ignore_branches: bool = False # whether to ignore branches when counting
) -> int: # position of n-th occurrence from end
"finds n-th instance of motif from end of string"
# Reverse the string and the substring
reversed_string = string[::-1]
reversed_substring = substring[::-1]
Expand Down Expand Up @@ -141,14 +142,9 @@ def find_nth_reverse(string: str, substring: str, n: int, ignore_branches: bool
return original_start_index


def remove_unmatched_brackets(s: str) -> str:
"""Removes all unmatched brackets from the string s.\n
| Arguments:
| :-
| s (string): glycan string in IUPAC-condensed\n
| Returns:
| :-
| Returns glycan without unmatched brackets"""
def remove_unmatched_brackets(s: str # glycan string in IUPAC-condensed
) -> str: # glycan without unmatched brackets
"Removes all unmatched brackets from the string s"
while True:
# Keep track of the indexes of the brackets
stack = []
Expand All @@ -171,45 +167,30 @@ def remove_unmatched_brackets(s: str) -> str:
return s


def reindex(df_new: pd.DataFrame, df_old: pd.DataFrame, out_col: str, ind_col: str, inp_col: str) -> list:
"""Returns columns values in order of new dataframe rows\n
| Arguments:
| :-
| df_new (pandas dataframe): dataframe with the new row order
| df_old (pandas dataframe): dataframe with the old row order
| out_col (string): column name of column in df_old that you want to reindex
| ind_col (string): column name of column in df_old that will give the index
| inp_col (string): column name of column in df_new that indicates the new order; ind_col and inp_col should match\n
| Returns:
| :-
| Returns out_col from df_old in the same order of inp_col in df_new"""
def reindex(df_new: pd.DataFrame, # dataframe with new row order
df_old: pd.DataFrame, # dataframe with old row order
out_col: str, # column name in df_old to reindex
ind_col: str, # column name in df_old for index
inp_col: str # column name in df_new for new order
) -> list: # out_col from df_old reordered to match inp_col in df_new
"Returns columns values in order of new dataframe rows"
if ind_col != inp_col:
print("Mismatching column names for ind_col and inp_col. Doesn't mean it's wrong but pay attention.")
return [df_old[out_col].values.tolist()[df_old[ind_col].values.tolist().index(k)] for k in df_new[inp_col].values.tolist()]


def stringify_dict(dicty: dict) -> str:
"""Converts dictionary into a string\n
| Arguments:
| :-
| dicty (dictionary): dictionary\n
| Returns:
| :-
| Returns string of type key:value for sorted items"""
def stringify_dict(dicty: Dict[Any, Any] # dictionary to convert
) -> str: # string of type key:value for sorted items
"Converts dictionary into a string"
dicty = dict(sorted(dicty.items()))
return ''.join(f"{key}{value}" for key, value in dicty.items())


def replace_every_second(string: str, old_char: str, new_char: str) -> str:
"""function to replace every second occurrence of old_char in string with new_char\n
| Arguments:
| :-
| string (string): a string
| old_char (string): a string character to be replaced (every second occurrence)
| new_char (string): the string character to replace old_char with\n
| Returns:
| :-
| Returns string with replaced characters"""
def replace_every_second(string: str, # input string
old_char: str, # character to replace
new_char: str # character to replace with
) -> str: # modified string
"function to replace every second occurrence of old_char in string with new_char"
count = 0
result = []
for char in string:
Expand All @@ -221,34 +202,25 @@ def replace_every_second(string: str, old_char: str, new_char: str) -> str:
return ''.join(result)


def multireplace(string: str, remove_dic: dict[str, str]) -> str:
"""Replaces all occurences of items in a set with a given string\n
| Arguments:
| :-
| string (str): string to perform replacements on
| remove_dic (set): dict of form to_replace:replace_with\n
| Returns:
| :-
| (str) modified string"""
def multireplace(string: str, # string to perform replacements on
remove_dic: Dict[str, str] # dict of form to_replace:replace_with
) -> str: # modified string
"Replaces all occurences of items in a set with a given string"
for k, v in remove_dic.items():
string = string.replace(k, v)
return string


def strip_suffixes(columns: list) -> list[str]:
"""Strip numerical suffixes like .1, .2, etc., from column names."""
def strip_suffixes(columns: List[Any] # column names
) -> List[str]: # column names without numerical suffixes
"Strip numerical suffixes like .1, .2, etc., from column names"
return [re.sub(r"\.\d+$", "", str(name)) for name in columns]


def build_custom_df(df: pd.DataFrame, kind: str = 'df_species') -> pd.DataFrame:
"""creates custom df from df_glycan\n
| Arguments:
| :-
| df (dataframe): df_glycan / sugarbase
| kind (string): whether to create 'df_species', 'df_tissue', or 'df_disease' from df_glycan; default:df_species\n
| Returns:
| :-
| Returns custom df in the form of one glycan - species/tissue/disease association per row"""
def build_custom_df(df: pd.DataFrame, # df_glycan / sugarbase
kind: str = 'df_species' # whether to create 'df_species', 'df_tissue', or 'df_disease'
) -> pd.DataFrame: # custom df with one glycan - species/tissue/disease association per row
"creates custom df from df_glycan"
kind_to_cols = {
'df_species': ['glycan', 'Species', 'Genus', 'Family', 'Order', 'Class',
'Phylum', 'Kingdom', 'Domain', 'ref'],
Expand All @@ -267,8 +239,10 @@ def build_custom_df(df: pd.DataFrame, kind: str = 'df_species') -> pd.DataFrame:
return df


def download_model(file_id: str, local_path: str = 'model_weights.pt') -> None:
"""Download the model weights file from Google Drive."""
def download_model(file_id: str, # Google Drive file ID
local_path: str = 'model_weights.pt' # where to save model file
) -> None:
"Download the model weights file from Google Drive"
file_id = file_id.split('/d/')[1].split('/view')[0]
url = f'https://drive.google.com/uc?id={file_id}'
gdown.download(url, local_path, quiet = False)
Expand Down Expand Up @@ -325,12 +299,10 @@ def _deserialize_cell(cell_data: Dict[str, Any]) -> Any:
return cell_data['value']

@classmethod
def serialize(cls, df: pd.DataFrame, path: str) -> None:
"""Serialize a DataFrame to JSON with type information.
Args:
df: pandas DataFrame to serialize
path: file path to save the serialized data"""
def serialize(cls, df: pd.DataFrame, # DataFrame to serialize
path: str # file path to save serialized data
) -> None:
"Serialize a DataFrame to JSON with type information"
data = {
'columns': list(df.columns),
'index': list(df.index),
Expand All @@ -345,14 +317,9 @@ def serialize(cls, df: pd.DataFrame, path: str) -> None:
json.dump(data, f)

@classmethod
def deserialize(cls, path: str) -> pd.DataFrame:
"""Deserialize a DataFrame from JSON.
Args:
path: file path to load the serialized data from
Returns:
pandas DataFrame with restored data types"""
def deserialize(cls, path: str # file path to load serialized data
) -> pd.DataFrame: # DataFrame with restored data types
"Deserialize a DataFrame from JSON"
with open(path, 'r') as f:
data = json.load(f)

Expand Down
Loading

0 comments on commit e6721a1

Please sign in to comment.