Skip to content

Commit

Permalink
adducts & processing
Browse files Browse the repository at this point in the history
- add adduct functionality to mz_to_composition
- fine-tune ? handling in choose_correct_isoform
- fix glycoproteomics formatting in process_for_glycoshift
  • Loading branch information
Bribak committed Oct 24, 2024
1 parent 145877d commit 40e9827
Show file tree
Hide file tree
Showing 6 changed files with 521 additions and 537 deletions.
2 changes: 1 addition & 1 deletion bin/glycoworkGUI.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ def openLectinArrayAnalysisDialog():


def show_about_info():
about_message = "glycowork v1.3\n\n" \
about_message = "glycowork v1.4\n\n" \
"For more information and citation, please refer to:\n" \
"Thomès, L., et al. (2021). Glycowork: A Python package for glycan data science and machine learning. Glycobiology, 31(10), 1240-1244.\n" \
"DOI: 10.1093/glycob/cwab067\n" \
Expand Down
958 changes: 479 additions & 479 deletions glycowork/glycan_data/glycoproteomics_human_milk_N_PMID34087070.csv

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions glycowork/glycan_data/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -1043,8 +1043,9 @@ def correct_multiple_testing(pvals, alpha, correction_method = "two-stage"):
| Returns:
| :-
| (i) list of corrected p-values
| (ii) list of True/False of whether corrected p-value reaches statistical significance
"""
| (ii) list of True/False of whether corrected p-value reaches statistical significance"""
if not isinstance(pvals, list):
pvals = pvals.tolist()
corrpvals = multipletests(pvals, method = 'fdr_tsbh' if correction_method == "two-stage" else 'fdr_bh')[1]
corrpvals = [p if p >= pvals[i] else pvals[i] for i, p in enumerate(corrpvals)]
significance = [p < alpha for p in corrpvals]
Expand Down
74 changes: 25 additions & 49 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ def preprocess_data(df, group1, group2, experiment = "diff", motifs = False, fea
| (i) transformed and processed dataset
| (ii) untransformed but processed dataset
| (iii) labels for group1
| (iv) labels for group2
"""
| (iv) labels for group2"""
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
if not isinstance(group1[0], str) and experiment == "diff":
Expand Down Expand Up @@ -134,8 +133,7 @@ def get_pvals_motifs(df, glycan_col_name = 'glycan', label_col_name = 'target',
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
| Returns:
| :-
| Returns dataframe with p-values, corrected p-values, and Cohen's d as effect size for every glycan motif
"""
| Returns dataframe with p-values, corrected p-values, and Cohen's d as effect size for every glycan motif"""
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
# Reformat to allow for proper annotation in all samples
Expand Down Expand Up @@ -189,8 +187,7 @@ def get_representative_substructures(enrichment_df):
| enrichment_df (dataframe): output from get_pvals_motifs\n
| Returns:
| :-
| Returns up to 10 minimal glycans in a list
"""
| Returns up to 10 minimal glycans in a list"""
glycans = list(set(df_species.glycan))
# Only consider motifs that are significantly enriched
filtered_df = enrichment_df[enrichment_df.corr_pval < 0.05].reset_index(drop = True)
Expand Down Expand Up @@ -238,8 +235,7 @@ def get_heatmap(df, motifs = False, feature_set = ['known'], transform = '',
| **kwargs: keyword arguments that are directly passed on to seaborn clustermap\n
| Returns:
| :-
| Prints clustermap
"""
| Prints clustermap"""
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
if index_col in df.columns:
Expand Down Expand Up @@ -303,8 +299,7 @@ def plot_embeddings(glycans, emb = None, label_list = None,
| filepath (string): absolute path including full filename allows for saving the plot
| alpha (float): transparency of points in plot; default:0.8
| palette (string): color palette to color different classes; default:'colorblind'
| **kwargs: keyword arguments that are directly passed on to matplotlib\n
"""
| **kwargs: keyword arguments that are directly passed on to matplotlib\n"""
idx = [i for i, g in enumerate(glycans) if '{' not in g]
glycans = [glycans[i] for i in idx]
label_list = [label_list[i] for i in idx]
Expand Down Expand Up @@ -355,8 +350,7 @@ def characterize_monosaccharide(sugar, df = None, mode = 'sugar', glycan_col_nam
| thresh (int): threshold count of when to include motifs in plot; default:10 occurrences\n
| Returns:
| :-
| Plots modification distribution and typical neighboring bond/monosaccharide
"""
| Plots modification distribution and typical neighboring bond/monosaccharide"""
if df is None:
df = df_species
if rank is not None and focus is not None:
Expand Down Expand Up @@ -451,8 +445,7 @@ def get_coverage(df, filepath = ''):
| filepath (string): absolute path including full filename allows for saving the plot\n
| Returns:
| :-
| Prints the heatmap
"""
| Prints the heatmap"""
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
d = df.iloc[:,1:]
Expand Down Expand Up @@ -492,8 +485,7 @@ def get_pca(df, groups = None, motifs = False, feature_set = ['known', 'exhausti
| rarity_filter (float): proportion of samples that need to have a non-zero value for a variable to be included; default:0.05\n
| Returns:
| :-
| Prints PCA plot
"""
| Prints PCA plot"""
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
if transform == "ALR":
Expand Down Expand Up @@ -544,8 +536,7 @@ def select_grouping(cohort_b, cohort_a, glycans, p_values, paired = False, group
| grouped_BH (bool): whether to perform two-stage adaptive Benjamini-Hochberg as a grouped multiple testing correction; will SIGNIFICANTLY increase runtime; default:False\n
| Returns:
| :-
| Returns dictionaries of group : glycans and group : p-values (just one big group if no grouping can be found)
"""
| Returns dictionaries of group : glycans and group : p-values (just one big group if no grouping can be found)"""
if not grouped_BH:
return {"group1": glycans}, {"group1": p_values}
funcs = {"by_Sia/Fuc": group_glycans_sia_fuc}
Expand Down Expand Up @@ -616,8 +607,7 @@ def get_differential_expression(df, group1, group2,
| (vii) Corrected p-values (Levene's test for equality of variances with Benjamini-Hochberg correction) for difference in variance
| (viii) Effect size as Cohen's d (sets=False) or Mahalanobis distance (sets=True)
| (xi) Corrected p-values of equivalence test to test whether means are significantly equivalent; only done for p-values > 0.05 from (iv)
| (x) [only if effect_size_variance=True] Effect size variance
"""
| (x) [only if effect_size_variance=True] Effect size variance"""
df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = motifs, impute = impute,
min_samples = min_samples, transform = transform, feature_set = feature_set,
paired = paired, gamma = gamma, custom_scale = custom_scale, custom_motifs = custom_motifs,
Expand Down Expand Up @@ -719,8 +709,7 @@ def get_pval_distribution(df_res, filepath = ''):
| filepath (string): absolute path including full filename allows for saving the plot\n
| Returns:
| :-
| prints p-value distribution plot
"""
| prints p-value distribution plot"""
if isinstance(df_res, str):
df_res = pd.read_csv(df_res) if df_res.endswith(".csv") else pd.read_excel(df_res)
# make plot
Expand All @@ -742,8 +731,7 @@ def get_ma(df_res, log2fc_thresh = 1, sig_thresh = 0.05, filepath = ''):
| filepath (string): absolute path including full filename allows for saving the plot\n
| Returns:
| :-
| prints MA plot
"""
| prints MA plot"""
if isinstance(df_res, str):
df_res = pd.read_csv(df_res) if df_res.endswith(".csv") else pd.read_excel(df_res)
# Create masks for significant and non-significant points
Expand Down Expand Up @@ -777,8 +765,7 @@ def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed =
| **kwargs: keyword arguments that are directly passed on to seaborn scatterplot\n
| Returns:
| :-
| Prints volcano plot
"""
| Prints volcano plot"""
if isinstance(df_res, str):
df_res = pd.read_csv(df_res) if df_res.endswith(".csv") else pd.read_excel(df_res)
df_res['log_p'] = -np.log10(df_res['corr p-val'].values)
Expand Down Expand Up @@ -835,8 +822,7 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh
| Returns:
| :-
| (i) a pandas DataFrame with an F statistic, corrected p-value, indication of its significance, and effect size (Omega squared) for each glycan.
| (ii) a dictionary of type glycan : pandas DataFrame, with post-hoc results for each glycan with a significant ANOVA.
"""
| (ii) a dictionary of type glycan : pandas DataFrame, with post-hoc results for each glycan with a significant ANOVA."""
df, _, groups, _ = preprocess_data(df, groups, [], experiment = "anova", motifs = motifs, impute = impute,
min_samples = min_samples, transform = transform, feature_set = feature_set,
gamma = gamma, custom_scale = custom_scale, custom_motifs = custom_motifs)
Expand Down Expand Up @@ -888,8 +874,7 @@ def get_meta_analysis(effect_sizes, variances, model = 'fixed', filepath = '',
| Returns:
| :-
| (1) The combined effect size
| (2) The p-value for the combined effect size
"""
| (2) The p-value for the combined effect size"""
if model not in ['fixed', 'random']:
raise ValueError("Model must be 'fixed' or 'random'")
weights = 1 / np.array(variances)
Expand Down Expand Up @@ -951,8 +936,7 @@ def get_glycan_change_over_time(data, degree = 1):
| Returns:
| :-
| (i) slope -- the slope of the regression line (i.e., the rate of change of glycan expression over time)
| (ii) p_value -- the p-value of the t-test for the slope
"""
| (ii) p_value -- the p-value of the t-test for the slope"""
# Extract arrays for time and glycan abundance from the 2D input array
time, glycan_abundance = data[:, 0], data[:, 1]
if degree == 1:
Expand Down Expand Up @@ -997,8 +981,7 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
| (ii) The slope of their expression curve over time
| (iii) Uncorrected p-values (t-test) for testing whether slope is significantly different from zero
| (iv) Corrected p-values (t-test with two-stage Benjamini-Hochberg correction) for testing whether slope is significantly different from zero
| (v) Significance: True/False whether the corrected p-value lies below the sample size-appropriate significance threshold
"""
| (v) Significance: True/False whether the corrected p-value lies below the sample size-appropriate significance threshold"""
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
df = df.fillna(0)
Expand Down Expand Up @@ -1061,8 +1044,7 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
| Returns:
| :-
| Returns a pandas dataframe containing the adjusted p-values, and most important waveform parameters for each
| molecule in the analysis.
"""
| molecule in the analysis."""
if isinstance(df_in, str):
df = pd.read_csv(df_in) if df_in.endswith(".csv") else pd.read_excel(df_in)
else:
Expand Down Expand Up @@ -1128,8 +1110,7 @@ def get_biodiversity(df, group1, group2, metrics = ['alpha', 'beta'], motifs = F
| (iv) Uncorrected p-values (Welch's t-test) for difference in mean
| (v) Corrected p-values (Welch's t-test with two-stage Benjamini-Hochberg correction) for difference in mean
| (vi) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold
| (vii) Effect size as Cohen's d (ANOSIM R for beta; F statistics for PERMANOVA and Shannon/Simpson (ANOVA))
"""
| (vii) Effect size as Cohen's d (ANOSIM R for beta; F statistics for PERMANOVA and Shannon/Simpson (ANOVA))"""
experiment = "diff" if group2 else "anova"
df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = experiment, motifs = motifs, impute = False,
transform = transform, feature_set = feature_set, paired = paired, gamma = gamma,
Expand Down Expand Up @@ -1207,8 +1188,7 @@ def get_SparCC(df1, df2, motifs = False, feature_set = ["known", "exhaustive"],
| Returns:
| :-
| Returns (i) a dataframe of pairwise correlations (Spearman's rho)
| and (ii) a dataframe with corrected p-values (two-stage Benjamini-Hochberg)
"""
| and (ii) a dataframe with corrected p-values (two-stage Benjamini-Hochberg)"""
if isinstance(df1, str):
df1 = pd.read_csv(df1) if df1.endswith(".csv") else pd.read_excel(df1)
df2 = pd.read_csv(df2) if df2.endswith(".csv") else pd.read_excel(df2)
Expand Down Expand Up @@ -1288,8 +1268,7 @@ def multi_feature_scoring(df, group1, group2, filepath = ''):
| Prints the identified features and returns:
| (i) the trained model to distinguish conditions
| (ii) the ROC AUC score of the trained model
| (iii) (optionally, if filepath) a saved plot of the ROC curve for the model
"""
| (iii) (optionally, if filepath) a saved plot of the ROC curve for the model"""
if group2:
y = [0] * len(group1) + [1] * len(group2)
else:
Expand Down Expand Up @@ -1345,8 +1324,7 @@ def get_roc(df, group1, group2, motifs = False, feature_set = ["known", "exhaust
| multi_score (bool): whether to find the best glycan risk score, containing multiple glycan features; default:False\n
| Returns:
| :-
| Returns a sorted list of tuples of type (glycan, AUC score) and, optionally, ROC curve for best feature
"""
| Returns a sorted list of tuples of type (glycan, AUC score) and, optionally, ROC curve for best feature"""
experiment = "diff" if group2 else "anova"
df, _, group1, group2 = preprocess_data(df, group1, group2, experiment = experiment, motifs = motifs, impute = impute,
transform = transform, feature_set = feature_set, paired = paired, gamma = gamma,
Expand Down Expand Up @@ -1435,8 +1413,7 @@ def get_lectin_array(df, group1, group2, paired = False, transform = ''):
| (iii) Lectins supporting the change in (i)
| (iv) Direction of the change (e.g., "up" means higher in group2)
| (v) Score/Magnitude of the change (remember, if you have more than two groups this reports on any pairwise combination, like an ANOVA)
| (vi) Clustering of the scores into highly/moderate/low significance findings
"""
| (vi) Clustering of the scores into highly/moderate/low significance findings"""
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
df = df.set_index(df.columns[0])
Expand Down Expand Up @@ -1507,11 +1484,10 @@ def get_glycoshift_per_site(df, group1, group2, paired = False, impute = True,
| (for each condition/interaction feature)
| (i) Regression coefficient from the GLM (indicating direction of change in the treatment condition)
| (ii) Corrected p-values (two-tailed t-test with two-stage Benjamini-Hochberg correction) for testing the coefficient against zero
| (iii) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold
"""
| (iii) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold"""
df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = False, impute = impute,
min_samples = min_samples, transform = "Nothing", paired = paired)
alpha = get_alphaN(len(group1+group2))
alpha = get_alphaN(len(group1 + group2))
df, glycan_features = process_for_glycoshift(df)
necessary_columns = ['Glycoform'] + glycan_features
preserved_data = df[necessary_columns]
Expand Down
5 changes: 4 additions & 1 deletion glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ def choose_correct_isoform(glycans, reverse = False):
| Returns the correct isomer as a string (if reverse=False; otherwise it returns a list of strings)
"""
glycans = list(set(glycans))
if '?' in ''.join(glycans) and not reverse:
min_questions = min(glycan.count('?') for glycan in glycans)
glycans = [glycan for glycan in glycans if glycan.count('?') == min_questions]
if len(glycans) == 1:
return glycans[0]
if not any(['[' in g for g in glycans]):
Expand Down Expand Up @@ -1065,7 +1068,7 @@ def process_for_glycoshift(df):
| (i) glycoproteomics dataset with new columns for protein_site, composition, and composition counts
| (ii) list of identified glycan features, such as different monosaccharides
"""
df['Glycosite'] = [k.split('_')[0] + '_' + k.split('_')[2] for i, k in enumerate(df.index)]
df['Glycosite'] = [k.split('_')[0] + '_' + k.split('_')[1] for i, k in enumerate(df.index)]
if '[' in df.index[0]:
comps = ['['+k.split('[')[1] for k in df.index]
comps = [list(map(int, re.findall(r'\d+', s))) for s in comps]
Expand Down
14 changes: 9 additions & 5 deletions glycowork/motif/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def stemify_dataset(df, stem_lib = None, libr = None,

def mz_to_composition(mz_value, mode = 'negative', mass_value = 'monoisotopic', reduced = False,
sample_prep = 'underivatized', mass_tolerance = 0.5, kingdom = 'Animalia',
glycan_class = 'all', df_use = None, filter_out = None, extras = ["doubly_charged"]):
glycan_class = 'all', df_use = None, filter_out = None, extras = ["doubly_charged"], adduct = None):
"""Mapping a m/z value to a matching monosaccharide composition within SugarBase\n
| Arguments:
| :-
Expand All @@ -268,7 +268,8 @@ def mz_to_composition(mz_value, mode = 'negative', mass_value = 'monoisotopic',
| glycan_class (string): which glycan class does the m/z value stem from, 'N', 'O', 'lipid' linked glycans, or 'free' glycans; default:'all'
| df_use (dataframe): species-specific glycan dataframe to use for mapping; default: df_glycan
| filter_out (set): set of monosaccharide types to ignore during composition finding; default:None
| extras (list): additional operations to perform if regular m/z matching does not yield a result; options include "adduct" and "doubly_charged"\n
| extras (list): additional operations to perform if regular m/z matching does not yield a result; options include "adduct" and "doubly_charged"
| adduct (string): chemical formula of adduct that contributes to m/z, e.g., "C2H4O2"; default:None\n
| Returns:
| :-
| Returns a list of matching compositions in dict form
Expand All @@ -280,7 +281,9 @@ def mz_to_composition(mz_value, mode = 'negative', mass_value = 'monoisotopic',
df_use = df_glycan[(df_glycan.glycan_type == glycan_class) & (df_glycan.Kingdom.apply(lambda x: kingdom in x))]
if filter_out is None:
filter_out = set()
adduct = mass_dict['Acetate'] if mode == 'negative' else mass_dict['Na+']
if adduct:
mz -= calculate_adduct_mass(adduct, mass_value)
adduct_mass = mass_dict['Acetate'] if mode == 'negative' else mass_dict['Na+']
if reduced:
mz_value -= 1.0078
multiplier = 1 if mode == 'negative' else -1
Expand All @@ -297,7 +300,7 @@ def mz_to_composition(mz_value, mode = 'negative', mass_value = 'monoisotopic',
if "adduct" in extras:
# Check for matches including the adduct mass
for mass, comp in cache.items():
if abs(mass + adduct - mz_value) < mass_tolerance:
if abs(mass + adduct_mass - mz_value) < mass_tolerance:
if not filter_out.intersection(comp.keys()):
return [comp]
if "doubly_charged" in extras:
Expand Down Expand Up @@ -625,7 +628,8 @@ def composition_to_mass(dict_comp, mass_value = 'monoisotopic', sample_prep = 'u
| :-
| dict_comp (dict): composition in form monosaccharide:count
| mass_value (string): whether the expected mass is 'monoisotopic' or 'average'; default:'monoisotopic'
| sample_prep (string): whether the glycans has been 'underivatized', 'permethylated', or 'peracetylated'; default:'underivatized'\n
| sample_prep (string): whether the glycans has been 'underivatized', 'permethylated', or 'peracetylated'; default:'underivatized'
| adduct (string): chemical formula of adduct to be added, e.g., "C2H4O2"; default:None\n
| Returns:
| :-
| Returns the theoretical mass of input composition
Expand Down

0 comments on commit 40e9827

Please sign in to comment.