Skip to content

Commit

Permalink
azidylated monosaccharides, better universal input, .analysis fixes
Browse files Browse the repository at this point in the history
- support azide-containing monosaccharides
- more IUPAC-condensed dialect support in canonicalize_iupac
- various fixes of keyword argument combination mismatches in get_differential_expression
- ensure output stability in correct_multiple_testing
  • Loading branch information
Bribak committed Dec 11, 2024
1 parent 8c69c2c commit ef3da9c
Show file tree
Hide file tree
Showing 5 changed files with 205 additions and 78 deletions.
2 changes: 2 additions & 0 deletions glycowork/glycan_data/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,8 @@ def correct_multiple_testing(pvals: Union[List[float], np.ndarray], # list of ra
"Corrects p-values for multiple testing, by default with the two-stage Benjamini-Hochberg procedure"
if not isinstance(pvals, list):
pvals = pvals.tolist()
if not pvals:
return [], []
corrpvals = multipletests(pvals, method = 'fdr_tsbh' if correction_method == "two-stage" else 'fdr_bh')[1]
corrpvals = [p if p >= pvals[i] else pvals[i] for i, p in enumerate(corrpvals)]
significance = [bool(p < alpha) for p in corrpvals]
Expand Down
52 changes: 27 additions & 25 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,9 +543,9 @@ def get_differential_expression(
# Variance-based filtering of features
if not monte_carlo:
df, df_prison = variance_based_filtering(df)
df_org, df_org_prison = df_org.loc[df.index], df_org.loc[df_prison.index]
else:
df_prison = []
df_org, df_org_prison = df_org.loc[df.index], df_org.loc[df_prison.index]
df_prison, df_org_prison = pd.DataFrame(), pd.DataFrame()
glycans = df.index.tolist()
mean_abundance = df_org.mean(axis = 1)
df_a, df_b = df[group1], df[group2]
Expand All @@ -556,19 +556,18 @@ def get_differential_expression(
# Testing differential expression of each set/cluster
for cluster in clusters:
if len(cluster) > 1:
glycans.append(cluster)
cluster = list(cluster)
glycans.append(cluster)
gp1, gp2 = df_a.loc[cluster, :], df_b.loc[cluster, :]
mean_abundance_c.append(mean_abundance.loc[cluster].mean())
log2fc.append(((gp2.values - gp1.values).mean(axis = 1)).mean() if paired else (gp2.mean(axis = 1) - gp1.mean(axis = 1)).mean())
gp1, gp2 = df.loc[cluster, group1], df.loc[cluster, group2]
# Hotelling's T^2 test for multivariate comparisons
pvals.append(hotellings_t2(gp1.T.values, gp2.T.values, paired = paired)[1])
levene_pvals.append(np.mean([levene(gp1.loc[variable, :], gp2.loc[variable, :])[1] for variable in cluster]))
# Calculate Mahalanobis distance as measure of effect size for multivariate comparisons
effect_sizes.append(mahalanobis_distance(gp1, gp2, paired = paired))
if effect_size_variance:
variances.append(mahalanobis_variance(gp1, gp2, paired = paired))
# Hotelling's T^2 test for multivariate comparisons
pvals.append(hotellings_t2(gp1.T.values, gp2.T.values, paired = paired)[1])
levene_pvals.append(np.mean([levene(gp1.loc[variable, :], gp2.loc[variable, :])[1] for variable in cluster]))
# Calculate Mahalanobis distance as measure of effect size for multivariate comparisons
effect_sizes.append(mahalanobis_distance(gp1, gp2, paired = paired))
if effect_size_variance:
variances.append(mahalanobis_variance(gp1, gp2, paired = paired))
mean_abundance = mean_abundance_c
else:
log2fc = (df_b.values - df_a.values).mean(axis = 1) if paired else (df_b.mean(axis = 1) - df_a.mean(axis = 1))
Expand Down Expand Up @@ -606,21 +605,23 @@ def get_differential_expression(
corrpvals, significance = [1]*len(glycans), [False]*len(glycans)
df_out = pd.DataFrame(list(zip(glycans, mean_abundance, log2fc, pvals, corrpvals, significance, levene_pvals, effect_sizes, equivalence_pvals)),
columns = ['Glycan', 'Mean abundance', 'Log2FC', 'p-val', 'corr p-val', 'significant', 'corr Levene p-val', 'Effect size', 'Equivalence p-val'])
prison_rows = pd.DataFrame({
'Glycan': df_prison.index,
'Mean abundance': df_org_prison.mean(axis = 1),
'Log2FC': (df_prison[group2].values - df_prison[group1].values).mean(axis = 1) if paired else (df_prison[group2].mean(axis = 1) - df_prison[group1].mean(axis = 1)),
'p-val': [1.0] * len(df_prison),
'corr p-val': [1.0] * len(df_prison),
'significant': [False] * len(df_prison),
'corr Levene p-val': [1.0] * len(df_prison),
'Effect size': [0] * len(df_prison),
'Equivalence p-val': [1.0] * len(df_prison)})
prison_rows = prison_rows.astype({'significant': 'bool'})
df_out = pd.concat([df_out, prison_rows], ignore_index = True)
if not monte_carlo:
prison_rows = pd.DataFrame({
'Glycan': df_prison.index,
'Mean abundance': df_org_prison.mean(axis = 1),
'Log2FC': (df_prison[group2].values - df_prison[group1].values).mean(axis = 1) if paired else (df_prison[group2].mean(axis = 1) - df_prison[group1].mean(axis = 1)),
'p-val': [1.0] * len(df_prison),
'corr p-val': [1.0] * len(df_prison),
'significant': [False] * len(df_prison),
'corr Levene p-val': [1.0] * len(df_prison),
'Effect size': [0] * len(df_prison),
'Equivalence p-val': [1.0] * len(df_prison)})
prison_rows = prison_rows.astype({'significant': 'bool'})
if len(prison_rows) > 0:
df_out = pd.concat([df_out, prison_rows], ignore_index = True)
df_out['significant'] = df_out['significant'].astype('bool')
if effect_size_variance:
df_out['Effect size variance'] = variances + [0]*len(df_prison)
df_out['Effect size variance'] = list(variances) + [0]*len(df_prison)
if glycoproteomics:
return get_glycoform_diff(df_out, alpha = alpha, level = level)
else:
Expand Down Expand Up @@ -761,7 +762,8 @@ def get_glycanova(
'corr p-val': [1.0] * len(df_prison),
'significant': [False] * len(df_prison)})
prison_rows = prison_rows.astype({'significant': 'bool'})
df_out = pd.concat([df_out, prison_rows], ignore_index = True)
if len(prison_rows) > 0:
df_out = pd.concat([df_out, prison_rows], ignore_index = True)
df_out['significant'] = df_out['significant'].astype('bool')
df_out['Effect size'] = effect_sizes.values
return df_out.sort_values(by = 'corr p-val'), posthoc_results
Expand Down
14 changes: 10 additions & 4 deletions glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,8 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
if bool(re.search(r'\([A-Zd3-9]', glycan)):
glycan = glycan.replace('(', '[').replace(')', ']')
# Canonicalize linkage uncertainty
# Open linkages with anomeric config specified (e.g., "Mana-")
glycan = re.sub(r'([A-Z][a-z]*)([a-b])\-([A-Z])', r'\1\g<2>1-?\3', glycan)
# Open linkages (e.g., "c-")
glycan = re.sub(r'([a-z])\-([A-Z])', r'\1?1-?\2', glycan)
# Open linkages2 (e.g., "1-")
Expand All @@ -886,12 +888,12 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
# Missing starting carbon (e.g., "b-4")
glycan = re.sub(r'(a|b|\?)-(\d)', r'\g<1>1-\2', glycan)
# If still no '-' in glycan, assume 'a3' type of linkage denomination
if '-' not in glycan:
if '-' not in glycan or bool(re.search(r'[ab][123456](?!\-)', glycan)):
# Check whether linkages are recorded as b1 or as a3
if bool(re.search(r"^[^2-6]*1?[^2-6]*$", glycan)):
glycan = re.sub(r'(a|b)(\d)', r'\g<1>\g<2>-?', glycan)
glycan = re.sub(r'(a|b)(\d)(?!\-)', r'\g<1>\g<2>-?', glycan)
else:
glycan = re.sub(r'(a|b)(\d)', r'\g<1>1-\g<2>', glycan)
glycan = re.sub(r'(a|b)(\d)(?!\-)', r'\g<1>1-\g<2>', glycan)
# Smudge uncertainty
while '/' in glycan:
glycan = f'{glycan[:glycan.index("/")-1]}?{glycan[glycan.index("/")+2:]}'
Expand Down Expand Up @@ -929,13 +931,17 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
glycan = re.sub(r'([\?2-9])([\[\]])', r'\1)\2', glycan)
# Floating bits
if '+' in glycan:
if '-' not in glycan[:glycan.index('+')]:
prefix = glycan[:glycan.index('+')]
if prefix == 'S':
glycan = glycan.replace('S+', 'OS+')
elif '-' not in prefix:
glycan = glycan.replace('+', '(?1-?)+')
glycan = '{'+glycan.replace('+', '}')
post_process = {'5Ac(?1': '5Ac(a2', '5Gc(?1': '5Gc(a2', '5Ac(a1': '5Ac(a2', '5Gc(a1': '5Gc(a2', 'Fuc(?': 'Fuc(a',
'GalS': 'GalOS', 'GlcNAcS': 'GlcNAcOS', 'GalNAcS': 'GalNAcOS', 'SGal': 'GalOS', 'Kdn(?1': 'Kdn(a2',
'Kdn(a1': 'Kdn(a2'}
glycan = multireplace(glycan, post_process)
glycan = re.sub(r'[ab]-$', '', glycan) # Remove endings like Glcb-
# Canonicalize branch ordering
if '[' in glycan:
glycan = choose_correct_isoform(glycan)
Expand Down
4 changes: 3 additions & 1 deletion glycowork/motif/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,11 +457,13 @@ def glycan_to_composition(glycan: str, # Glycan in IUPAC-condensed format
stem_libr = stem_lib
SPECIAL_MODS = {
'1,7lactone': {'replacement': '', 'diff_moiety': '-H2O'},
'Az': {'replacement': 'AcH2O', 'diff_moiety': '+N3'},
'AcH2O': {'replacement': 'Ac', 'diff_moiety': '-OH'}
# Add other special modifications here in the format:
# 'modification': {'replacement': 'what to replace with', 'diff_moiety': chemical formula, sign indicating loss/gain}
}
VALID_COMPONENTS = {'Hex', 'dHex', 'HexNAc', 'HexN', 'HexA', 'Neu5Ac', 'Neu5Gc', 'Kdn',
'Pen', 'Me', 'S', 'P', 'PCho', 'PEtN', 'Ac', '-H2O'}
'Pen', 'Me', 'S', 'P', 'PCho', 'PEtN', 'Ac', '-H2O', '+N3', '-OH'}
glycan = glycan.replace('{', '').replace('}', '') if '{' in glycan else glycan
diff_moieties = Counter()
for mod, info in SPECIAL_MODS.items():
Expand Down
Loading

0 comments on commit ef3da9c

Please sign in to comment.