azidylated monosaccharides, better universal input, .analysis fixes

- support azide-containing monosaccharides - more IUPAC-condensed dialect support in canonicalize_iupac - various fixes of keyword argument combination mismatches in get_differential_expression - ensure output stability in correct_multiple_testing
BojarLab · Dec 11, 2024 · ef3da9c · ef3da9c
1 parent 8c69c2c
commit ef3da9c
Show file tree

Hide file tree

Showing 5 changed files with 205 additions and 78 deletions.
diff --git a/glycowork/glycan_data/stats.py b/glycowork/glycan_data/stats.py
@@ -836,6 +836,8 @@ def correct_multiple_testing(pvals: Union[List[float], np.ndarray], # list of ra
   "Corrects p-values for multiple testing, by default with the two-stage Benjamini-Hochberg procedure"
   if not isinstance(pvals, list):
     pvals = pvals.tolist()
+  if not pvals:
+    return [], []
   corrpvals = multipletests(pvals, method = 'fdr_tsbh' if correction_method == "two-stage" else 'fdr_bh')[1]
   corrpvals = [p if p >= pvals[i] else pvals[i] for i, p in enumerate(corrpvals)]
   significance = [bool(p < alpha) for p in corrpvals]

diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -543,9 +543,9 @@ def get_differential_expression(
   # Variance-based filtering of features
   if not monte_carlo:
     df, df_prison = variance_based_filtering(df)
+    df_org, df_org_prison = df_org.loc[df.index], df_org.loc[df_prison.index]
   else:
-    df_prison = []
-  df_org, df_org_prison = df_org.loc[df.index], df_org.loc[df_prison.index]
+    df_prison, df_org_prison = pd.DataFrame(), pd.DataFrame()
   glycans = df.index.tolist()
   mean_abundance = df_org.mean(axis = 1)
   df_a, df_b = df[group1], df[group2]
@@ -556,19 +556,18 @@ def get_differential_expression(
     # Testing differential expression of each set/cluster
     for cluster in clusters:
       if len(cluster) > 1:
-        glycans.append(cluster)
         cluster = list(cluster)
+        glycans.append(cluster)
         gp1, gp2 = df_a.loc[cluster, :], df_b.loc[cluster, :]
         mean_abundance_c.append(mean_abundance.loc[cluster].mean())
         log2fc.append(((gp2.values - gp1.values).mean(axis = 1)).mean() if paired else (gp2.mean(axis = 1) - gp1.mean(axis = 1)).mean())
-      gp1, gp2 = df.loc[cluster, group1], df.loc[cluster, group2]
-      # Hotelling's T^2 test for multivariate comparisons
-      pvals.append(hotellings_t2(gp1.T.values, gp2.T.values, paired = paired)[1])
-      levene_pvals.append(np.mean([levene(gp1.loc[variable, :], gp2.loc[variable, :])[1] for variable in cluster]))
-      # Calculate Mahalanobis distance as measure of effect size for multivariate comparisons
-      effect_sizes.append(mahalanobis_distance(gp1, gp2, paired = paired))
-      if effect_size_variance:
-        variances.append(mahalanobis_variance(gp1, gp2, paired = paired))
+        # Hotelling's T^2 test for multivariate comparisons
+        pvals.append(hotellings_t2(gp1.T.values, gp2.T.values, paired = paired)[1])
+        levene_pvals.append(np.mean([levene(gp1.loc[variable, :], gp2.loc[variable, :])[1] for variable in cluster]))
+        # Calculate Mahalanobis distance as measure of effect size for multivariate comparisons
+        effect_sizes.append(mahalanobis_distance(gp1, gp2, paired = paired))
+        if effect_size_variance:
+          variances.append(mahalanobis_variance(gp1, gp2, paired = paired))
     mean_abundance = mean_abundance_c
   else:
     log2fc = (df_b.values - df_a.values).mean(axis = 1) if paired else (df_b.mean(axis = 1) - df_a.mean(axis = 1))
@@ -606,21 +605,23 @@ def get_differential_expression(
     corrpvals, significance = [1]*len(glycans), [False]*len(glycans)
   df_out = pd.DataFrame(list(zip(glycans, mean_abundance, log2fc, pvals, corrpvals, significance, levene_pvals, effect_sizes, equivalence_pvals)),
                      columns = ['Glycan', 'Mean abundance', 'Log2FC', 'p-val', 'corr p-val', 'significant', 'corr Levene p-val', 'Effect size', 'Equivalence p-val'])
-  prison_rows = pd.DataFrame({
-      'Glycan': df_prison.index,
-      'Mean abundance': df_org_prison.mean(axis = 1),
-      'Log2FC': (df_prison[group2].values - df_prison[group1].values).mean(axis = 1) if paired else (df_prison[group2].mean(axis = 1) - df_prison[group1].mean(axis = 1)),
-      'p-val': [1.0] * len(df_prison),
-      'corr p-val': [1.0] * len(df_prison),
-      'significant': [False] * len(df_prison),
-      'corr Levene p-val': [1.0] * len(df_prison),
-      'Effect size': [0] * len(df_prison),
-      'Equivalence p-val': [1.0] * len(df_prison)})
-  prison_rows = prison_rows.astype({'significant': 'bool'})
-  df_out = pd.concat([df_out, prison_rows], ignore_index = True)
+  if not monte_carlo:
+    prison_rows = pd.DataFrame({
+        'Glycan': df_prison.index,
+        'Mean abundance': df_org_prison.mean(axis = 1),
+        'Log2FC': (df_prison[group2].values - df_prison[group1].values).mean(axis = 1) if paired else (df_prison[group2].mean(axis = 1) - df_prison[group1].mean(axis = 1)),
+        'p-val': [1.0] * len(df_prison),
+        'corr p-val': [1.0] * len(df_prison),
+        'significant': [False] * len(df_prison),
+        'corr Levene p-val': [1.0] * len(df_prison),
+        'Effect size': [0] * len(df_prison),
+        'Equivalence p-val': [1.0] * len(df_prison)})
+    prison_rows = prison_rows.astype({'significant': 'bool'})
+    if len(prison_rows) > 0:
+      df_out = pd.concat([df_out, prison_rows], ignore_index = True)
   df_out['significant'] = df_out['significant'].astype('bool')
   if effect_size_variance:
-    df_out['Effect size variance'] = variances + [0]*len(df_prison)
+    df_out['Effect size variance'] = list(variances) + [0]*len(df_prison)
   if glycoproteomics:
     return get_glycoform_diff(df_out, alpha = alpha, level = level)
   else:
@@ -761,7 +762,8 @@ def get_glycanova(
       'corr p-val': [1.0] * len(df_prison),
       'significant': [False] * len(df_prison)})
     prison_rows = prison_rows.astype({'significant': 'bool'})
-    df_out = pd.concat([df_out, prison_rows], ignore_index = True)
+    if len(prison_rows) > 0:
+      df_out = pd.concat([df_out, prison_rows], ignore_index = True)
     df_out['significant'] = df_out['significant'].astype('bool')
     df_out['Effect size'] = effect_sizes.values
     return df_out.sort_values(by = 'corr p-val'), posthoc_results

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -867,6 +867,8 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
   if bool(re.search(r'\([A-Zd3-9]', glycan)):
     glycan = glycan.replace('(', '[').replace(')', ']')
   # Canonicalize linkage uncertainty
+  # Open linkages with anomeric config specified (e.g., "Mana-")
+  glycan = re.sub(r'([A-Z][a-z]*)([a-b])\-([A-Z])', r'\1\g<2>1-?\3', glycan)
   # Open linkages (e.g., "c-")
   glycan = re.sub(r'([a-z])\-([A-Z])', r'\1?1-?\2', glycan)
   # Open linkages2 (e.g., "1-")
@@ -886,12 +888,12 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
   # Missing starting carbon (e.g., "b-4")
   glycan = re.sub(r'(a|b|\?)-(\d)', r'\g<1>1-\2', glycan)
   # If still no '-' in glycan, assume 'a3' type of linkage denomination
-  if '-' not in glycan:
+  if '-' not in glycan or bool(re.search(r'[ab][123456](?!\-)', glycan)):
     # Check whether linkages are recorded as b1 or as a3
     if bool(re.search(r"^[^2-6]*1?[^2-6]*$", glycan)):
-      glycan = re.sub(r'(a|b)(\d)', r'\g<1>\g<2>-?', glycan)
+      glycan = re.sub(r'(a|b)(\d)(?!\-)', r'\g<1>\g<2>-?', glycan)
     else:
-      glycan = re.sub(r'(a|b)(\d)', r'\g<1>1-\g<2>', glycan)
+      glycan = re.sub(r'(a|b)(\d)(?!\-)', r'\g<1>1-\g<2>', glycan)
   # Smudge uncertainty
   while '/' in glycan:
     glycan = f'{glycan[:glycan.index("/")-1]}?{glycan[glycan.index("/")+2:]}'
@@ -929,13 +931,17 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
   glycan = re.sub(r'([\?2-9])([\[\]])', r'\1)\2', glycan)
   # Floating bits
   if '+' in glycan:
-    if '-' not in glycan[:glycan.index('+')]:
+    prefix = glycan[:glycan.index('+')]
+    if prefix == 'S':
+      glycan = glycan.replace('S+', 'OS+')
+    elif '-' not in prefix:
       glycan = glycan.replace('+', '(?1-?)+')
     glycan = '{'+glycan.replace('+', '}')
   post_process = {'5Ac(?1': '5Ac(a2', '5Gc(?1': '5Gc(a2', '5Ac(a1': '5Ac(a2', '5Gc(a1': '5Gc(a2', 'Fuc(?': 'Fuc(a',
                   'GalS': 'GalOS', 'GlcNAcS': 'GlcNAcOS', 'GalNAcS': 'GalNAcOS', 'SGal': 'GalOS', 'Kdn(?1': 'Kdn(a2',
                   'Kdn(a1': 'Kdn(a2'}
   glycan = multireplace(glycan, post_process)
+  glycan = re.sub(r'[ab]-$', '', glycan)  # Remove endings like Glcb-
   # Canonicalize branch ordering
   if '[' in glycan:
     glycan = choose_correct_isoform(glycan)

diff --git a/glycowork/motif/tokenization.py b/glycowork/motif/tokenization.py
@@ -457,11 +457,13 @@ def glycan_to_composition(glycan: str, # Glycan in IUPAC-condensed format
     stem_libr = stem_lib
   SPECIAL_MODS = {
     '1,7lactone': {'replacement': '', 'diff_moiety': '-H2O'},
+    'Az': {'replacement': 'AcH2O', 'diff_moiety': '+N3'},
+    'AcH2O': {'replacement': 'Ac', 'diff_moiety': '-OH'}
     # Add other special modifications here in the format:
     # 'modification': {'replacement': 'what to replace with', 'diff_moiety': chemical formula, sign indicating loss/gain}
   }
   VALID_COMPONENTS = {'Hex', 'dHex', 'HexNAc', 'HexN', 'HexA', 'Neu5Ac', 'Neu5Gc', 'Kdn',
-                     'Pen', 'Me', 'S', 'P', 'PCho', 'PEtN', 'Ac', '-H2O'}
+                     'Pen', 'Me', 'S', 'P', 'PCho', 'PEtN', 'Ac', '-H2O', '+N3', '-OH'}
   glycan = glycan.replace('{', '').replace('}', '') if '{' in glycan else glycan
   diff_moieties = Counter()
   for mod, info in SPECIAL_MODS.items():