add more tests, CodeCov shield, some minor fixes

- updated resources.open_text to resources.files due to DeprecationWarning - edge case in get_unique_topologies for empty list indexing - always return dataframe in compositions_to_structures - make component inference in parse_glycoform more robust - quick-return string-identical repeats in equal_repeats
BojarLab · Nov 15, 2024 · 0c94995 · 0c94995
1 parent 25d4672
commit 0c94995
Show file tree

Hide file tree

Showing 10 changed files with 913 additions and 290 deletions.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -35,10 +35,16 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           test -f setup.py && pip install -e ".[all]"
-          pip install pytest
+          pip install pytest pytest-cov
 
       - name: Run tests
         shell: bash -l {0}
         run: |
           cd tests
-          pytest
+          pytest --cov=../ --cov-report=xml
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        with:
+          file: ./coverage.xml
+          fail_ci_if_error: true
diff --git a/README.md b/README.md
diff --git a/glycowork/glycan_data/loader.py b/glycowork/glycan_data/loader.py
@@ -8,7 +8,7 @@
 from importlib import resources
 from typing import Any, Dict, List, Optional
 
-with resources.open_text("glycowork.glycan_data", "glycan_motifs.csv") as f:
+with resources.files("glycowork.glycan_data").joinpath("glycan_motifs.csv").open(encoding = 'utf-8-sig') as f:
   motif_list = pd.read_csv(f)
 this_dir, this_filename = path.split(__file__)  # Get path of data.pkl
 data_path = path.join(this_dir, 'lib_v11.pkl')

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -878,6 +878,8 @@ def equal_repeats(r1: str, # First glycan sequence
                  r2: str # Second glycan sequence
                 ) -> bool: # True if repeats are shifted versions
   "Check whether two repeat units could stem from the same repeating structure"
+  if r1 == r2:
+    return True
   r1_long = r1[:r1.rindex(')')+1] * 2
   return any(r1_long[i:i + len(r2)] == r2 for i in range(len(r1)))
 
@@ -907,6 +909,9 @@ def parse_glycoform(glycoform: Union[str, Dict[str, int]], # Composition in H5N4
                   ) -> Dict[str, int]: # Dictionary of feature counts
   "Convert composition like H5N4F1A2 into monosaccharide counts"
   if isinstance(glycoform, dict):
+    if not any(f in glycoform.keys() for f in glycan_features):
+      mapping = {'Hex': 'H', 'HexNAc': 'N', 'dHex': 'F', 'Neu5Ac': 'A', 'Neu5Gc': 'G'}
+      glycoform = {mapping.get(k, k): v for k, v in glycoform.items()}
     components = {k: glycoform.get(k, 0) for k in glycan_features}
     return components | infer_features_from_composition(components)
   components = {c: 0 for c in glycan_features}

diff --git a/glycowork/motif/regex.py b/glycowork/motif/regex.py
@@ -2,7 +2,7 @@
 import copy
 import networkx as nx
 from itertools import product, combinations, chain
-from typing import Dict, List, Union, Optional, Tuple, Callable, Any
+from typing import Dict, List, Union, Optional, Tuple, Any
 from glycowork.glycan_data.loader import replace_every_second, unwrap
 from glycowork.motif.processing import min_process_glycans, bracket_removal, canonicalize_iupac
 from glycowork.motif.graph import graph_to_string, subgraph_isomorphism, compare_glycans, glycan_to_nxGraph

diff --git a/glycowork/motif/tokenization.py b/glycowork/motif/tokenization.py
@@ -6,7 +6,8 @@
 from importlib import resources
 from collections import Counter
 from sklearn.cluster import DBSCAN
-from typing import Dict, List, Set, Union, Optional, Tuple
+from functools import reduce
+from typing import Dict, List, Set, Union, Optional
 
 from glycowork.glycan_data.loader import lib, unwrap, df_glycan, Hex, dHex, HexA, HexN, HexNAc, Pen, linkages
 from glycowork.motif.processing import min_process_glycans, rescue_glycans, rescue_compositions
@@ -16,7 +17,7 @@
          'L':12, 'M':13, 'N':14, 'P':15, 'Q':16, 'R':17, 'S':18, 'T':19,
          'V':20, 'W':21, 'Y':22, 'X':23, 'Z':24, 'z':25}
 
-with resources.open_text("glycowork.motif", "mz_to_composition.csv") as f:
+with resources.files("glycowork.motif").joinpath("mz_to_composition.csv").open(encoding = 'utf-8-sig') as f:
   mapping_file = pd.read_csv(f)
 mass_dict = dict(zip(mapping_file.composition, mapping_file["underivatized_monoisotopic"]))
 
@@ -330,7 +331,7 @@ def compositions_to_structures(composition_list: List[Dict[str, int]], # List of
   print(f"{not_matched_count} compositions could not be matched. Run with verbose = True to see which compositions.")
   if verbose:
     print(not_matched_list)
-  return df_out
+  return df_out if isinstance(df_out, pd.DataFrame) else pd.DataFrame()
 
 
 def mz_to_structures(mz_list: List[float], # List of precursor masses
@@ -365,10 +366,7 @@ def mz_to_structures(mz_list: List[float], # List of precursor masses
   for m, comp in enumerate(compositions):
     out_structures.append(compositions_to_structures(comp, glycan_class = glycan_class,
                                               abundances = abundances.iloc[[m]], kingdom = kingdom, df_use = df_use, verbose = verbose))
-  if out_structures:
-    return pd.concat(out_structures, axis = 0)
-  else:
-    return []
+  return pd.concat(out_structures, axis = 0).reset_index(drop = True) if out_structures else []
 
 
 def mask_rare_glycoletters(glycans: List[str], # List of IUPAC-condensed glycans
@@ -553,4 +551,4 @@ def get_unique_topologies(composition: Dict[str, int], # Composition dictionary
   df_use = df_use[df_use.glycan_type == glycan_type]
   df_use = df_use[df_use[taxonomy_rank].apply(lambda x: taxonomy_value in x)].glycan.values
   df_use = list(set([structure_to_basic(k) for k in df_use]))
-  return [[g.replace(k, v) for k,v in universal_replacers.items()][0] for g in df_use if '{' not in g]
+  return [reduce(lambda x, kv: x.replace(*kv), universal_replacers.items(), g) for g in df_use if '{' not in g]
diff --git a/glycowork/network/biosynthesis.py b/glycowork/network/biosynthesis.py
@@ -10,7 +10,7 @@
 from scipy.stats import ttest_rel, ttest_ind
 from statsmodels.formula.api import ols
 from statsmodels.stats.multitest import multipletests
-from typing import Dict, List, Set, Union, Optional, Tuple, Any, FrozenSet
+from typing import Dict, List, Set, Union, Optional, Tuple, FrozenSet
 import statsmodels.api as sm
 import networkx as nx
 import numpy as np