more tests, add .cfg file, minor stability fixes

BojarLab · Nov 16, 2024 · f76535e · f76535e
1 parent 5a99d6b
commit f76535e
Show file tree

Hide file tree

Showing 5 changed files with 1,003 additions and 17 deletions.
diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -723,6 +723,8 @@ def get_glycanova(
     custom_scale: float = 0 # Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)
     ) -> Tuple[pd.DataFrame, Dict[str, pd.DataFrame]]: # (ANOVA results with F-stats and omega-squared effect sizes, post-hoc results)
     "Performs one-way ANOVA with omega-squared effect size calculation and optional Tukey's HSD post-hoc testing on glycomics data across multiple groups"
+    if len(set(groups)) < 3:
+      raise ValueError("You have fewer than three groups. We suggest get_differential_expression for those cases. ANOVA is for >= three groups.")
     df, _, groups, _ = preprocess_data(df, groups, [], experiment = "anova", motifs = motifs, impute = impute,
                                       min_samples = min_samples, transform = transform, feature_set = feature_set,
                                       gamma = gamma, custom_scale = custom_scale, custom_motifs = custom_motifs)
@@ -982,10 +984,17 @@ def get_biodiversity(
       mean_a, mean_b = [np.mean(row_a) for row_a in df_a.values], [np.mean(row_b) for row_b in df_b.values]
       if paired:
         assert len(df_a) == len(df_b), "For paired samples, the size of group1 and group2 should be the same"
-      pvals = [ttest_rel(row_b, row_a)[1] if paired else ttest_ind(row_b, row_a, equal_var = False)[1] for
-                     row_a, row_b in zip(df_a.values, df_b.values)]
-      pvals = [p if p > 0 and p < 1 else 1.0 for p in pvals]
-      effect_sizes, _ = zip(*[cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)])
+      pvals = []
+      effect_sizes = []
+      for row_a, row_b in zip(df_a.values, df_b.values):
+        if np.allclose(row_a, row_b, rtol = 1e-5, atol = 1e-8):
+          pvals.append(1.0)
+          effect_sizes.append(0.0)
+        else:
+          pval = ttest_rel(row_b, row_a)[1] if paired else ttest_ind(row_b, row_a, equal_var = False)[1]
+          pvals.append(pval if (pval > 0 and pval < 1) else 1.0)
+          effect, _ = cohen_d(row_b, row_a, paired = paired)
+          effect_sizes.append(effect)
       a_df_stats = pd.DataFrame(list(zip(a_df.index.tolist(), mean_a, mean_b, pvals, effect_sizes)),
                                columns = ["Metric", "Group1 mean", "Group2 mean", "p-val", "Effect size"])
       shopping_cart.append(a_df_stats)
@@ -1250,9 +1259,9 @@ def get_lectin_array(
   lectin_lib = load_lectin_lib()
   useable_lectin_mapping, motif_mapping = create_lectin_and_motif_mappings(lectin_list, lectin_lib)
   if group2:
-    mean_scores_per_condition = df[group1 + group2].groupby([0] * len(group1) + [1] * len(group2), axis = 1).mean()
+    mean_scores_per_condition = df[group1 + group2].T.groupby([0] * len(group1) + [1] * len(group2)).mean().T
   else:
-    mean_scores_per_condition = df.groupby(group1, axis = 1).mean()
+    mean_scores_per_condition = df.T.groupby(group1).mean().T
   lectin_variance = mean_scores_per_condition.var(axis = 1)
   idf = np.sqrt(lectin_variance)
   if group2:

diff --git a/glycowork/motif/draw.py b/glycowork/motif/draw.py
@@ -29,7 +29,7 @@ def matches(
     ) -> Generator[Tuple[int, int, int], None, None]: # Yields (start pos, end pos, nesting depth)
   "Finds matching pairs of delimiters in a string, handling nested pairs and returning positions and depth;ref: https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex"""
   stack = []
-  for m in re.finditer(r'[{}{}]'.format(opendelim, closedelim), line):
+  for m in re.finditer(r'[\[\]]', line) if opendelim == '[' else re.finditer(r'[{}{}]'.format(opendelim, closedelim), line):
       pos = m.start()
       if line[pos-1] == '\\':
           # Skip escape sequence
@@ -2356,7 +2356,7 @@ def scale_in_range(
   "Normalizes list of numbers to specified range"
   min_val = min(listy)
   max_val = max(listy)
-  range_val = max_val - min_val
+  range_val = max(max_val - min_val, 1e-6)
   return [(b - a) * ((x - min_val) / range_val) + a for x in listy]
 
 

diff --git a/glycowork/network/biosynthesis.py b/glycowork/network/biosynthesis.py
@@ -945,8 +945,8 @@ def get_maximum_flow(network: nx.Graph, # Biosynthetic network
   # Dictionary to store flow values and paths for each sink
   flow_results = {}
   for sink in sinks:
-    path_length = nx.shortest_path_length(network, source = source, target = sink)
     try:
+      path_length = nx.shortest_path_length(network, source = source, target = sink)
       try:
         flow_value, flow_dict = nx.maximum_flow(network, source, sink)
       except:
@@ -955,7 +955,7 @@ def get_maximum_flow(network: nx.Graph, # Biosynthetic network
           'flow_value': flow_value * path_length,
           'flow_dict': flow_dict
           }
-    except nx.NetworkXError:
+    except (nx.NetworkXError, nx.NetworkXNoPath):
       print(f"{sink} cannot be reached.")
   return flow_results
 

diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,6 @@
+[bdist_wheel]
+universal=0
+
+[build_system]
+requires = ["setuptools>=64.0"]
+build-backend = "setuptools.build_meta"