Merge pull request #305 from MontgomeryLab/issue-303

Version 1.4 release preparation
MontgomeryLab · May 2, 2023 · 6855509 · 6855509
2 parents 5aac71b + e2704c0
commit 6855509
Show file tree

Hide file tree

Showing 11 changed files with 29 additions and 41 deletions.
diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml
@@ -202,10 +202,6 @@ shared_memory: False
 ##-- Suppress all alignments if > <int> exist (default: no limit) (option -m) --##
 #suppress_aln: 10
 
-##-- Trim <int> bases from 5' (left) or 3' (right) end of reads (options --trim5 and --trim3) --##
-#trim5: 0
-#trim3: 0
-
 ##-- Input quals are from GA Pipeline ver. < 1.3 (option --solexa-quals) --##
 #solexa: false
 
@@ -221,9 +217,6 @@ shared_memory: False
 ######-------------------------------------------------------------------------------######
 
 
-##-- If True: show all parsed features in the counts csv, regardless of count/identity --##
-counter_all_features: False
-
 ##-- If True: counts are normalized by genomic hits (number of multi-alignments) --##
 counter_normalize_by_genomic_hits: True
 

diff --git a/images/features_sheet_header.png b/images/features_sheet_header.png
diff --git a/images/tiny-count_selection.png b/images/tiny-count_selection.png
diff --git a/tests/testdata/config_files/run_config_template.yml b/tests/testdata/config_files/run_config_template.yml
@@ -202,10 +202,6 @@ shared_memory: False
 ##-- Suppress all alignments if > <int> exist (default: no limit) (option -m) --##
 #suppress_aln: 10
 
-##-- Trim <int> bases from 5' (left) or 3' (right) end of reads (options --trim5 and --trim3) --##
-#trim5: 0
-#trim3: 0
-
 ##-- Input quals are from GA Pipeline ver. < 1.3 (option --solexa-quals) --##
 #solexa: false
 
@@ -221,9 +217,6 @@ shared_memory: False
 ######-------------------------------------------------------------------------------######
 
 
-##-- If True: show all parsed features in the counts csv, regardless of count/identity --##
-counter_all_features: False
-
 ##-- If True: counts are normalized by genomic hits (number of multi-alignments) --##
 counter_normalize_by_genomic_hits: True
 

diff --git a/tiny/cwl/tools/tiny-count.cwl b/tiny/cwl/tools/tiny-count.cwl
@@ -50,11 +50,6 @@ inputs:
     inputBinding:
       prefix: --stepvector
 
-  all_features:
-    type: boolean?
-    inputBinding:
-      prefix: --all-features
-
   in_pipeline:
     type: boolean?
     inputBinding:

diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl
@@ -86,7 +86,6 @@ inputs:
   counter_diags: boolean?
   counter_decollapse: boolean?
   counter_stepvector: string?
-  counter_all_features: boolean?
   counter_normalize_by_feature_hits: boolean?
   counter_normalize_by_genomic_hits: boolean?
 
@@ -214,7 +213,6 @@ steps:
       aligned_seqs: bowtie/sam_out
       gff_files: gff_files
       out_prefix: run_name
-      all_features: counter_all_features
       normalize_by_feature_hits:
         source: counter_normalize_by_feature_hits
         valueFrom: $(String(self))  # convert boolean -> string

diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py
@@ -64,9 +64,7 @@ def get_args():
     optional_args.add_argument('-sv', '--stepvector', choices=['Cython', 'HTSeq'], default='Cython',
                                help='Select which StepVector implementation is used to find '
                                     'features overlapping an interval.')
-    optional_args.add_argument('-a', '--all-features', action='store_true', help=argparse.SUPPRESS)
-                               #help='Represent all features in output counts table, '
-                               #     'even if they did not match in Stage 1 selection.')
+    optional_args.add_argument('-a', '--all-features', action='store_true', help=argparse.SUPPRESS)  # deprecated
     optional_args.add_argument('-p', '--in-pipeline', action='store_true',
                                help='Indicates that tiny-count was invoked as part of a pipeline run '
                                     'and that input files should be sourced as such.')

diff --git a/tiny/rna/plotter.py b/tiny/rna/plotter.py
@@ -362,9 +362,9 @@ def load_dge_tables(comparisons: list, class_fillna: str) -> pd.DataFrame:
         if not comparison:
             raise ValueError("Could not find condition names in DGE filename: " + dgefile)
         if len(comparison) > 1:
-            print("Warning: multiple conditions matched in DGE filename. Using first match.")
+            print("Warning: multiple conditions matched in DGE filename. Using last match.")
 
-        comparison_name = "_vs_".join(comparison[0])
+        comparison_name = "_vs_".join(comparison[-1])
         table = set_counts_table_multiindex(pd.read_csv(dgefile), class_fillna)
 
         de_table[comparison_name] = table['padj']

diff --git a/tiny/rna/plotterlib.py b/tiny/rna/plotterlib.py
@@ -241,8 +241,8 @@ def scatter_grouped(self, count_x: pd.DataFrame, count_y: pd.DataFrame, *groups,
         has_outgroup = all(co.replace(0, pd.NA).dropna().any()
                            for co in (count_x_out, count_y_out))
 
-        # Determine which groups we are able to plot on log scale
-        plottable_groups = self.get_nonzero_group_indexes(count_x, count_y, groups)
+        # Make all counts log-compatible, or drop the group if it is zero in both conditions
+        plottable_groups = self.nonzero_group_indexes(count_x, count_y, groups, view_lims)
         plot_labels = [labels[i] for i in plottable_groups]
         plot_groups = [groups[i] for i in plottable_groups]
         group_it = iter(plot_groups)
@@ -271,20 +271,34 @@ def scatter_grouped(self, count_x: pd.DataFrame, count_y: pd.DataFrame, *groups,
         return gscat
 
     @staticmethod
-    def get_nonzero_group_indexes(count_x, count_y, groups):
-        """When scatter plotting groups for two conditions on a log scale, if one
-        of the conditions has all zero counts for the group, then none of the group's
-        points are actually plotted due to the singularity at 0. We want to skip
-        plotting these groups and omit them from the legend."""
+    def nonzero_group_indexes(count_x, count_y, groups, view_lims):
+        """When scatter plotting features on a log scale, if the feature has a count of
+        zero in either condition then it is omitted from the plot by default due to the
+        singularity at 0, but we want to represent them nonetheless. So, we set their count
+        in the zero condition to the lower plot limit so that they are plotted on the very
+        edge of the plot space. Otherwise, approximating zero for these features would
+        shrink the plot.
+
+        We still want to omit features that are zero in both conditions, and if an entire
+        group consists of zero counts, its label should be omitted from the legend. This
+        is accomplished by omitting its index from the returned list."""
 
         non_zero_groups = []
+        minpos = min(view_lims)
         for i, group in enumerate(groups):
             x, y = count_x.loc[group], count_y.loc[group]
             x_is_zeros = x.replace(0, pd.NA).dropna().empty
             y_is_zeros = y.replace(0, pd.NA).dropna().empty
-            if not (x_is_zeros or y_is_zeros):
+
+            if not (x_is_zeros and y_is_zeros):
                 non_zero_groups.append(i)
 
+                # Replace counts that are zero in only one condition
+                x.loc[(x == 0) & (y != 0)] = minpos
+                y.loc[(y == 0) & (x != 0)] = minpos
+                count_x.loc[group] = x
+                count_y.loc[group] = y
+
         return non_zero_groups
 
     @staticmethod

diff --git a/tiny/templates/compatibility/run_config_compatibility.yml b/tiny/templates/compatibility/run_config_compatibility.yml
@@ -7,6 +7,10 @@
 
 
 1.4.0:
+  remove:
+    - counter_all_features
+    - trim5
+    - trim3
   rename:
     - counter_normalize_by_hits: counter_normalize_by_feature_hits
   add:

diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml
@@ -202,10 +202,6 @@ shared_memory: False
 ##-- Suppress all alignments if > <int> exist (default: no limit) (option -m) --##
 #suppress_aln: 10
 
-##-- Trim <int> bases from 5' (left) or 3' (right) end of reads (options --trim5 and --trim3) --##
-#trim5: 0
-#trim3: 0
-
 ##-- Input quals are from GA Pipeline ver. < 1.3 (option --solexa-quals) --##
 #solexa: false
 
@@ -221,9 +217,6 @@ shared_memory: False
 ######-------------------------------------------------------------------------------######
 
 
-##-- If True: show all parsed features in the counts csv, regardless of count/identity --##
-counter_all_features: False
-
 ##-- If True: counts are normalized by genomic hits (number of multi-alignments) --##
 counter_normalize_by_genomic_hits: True
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,10 @@ @@
 .4.0:
+      remove:
+        - counter_all_features
+        - trim5
+        - trim3
       rename:
         - counter_normalize_by_hits: counter_normalize_by_feature_hits
       add:
@@ Expand Down @@