Merge pull request #21 from JCSzamosi/new_tests

New read depth function
JCSzamosi · Aug 3, 2023 · dc7f287 · dc7f287
2 parents 03c89da + 0ff1c74
commit dc7f287
Show file tree

Hide file tree

Showing 21 changed files with 1,007 additions and 55 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -5,3 +5,4 @@
 ^CHANGELOG\.md$
 ^data-raw$
 ^benchmark/*
+^ignore_me.R$
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@
 test_data/*
 ring_dir
 private
+ignore_me.R
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,20 @@
+* 2023-08-03 v0.0.1.9002 (development update)
+	* **BREAKING CHANGES**
+		* Completely re-writes `plot_read_depth()`. 
+			* allows users to plot read depth with a variable on the X axis and
+			a colour parameter
+			* users can access the old function with `plt_read_depth()`
+			temporarily, but this will be removed before the next full release.
+		* `rank_abund()` is broken and is no longer exported. Please file a bug
+		report if you were using this function.
+	* exports `order_taxa()`, by request 
+	* makes `prop_tax_down()` slightly more efficient by checking up front if
+	there is nothing to do.
+	* deprecates `order_levs()` because it isn't used anywhere. Its intended
+	function is performed by `order_taxa()`.
+	* introduces visual and automatic testing of the new `plot_read_depth()`
+	function.
+
 * 2023-04-25 v0.0.1.9001 (development update)
 	* in `plot_tax_bar()` 
 		* the `legloc` argument is now passed directly to

diff --git a/NAMESPACE b/NAMESPACE
@@ -4,11 +4,12 @@ export(long_distance_df)
 export(make_ord_df)
 export(make_phy_df)
 export(order_levs)
+export(order_taxa)
 export(plot_read_depth)
 export(plot_tax_bar)
 export(plt_ord)
+export(plt_read_depth)
 export(prop_tax_down)
-export(rank_abund)
 export(rotate_ticks)
 export(tax_colours)
 import(ggplot2)

diff --git a/R/make_phy_df.R b/R/make_phy_df.R
@@ -25,7 +25,7 @@
 #' @param indic a flag to indicate if the taxon names have level indicators. If
 #'   FALSE, they are added.
 #' @param prop Specifies whether taxa need to be propogated down the taxonomy
-#'   table (default, `TRUE`) or if this has already been done.
+#'   table (default is `TRUE`) or if this has already been done.
 #' @param count If `FALSE` (default) the function will expect a relative
 #'   abundance table and create an 'Other' category for taxa below the cutoff
 #'   (and will raise an error if the table is not relative abundance). If TRUE,
@@ -41,7 +41,7 @@ make_phy_df = function(physeq, rank = 'Genus', cutoff = 0.001, indic = FALSE,
         stop('physeq must be a relative abundance table. You have counts > 1.')
     }
     ranks = colnames(phyloseq::tax_table(physeq))
-    if (rank == 'OTU'){
+    if ((rank == 'OTU') & !('OTU' %in% ranks)) {
         ranks = c(ranks, rank)
     }
 
@@ -131,7 +131,7 @@ remain = function(x, tot = 1){
 
 #' Order Taxon Name Factors
 #'
-#' `order_taxa` reorders the taxon names in a taxon column (e.g. 'Class' or
+#' `order_taxa()` reorders the taxon names in a taxon column (e.g. 'Class' or
 #' 'Phylum') by the taxon's mean abundance (but always makes sure to put Other
 #' first).
 #'
@@ -144,14 +144,15 @@ remain = function(x, tot = 1){
 #' @param abund The name of the abundances column. Defaults to 'Abundance'
 #' @param decreasing Specifies whether the taxon order should be based on
 #'   decreasing or increasing abundance. Defaults to FALSE.
+#' @export
 order_taxa = function(phy_df, rank, abund = 'Abundance', decreasing = FALSE){
 
     phy_df[,rank] = factor(phy_df[,rank])
-	phy_df %>%
-        dplyr::filter(UQ(sym(rank)) != 'Other') %>%
-		dplyr::group_by(UQ(sym(rank))) %>%
-		dplyr::summarize(Tot = sum(UQ(sym(abund)))) %>%
-		data.frame() -> total_abunds
+    total_abunds = (phy_df
+                    %>% dplyr::filter(.data[[rank]] != 'Other')
+                    %>% dplyr::group_by(.data[[rank]])
+                    %>%	dplyr::summarize(Tot = sum(.data[[abund]]))
+                    %>%	data.frame())
 
 	lev_ord = levels(droplevels(total_abunds[,rank]))
 	if (decreasing){

diff --git a/R/plot_read_depth.R b/R/plot_read_depth.R
@@ -1,19 +1,196 @@
-#### plot_read_depth -----------------------------------------------------------
+#### plot_read_depth ------------------------------------------------------------
+
+#' View read depth
+#'
+#' @section Details: Creates a read depth boxplot
+#'
+#' @section Value: A [ggplot2] object
+#'
+#' @param physeq A phyloseq object
+#' @param xvar `NULL` A variable (column name) from `sample_data(physeq)` by
+#'   which to group samples on the x-axis. Must be categorical. If `NULL`, no
+#'   grouping will be performed.
+#' @param lin `FALSE` If `TRUE`, the y-axis will be on a linear scale. By
+#'   default it is log-scaled.
+#' @param cvar `NULL` A variable (column name) from `sample_data(physeq)` by
+#'   which the points should be coloured. If `NULL`, all points will be black.
+#'   May be continuous or categorical. See the `clrs` argument if you want to
+#'   specify your colours.
+#' @param clrs `NULL` If the `cvar` parameter is set and this is unset, colours
+#'   will be taken from [tax_colours] if `cvar` is categorical and use
+#'   [ggplot2]'s default (blue) if `cvar` is continuous. If `cvar` is
+#'   categorical, you may provide a named colour vector to use instead of
+#'   [tax_colours] and if it is continuous you may provide a two-colour vector.
+#'   The vector must have names "low" and "high". If `cvar` is NOT set, you may
+#'   use this parameter to provide a single colour name or hex code that will be
+#'   applied to all points.
+#' @export
+plot_read_depth = function(physeq, xvar = NULL, lin = FALSE, cvar = NULL,
+                          clrs = NULL){
+    # Check inputs
+    sample_sum_df = data.frame(Total = phyloseq::sample_sums(physeq),
+                               phyloseq::sample_data(physeq))
+
+	prd_check(sample_sum_df, cvar, xvar)
+
+	# Deal with the colour vector
+	clrs = prd_clrs(sample_sum_df, cvar, clrs)
+
+
+    # Construct the plot foundation with or without a grouping variable
+    if (is.null(xvar)){
+        depth_plot = ggplot2::ggplot(sample_sum_df,
+                                     ggplot2::aes(x = 'All', y = Total))
+    } else {
+        depth_plot = ggplot2::ggplot(sample_sum_df,
+                                     ggplot2::aes(x = .data[[xvar]], y = Total))
+    }
+
+    # Add the jittered points, taking colour into account
+    # No colouring variable
+    if (is.null(cvar)){
+        # And no specified colour
+        if (is.null(clrs)){
+            depth_plot = depth_plot + ggplot2::geom_jitter(width = 0.2)
+        # And colour is specified
+        } else {
+            depth_plot = depth_plot + ggplot2::geom_jitter(width = 0.2,
+                                                           colour = clrs)
+        }
+    # colouring variable is specified and continuous
+    } else if (is.numeric(sample_sum_df[[cvar]])) {
+        depth_plot = depth_plot + ggplot2::geom_jitter(width = 0.2,
+                                        ggplot2::aes(colour = .data[[cvar]]))
+        # if clrs not specified, do nothing. defaults will do.
+        if (is.null(clrs)){
+            NULL
+        # if clrs is specified, use it
+        } else {
+            depth_plot = depth_plot +
+                ggplot2::scale_colour_gradient(low = clrs['low'],
+                                               high = clrs['high'])
+        }
+    # if colouring variable is specified and categorical
+    } else {
+        depth_plot = depth_plot + ggplot2::geom_jitter(width = 0.2,
+                                              aes(colour = .data[[cvar]])) +
+            ggplot2::scale_colour_manual(values = clrs)
+    }
+
+    if (!lin){
+        depth_plot = depth_plot + ggplot2::scale_y_log10()
+    }
+    return(depth_plot)
+}
+
+#### prd_check() ---------------------------------------------------------------
+
+#' Check inputs for plot_read_depth()
+#'
+#' Checks the inputs of `plot_read_depth()`
+#' @param sample_sum_df data frame of sample sums and sample data
+#' @param cvar,xvar passed through from `plot_read_depth()`
+
+prd_check = function(sample_sum_df, cvar, xvar){
+    # Are cvar and xvar columns?
+    if (!is.null(xvar) && !(xvar %in% colnames(sample_sum_df))){
+        stop(paste('"xvar" must be one of the columns in sample_data(physeq).'))
+    }
+
+    if (!is.null(cvar) && !(cvar %in% colnames(sample_sum_df))){
+        stop(paste('"cvar" must be one of the columns in sample_data(physeq).'))
+    }
+}
+
+#### prd_clrs() ----------------------------------------------------------------
+
+#' Deal w colour vector for plot_read_depth()
+#'
+#' @param cvar,clrs passed through from `plot_read_depth()`
+prd_clrs = function(sample_sum_df, cvar, clrs){
+	# cvar unset, clrs set
+    if (is.null(cvar) & !is.null(clrs)){
+        if (length(clrs) > 1){
+            warn(paste('With no cvar specified, only the first colour in "clrs"',
+                       'will be used.'))
+            clrs = clrs[1]
+			return(clrs)
+        }
+    } else if (!is.null(cvar)){
+        # Built-in colour vector
+        if (is.null(clrs)){
+            # Categorical
+            if (is.factor(sample_sum_df[[cvar]]) ||
+				is.character(sample_sum_df[[cvar]])){
+                n = dplyr::n_distinct(sample_sum_df[[cvar]])
+                nx = ceiling(n/length(tax_colours))
+                clrs = rep(tax_colours, nx)
+				return(clrs)
+
+			# continuous requires no action
+            } else {
+				return(clrs)
+			}
+        # User-specified colour vector
+        } else {
+            # Categorical
+            if (is.factor(sample_sum_df[[cvar]]) |
+				is.character(sample_sum_df[[cvar]])){
+                if (is.null(names(clrs))){
+                    return(clrs)
+                    # nothing to do here. use it as is
+
+                } else if (!all(unique(sample_sum_df) %in% names(clrs))){
+                    stop(paste('If "clrs" is named, its names must contain',
+                               'all the values in the "cvar" column'))
+                }
+                # if we get this far, the colour vector is fine.
+
+				return(clrs)
+
+            # Continuous
+            } else {
+                if (is.null(names(clrs)) |
+                    !all(c('high','low') %in% names(clrs))){
+                    stop(paste('With a continuous "cvar", the provided colour',
+                             'vector "clrs" must have names "high" and "low".'))
+                } else {
+                	# if we get this far, the colour vector is fine.
+					return(clrs)
+				}
+				return(clrs)
+            }
+			return(clrs)
+        }
+		return(clrs)
+    }
+	return(clrs)
+}
+
+#### plt_read_depth -----------------------------------------------------------
 
 #' Plot read depth
 #'
-#' Creates a read depth histogram.
+#' @describeIn
+#' `r lifecycle::badge("deprecated")`
+#'
+#' Creates a read depth histogram. Deprecated in favour of `plot_read_depth()`
 #' @param physeq A phyloseq object
 #' @export
-plot_read_depth = function(physeq){
-    sample_sum_df = data.frame(Total = sample_sums(physeq),
-                               sample_data(physeq))
-    depth_plot = ggplot(sample_sum_df, aes(x = Total)) +
-        geom_histogram(colour = 'black', fill = 'grey57') +
-        scale_x_log10() +
-        xlab('Log10 read depth') +
-        ylab('Number of samples') +
-        ggtitle('Distribution of read depths') +
-        theme_bw()
+plt_read_depth = function(physeq){
+    lifecycle::deprecate_soft('0.0.1.9002', 'plt_read_depth()',
+                              'plot_read_depth()',
+                              details = paste("This is the old version of",
+                                              "plot_read_depth(), and will be",
+                                              "removed soon."))
+    sample_sum_df = data.frame(Total = phyloseq::sample_sums(physeq),
+                               phyloseq::sample_data(physeq))
+    depth_plot = ggplot2::ggplot(sample_sum_df, aes(x = Total)) +
+        ggplot2::geom_histogram(colour = 'black', fill = 'grey57') +
+        ggplot2::scale_x_log10() +
+        ggplot2::xlab('Log10 read depth') +
+        ggplot2::ylab('Number of samples') +
+        ggplot2::ggtitle('Distribution of read depths') +
+        ggplot2::theme_bw()
     return(depth_plot)
 }
diff --git a/R/prep_plot_rank_ab.R b/R/prep_plot_rank_ab.R
@@ -11,25 +11,26 @@
 #'
 #' @param phy_df A dataframe of a phyloseq object, like that generated by
 #'   [phyloseq::psmelt()] or [make_phy_df()]
-#' @param varbs (`NULL`) A character vector of grouping variables from
-#'   which the baseline values are chosen to define the abundance ordering. If
-#'   it is `NULL`, the ordering will be based on mean abundances in the
-#'   whole data frame.
-#' @param bases (`NULL`) A character vector of baseline values for the
-#'   variables given in `vars`. The ordering of the taxa will be given
-#'   based only on the samples with these baseline values for these variables.
-#'   Must be in the same order as vars.
+#' @param gvars (`NULL`) A character vector of grouping variables from which the
+#'   baseline values are chosen to define the abundance ordering. If it is
+#'   `NULL`, the ordering will be based on mean abundances in the whole data
+#'   frame.
+#' @param bases (`NULL`) A character vector of baseline values for the variables
+#'   given in `gvars`. The ordering of the taxa will be given based only on the
+#'   samples with these baseline values for these variables. Must be in the same
+#'   order as `gvars`.
 #' @param abunds (`'Abundance'`) The name of the abundance column.
-#' @param rank (`'Genus'`) The rank to base the ordering on.
+#' @param rank (`'Genus'`) The rank to base the ordering on. Must be a column in
+#'   `phy_df`
 #' @param IDcol (`'X.SampleID'`) The column name of the sample IDs
-#' @export
-rank_abund = function(phy_df, varbs = NULL, bases = NULL, abunds = 'Abundance',
-                      rank = 'Genus', IDcol = 'X.SampleID'){
+#' 
+rank_abund = function(phy_df, gvars = NULL, bases = NULL, abunds = 'Abundance',
+                      rank = 'Genus', IDcol){
     # Set up the groups for the plotting totals
     rank_abs = df_glom(phy_df, IDcol = IDcol, rank = rank, abunds = abunds)
 
     # Subset and order
-    ranked = subset_order(rank_abs, varbs, bases, rank = rank)
+    ranked = subset_order(rank_abs, gvars, bases, rank = rank)
 
     # Order the bigger data frame by the above ordering
     lev_ord = levels(ranked[,rank])

diff --git a/R/prop_tax_down.R b/R/prop_tax_down.R
@@ -36,6 +36,11 @@ prop_tax_down = function(physeq, indic, dbig = TRUE){
     colnames(tt) = colnames(phyloseq::tax_table(physeq))
     rownames(tt) = sp
 
+    # If there are no unspecified taxa, stop
+    if (!any(is.na(tt))){
+        return(physeq)
+    }
+
     tt = prop_tax_tab(tt, indic)
     phyloseq::tax_table(physeq) = tt
 

diff --git a/R/utils.R b/R/utils.R
@@ -2,6 +2,13 @@
 
 #' Order the levels of one factor by the values of another
 #'
+#' @description `r lifecycle::badge("deprecated")`
+#'
+#' This function will be deprecated because it is just recapitulating
+#' functionality provided by `ggplot2::facet_grid()`. If you are using this
+#' function and don't want it it go away, please get in touch and let me know
+#' what you're using it for.
+#'
 #' `order_levs()` takes two factors, the first of which has values that are
 #' nested within the values of the second, and orders the levels of the first
 #' factor such that they are clustered within the second factor.
@@ -14,6 +21,7 @@
 #' @param f2 The factor to use when re-ordering `f1`.
 #' @export
 order_levs = function(f1,...){
+    lifecycle::deprecate_warn('0.0.2', 'order_levs()')
 
     # if (is.numeric(f2)){
     #     ord = order(...)