Merge pull request #18 from JCSzamosi/plot_tax_bar

Merge the improvements and testing of plot_tax_bar into the updates branch
JCSzamosi · Apr 25, 2023 · 5c15691 · 5c15691
2 parents 82814fb + d5a0f06
commit 5c15691
Show file tree

Hide file tree

Showing 80 changed files with 1,620 additions and 322 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,3 +4,4 @@
 ^NAMESPACE-old$
 ^CHANGELOG\.md$
 ^data-raw$
+^benchmark/*
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,3 @@
 test_data/*
 ring_dir
 private
-*.~lock*
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,30 @@
+* 2023-04-25 v0.0.1.9001 (development update)
+	* in `plot_tax_bar()` 
+		* the `legloc` argument is now passed directly to
+		`ggplot2::theme(legend.position)` and can take any value that can take.
+		* added a `r_ticks` argument. FALSE by default (default behaviour
+		is unchanged). If TRUE, the tick text on the x-axis is rotated 90
+		degrees and reads down to up.
+		* introduced improved functionality when a custom colour vector is used,
+		with and without names
+		* introduce a `leglen` option to allow the user to limit how many taxa
+		are displayed in the legend without removing any taxa from the plot.
+		* soft-deprecate the `yscale` argument. Will stop supporting non-linear
+		y-axes soon
+		* improve error when the `rank` argument is missing from the input dat
+		frame
+		* introduce a warning when the per-sample abundaces sum to greater than
+		1 but the `mean` argument is not set to `TRUE`.
+		* prep the function so I can stop exporting the whole `ggplot2`
+		namespace
+	* introduce lifecycle management with the `lifecycle()` package
+	* start using roxygen2md to use Markdown in documentation.
+	* introduce the `benchmark` folder which contains "good" plotting outputs
+	against which new versions of the package can be tested. Created a .Rmd
+	file in that folder which tests `plot_tax_bar()`.
+	* remove the files that held the old colour vectors
+
+
 * 2023-04-14 v0.0.1 (was v1.0.1)
 	* The multiple colour vectors have been replaced with a single object,
 	`tax_colours`, which will cycle if there are more than 30 taxa. Having more

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: AfterSl1p
 Type: Package
 Title: Generate Summary Graphics and Basic Analysis of 16s Data
-Version: 0.0.1.9000
+Version: 0.0.1.9001
 Author: J. C. Szamosi and Shahrokh Shekarriz
 Authors@R: c(person("JC", "Szamosi", email = "[email protected]",
                   role = c('aut','cre')), 
@@ -19,6 +19,7 @@ Depends:
 Imports:
     dplyr (>= 0.7.2),
     ggplot2 (>= 2.2.1),
+    lifecycle,
     phyloseq (>= 1.19.1),
     pipeR,
     rlang (>= 0.1.2),
@@ -28,3 +29,4 @@ RoxygenNote: 7.2.3
 Suggests: 
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
+Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -14,3 +14,4 @@ export(tax_colours)
 import(ggplot2)
 import(pipeR)
 import(rlang)
+importFrom(lifecycle,deprecated)
diff --git a/R/AfterSl1p-package.R b/R/AfterSl1p-package.R
@@ -2,8 +2,9 @@
 "_PACKAGE"
 
 ## usethis namespace: start
-#' @import rlang
-#' @import pipeR
 #' @import ggplot2
+#' @import pipeR
+#' @import rlang
+#' @importFrom lifecycle deprecated
 ## usethis namespace: end
 NULL
diff --git a/R/data.R b/R/data.R
@@ -3,13 +3,13 @@
 #' A vector of colours for use with taxa bar charts.
 #'
 #' @section Details: There are 30 colours, and grey will be added by the
-#'   \code{plot_tax_bar} function for the "Other" category. If a plot calls for
-#'   more than 30 colours, this will just recycle. That is usually fine because
-#'   low-abundance stuff can't be seen anyway, but if you have a situation where
-#'   you have more than 30 things that actually need to be distinguished, you'll
-#'   need to provide your own vector. Also, if you are in that situation, try to
-#'   find another way to do what you are doing. People cannot generally
-#'   distinguish anywhere near 30 colours on a single plot.
+#'   [plot_tax_bar()] function for the "Other" category. If a plot
+#'   calls for more than 30 colours, this will just recycle. That is usually
+#'   fine because low-abundance stuff can't be seen anyway, but if you have a
+#'   situation where you have more than 30 things that actually need to be
+#'   distinguished, you'll need to provide your own vector. Also, if you are in
+#'   that situation, try to find another way to do what you are doing. People
+#'   cannot generally distinguish anywhere near 30 colours on a single plot.
 #' @export
 tax_colours = c("#87c5ab","#eea27c","#a9a8d2","#ffff99","#9999ff","#fb8072",
                 "#80b1d3","#fdb462","#b3de69","#fccde5","#bc80bd","#ccebc5",

diff --git a/R/long_distance_df.R b/R/long_distance_df.R
@@ -3,13 +3,13 @@
 #' Create a long data frame of among-sample distances
 #'
 #'
-#' \code{long_distance_df} creates a long data frame of all the pairwise
+#' `long_distance_df` creates a long data frame of all the pairwise
 #' distances from a sample distance matrix (e.g. the output of
-#' \code{\link{phyloseq::distance}}) with all the metadata listed for each sample.
-#' Allows for easy within- and among-group boxplots, or whatever other
-#' comparisons are of interest.
+#' [phyloseq::distance()]) with all the metadata listed for
+#' each sample. Allows for easy within- and among-group boxplots, or whatever
+#' other comparisons are of interest.
 #'
-#' @section Value: A data frame \eqn{N(N-1)} (or \eqn{N^2} if \code{diag = TRUE}
+#' @section Value: A data frame \eqn{N(N-1)} (or \eqn{N^2} if `diag = TRUE`
 #'   is set) rows (where N is the number of samples) with sample IDs, metadata,
 #'   and pairwise distances listed for each pair of samples. Sample ID and
 #'   metadata columns have '1' or '2' appended to them so the user can tell
@@ -18,22 +18,22 @@
 #'   names as row and column names.
 #' @param metadat A data frame or data frame-like object with the data set's
 #'   metadata
-#' @param idcol (\code{'X.SampleID'}.) A string. The column in \code{metadat}
+#' @param idcol (`'X.SampleID'`.) A string. The column in `metadat`
 #'   that holds the sample names. Sample names should match the row/column namse
 #'   of the distance matrix. If there are samples in the metadata data frame
 #'   that are missing from the distance matrix, they will be excluded with a
 #'   warning. If there are samples in the distance matrix that are missing from
 #'   the metadata, you will get an error.
-#' @param diag (\code{FALSE}.) Logical. Whether the diagonal elements (zeros in
+#' @param diag (`FALSE`.) Logical. Whether the diagonal elements (zeros in
 #'   a distance matrix) should be included in the long data frame. Defaults to
-#'   \code{FALSE} because we almost never want them.
-#' @param suff (\code{c('1','2')}.) A character vector of length 2. The suffixes
+#'   `FALSE` because we almost never want them.
+#' @param suff (`c('1','2')`.) A character vector of length 2. The suffixes
 #'   to be appended to the metadata column names in the output. The two elements
 #'   must not be identical.
-#' @param distcol (\code{'Distance'}.) A string. The desired column name for the
+#' @param distcol (`'Distance'`.) A string. The desired column name for the
 #'   distance column in your long data frame. Only here to avoid clashes with
 #'   existing metadata column names.
-#' @param baseline (\code{'NULL'}). A dataframe whose column names must also be
+#' @param baseline (`'NULL'`). A dataframe whose column names must also be
 #'   column names in the metadat data frame, and whose rows contain a subset of
 #'   the possible values/combinations. If this parameter is used, all the
 #'   samples whose metadata matches a row in this data frame will end up in
@@ -81,7 +81,7 @@ long_distance_df = function(dmat, metadat, idcol = 'X.SampleID', diag = FALSE,
 
 ### lddf_check -----------------------------------------------------------------
 
-#' Check the inputs of \code{long_distance_df()}
+#' Check the inputs of `long_distance_df()`
 #'
 #' For internal use only
 lddf_check = function(dmat, metadat, idcol = 'X.SampleID', diag = FALSE,
@@ -127,7 +127,7 @@ lddf_check = function(dmat, metadat, idcol = 'X.SampleID', diag = FALSE,
 
 #' Does the actual gathering and spreading without testing assumptions
 #'
-#' \code{lddf_work} Used internally by \code{long_distance_df()}. I recommend
+#' `lddf_work` Used internally by `long_distance_df()`. I recommend
 #' you use that function unless you really know what you're doing. This function
 #' does the actual gathering, spreading, and joining associated with making the
 #' lddf, but without checking if the distance matrix is sensible or removing
@@ -139,16 +139,16 @@ lddf_check = function(dmat, metadat, idcol = 'X.SampleID', diag = FALSE,
 #'   names as row and column names.
 #' @param metadat A data frame or data frame-like object with the data set's
 #'   metadata
-#' @param idcol (\code{'X.SampleID'}.) A string. The column in \code{metadat}
+#' @param idcol (`'X.SampleID'`.) A string. The column in `metadat`
 #'   that holds the sample names. Sample names should match the row/column namse
 #'   of the distance matrix. If there are samples in the metadata data frame
 #'   that are missing from the distance matrix, they will be excluded with a
 #'   warning. If there are samples in the distance matrix that are missing from
 #'   the metadata, you will get an error.
-#' @param suff (\code{c('1','2')}.) A character vector of length 2. The suffixes
+#' @param suff (`c('1','2')`.) A character vector of length 2. The suffixes
 #'   to be appended to the metadata column names in the output. The two elements
 #'   must not be identical.
-#' @param distcol (\code{'Distance'}.) A string. The desired column name for the
+#' @param distcol (`'Distance'`.) A string. The desired column name for the
 #'   distance column in your long data frame. Only here to avoid clashes with
 #'   existing metadata column names.
 lddf_work = function(dmat, metadat, idcol = 'X.SampleID', suff = c('1','2'),

diff --git a/R/make_phy_df.R b/R/make_phy_df.R
@@ -2,7 +2,7 @@
 
 #' Generate a Data Frame for Taxon Bar Charts
 #'
-#' \code{make_phy_df} generates a data frame that is useful for generating taxon
+#' `make_phy_df` generates a data frame that is useful for generating taxon
 #' bar charts.
 #'
 #' @section Details: This function takes a phyloseq object and generates a data
@@ -13,23 +13,23 @@
 #'   abundance, and weird things will happen if it is not.
 #'
 #' @section Value: A data frame similar in structure to that generated by
-#'   \code{psmelt}, but with an 'Other' category added and taxon levels ordered
-#'   for use in plotting.
+#'   [phyloseq::psmelt()], but with an 'Other' category added and
+#'   taxon levels ordered for use in plotting.
 #'
 #' @param physeq A phyloseq object.
 #' @param rank The rank at which to glom taxa. Must be one of 'OTU', 'Genus',
 #'   'Family', 'Order', 'Class', 'Phylum'. Default is 'Genus'.
 #' @param cutoff The abundance cutoff below which taxa are grouped into 'Other'.
 #'   If you don't want anything grouped into 'Other', set this to 0. Default is
 #'   0.001.
-#' @param indic a flag to indicate if the taxon names have level indicators.
-#'   If FALSE, they are added.
+#' @param indic a flag to indicate if the taxon names have level indicators. If
+#'   FALSE, they are added.
 #' @param prop Specifies whether taxa need to be propogated down the taxonomy
-#'   table (default, TRUE) or if this has already been done.
-#' @param count If FALSE (default) the function will expect a relative abundance
-#'   table and create an 'Other' category for taxa below the cutoff (and will
-#'   raise an error if the table is not relative abundance). If TRUE, the
-#'   function will not check for relative abundance and will not create an
+#'   table (default, `TRUE`) or if this has already been done.
+#' @param count If `FALSE` (default) the function will expect a relative
+#'   abundance table and create an 'Other' category for taxa below the cutoff
+#'   (and will raise an error if the table is not relative abundance). If TRUE,
+#'   the function will not check for relative abundance and will not create an
 #'   'Other' category.
 #' @export
 make_phy_df = function(physeq, rank = 'Genus', cutoff = 0.001, indic = FALSE,
@@ -131,15 +131,15 @@ remain = function(x, tot = 1){
 
 #' Order Taxon Name Factors
 #'
-#' \code{order_taxa} reorders the taxon names in a taxon column (e.g. 'Class' or
+#' `order_taxa` reorders the taxon names in a taxon column (e.g. 'Class' or
 #' 'Phylum') by the taxon's mean abundance (but always makes sure to put Other
 #' first).
 #'
 #' @section Value: A data frame that is identical to the one given, but with the
 #'   specified column re-ordered by its mean abundance
 #'
 #' @param phy_df A data frame of a phyloseq object, as produced by
-#'   \code{\link{psmelt}} or \code{\link{make_phy_df}}.
+#'   [phyloseq::psmelt()] or [make_phy_df()].
 #' @param rank The name of the column to be re-ordered
 #' @param abund The name of the abundances column. Defaults to 'Abundance'
 #' @param decreasing Specifies whether the taxon order should be based on