From 37c5ef3966a86f577256603d7062d837399fd780 Mon Sep 17 00:00:00 2001 From: pfgherardini Date: Tue, 12 Feb 2019 10:52:21 -0800 Subject: [PATCH 1/2] implemented shifting of negative values --- R/cluster.R | 18 +++++++----- R/io.R | 49 ++++++++++++++++++++++++++++++--- man/cluster_fcs_files.Rd | 14 ++++++++-- man/cluster_fcs_files_groups.Rd | 13 +++++++-- man/cluster_fcs_files_in_dir.Rd | 10 ++++++- man/convert_fcs.Rd | 11 ++++++-- man/process_file.Rd | 13 +++++++-- man/process_files_groups.Rd | 13 +++++++-- man/shift_negative_values.Rd | 25 +++++++++++++++++ 9 files changed, 144 insertions(+), 22 deletions(-) create mode 100644 man/shift_negative_values.Rd diff --git a/R/cluster.R b/R/cluster.R index ec13978..59b6ddb 100644 --- a/R/cluster.R +++ b/R/cluster.R @@ -65,7 +65,8 @@ cluster_data <- function(tab, col.names, k, ...) { #' @inheritParams cluster_fcs_files #' #' -process_files_groups <- function(files, col.names, num.clusters, num.samples, asinh.cofactor, downsample.to, output.dir) { +process_files_groups <- function(files, col.names, num.clusters, num.samples, asinh.cofactor, + downsample.to, output.dir, negative.values) { tab <- NULL orig.data <- NULL @@ -123,7 +124,7 @@ process_files_groups <- function(files, col.names, num.clusters, num.samples, as #' @param f The file path #' @inheritParams cluster_fcs_files #' -process_file <- function(f, col.names, num.clusters, num.samples, asinh.cofactor, output.dir) { +process_file <- function(f, col.names, num.clusters, num.samples, asinh.cofactor, output.dir, negative.values) { fcs.file <- flowCore::read.FCS(f) orig.data <- flowCore::exprs(fcs.file) tab <- convert_fcs(fcs.file, asinh.cofactor) @@ -199,18 +200,20 @@ cluster_fcs_files_in_dir <- function(wd, ...) { #' @param num.cores Number of CPU cores to use #' @param col.names A vector of column names indicating which columns should be used for clustering #' @param num.clusters The desired number of clusters -#' @param asinh.cofactor Cofactor for asinh transformation. If this is \code{NULL} no transformation is performed (see \code{convert_fcs}) #' @param num.samples Number of samples to be used for the CLARA algorithm (see \code{cluster::clara}) #' @param output.dir The name of the output directory, it will be created if it does not exist +#' @inheritParams convert_fcs #' @return Returns either \code{NULL} or a \code{try-error} object if some error occurred during the computation #' @export -cluster_fcs_files <- function(files.list, num.cores, col.names, num.clusters, asinh.cofactor, num.samples = 50, output.dir = ".") { +cluster_fcs_files <- function(files.list, num.cores, col.names, num.clusters, asinh.cofactor, + num.samples = 50, output.dir = ".", negative.values = "truncate") { if(!dir.exists(output.dir)) dir.create(output.dir, recursive = TRUE, showWarnings = TRUE) parallel::mclapply(files.list, mc.cores = num.cores, mc.preschedule = FALSE, process_file, col.names = col.names, num.clusters = num.clusters, - num.samples = num.samples, asinh.cofactor = asinh.cofactor, output.dir = output.dir) + num.samples = num.samples, asinh.cofactor = asinh.cofactor, + output.dir = output.dir, negative.values = negative.values) } @@ -227,7 +230,7 @@ cluster_fcs_files <- function(files.list, num.cores, col.names, num.clusters, as #' #' @export cluster_fcs_files_groups <- function(files.list, num.cores, col.names, num.clusters, asinh.cofactor, - num.samples = 50, downsample.to = 0, output.dir = ".") { + num.samples = 50, downsample.to = 0, output.dir = ".", negative.values = "truncate") { files.list <- lapply(names(files.list), function(x) { c(x, files.list[[x]]) @@ -238,7 +241,8 @@ cluster_fcs_files_groups <- function(files.list, num.cores, col.names, num.clust parallel::mclapply(files.list, mc.cores = num.cores, mc.preschedule = FALSE, process_files_groups, col.names = col.names, num.clusters = num.clusters, num.samples = num.samples, - asinh.cofactor = asinh.cofactor, downsample.to = downsample.to, output.dir = output.dir) + asinh.cofactor = asinh.cofactor, downsample.to = downsample.to, + output.dir = output.dir, negative.values = negative.values) } diff --git a/R/io.R b/R/io.R index 1c1a710..9abb6f5 100644 --- a/R/io.R +++ b/R/io.R @@ -8,14 +8,21 @@ #' #' @param f The \code{flowFrame} to convert #' @param asinh.cofactor Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed -#' @param clip.at.zero Wether to clip negative values (after transformation) at zero +#' @param negative.values How to deal with negative values in the data. If this is \code{NULL} negative values +#' are left as is. Otherwise two options are possible: +#' \itemize{ +#' \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) +#' \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will +#' be truncated to 0. This option is useful in cases where the range of data significantly extends +#' in the negatives, for instance due to compensation. +#' } #' @param compensate Wether to compensate the data using the compensation matrix embedded in the \code{flowFrame} (if any) #' #' @return Returns a \code{data.frame} corresponding to the data in \code{flowCore::exprs(f)} after compensation #' and transformation #' #' @export -convert_fcs <- function(f, asinh.cofactor = NULL, clip.at.zero = T, compensate = T) { +convert_fcs <- function(f, asinh.cofactor = NULL, negative.values = "truncate", compensate = T) { comp <- grep("SPILL", names(flowCore::description(f)), value = T) if(compensate && (length(comp) > 0)) { @@ -35,8 +42,13 @@ convert_fcs <- function(f, asinh.cofactor = NULL, clip.at.zero = T, compensate = if(!is.null(asinh.cofactor)) m <- asinh(m / asinh.cofactor) - if(clip.at.zero) - m[m < 0] <- 0 + if(!is.null(negative.values)) { + negative.values <- match.arg(negative.values, choices = c("truncate", "shift")) + if(negative.values == "truncate") + m[m < 0] <- 0 + else if(negative.values == "shift") + m <- shift_negative_values(m) + } tab <- data.frame(m, check.names = F, stringsAsFactors = F) @@ -51,6 +63,35 @@ convert_fcs <- function(f, asinh.cofactor = NULL, clip.at.zero = T, compensate = } +#' Shift negative values in a matrix +#' +#' This function shifts negative values in a data matrix. For each column vector +#' the procedure is as follows: +#' \enumerate{ +#' \item A specific quantile is calculated from the vector +#' \item If the quantile is negative then its absolute value is added to the vector +#' \item Values that are still negative are truncated at 0 +#' } +#' +#' @param m The data matrix +#' @param quantile.prob The quantile probability to use +#' @return Return the transformed data matrix +#' +shift_negative_values <- function(m, quantile.prob = 0.05) { + apply(m, 2, function(x) { + qq <- quantile(x, quantile.prob) + ret <- NULL + if(qq < 0) + ret <- x + abs(qq) + else + ret <- x + ret[ret < 0] <- 0 + return(ret) + }) +} + + + #' Get the columns that are common to a set of input tabular files #' #' @param files.list A vector of input file names. If these are text files, each file should be a tab-separated table, diff --git a/man/cluster_fcs_files.Rd b/man/cluster_fcs_files.Rd index 70fd802..3653bee 100644 --- a/man/cluster_fcs_files.Rd +++ b/man/cluster_fcs_files.Rd @@ -5,7 +5,8 @@ \title{Cluster FCS files} \usage{ cluster_fcs_files(files.list, num.cores, col.names, num.clusters, - asinh.cofactor, num.samples = 50, output.dir = ".") + asinh.cofactor, num.samples = 50, output.dir = ".", + negative.values = "truncate") } \arguments{ \item{files.list}{The files to cluster} @@ -16,11 +17,20 @@ cluster_fcs_files(files.list, num.cores, col.names, num.clusters, \item{num.clusters}{The desired number of clusters} -\item{asinh.cofactor}{Cofactor for asinh transformation. If this is \code{NULL} no transformation is performed (see \code{convert_fcs})} +\item{asinh.cofactor}{Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed} \item{num.samples}{Number of samples to be used for the CLARA algorithm (see \code{cluster::clara})} \item{output.dir}{The name of the output directory, it will be created if it does not exist} + +\item{negative.values}{How to deal with negative values in the data. If this is \code{NULL} negative values +are left as is. Otherwise two options are possible: +\itemize{ + \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) + \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + be truncated to 0. This option is useful in cases where the range of data significantly extends + in the negatives, for instance due to compensation. +}} } \value{ Returns either \code{NULL} or a \code{try-error} object if some error occurred during the computation diff --git a/man/cluster_fcs_files_groups.Rd b/man/cluster_fcs_files_groups.Rd index 2ed2471..f310686 100644 --- a/man/cluster_fcs_files_groups.Rd +++ b/man/cluster_fcs_files_groups.Rd @@ -6,7 +6,7 @@ \usage{ cluster_fcs_files_groups(files.list, num.cores, col.names, num.clusters, asinh.cofactor, num.samples = 50, downsample.to = 0, - output.dir = ".") + output.dir = ".", negative.values = "truncate") } \arguments{ \item{files.list}{A named list of vectors detailing how the files should be pooled before clustering. Files in the same vector will @@ -18,13 +18,22 @@ be pooled together. The name of the output is going to correspond to the name of \item{num.clusters}{The desired number of clusters} -\item{asinh.cofactor}{Cofactor for asinh transformation. If this is \code{NULL} no transformation is performed (see \code{convert_fcs})} +\item{asinh.cofactor}{Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed} \item{num.samples}{Number of samples to be used for the CLARA algorithm (see \code{cluster::clara})} \item{downsample.to}{The number of events that should be randomly sampled from each file before pooling. If this is 0, no sampling is performed} \item{output.dir}{The name of the output directory, it will be created if it does not exist} + +\item{negative.values}{How to deal with negative values in the data. If this is \code{NULL} negative values +are left as is. Otherwise two options are possible: +\itemize{ + \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) + \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + be truncated to 0. This option is useful in cases where the range of data significantly extends + in the negatives, for instance due to compensation. +}} } \value{ Returns either \code{NULL} or a \code{try-error} object if some error occurred during the computation diff --git a/man/cluster_fcs_files_in_dir.Rd b/man/cluster_fcs_files_in_dir.Rd index 8d05d8e..170b9e4 100644 --- a/man/cluster_fcs_files_in_dir.Rd +++ b/man/cluster_fcs_files_in_dir.Rd @@ -14,9 +14,17 @@ cluster_fcs_files_in_dir(wd, ...) \item{num.cores}{Number of CPU cores to use} \item{col.names}{A vector of column names indicating which columns should be used for clustering} \item{num.clusters}{The desired number of clusters} - \item{asinh.cofactor}{Cofactor for asinh transformation. If this is \code{NULL} no transformation is performed (see \code{convert_fcs})} \item{num.samples}{Number of samples to be used for the CLARA algorithm (see \code{cluster::clara})} \item{output.dir}{The name of the output directory, it will be created if it does not exist} + \item{asinh.cofactor}{Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed} + \item{negative.values}{How to deal with negative values in the data. If this is \code{NULL} negative values +are left as is. Otherwise two options are possible: +\itemize{ + \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) + \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + be truncated to 0. This option is useful in cases where the range of data significantly extends + in the negatives, for instance due to compensation. +}} }} } \value{ diff --git a/man/convert_fcs.Rd b/man/convert_fcs.Rd index 4f01978..2cfbb79 100644 --- a/man/convert_fcs.Rd +++ b/man/convert_fcs.Rd @@ -4,7 +4,7 @@ \alias{convert_fcs} \title{Convert a flowFrame to data.frame} \usage{ -convert_fcs(f, asinh.cofactor = NULL, clip.at.zero = T, +convert_fcs(f, asinh.cofactor = NULL, negative.values = "truncate", compensate = T) } \arguments{ @@ -12,7 +12,14 @@ convert_fcs(f, asinh.cofactor = NULL, clip.at.zero = T, \item{asinh.cofactor}{Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed} -\item{clip.at.zero}{Wether to clip negative values (after transformation) at zero} +\item{negative.values}{How to deal with negative values in the data. If this is \code{NULL} negative values +are left as is. Otherwise two options are possible: +\itemize{ + \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) + \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + be truncated to 0. This option is useful in cases where the range of data significantly extends + in the negatives, for instance due to compensation. +}} \item{compensate}{Wether to compensate the data using the compensation matrix embedded in the \code{flowFrame} (if any)} } diff --git a/man/process_file.Rd b/man/process_file.Rd index b7b9be9..46e52c5 100644 --- a/man/process_file.Rd +++ b/man/process_file.Rd @@ -5,7 +5,7 @@ \title{Process an individual file for clustering} \usage{ process_file(f, col.names, num.clusters, num.samples, asinh.cofactor, - output.dir) + output.dir, negative.values) } \arguments{ \item{f}{The file path} @@ -16,9 +16,18 @@ process_file(f, col.names, num.clusters, num.samples, asinh.cofactor, \item{num.samples}{Number of samples to be used for the CLARA algorithm (see \code{cluster::clara})} -\item{asinh.cofactor}{Cofactor for asinh transformation. If this is \code{NULL} no transformation is performed (see \code{convert_fcs})} +\item{asinh.cofactor}{Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed} \item{output.dir}{The name of the output directory, it will be created if it does not exist} + +\item{negative.values}{How to deal with negative values in the data. If this is \code{NULL} negative values +are left as is. Otherwise two options are possible: +\itemize{ + \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) + \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + be truncated to 0. This option is useful in cases where the range of data significantly extends + in the negatives, for instance due to compensation. +}} } \description{ Process an individual file for clustering diff --git a/man/process_files_groups.Rd b/man/process_files_groups.Rd index 8bebd90..e40f1d2 100644 --- a/man/process_files_groups.Rd +++ b/man/process_files_groups.Rd @@ -5,7 +5,7 @@ \title{Process a group of files for clustering} \usage{ process_files_groups(files, col.names, num.clusters, num.samples, - asinh.cofactor, downsample.to, output.dir) + asinh.cofactor, downsample.to, output.dir, negative.values) } \arguments{ \item{files}{A vector of strings. The first string in the vector corresponds to the name to be used for the clustering output, @@ -17,11 +17,20 @@ the remaining strings are the paths of the files that will be pooled together fo \item{num.samples}{Number of samples to be used for the CLARA algorithm (see \code{cluster::clara})} -\item{asinh.cofactor}{Cofactor for asinh transformation. If this is \code{NULL} no transformation is performed (see \code{convert_fcs})} +\item{asinh.cofactor}{Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed} \item{downsample.to}{The number of events that should be randomly sampled from each file before pooling. If this is 0, no sampling is performed} \item{output.dir}{The name of the output directory, it will be created if it does not exist} + +\item{negative.values}{How to deal with negative values in the data. If this is \code{NULL} negative values +are left as is. Otherwise two options are possible: +\itemize{ + \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) + \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + be truncated to 0. This option is useful in cases where the range of data significantly extends + in the negatives, for instance due to compensation. +}} } \description{ Process a group of files for clustering diff --git a/man/shift_negative_values.Rd b/man/shift_negative_values.Rd new file mode 100644 index 0000000..5cb4db2 --- /dev/null +++ b/man/shift_negative_values.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{shift_negative_values} +\alias{shift_negative_values} +\title{Shift negative values in a matrix} +\usage{ +shift_negative_values(m, quantile.prob = 0.05) +} +\arguments{ +\item{m}{The data matrix} + +\item{quantile.prob}{The quantile probability to use} +} +\value{ +Return the transformed data matrix +} +\description{ +This function shifts negative values in a data matrix. For each column vector +the procedure is as follows: +\enumerate{ + \item A specific quantile is calculated from the vector + \item If the quantile is negative then its absolute value is added to the vector + \item Values that are still negative are truncated at 0 +} +} From 834fd2a827ba3a2cfd6fad6bd66b512aa7fc81a8 Mon Sep 17 00:00:00 2001 From: pfgherardini Date: Fri, 21 Jun 2019 05:06:36 -0700 Subject: [PATCH 2/2] updated to use negative.values --- DESCRIPTION | 2 +- R/cluster.R | 17 +++++++++-------- R/features.R | 24 ++++++++++++++---------- R/io.R | 15 ++++++++------- inst/shinyGUI/server.R | 14 ++++++++++++-- man/cluster_fcs_files.Rd | 8 ++++++-- man/cluster_fcs_files_groups.Rd | 9 +++++++-- man/cluster_fcs_files_in_dir.Rd | 5 ++++- man/convert_fcs.Rd | 12 ++++++++---- man/get_cluster_features.Rd | 12 ++++++------ man/process_file.Rd | 8 ++++++-- man/process_files_groups.Rd | 9 +++++++-- man/shift_negative_values.Rd | 2 +- 13 files changed, 89 insertions(+), 48 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index eb1b540..d4a3fd1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: grappolo Type: Package Title: Feature generation from single-cell data -Version: 0.4.4 +Version: 0.5.1 Authors@R: "Pier Federico Gherardini [aut, cre]" Description: This is a package to cluster single-cell flow data and generate features that can be used for model building diff --git a/R/cluster.R b/R/cluster.R index 59b6ddb..217f8da 100644 --- a/R/cluster.R +++ b/R/cluster.R @@ -66,7 +66,7 @@ cluster_data <- function(tab, col.names, k, ...) { #' #' process_files_groups <- function(files, col.names, num.clusters, num.samples, asinh.cofactor, - downsample.to, output.dir, negative.values) { + downsample.to, output.dir, negative.values, quantile.prob) { tab <- NULL orig.data <- NULL @@ -82,7 +82,7 @@ process_files_groups <- function(files, col.names, num.clusters, num.samples, as fcs.file <- flowCore::read.FCS(f) temp.orig.data <- flowCore::exprs(fcs.file) - temp.tab <- convert_fcs(fcs.file, asinh.cofactor) + temp.tab <- convert_fcs(fcs.file, asinh.cofactor, negative.values = negative.values, quantile.prob = quantile.prob) if(downsample.to > 0) { x <- NULL @@ -124,10 +124,10 @@ process_files_groups <- function(files, col.names, num.clusters, num.samples, as #' @param f The file path #' @inheritParams cluster_fcs_files #' -process_file <- function(f, col.names, num.clusters, num.samples, asinh.cofactor, output.dir, negative.values) { +process_file <- function(f, col.names, num.clusters, num.samples, asinh.cofactor, output.dir, negative.values, quantile.prob) { fcs.file <- flowCore::read.FCS(f) orig.data <- flowCore::exprs(fcs.file) - tab <- convert_fcs(fcs.file, asinh.cofactor) + tab <- convert_fcs(fcs.file, asinh.cofactor, negative.values = negative.values, quantile.prob = quantile.prob) m <- grappolo:::cluster_data(tab, col.names, k = num.clusters, sampsize = min(nrow(tab), 1000), samples = num.samples) colnames(m) <- gsub("groups", "cellType", colnames(m)) @@ -206,14 +206,14 @@ cluster_fcs_files_in_dir <- function(wd, ...) { #' @return Returns either \code{NULL} or a \code{try-error} object if some error occurred during the computation #' @export cluster_fcs_files <- function(files.list, num.cores, col.names, num.clusters, asinh.cofactor, - num.samples = 50, output.dir = ".", negative.values = "truncate") { + num.samples = 50, output.dir = ".", negative.values = "truncate", quantile.prob = 0.05) { if(!dir.exists(output.dir)) dir.create(output.dir, recursive = TRUE, showWarnings = TRUE) parallel::mclapply(files.list, mc.cores = num.cores, mc.preschedule = FALSE, process_file, col.names = col.names, num.clusters = num.clusters, num.samples = num.samples, asinh.cofactor = asinh.cofactor, - output.dir = output.dir, negative.values = negative.values) + output.dir = output.dir, negative.values = negative.values, quantile.prob = quantile.prob) } @@ -230,7 +230,8 @@ cluster_fcs_files <- function(files.list, num.cores, col.names, num.clusters, as #' #' @export cluster_fcs_files_groups <- function(files.list, num.cores, col.names, num.clusters, asinh.cofactor, - num.samples = 50, downsample.to = 0, output.dir = ".", negative.values = "truncate") { + num.samples = 50, downsample.to = 0, output.dir = ".", negative.values = "truncate", + quantile.prob = 0.05) { files.list <- lapply(names(files.list), function(x) { c(x, files.list[[x]]) @@ -242,7 +243,7 @@ cluster_fcs_files_groups <- function(files.list, num.cores, col.names, num.clust parallel::mclapply(files.list, mc.cores = num.cores, mc.preschedule = FALSE, process_files_groups, col.names = col.names, num.clusters = num.clusters, num.samples = num.samples, asinh.cofactor = asinh.cofactor, downsample.to = downsample.to, - output.dir = output.dir, negative.values = negative.values) + output.dir = output.dir, negative.values = negative.values, quantile.prob = quantile.prob) } diff --git a/R/features.R b/R/features.R index 2c638db..2df3b1f 100644 --- a/R/features.R +++ b/R/features.R @@ -86,10 +86,10 @@ multistep_normalize <- function(tab, norm.template, subject.var) { #' stimulation conditions. #' In this case the \code{metadata.tab} would look like this #' \itemize{ -#' \item{\code{file}}{The names of the data files that contain data for each sample. These must match the names in the clustering results (see above)} -#' \item{\code{timepoint}}{The timepoint information} -#' \item{\code{condition}}{The stimulation condition} -#' \item{\code{subject}}{The subjet each file was derived from} +#' \item{\code{file}}{ The names of the data files that contain data for each sample. These must match the names in the clustering results (see above)} +#' \item{\code{timepoint}}{ The timepoint information} +#' \item{\code{condition}}{ The stimulation condition} +#' \item{\code{subject}}{ The subjet each file was derived from} #' } #' Let's assume a few different scenarios. #' \enumerate{ @@ -111,7 +111,7 @@ multistep_normalize <- function(tab, norm.template, subject.var) { #' (see Details). The combination of \code{predictors} and \code{endpoint.grouping} must uniquely identify every row in \code{metadata.tab}. #' The function will throw an error if this is not the case. #' @param filename.col The name of the column in \code{metadata.tab} that is used to identify the file names in tab -#' @return Returns a data frame whose format depends on the value of the \code{format} parameter +#' @return Returns a data frame whose format depends on the value of the \code{out.format} parameter #' \itemize{ #' \item{table}: each row corresponds to a combination of the levels of the variables specified in \code{endpoint.grouping}, and the columns are #' cluster features, which are combinations of the levels of the \code{predictors} for each feature specified in \code{features.names} @@ -119,12 +119,16 @@ multistep_normalize <- function(tab, norm.template, subject.var) { #' } #' @export -get_cluster_features <- function(tab, metadata.tab, features.names, out.format = "table", predictors = NULL, endpoint.grouping = NULL, filename.col = "file") { +get_cluster_features <- function(tab, metadata.tab, features.names, out.format = "table", predictors = NULL, + endpoint.grouping = NULL, filename.col = "file", transform.popsize = TRUE) { out.format <- match.arg(out.format, c("table", "tidy")) - m <- reshape_cluster_features(tab, features.names) + m <- reshape_cluster_features(tab, features.names, transform.popsize) df <- reshape::melt(m, varnames = c(filename.col, "variable")) + # Restore the original value of filename.col + names(df) <- gsub(make.names(filename.col), filename.col, names(df)) + df <- merge(df, metadata.tab, by = filename.col) ret <- NULL @@ -143,7 +147,7 @@ get_cluster_features <- function(tab, metadata.tab, features.names, out.format = ret <- reshape::cast(df, formula.exp) } - return(ret) + return(data.frame(ret, check.names = FALSE, stringsAsFactors = FALSE)) } @@ -171,7 +175,7 @@ transpose_feature_matrix <- function(m) { } -reshape_cluster_features <- function(input.tab, features) { +reshape_cluster_features <- function(input.tab, features, transform.popsize = TRUE) { col.names <- sapply(features, paste, "@", sep = "") col.names <- paste(col.names, collapse = "|") col.names <- grep(col.names, names(input.tab), value = T) @@ -183,7 +187,7 @@ reshape_cluster_features <- function(input.tab, features) { temp <- m[, grep(sprintf("%s@", s), colnames(m))] temp[is.na(temp)] <- 0 - if(s == "popsize") { + if(s == "popsize" && transform.popsize) { temp <- t(temp) temp <- temp / rowSums(temp) temp <- t(temp) diff --git a/R/io.R b/R/io.R index 9abb6f5..4e53b92 100644 --- a/R/io.R +++ b/R/io.R @@ -8,21 +8,22 @@ #' #' @param f The \code{flowFrame} to convert #' @param asinh.cofactor Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed +#' @param compensate Wether to compensate the data using the compensation matrix embedded in the \code{flowFrame} (if any) #' @param negative.values How to deal with negative values in the data. If this is \code{NULL} negative values #' are left as is. Otherwise two options are possible: #' \itemize{ #' \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) -#' \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will +#' \item{\code{shift}}: The data will be shifted so that only \code{quantile.prob} of the values for each channel will #' be truncated to 0. This option is useful in cases where the range of data significantly extends #' in the negatives, for instance due to compensation. #' } -#' @param compensate Wether to compensate the data using the compensation matrix embedded in the \code{flowFrame} (if any) -#' +#' @param quantile.prob Only used if \code{negative.value} is set to \code{shift}. The quantile of measurements +#' that are going to be truncated to 0. For instance if this is 0.05, the data will be shifted so that +#' only 5 percent of the values are negative and will be truncated to 0 #' @return Returns a \code{data.frame} corresponding to the data in \code{flowCore::exprs(f)} after compensation #' and transformation -#' #' @export -convert_fcs <- function(f, asinh.cofactor = NULL, negative.values = "truncate", compensate = T) { +convert_fcs <- function(f, asinh.cofactor = NULL, compensate = T, negative.values = "truncate", quantile.prob = 0.05) { comp <- grep("SPILL", names(flowCore::description(f)), value = T) if(compensate && (length(comp) > 0)) { @@ -47,7 +48,7 @@ convert_fcs <- function(f, asinh.cofactor = NULL, negative.values = "truncate", if(negative.values == "truncate") m[m < 0] <- 0 else if(negative.values == "shift") - m <- shift_negative_values(m) + m <- shift_negative_values(m, quantile.prob) } tab <- data.frame(m, check.names = F, stringsAsFactors = F) @@ -77,7 +78,7 @@ convert_fcs <- function(f, asinh.cofactor = NULL, negative.values = "truncate", #' @param quantile.prob The quantile probability to use #' @return Return the transformed data matrix #' -shift_negative_values <- function(m, quantile.prob = 0.05) { +shift_negative_values <- function(m, quantile.prob) { apply(m, 2, function(x) { qq <- quantile(x, quantile.prob) ret <- NULL diff --git a/inst/shinyGUI/server.R b/inst/shinyGUI/server.R index 87f1c09..572a6a5 100644 --- a/inst/shinyGUI/server.R +++ b/inst/shinyGUI/server.R @@ -45,11 +45,15 @@ render_clustering_ui <- function(working.directory, ...) {renderUI({ ), fluidRow( column(12, - numericInput("clusteringui_num_clusters", "Number of clusters", value = 200, min = 1, max = 2000), numericInput("clusteringui_num_samples", "Number of samples (lower numbers lead to faster but less accurate results)", value = 50, min = 2), numericInput("clusteringui_asinh_cofactor", "Cofactor for asinh transformation", value = 5), numericInput("clusteringui_num_cores", "Number of CPU cores to use", value = 1), + selectInput("clusteringui_negative_values", "Negative vaues", choices = c("truncate", "shift")), + conditionalPanel( + condition = "input.clusteringui_negative_values == 'shift'", + numericInput("clusteringui_quantile_prob", "Quantile probability", value = 0.05, min = 0, max = 1, step = 0.01) + ), actionButton("clusteringui_start", "Start clustering") ) ) @@ -106,6 +110,8 @@ shinyServer(function(input, output, session) { num.samples <- force(input$clusteringui_num_samples) downsample.to <- force(input$clusteringui_downsample_to) output.dir <- force(working.directory) + negative.values <- force(input$clusteringui_negative_values) + quantile.prob <- force(input$clusteringui_quantile_prob) if(input$clusteringui_clustering_mode == "Pooled") { input.files <- lapply(clusteringui_reactive_values$clustering_groups, function(s) {file.path(working.directory, s)}) @@ -116,6 +122,8 @@ shinyServer(function(input, output, session) { asinh.cofactor = asinh.cofactor, num.samples = num.samples, downsample.to = downsample.to, + negative.values = negative.values, + quantile.prob = quantile.prob, output.dir = output.dir ) } else { @@ -125,7 +133,9 @@ shinyServer(function(input, output, session) { num.clusters = num.clusters, asinh.cofactor = asinh.cofactor, num.samples = num.samples, - output.dir = output.dir + output.dir = output.dir, + negative.values = negative.values, + quantile.prob = quantile.prob ) } diff --git a/man/cluster_fcs_files.Rd b/man/cluster_fcs_files.Rd index 3653bee..0964b36 100644 --- a/man/cluster_fcs_files.Rd +++ b/man/cluster_fcs_files.Rd @@ -6,7 +6,7 @@ \usage{ cluster_fcs_files(files.list, num.cores, col.names, num.clusters, asinh.cofactor, num.samples = 50, output.dir = ".", - negative.values = "truncate") + negative.values = "truncate", quantile.prob = 0.05) } \arguments{ \item{files.list}{The files to cluster} @@ -27,10 +27,14 @@ cluster_fcs_files(files.list, num.cores, col.names, num.clusters, are left as is. Otherwise two options are possible: \itemize{ \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) - \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + \item{\code{shift}}: The data will be shifted so that only \code{quantile.prob} of the values for each channel will be truncated to 0. This option is useful in cases where the range of data significantly extends in the negatives, for instance due to compensation. }} + +\item{quantile.prob}{Only used if \code{negative.value} is set to \code{shift}. The quantile of measurements +that are going to be truncated to 0. For instance if this is 0.05, the data will be shifted so that +only 5 percent of the values are negative and will be truncated to 0} } \value{ Returns either \code{NULL} or a \code{try-error} object if some error occurred during the computation diff --git a/man/cluster_fcs_files_groups.Rd b/man/cluster_fcs_files_groups.Rd index f310686..b7d8905 100644 --- a/man/cluster_fcs_files_groups.Rd +++ b/man/cluster_fcs_files_groups.Rd @@ -6,7 +6,8 @@ \usage{ cluster_fcs_files_groups(files.list, num.cores, col.names, num.clusters, asinh.cofactor, num.samples = 50, downsample.to = 0, - output.dir = ".", negative.values = "truncate") + output.dir = ".", negative.values = "truncate", + quantile.prob = 0.05) } \arguments{ \item{files.list}{A named list of vectors detailing how the files should be pooled before clustering. Files in the same vector will @@ -30,10 +31,14 @@ be pooled together. The name of the output is going to correspond to the name of are left as is. Otherwise two options are possible: \itemize{ \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) - \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + \item{\code{shift}}: The data will be shifted so that only \code{quantile.prob} of the values for each channel will be truncated to 0. This option is useful in cases where the range of data significantly extends in the negatives, for instance due to compensation. }} + +\item{quantile.prob}{Only used if \code{negative.value} is set to \code{shift}. The quantile of measurements +that are going to be truncated to 0. For instance if this is 0.05, the data will be shifted so that +only 5 percent of the values are negative and will be truncated to 0} } \value{ Returns either \code{NULL} or a \code{try-error} object if some error occurred during the computation diff --git a/man/cluster_fcs_files_in_dir.Rd b/man/cluster_fcs_files_in_dir.Rd index 170b9e4..f12b4d2 100644 --- a/man/cluster_fcs_files_in_dir.Rd +++ b/man/cluster_fcs_files_in_dir.Rd @@ -21,10 +21,13 @@ cluster_fcs_files_in_dir(wd, ...) are left as is. Otherwise two options are possible: \itemize{ \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) - \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + \item{\code{shift}}: The data will be shifted so that only \code{quantile.prob} of the values for each channel will be truncated to 0. This option is useful in cases where the range of data significantly extends in the negatives, for instance due to compensation. }} + \item{quantile.prob}{Only used if \code{negative.value} is set to \code{shift}. The quantile of measurements +that are going to be truncated to 0. For instance if this is 0.05, the data will be shifted so that +only 5 percent of the values are negative and will be truncated to 0} }} } \value{ diff --git a/man/convert_fcs.Rd b/man/convert_fcs.Rd index 2cfbb79..8a13a1f 100644 --- a/man/convert_fcs.Rd +++ b/man/convert_fcs.Rd @@ -4,24 +4,28 @@ \alias{convert_fcs} \title{Convert a flowFrame to data.frame} \usage{ -convert_fcs(f, asinh.cofactor = NULL, negative.values = "truncate", - compensate = T) +convert_fcs(f, asinh.cofactor = NULL, compensate = T, + negative.values = "truncate", quantile.prob = 0.05) } \arguments{ \item{f}{The \code{flowFrame} to convert} \item{asinh.cofactor}{Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed} +\item{compensate}{Wether to compensate the data using the compensation matrix embedded in the \code{flowFrame} (if any)} + \item{negative.values}{How to deal with negative values in the data. If this is \code{NULL} negative values are left as is. Otherwise two options are possible: \itemize{ \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) - \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + \item{\code{shift}}: The data will be shifted so that only \code{quantile.prob} of the values for each channel will be truncated to 0. This option is useful in cases where the range of data significantly extends in the negatives, for instance due to compensation. }} -\item{compensate}{Wether to compensate the data using the compensation matrix embedded in the \code{flowFrame} (if any)} +\item{quantile.prob}{Only used if \code{negative.value} is set to \code{shift}. The quantile of measurements +that are going to be truncated to 0. For instance if this is 0.05, the data will be shifted so that +only 5 percent of the values are negative and will be truncated to 0} } \value{ Returns a \code{data.frame} corresponding to the data in \code{flowCore::exprs(f)} after compensation diff --git a/man/get_cluster_features.Rd b/man/get_cluster_features.Rd index 525ee4f..dba8d08 100644 --- a/man/get_cluster_features.Rd +++ b/man/get_cluster_features.Rd @@ -6,7 +6,7 @@ \usage{ get_cluster_features(tab, metadata.tab, features.names, out.format = "table", predictors = NULL, endpoint.grouping = NULL, - filename.col = "file") + filename.col = "file", transform.popsize = TRUE) } \arguments{ \item{tab}{A \code{data.frame} representing clustering results, as produced by \code{cluster_data} (see Details)} @@ -27,7 +27,7 @@ The function will throw an error if this is not the case.} \item{filename.col}{The name of the column in \code{metadata.tab} that is used to identify the file names in tab} } \value{ -Returns a data frame whose format depends on the value of the \code{format} parameter +Returns a data frame whose format depends on the value of the \code{out.format} parameter \itemize{ \item{table}: each row corresponds to a combination of the levels of the variables specified in \code{endpoint.grouping}, and the columns are cluster features, which are combinations of the levels of the \code{predictors} for each feature specified in \code{features.names} @@ -47,10 +47,10 @@ An example will help clarify the working of this function. Suppose you have coll stimulation conditions. In this case the \code{metadata.tab} would look like this \itemize{ - \item{\code{file}}{The names of the data files that contain data for each sample. These must match the names in the clustering results (see above)} - \item{\code{timepoint}}{The timepoint information} - \item{\code{condition}}{The stimulation condition} - \item{\code{subject}}{The subjet each file was derived from} + \item{\code{file}}{ The names of the data files that contain data for each sample. These must match the names in the clustering results (see above)} + \item{\code{timepoint}}{ The timepoint information} + \item{\code{condition}}{ The stimulation condition} + \item{\code{subject}}{ The subjet each file was derived from} } Let's assume a few different scenarios. \enumerate{ diff --git a/man/process_file.Rd b/man/process_file.Rd index 46e52c5..f6ac51d 100644 --- a/man/process_file.Rd +++ b/man/process_file.Rd @@ -5,7 +5,7 @@ \title{Process an individual file for clustering} \usage{ process_file(f, col.names, num.clusters, num.samples, asinh.cofactor, - output.dir, negative.values) + output.dir, negative.values, quantile.prob) } \arguments{ \item{f}{The file path} @@ -24,10 +24,14 @@ process_file(f, col.names, num.clusters, num.samples, asinh.cofactor, are left as is. Otherwise two options are possible: \itemize{ \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) - \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + \item{\code{shift}}: The data will be shifted so that only \code{quantile.prob} of the values for each channel will be truncated to 0. This option is useful in cases where the range of data significantly extends in the negatives, for instance due to compensation. }} + +\item{quantile.prob}{Only used if \code{negative.value} is set to \code{shift}. The quantile of measurements +that are going to be truncated to 0. For instance if this is 0.05, the data will be shifted so that +only 5 percent of the values are negative and will be truncated to 0} } \description{ Process an individual file for clustering diff --git a/man/process_files_groups.Rd b/man/process_files_groups.Rd index e40f1d2..b160a3f 100644 --- a/man/process_files_groups.Rd +++ b/man/process_files_groups.Rd @@ -5,7 +5,8 @@ \title{Process a group of files for clustering} \usage{ process_files_groups(files, col.names, num.clusters, num.samples, - asinh.cofactor, downsample.to, output.dir, negative.values) + asinh.cofactor, downsample.to, output.dir, negative.values, + quantile.prob) } \arguments{ \item{files}{A vector of strings. The first string in the vector corresponds to the name to be used for the clustering output, @@ -27,10 +28,14 @@ the remaining strings are the paths of the files that will be pooled together fo are left as is. Otherwise two options are possible: \itemize{ \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0) - \item{\code{shift}}: The data will be shifted so that only 5 percent of the values for each channel will + \item{\code{shift}}: The data will be shifted so that only \code{quantile.prob} of the values for each channel will be truncated to 0. This option is useful in cases where the range of data significantly extends in the negatives, for instance due to compensation. }} + +\item{quantile.prob}{Only used if \code{negative.value} is set to \code{shift}. The quantile of measurements +that are going to be truncated to 0. For instance if this is 0.05, the data will be shifted so that +only 5 percent of the values are negative and will be truncated to 0} } \description{ Process a group of files for clustering diff --git a/man/shift_negative_values.Rd b/man/shift_negative_values.Rd index 5cb4db2..2aa3b9c 100644 --- a/man/shift_negative_values.Rd +++ b/man/shift_negative_values.Rd @@ -4,7 +4,7 @@ \alias{shift_negative_values} \title{Shift negative values in a matrix} \usage{ -shift_negative_values(m, quantile.prob = 0.05) +shift_negative_values(m, quantile.prob) } \arguments{ \item{m}{The data matrix}