Skip to content

Commit

Permalink
Merge pull request #258 from VEuPathDB/scatter-correlations
Browse files Browse the repository at this point in the history
Scatter correlations
  • Loading branch information
d-callan authored May 1, 2024
2 parents ecdf2b2 + 0cb5f6c commit b71a74d
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 1 deletion.
40 changes: 39 additions & 1 deletion R/class-plotdata-scatter.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ newScatterPD <- function(.dt = data.table::data.table(),
value = character(),
useGradientColorscale = FALSE,
overlayValues = veupathUtils::BinList(),
correlationMethod = character(),
sampleSizes = logical(),
completeCases = logical(),
evilMode = character(),
Expand Down Expand Up @@ -30,18 +31,34 @@ newScatterPD <- function(.dt = data.table::data.table(),
group <- veupathUtils::findColNamesFromPlotRef(variables, 'overlay')
panel <- findPanelColName(veupathUtils::findVariableSpecFromPlotRef(variables, 'facet1'),
veupathUtils::findVariableSpecFromPlotRef(variables, 'facet2'))

dtForCorr <- data.table::as.data.table(.pd)

if (useGradientColorscale) {
#series data w gradient
.pd$overlayMissingData <- is.na(.pd[[group]])
series <- collapseByGroup(.pd, group = 'overlayMissingData', panel)
.pd$overlayMissingData <- NULL
series$overlayMissingData <- NULL
data.table::setnames(series, c(panel, 'seriesX', 'seriesY', 'seriesGradientColorscale'))

# corr results w gradient, same as w/o groups so set group to NULL
dtForCorr[[group]] <- NULL
if (correlationMethod != 'none') {
corrResult <- groupCorrelation(dtForCorr, x, y, NULL, panel, correlationMethod = correlationMethod)
}
} else {
#series data w/o gradient
series <- collapseByGroup(.pd, group, panel)
data.table::setnames(series, c(group, panel, 'seriesX', 'seriesY'))

# corr results w/o gradient
if (correlationMethod != 'none') {
corrResult <- groupCorrelation(dtForCorr, x, y, group, panel, correlationMethod = correlationMethod)
}
}

veupathUtils::logWithTime('Calculated correlation results per group.', verbose)

if (xType == 'DATE') {
series$seriesX <- lapply(series$seriesX, format, '%Y-%m-%d')
} else {
Expand Down Expand Up @@ -103,6 +120,17 @@ newScatterPD <- function(.dt = data.table::data.table(),
} else {
.pd <- series
}

if (correlationMethod != 'none') {
if (!is.null(key(.pd))) {
.pd <- merge(.pd, corrResult)
} else {
.pd <- cbind(.pd, corrResult)
}

attr$correlationMethod <- jsonlite::unbox(correlationMethod)
}

attr$names <- names(.pd)
if (useGradientColorscale) attr$useGradientColorscale <- useGradientColorscale

Expand Down Expand Up @@ -160,6 +188,8 @@ validateScatterPD <- function(.scatter, verbose) {
#' to include raw data with smoothed mean. Note only 'raw' is compatible with a continuous
#' overlay variable.
#' @param overlayValues veupathUtils::BinList providing overlay values of interest
#' @param correlationMethod character indicating which correlation method to use. One of 'pearson',
#' 'spearman', 'sparcc' or 'none'. Default is 'none'.
#' @param sampleSizes boolean indicating if sample sizes should be computed
#' @param completeCases boolean indicating if complete cases should be computed
#' @param evilMode String indicating how evil this plot is ('strataVariables', 'allVariables', 'noVariables')
Expand Down Expand Up @@ -207,6 +237,7 @@ scattergl.dt <- function(data,
'density',
'raw'),
overlayValues = NULL,
correlationMethod = c('none','pearson', 'sparcc', 'spearman'),
sampleSizes = c(TRUE, FALSE),
completeCases = c(TRUE, FALSE),
evilMode = c('noVariables', 'allVariables', 'strataVariables'),
Expand All @@ -216,6 +247,7 @@ scattergl.dt <- function(data,

if (!inherits(variables, 'VariableMetadataList')) stop("The `variables` argument must be a VariableMetadataList object.")
value <- veupathUtils::matchArg(value)
correlationMethod <- veupathUtils::matchArg(correlationMethod)
sampleSizes <- veupathUtils::matchArg(sampleSizes)
completeCases <- veupathUtils::matchArg(completeCases)
evilMode <- veupathUtils::matchArg(evilMode)
Expand Down Expand Up @@ -269,6 +301,7 @@ scattergl.dt <- function(data,
value = value,
useGradientColorscale = useGradientColorscale,
overlayValues = overlayValues,
correlationMethod = correlationMethod,
sampleSizes = sampleSizes,
completeCases = completeCases,
inferredVarAxis = 'y',
Expand All @@ -277,6 +310,7 @@ scattergl.dt <- function(data,

.scatter <- validateScatterPD(.scatter, verbose)
veupathUtils::logWithTime(paste('New scatter plot object created with parameters value =', value,
', correlationMethod =', correlationMethod,
', sampleSizes = ', sampleSizes,
', completeCases = ', completeCases,
', evilMode =', evilMode,
Expand Down Expand Up @@ -318,6 +352,8 @@ scattergl.dt <- function(data,
#' 'density' estimates (no raw data returned), alternatively 'smoothedMeanWithRaw' to include raw
#' data with smoothed mean. Note only 'raw' is compatible with a continuous overlay variable.
#' @param overlayValues veupathUtils::BinList providing overlay values of interest
#' @param correlationMethod character indicating which correlation method to use. One of 'pearson',
#' 'spearman','sparcc' or 'none'. Default is 'none'.
#' @param sampleSizes boolean indicating if sample sizes should be computed
#' @param completeCases boolean indicating if complete cases should be computed
#' @param evilMode String indicating how evil this plot is ('strataVariables', 'allVariables', 'noVariables')
Expand Down Expand Up @@ -365,6 +401,7 @@ scattergl <- function(data,
'density',
'raw'),
overlayValues = NULL,
correlationMethod = c('none','pearson', 'sparcc', 'spearman'),
sampleSizes = c(TRUE, FALSE),
completeCases = c(TRUE, FALSE),
evilMode = c('noVariables', 'allVariables', 'strataVariables'),
Expand All @@ -376,6 +413,7 @@ scattergl <- function(data,
variables,
value = value,
overlayValues = overlayValues,
correlationMethod = correlationMethod,
sampleSizes = sampleSizes,
completeCases = completeCases,
evilMode = evilMode,
Expand Down
36 changes: 36 additions & 0 deletions R/group.R
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,42 @@ groupDensity <- function(data, x = NULL, y, group = NULL, panel = NULL, geo = NU
return(dt)
}

## NOTE: if this is done right, we shouldnt have to collapse the result ever
## it should be that each group gets a single value for each of dataX, dataY, correlationCoef, pValue
## the only way that wouldnt be true was if we had more than two cols in .SD, which shouldnt be the case
groupCorrelation <- function(
data,
x,
y,
group = NULL,
panel = NULL,
geo = NULL,
correlationMethod = c('pearson', 'spearman', 'sparcc')
) {
veupathUtils::matchArg(correlationMethod)

if (length(dt) > 2) {
stop('Correlation can only be computed for two variables.')
}

byCols <- colnames(data)[colnames(data) %in% c(group, geo, panel)]
if (all(is.null(c(group,geo,panel)))) {
dt <- data[, {corrResult <- correlationOrEmpty(.SD, method = correlationMethod);
list(correlationCoef = corrResult$correlationCoef,
pValue = corrResult$pValue)}]
} else {
dt <- data[, {corrResult <- correlationOrEmpty(.SD, method = correlationMethod);
list(correlationCoef = corrResult$correlationCoef,
pValue = corrResult$pValue)},
keyby=eval(byCols)]
}

indexCols <- c(panel, geo, group)
setkeyv(dt, indexCols)

return(dt)
}

#' @importFrom purrr reduce
groupSmoothedMean <- function(data, x, y, group = NULL, panel = NULL, geo = NULL, collapse = TRUE) {

Expand Down
21 changes: 21 additions & 0 deletions R/utils-stats.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
## this wraps veupathUtils::correlation, except wont err for an empty dt
## it also returns the dt of correlation coeficients and p-values rather
## than the ComputeResult object
correlationOrEmpty <- function(dt, method = c('pearson', 'spearman', 'sparcc')) {
method <- veupathUtils::matchArg(method)

emptyResult <- data.table::data.table(dataX = NA_character_, dataY = NA_character_, correlationCoef = NA_real_, pValue = NA_real_)
if (is.null(dt)) return(emptyResult)
if (nrow(dt) == 0) return(emptyResult)

result <- try(suppressWarnings(veupathUtils::correlation(dt, NULL, method = method, format = 'data.table', verbose = FALSE)), silent = TRUE)

if (any(veupathUtils::is.error(result)) || nrow(result) == 0) {
emptyResult$dataX <- names(dt)[1]
emptyResult$dataY <- names(dt)[2]
return(emptyResult)
} else {
return(result)
}
}

formatPValue <- function(pvalue) {
if (pvalue < 0.0001) return("<0.0001")
return(as.character(signif(pvalue, 2)))
Expand Down
4 changes: 4 additions & 0 deletions man/scattergl.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions man/scattergl.dt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 37 additions & 0 deletions tests/testthat/test-scattergl.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ test_that("scatter.dt does not fail when there are no complete cases.", {
expect_equal(attr$completeCasesAllVars[1], 0)
expect_equal(is.list(dt$densityX), TRUE)
expect_equal(is.list(dt$densityY), TRUE)

dt <- scattergl.dt(df, variables, value='raw', correlationMethod = 'pearson')
attr <- attributes(dt)
expect_equal(attr$completeCasesAllVars[1], 0)
expect_equal(is.list(dt$seriesX), TRUE)
expect_equal(is.list(dt$seriesY), TRUE)

variables <- new("VariableMetadataList", SimpleList(
new("VariableMetadata",
Expand Down Expand Up @@ -129,6 +135,14 @@ test_that("scattergl.dt() returns a valid plot.data scatter object", {
namedAttrList <- getPDAttributes(dt)
expect_equal(names(namedAttrList),c('variables'))

# make sure correlation coef and pvalue is returned if there is a correlationMethod
dt <- scattergl.dt(df, variables, 'raw', correlationMethod = 'pearson')
expect_is(dt, 'plot.data')
expect_is(dt, 'scatterplot')
namedAttrList <- getPDAttributes(dt)
expect_equal(names(namedAttrList),c('variables', 'completeCasesAllVars','completeCasesAxesVars','completeCasesTable','sampleSizeTable','correlationMethod'))
expect_equal(length(namedAttrList$correlationMethod), 1)

})

test_that("scattergl.dt() returns plot data and config of the appropriate types", {
Expand Down Expand Up @@ -180,6 +194,13 @@ test_that("scattergl.dt() returns plot data and config of the appropriate types"
expect_equal(class(unlist(sampleSizes$entity.cat4)), 'character')
expect_equal(class(unlist(sampleSizes$size)), 'integer')

# check types of correlation results when there is a correlationMethod
dt <- scattergl.dt(df, variables, 'raw', correlationMethod = 'pearson')
expect_equal(class(dt$correlationCoef), 'numeric')
expect_equal(class(dt$pValue), 'numeric')
namedAttrList <- getPDAttributes(dt)
expect_equal(class(namedAttrList$correlationMethod),c('scalar', 'character'))

variables <- new("VariableMetadataList", SimpleList(
new("VariableMetadata",
variableClass = new("VariableClass", value = 'native'),
Expand Down Expand Up @@ -282,6 +303,12 @@ test_that("scattergl.dt() returns an appropriately sized data.table", {
expect_equal(nrow(dt),12)
expect_equal(names(dt),c('entity.cat4', 'entity.cat3', 'seriesX', 'seriesY', 'bestFitLineX', 'bestFitLineY', 'r2'))

# should see some new cols if we have a correlationMethod
dt <- scattergl.dt(df, variables, 'bestFitLineWithRaw', correlationMethod = 'pearson')
expect_is(dt, 'data.table')
expect_equal(nrow(dt),12)
expect_equal(names(dt),c('entity.cat4', 'entity.cat3', 'seriesX', 'seriesY', 'bestFitLineX', 'bestFitLineY', 'r2', 'correlationCoef', 'pValue'))

variables <- new("VariableMetadataList", SimpleList(
new("VariableMetadata",
variableClass = new("VariableClass", value = 'native'),
Expand Down Expand Up @@ -900,6 +927,16 @@ test_that("scattergl() returns appropriately formatted json", {
expect_equal(names(jsonList$completeCasesTable$variableDetails), c('variableId','entityId'))
expect_equal(jsonList$completeCasesTable$variableDetails$variableId, c('contA', 'contB', 'cat3', 'cat4'))

# check json for correlations when correlationMethod is not none
dt <- scattergl.dt(df, variables, 'smoothedMeanWithRaw', correlationMethod = 'pearson')
outJson <- getJSON(dt, FALSE)
jsonList <- jsonlite::fromJSON(outJson)
expect_equal(names(jsonList$scatterplot$data),c('facetVariableDetails','overlayVariableDetails','seriesX','seriesY','smoothedMeanX','smoothedMeanY','smoothedMeanSE','smoothedMeanError','correlationCoef','pValue'))
expect_equal(names(jsonList$scatterplot$config), c('variables','completeCasesAllVars','completeCasesAxesVars','correlationMethod'))
expect_equal(jsonList$scatterplot$config$correlationMethod, 'pearson')
expect_equal(class(jsonList$scatterplot$data$correlationCoef), 'numeric')
expect_equal(class(jsonList$scatterplot$data$pValue), 'numeric')

# Continuous overlay with > 8 values
variables <- new("VariableMetadataList", SimpleList(
new("VariableMetadata",
Expand Down

0 comments on commit b71a74d

Please sign in to comment.