Skip to content

Commit

Permalink
Merge pull request #132 from OxfordIHTM/dev
Browse files Browse the repository at this point in the history
add tests for process functions; fix #68; fix #69; fix #70; fix #85; fix #131
  • Loading branch information
ernestguevarra authored Sep 21, 2024
2 parents 2a85270 + 1d4b358 commit 1ee2214
Show file tree
Hide file tree
Showing 9 changed files with 238 additions and 190 deletions.
65 changes: 48 additions & 17 deletions R/pact_process.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@
#' Process variable of interest from Pandemic PACT website data by a grouping
#' variable
#'
#' @param df A data.frame of the Pandemic PACT dataset from the Figshare
#' repository.
#' @param topic A character value of the variable name in `df` for the topic of
#' interest.
#' @param pact_data_list_cols A data.frame for the Pandemic PACT dataset read
#' from the Pandemic PACT website that has already been pre-processed to have
#' list columns for nested variables. This is usually obtained via a call to
#' `pact_process_website()` with `col_list = TRUE`.
#' @param topic A character value of the variable name in `pact_data` for the
#' topic of interest.
#' @param group A character value or vector of up to two values of the variable
#' name/s in `df` to use as grouping variable/s. When specified as NULL
#' (default), no grouping is applied to come up with value of `outcome`
#' based on the `topic` of interest. When specified as NULL (default), no
#' grouping is applied.
#' name/s in `pact_data` to use as grouping variable/s. When specified as
#' NULL (default), no grouping is applied to come up with value of `outcome`
#' based on the `topic` of interest.
#' @param outcome The type of outcome. Either *"frequency"* or *"money"*.
#' Default is *"frequency"*.
#' @param na_values A character value or vector of values for strings to be
#' considered as NA for `topic` and `group`. If NULL (default), `topic` and
#' `group` are kept as is.
#'
#' @returns A data.frame structured based on specification. If `group` is NULL,
#' the data.frame presents values for `topic` as first column and then either
Expand All @@ -26,16 +30,29 @@
#'
#' @examples
#' \dontrun{
#' df <- pact_read_website()
#' df <- pact_read_website() |> pact_process_website()
#' pact_process_topic_group(df, topic = "Disease")
#' }
#'
#' @rdname pact_process
#' @export
#'

pact_process_topic_group <- function(df, topic, group = NULL,
outcome = c("frequency", "money")) {
pact_process_topic_group <- function(pact_data_list_cols,
topic, group = NULL,
outcome = c("frequency", "money"),
na_values = NULL) {
## Check for list columns ----
if (all(lapply(pact_data_list_cols[ , nested_vars], FUN = class) == "list")) {
pact_data <- pact_data_list_cols
} else {
stop(
"`pact_data_list_cols` doesn't seem to have the expected list columns. ",
"Please check that you have used `col_list = TRUE` when using ",
"`pact_process_website()` and try again."
)
}

## Get outcome value ----
outcome <- match.arg(outcome)

Expand All @@ -58,10 +75,10 @@ pact_process_topic_group <- function(df, topic, group = NULL,

## Check if topic is a nested variable and unnest if so ----
if (topic %in% nested_vars) {
tidy_df <- df |>
tidy_df <- pact_data |>
tidyr::unnest(cols = {{ topic }})
} else {
tidy_df <- df
tidy_df <- pact_data
}

## Check if group is/are nested variable/s and unnest if so ----
Expand Down Expand Up @@ -93,6 +110,17 @@ pact_process_topic_group <- function(df, topic, group = NULL,
}
}

## Handle NA values ----
if (!is.null(na_values)) {
tidy_df <- tidy_df |>
dplyr::mutate(
dplyr::across(
.cols = dplyr::contains(c(group, topic)),
.fns = ~ifelse(.x %in% na_values, NA_character_, .x)
)
)
}

## Frequencies or monies ----
if (outcome == "frequency") {
tidy_df <- parse(
Expand Down Expand Up @@ -123,14 +151,15 @@ pact_process_topic_group <- function(df, topic, group = NULL,
#' @export
#'

pact_process_disease <- function(df,
pact_process_disease <- function(pact_data_list_cols,
group = NULL,
outcome = c("frequency", "money")) {
## Get outcome ----
outcome <- match.arg(outcome)

pact_process_topic_group(
df = df, topic = "Disease", group = group, outcome = outcome
pact_data_list_cols = pact_data_list_cols,
topic = "Disease", group = group, outcome = outcome
)
}

Expand All @@ -139,13 +168,15 @@ pact_process_disease <- function(df,
#' @export
#'

pact_process_category <- function(df,
pact_process_category <- function(pact_data_list_cols,
group = NULL,
outcome = c("frequency", "money")) {
## Get outcome ----
outcome <- match.arg(outcome)

pact_process_topic_group(
df = df, topic = "ResearchSubcat", group = group, outcome = outcome
pact_data_list_cols = pact_data_list_cols,
topic = c("ResearchSubcat", "ResearchCat"),
group = group, outcome = outcome
)
}
18 changes: 9 additions & 9 deletions R/pact_process_figshare.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
pact_process_figshare <- function(df) {
## Get core variables ----
core_vars <- df |>
dplyr::select(.data$PactID:.data$Grant.Start.Year)
dplyr::select("PactID":"Grant.Start.Year")

## Process study subject variable ----
study_subject <- pact_process_figshare_category(
Expand Down Expand Up @@ -245,7 +245,7 @@ pact_process_figshare_category <- function(df, category,

## Select PactID and category variable ----
tidy_df <- tidy_df |>
dplyr::select(c(.data$PactID, {{ category }}))
dplyr::select("PactID", {{ category }})

## Return tidy_df ----
tidy_df
Expand All @@ -264,7 +264,7 @@ pact_process_figshare_category_pathogen <- function(df) {
names_prefix = "Pathogen..choice."
) |>
dplyr::filter(.data$value == "Checked") |>
dplyr::select(c(.data$PactID, .data$Pathogen)) |>
dplyr::select("PactID", "Pathogen") |>
dplyr::left_join(
df |>
tidyr::pivot_longer(
Expand All @@ -273,20 +273,20 @@ pact_process_figshare_category_pathogen <- function(df) {
names_prefix = "Coronavirus..choice."
) |>
dplyr::filter(.data$value == "Checked") |>
dplyr::select(c(.data$PactID, .data$Coronavirus)),
dplyr::select("PactID", "Coronavirus"),
by = "PactID",
relationship = "many-to-many"
) |>
dplyr::left_join(
df |>
dplyr::select(
c(.data$PactID, .data$Bunyaviridae:.data$Influenza.A)
"PactID", "Bunyaviridae":"Influenza.A"
),
by = "PactID"
) |>
dplyr::mutate(
dplyr::across(
.cols = .data$Pathogen:.data$Influenza.A,
.cols = "Pathogen":"Influenza.A",
.fns = function(x) stringr::str_remove_all(
string = x, pattern = "\\.$"
) |>
Expand All @@ -309,11 +309,11 @@ pact_process_figshare_category_pathogen <- function(df) {
.default = .data$Pathogen
)
) |>
dplyr::select(c(.data$PactID, .data$Pathogen, .data$Pathogen.Specific)) |>
dplyr::select("PactID", "Pathogen", "Pathogen.Specific") |>
dplyr::group_by(.data$PactID) |>
dplyr::mutate(
dplyr::across(
.cols = .data$Pathogen:.data$Pathogen.Specific,
.cols = "Pathogen":"Pathogen.Specific",
.fns = ~list(.x)
)
) |>
Expand Down Expand Up @@ -346,5 +346,5 @@ pact_process_figshare_category_funder <- function(df) {
dplyr::mutate(Funder.Name = list(.data$Funder.Name)) |>
dplyr::distinct() |>
dplyr::ungroup() |>
dplyr::select(c(.data$PactID, .data$Funder.Name))
dplyr::select("PactID", "Funder.Name")
}
Loading

0 comments on commit 1ee2214

Please sign in to comment.