diff --git a/NAMESPACE b/NAMESPACE index 5f7ee45..6fde7d3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(check_data) export(check_land_water) export(check_leap_issue) export(check_missing) +export(compare_polis_afp_snapshots) export(correct_flipped_geo_coords) export(create_gt_table) export(create_summary_by_group) diff --git a/R/globals.R b/R/globals.R index bb2945c..bcc4ef8 100644 --- a/R/globals.R +++ b/R/globals.R @@ -25,6 +25,9 @@ globalVariables( "geo_id_cols", "geo_name_cols", "header", "Missing in Shapefile", "Non-unique Count", "Variable", "ID", "Test", "Value", "null_count", "Lat_parzer", "Long_parzer", "parse_failed", "parsed_date", "parsed_date1", - "parsed_date2", "Total", "snapshot", "WHO_REGION" + "parsed_date2", "Total", "snapshot", "WHO_REGION", + "EPID", "ParalysisOnsetDate", "Validation Type", "WPV1", "afp_cases1", + "afp_cases2", "cVDPV1", "cVDPV2", "n_data1", "n_data2", "val_type", + "virus_type", "CollectionDate" ) ) diff --git a/R/validate_data.R b/R/validate_data.R index 6a60922..2c7e4ad 100644 --- a/R/validate_data.R +++ b/R/validate_data.R @@ -40,12 +40,12 @@ check_missing <- function(data, cols_to_check = NULL, } else { cols_to_check <- base::setdiff(cols_to_check, group_by) } - + # Prepare the grouping if (!base::is.null(group_by)) { data <- data |> dplyr::group_by(dplyr::across(dplyr::all_of(group_by))) } - + result <- data |> dplyr::summarise( dplyr::across( @@ -93,22 +93,22 @@ check_missing <- function(data, cols_to_check = NULL, `Is Completely Null` ) |> as.data.frame() - + # drop Priority col if key column # is not specified if (is.null(key_columns)) { result <- result |> dplyr::select(-Priority) } - + # Initialize a list to store results missing_results <- list() - + # Loop through each row in the result for (i in seq_len(nrow(result))) { col <- result$Column[i] missing_percent <- result$`Missing Percent`[i] - + # Check if there are missing values if (missing_percent > 0) { missing_results[[length(missing_results) + 1]] <- list( @@ -118,10 +118,10 @@ check_missing <- function(data, cols_to_check = NULL, ) } } - + # Summarize results row by row cli::cli_h1("Completely Null Columns") - + for (i in seq_len(nrow(result))) { if (result$`Is Completely Null`[i]) { cli::cli_alert_warning(paste0( @@ -129,29 +129,29 @@ check_missing <- function(data, cols_to_check = NULL, )) } } - + # Summarize results row by row cli::cli_h1("Columns with Missing Values") - + missing_cols <- result |> dplyr::filter(`Missing Count` > 0 & `Is Completely Null` == FALSE) |> dplyr::arrange(dplyr::desc(`Missing Percent`)) - + for (i in seq_len(nrow(missing_cols))) { row <- missing_cols[i, ] - + cli::cli_h2(paste("Column:", row$Column)) - + # Report on columns with missing values cli::cli_alert_warning(paste0( "Column '", crayon::blue(row$Column), "' has ", crayon::red(row$`Missing Percent`), "% (", crayon::red(row$`Missing Count`), ") missing values." )) - + cli::cli_text("") } - + return(result) } @@ -252,7 +252,7 @@ validate_admin_hierarchy <- function(data, column_combos, if (return_non_unique && is.null(id_columns)) { stop("id_columns must be provided when return_non_unique is TRUE") } - + # Validate that id_columns exist in the data if (!is.null(id_columns) && !all(id_columns %in% names(data))) { missing_cols <- setdiff(id_columns, names(data)) @@ -261,7 +261,7 @@ validate_admin_hierarchy <- function(data, column_combos, paste(missing_cols, collapse = ", ") )) } - + # Validate that column_combos are present in the data all_columns <- unique(unlist(column_combos)) if (!all(all_columns %in% names(data))) { @@ -271,11 +271,11 @@ validate_admin_hierarchy <- function(data, column_combos, paste(missing_cols, collapse = ", ") )) } - + # Convert columns to character data <- data |> dplyr::mutate(dplyr::across(dplyr::all_of(all_columns), as.character)) - + result_list <- purrr::map(column_combos, function(combo) { data |> dplyr::distinct(dplyr::across(dplyr::all_of(combo))) |> @@ -290,9 +290,9 @@ validate_admin_hierarchy <- function(data, column_combos, ) |> dplyr::select(total_count, non_unique_count, proportion_unique) }) - + names(result_list) <- purrr::map_chr(column_combos, paste, collapse = " ~ ") - + results_df <- result_list |> dplyr::bind_rows(.id = "Column Combination") |> dplyr::rename( @@ -301,7 +301,7 @@ validate_admin_hierarchy <- function(data, column_combos, "Proportion Unique" = proportion_unique ) |> as.data.frame() - + if (return_non_unique && !is.null(id_columns)) { non_unique_ids <- data |> dplyr::group_by(dplyr::across(dplyr::all_of(all_columns))) |> @@ -309,20 +309,20 @@ validate_admin_hierarchy <- function(data, column_combos, dplyr::ungroup() |> dplyr::pull(dplyr::all_of(id_columns)) |> unique() - + return(non_unique_ids) } else { return(results_df) } - + # Summarize results row by row cli::cli_h1("Summary of Administrative Hierarchy Validation") - + for (i in seq_len(nrow(results_df))) { row <- results_df[i, ] - + cli::cli_h2(paste("Column Combination:", row$`Column Combination`)) - + if (row$`Non-unique Count` > 0) { cli::cli_alert_warning(paste0( "Found ", crayon::red(row$`Non-unique Count`), @@ -331,7 +331,7 @@ validate_admin_hierarchy <- function(data, column_combos, } else { cli::cli_alert_success("All combinations are unique.") } - + cli::cli_text("") } } @@ -416,17 +416,17 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, id_col) { # Remove the first element from geo_id_cols geo_id_cols <- geo_id_cols[-1] - + # Use poliprep::shp_global if shapefile_data is not provided if (is.null(shapefile_data)) { shapefile_data <- poliprep::shp_global shapefile_name_cols <- c("ADM0_NAME", "ADM1_NAME", "ADM2_NAME") shapefile_id_cols <- c("ADM1_GUID", "ADM2_GUID") } - + # Ensure input integrity if (length(geo_name_cols) != length(shapefile_name_cols) || - length(geo_id_cols) != length(shapefile_id_cols)) { + length(geo_id_cols) != length(shapefile_id_cols)) { stop( paste0( "The number of name columns must match, ", @@ -434,7 +434,7 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, ) ) } - + # Ensure distinct combinations in both datasets data <- data |> dplyr::distinct(dplyr::across( @@ -444,17 +444,17 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, dplyr::distinct(dplyr::across( dplyr::all_of(c(shapefile_name_cols, shapefile_id_cols)) )) - + # Initialize results list results <- list() - + # Summary table to keep track of mismatches summary_table <- data.frame( Column = c(geo_name_cols, geo_id_cols), Missing_in_Shapefile = numeric(length(c(geo_name_cols, geo_id_cols))), stringsAsFactors = FALSE ) - + # Detailed mismatches (long list) detailed_mismatches <- data.frame( stringsAsFactors = FALSE @@ -462,23 +462,23 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, detailed_mismatches[[id_col]] <- character() detailed_mismatches$Geo_Column <- character() detailed_mismatches$Data_Value <- character() - + # Check names and IDs for mismatches for (i in seq_along(geo_name_cols)) { # Check if names from data exist in shapefile col <- geo_name_cols[i] shapefile_col <- shapefile_name_cols[i] - + # Mismatches where data values are not found in the shapefile missing_in_shapefile <- dplyr::anti_join( data |> dplyr::filter(!is.na(.data[[col]])), shapefile_data |> dplyr::filter(!is.na(.data[[shapefile_col]])), by = setNames(shapefile_col, col) ) - + # Update the summary table summary_table$Missing_in_Shapefile[i] <- nrow(missing_in_shapefile) - + # If there are mismatches, add them to the detailed mismatch table if (nrow(missing_in_shapefile) > 0) { detailed_mismatches <- rbind(detailed_mismatches, data.frame( @@ -489,12 +489,12 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, )) } } - + for (i in seq_along(geo_id_cols)) { # Check if IDs from data exist in shapefile col <- geo_id_cols[i] shapefile_col <- shapefile_id_cols[i] - + # Ensure the shapefile ID column is character and lowercase shapefile_data <- shapefile_data |> dplyr::mutate( @@ -507,7 +507,7 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, ) ) ) - + # Ensure the data ID column is character and lowercase data <- data |> dplyr::mutate( @@ -518,19 +518,19 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, stringr::str_replace_all(!!rlang::sym(geo_id_cols[i]), "[{}]", "") ) ) - + # Mismatches where data values are not found in the shapefile missing_in_shapefile <- dplyr::anti_join( data |> dplyr::filter(!is.na(.data[[col]])), shapefile_data |> dplyr::filter(!is.na(.data[[shapefile_col]])), by = setNames(shapefile_col, col) ) - + # Update the summary table summary_table$Missing_in_Shapefile[length( geo_name_cols ) + i] <- nrow(missing_in_shapefile) - + # If there are mismatches, add them to the detailed mismatch table if (nrow(missing_in_shapefile) > 0) { detailed_mismatches <- rbind(detailed_mismatches, data.frame( @@ -541,7 +541,7 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, )) } } - + # Rename the columns and add to results results$detailed_mismatches <- detailed_mismatches |> dplyr::rename( @@ -549,22 +549,22 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, `Name/ID Missing in Shapefile` = Data_Value ) names(results$detailed_mismatches)[1] <- id_col - + # Rename the columns and add to results results$summary_table <- summary_table |> dplyr::rename( `Geo Column Type` = Column, `Missing in Shapefile` = Missing_in_Shapefile ) - + # Summarize results row by row cli::cli_h1("Summary of Geoname and ID Mismatches") - + for (i in seq_len(nrow(results$summary_table))) { row <- results$summary_table[i, ] - + cli::cli_h2(paste("Geographic Column:", row$`Geo Column Type`)) - + if (row$`Missing in Shapefile` > 0) { cli::cli_alert_warning(paste0( "Found ", crayon::red(row$`Missing in Shapefile`), @@ -573,10 +573,10 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, } else { cli::cli_alert_success("All values in the data match the shapefile.") } - + cli::cli_text("") # Add a blank line for readability } - + return(results) } @@ -690,12 +690,12 @@ check_data <- function(data, )) { # Initialize results list full_results <- list() - + # Get all columns all_cols <- names(data) - + # 1. Base data information --------------------------------------------- - + # 1a. Missing data if (run_missing_check) { full_results$missing_data <- check_missing( @@ -709,21 +709,21 @@ check_data <- function(data, `Missing Count` = as.integer(`Missing Count`) ) |> as.data.frame() - + # 1b. Data type check for all columns data_type_check <- data.frame( Column = names(data), DataType = sapply(data, function(x) class(x)[1]) ) - + # add data type check to missing data full_results$missing_data <- full_results$missing_data |> dplyr::left_join(data_type_check, by = "Column") |> dplyr::select(Column, DataType, `Missing Count`, `Is Completely Null`) - + full_results$geo_missing_data <- full_results$missing_data |> dplyr::filter(Column %in% c(geo_name_cols, geo_id_cols)) - + full_results$geo_missing_id <- data |> dplyr::select(!!rlang::sym(id_col), dplyr::all_of( c(geo_name_cols, geo_id_cols) @@ -740,16 +740,16 @@ check_data <- function(data, dplyr::select(-Column) |> as.data.frame() } - + # 2a Check date columns if provided ------------------------------------ - + if (run_date_check) { date_results <- data.frame(Column = date_cols) - + for (i in seq_along(date_cols)) { col <- date_cols[i] validated_data <- validate_date(data, col) - + date_results[i, "Missing"] <- sum( validated_data[[paste0(col, "_missing")]], na.rm = TRUE @@ -774,52 +774,52 @@ check_data <- function(data, validated_data[[paste0(col, "_format_issue")]], na.rm = TRUE ) - + # Get IDs ids <- validated_data |> dplyr::filter(!!rlang::sym(paste0(col, "_missing"))) |> dplyr::distinct(!!rlang::sym(id_col)) |> dplyr::mutate(Test = glue::glue("{col} is missing")) - + ids_non_date <- validated_data |> dplyr::filter(!!rlang::sym(paste0(col, "_non_date"))) |> dplyr::distinct(!!rlang::sym(id_col)) |> dplyr::mutate(Test = glue::glue("{col} is non-date")) - + ids_invalid <- validated_data |> dplyr::filter(!!rlang::sym(paste0(col, "_invalid"))) |> dplyr::distinct(!!rlang::sym(id_col)) |> dplyr::mutate(Test = glue::glue("{col} is invalid")) - + ids_future <- validated_data |> dplyr::filter(!!rlang::sym(paste0(col, "_future"))) |> dplyr::distinct(!!rlang::sym(id_col)) |> dplyr::mutate(Test = glue::glue("{col} is in the future")) - + ids_leap_issue <- validated_data |> dplyr::filter(!!rlang::sym(paste0(col, "_leap_issue"))) |> dplyr::distinct(!!rlang::sym(id_col)) |> dplyr::mutate(Test = glue::glue("{col} has leap year issue")) - + ids_format_issue <- validated_data |> dplyr::filter(!!rlang::sym(paste0(col, "_format_issue"))) |> dplyr::distinct(!!rlang::sym(id_col)) |> dplyr::mutate(Test = glue::glue("{col} has format issue")) - + full_results$date_issue_id <- dplyr::bind_rows( ids, ids_non_date, ids_invalid, ids_future, ids_leap_issue, ids_format_issue ) |> dplyr::mutate(`Column Type` = "Date") } - + # add date results pairs to results full_results$date_results <- date_results - + # 2b Check date pairs if provided if (!is.null(date_pair_cols)) { date_results_pairs <- data.frame() - + for (pair in date_pair_cols) { start_col <- pair[1] end_col <- pair[2] @@ -834,37 +834,37 @@ check_data <- function(data, validated_data[[paste0(start_col, "_invalid_order")]], na.rm = TRUE ) - + # Get IDs for invalid order ids_invalid_order <- validated_data |> dplyr::filter(!!rlang::sym(paste0(start_col, "_invalid_order"))) |> dplyr::distinct(!!rlang::sym(id_col)) |> dplyr::mutate(Test = glue::glue("{pair_name} has invalid order")) - + full_results$date_pair_issue_id <- dplyr::bind_rows( full_results$date_pair_issue_id, ids_invalid_order ) |> dplyr::mutate(`Column Type` = "Date") } - + # add date results pairs to results full_results$date_results_pairs <- date_results_pairs } } - + if (run_geo_hierarchy_check) { geo_hierarchy_results <- validate_admin_hierarchy( data, column_combos = list(geo_id_cols, geo_name_cols), id_columns = id_col ) - + # add geo_hierarchy to results full_results$geo_hierarchy <- geo_hierarchy_results } - + # 4. Check geoname and ID ------------------------------------------- - + if (run_geo_mismatch_check) { geo_mismatches <- join_and_check_mismatches( data, shapefile_data, @@ -872,7 +872,7 @@ check_data <- function(data, shapefile_name_cols, shapefile_id_cols, id_col ) - + # get ID and detAILS full_results$geo_mismatches_id <- geo_mismatches$detailed_mismatches |> dplyr::select( @@ -880,13 +880,13 @@ check_data <- function(data, Test = "Geo Column Type" ) |> dplyr::mutate(`Column Type` = "Geo") - + # add geo_mismatches to results full_results$geo_mismatches <- geo_mismatches$summary_table } - + # 5. Coordinate Checks ----------------------------------------------- - + if (run_coordinate_checks) { coord_check_results <- check_coords( data = data, @@ -903,7 +903,7 @@ check_data <- function(data, coord_check_results$var1 # add coordinate_checks to results full_results$coordinate_checks <- coord_check_results - + coord_check_results <- check_coords( data = data, shapefile_data = shapefile_data, @@ -916,7 +916,7 @@ check_data <- function(data, summary_table = FALSE, checks = coordinate_checks ) - + full_results$coord_issue_id <- dplyr::bind_rows( if ("parse" %in% coordinate_checks) { coord_check_results |> @@ -963,19 +963,19 @@ check_data <- function(data, ) |> dplyr::mutate(`Column Type` = "Coordinate") } - + if (run_detections) { # set up data identify detections data2 <- data |> dplyr::mutate( cVDPV1 = if ("VDPV1" %in% names(data)) { dplyr::if_else(VDPV1 == TRUE & VdpvClassifications == "Circulating", - TRUE, FALSE, missing = FALSE + TRUE, FALSE, missing = FALSE ) }, cVDPV2 = if ("VDPV2" %in% names(data)) { dplyr::if_else(VDPV2 == TRUE & VdpvClassifications == "Circulating", - TRUE, FALSE, missing = FALSE + TRUE, FALSE, missing = FALSE ) }, WPV1 = if ("WILD1" %in% names(data)) { @@ -984,7 +984,7 @@ check_data <- function(data, FALSE } ) - + full_results$wpv1_count <- if ("WPV1" %in% names(data2)) { sum(data2$WPV1, na.rm = TRUE) } else { @@ -1001,7 +1001,7 @@ check_data <- function(data, 0 } } - + full_results$total_columns <- ncol(data) full_results$total_rows <- nrow(data) full_results$duplicated_rows <- sum(duplicated(data[, id_col, drop = FALSE])) @@ -1009,7 +1009,7 @@ check_data <- function(data, full_results$missing_data$`Is Completely Null`, na.rm = T ) - + return(full_results) } @@ -1049,10 +1049,10 @@ check_data <- function(data, #' @export create_summary_table <- function(data) { all_summaries <- list() - + # Main summary if (!is.null(data$total_rows) && !is.null(data$total_columns) && - !is.null(data$duplicated_rows)) { + !is.null(data$duplicated_rows)) { main_summary <- data.frame( Column = c( "Total Rows", "Total Columns", "Total Duplicate", @@ -1068,20 +1068,20 @@ create_summary_table <- function(data) { dplyr::filter(!Column %in% c("Total Columns", "Total Null Columns")) all_summaries$main_summary <- main_summary } - + # Detections summary if (!is.null(data$wpv1_count) && !is.null(data$cvdpv1_count) && - !is.null(data$cvdpv2_count)) { + !is.null(data$cvdpv2_count)) { detections <- data.frame( Column = c("WPV1", "cVDPV1", "cVDPV2"), count = c(data$wpv1_count, data$cvdpv1_count, data$cvdpv2_count) ) |> dplyr::mutate(dplyr::across(dplyr::everything(), as.character)) |> dplyr::mutate(header = "Virus Detections") - + all_summaries$detections <- detections } - + # Coordinate checks if (!is.null(data$coordinate_checks)) { coordinate_checks <- data$coordinate_checks |> @@ -1095,7 +1095,7 @@ create_summary_table <- function(data) { dplyr::filter(Column != "Total Coords") all_summaries$coordinate_checks <- coordinate_checks } - + # Geo missing checks if (!is.null(data$geo_missing_data)) { geo_missing_checks <- data$geo_missing_data |> @@ -1103,10 +1103,10 @@ create_summary_table <- function(data) { dplyr::mutate(dplyr::across(dplyr::everything(), as.character)) |> dplyr::select(Column, count = "Missing Count") |> dplyr::mutate(header = "Geographical Names & ID Missing") - + all_summaries$geo_missing_checks <- geo_missing_checks } - + # Geographical mismatches if (!is.null(data$geo_mismatches)) { geo_mismatches <- data$geo_mismatches |> @@ -1118,7 +1118,7 @@ create_summary_table <- function(data) { ) all_summaries$geo_mismatches <- geo_mismatches } - + # Geographical hierarchy if (!is.null(data$geo_hierarchy)) { geo_hierarchy <- data$geo_hierarchy |> @@ -1130,7 +1130,7 @@ create_summary_table <- function(data) { ) all_summaries$geo_hierarchy <- geo_hierarchy } - + # Date results if (!is.null(data$date_results)) { date_results <- data$date_results |> @@ -1145,7 +1145,7 @@ create_summary_table <- function(data) { dplyr::select(Column, count, header) all_summaries$date_results <- date_results } - + # Date results pairs if (!is.null(data$date_results_pairs)) { date_results_pairs <- data$date_results_pairs |> @@ -1159,14 +1159,14 @@ create_summary_table <- function(data) { dplyr::select(Column, count, header) all_summaries$date_results_pairs <- date_results_pairs } - + # Combine all summaries all_summaries <- dplyr::bind_rows(all_summaries) - + # Create a unified summary table summary_table <- all_summaries |> dplyr::mutate(header = factor(header, - levels = unique(all_summaries$header) + levels = unique(all_summaries$header) )) |> dplyr::select(header, Column, count) |> dplyr::mutate(count = as.numeric( @@ -1238,14 +1238,14 @@ create_summary_table <- function(data) { TRUE ~ Column ) ) - - - + + + if (!is.null(data$date_issue_id) || - !is.null(data$geo_missing_id) || - !is.null(data$date_pair_issue_id) || - !is.null(data$geo_mismatches_id) || - !is.null(data$coord_issue_id)) { + !is.null(data$geo_missing_id) || + !is.null(data$date_pair_issue_id) || + !is.null(data$geo_mismatches_id) || + !is.null(data$coord_issue_id)) { # resolve ID datasets id_data <- dplyr::bind_rows( data$date_issue_id, @@ -1292,7 +1292,7 @@ create_summary_table <- function(data) { ) |> dplyr::select(1, 3, 2) } - + return( list( summary_table = summary_table, @@ -1395,12 +1395,12 @@ create_summary_by_group <- function(data, group_var, id_col, geo_name_cols, unique_groups <- unique(data[[group_var]]) |> sort(decreasing = decreasing) |> utils::tail(n_groups) - + res_list <- list() - + for (group in unique_groups) { group_data <- dplyr::filter(data, .data[[group_var]] == group) - + res_list[[as.character(group)]] <- check_data( data = group_data, id_col = id_col, @@ -1425,8 +1425,8 @@ create_summary_by_group <- function(data, group_var, id_col, geo_name_cols, ) |> create_summary_table() } - - + + # Process summary table summary_table <- dplyr::bind_rows( lapply(res_list, `[[`, "summary_table"), @@ -1445,13 +1445,13 @@ create_summary_by_group <- function(data, group_var, id_col, geo_name_cols, ) ) |> dplyr::ungroup() - + # Process id data id_data <- dplyr::bind_rows( lapply(res_list, `[[`, "id_data"), .id = group_var ) - + return(list(summary_table = summary_table, id_data = id_data)) } @@ -1496,12 +1496,12 @@ create_gt_table <- function(summary_data, autoscale_nanoplot = FALSE) { # Conditional loading for packages required_packages <- c("scales", "gt", "glue") - + missing_packages <- required_packages[!sapply( required_packages, requireNamespace, quietly = TRUE )] - + if (length(missing_packages) > 0) { stop( paste0( @@ -1511,7 +1511,7 @@ create_gt_table <- function(summary_data, call. = FALSE ) } - + gt_table <- gt::gt(summary_data) |> gt::data_color( columns = -c(Column, header), @@ -1520,11 +1520,11 @@ create_gt_table <- function(summary_data, # Handle potential negative values or NAs for this row x_clean <- pmax(x, 0, na.rm = TRUE) row_max <- max(x_clean, na.rm = TRUE) * 1.3 - + if (row_max == 0) { return(rep("white", length(x))) } - + scales::col_numeric( palette = c("#FFFFFF", "#FF9999", "#FF0000"), domain = c(0, row_max) @@ -1558,7 +1558,7 @@ create_gt_table <- function(summary_data, style = gt::cell_text(weight = "bold"), locations = gt::cells_column_labels(everything()) ) - + if (add_nanoplot) { gt_table <- gt_table |> gt::cols_nanoplot( columns = -c(1, 2), @@ -1571,7 +1571,7 @@ create_gt_table <- function(summary_data, ) ) } - + return(gt_table) } @@ -1664,12 +1664,12 @@ validate_polis <- function(data, type = "AFP", vheight = 1400, vwidth = 1550, ...) { # Conditional loading for packages required_packages <- c("scales", "gt", "glue", "webshot") - + missing_packages <- required_packages[!sapply( required_packages, requireNamespace, quietly = TRUE )] - + if (length(missing_packages) > 0) { stop( paste0( @@ -1679,7 +1679,7 @@ validate_polis <- function(data, type = "AFP", call. = FALSE ) } - + # Define parameters based on data type default_params <- list( AFP = list( @@ -1716,14 +1716,14 @@ validate_polis <- function(data, type = "AFP", ) ) ) - + if (!type %in% names(default_params)) { stop( "Invalid data type. Supported types are: ", paste(names(default_params), collapse = ", ") ) } - + # Use provided parameters if not NULL, otherwise use defaults params <- list( group_var = if (!is.null(group_var)) { @@ -1762,7 +1762,7 @@ validate_polis <- function(data, type = "AFP", default_params[[type]]$date_pair_cols } ) - + # Check if all specified columns exist in the dataset all_cols <- c( params$group_var, params$id_col, params$geo_name_cols, @@ -1770,15 +1770,15 @@ validate_polis <- function(data, type = "AFP", unlist(params$date_pair_cols) ) missing_cols <- setdiff(all_cols, names(data)) - + if (length(missing_cols) > 0) { stop(paste( "The following columns are not present in the dataset:", paste(missing_cols, collapse = ", ") )) } - - + + summary <- create_summary_by_group( data = data, group_var = params$group_var, @@ -1792,7 +1792,7 @@ validate_polis <- function(data, type = "AFP", decreasing = decreasing, ... ) - + title <- if (is.null(custom_title)) { glue::glue( "POLIS {type} Data Quality Checks ", @@ -1801,7 +1801,7 @@ validate_polis <- function(data, type = "AFP", } else { custom_title } - + gt_table <- create_gt_table( summary$summary_table |> dplyr::filter( Column != "Total Null Columns" @@ -1809,25 +1809,25 @@ validate_polis <- function(data, type = "AFP", title = title, autoscale_nanoplot = autoscale_nanoplot ) - + if (save_output) { if (is.null(plots_path)) { stop("plots_path must be provided when save_output is TRUE") } - + today <- format(Sys.Date(), "%Y%m%d") file_prefix <- glue::glue("polis_{type}_validation_{today}_{polis_version}") - + html_path <- file.path(plots_path, glue::glue("{file_prefix}.html")) png_path <- file.path(plots_path, glue::glue("{file_prefix}.png")) - + gt_table |> gt::gtsave(html_path) - + webshot::webshot(html_path, png_path, vheight = vheight, vwidth = vwidth) - + file.remove(html_path) } - + return(list(gt_table = gt_table, id_data = summary$id_data)) } @@ -1873,34 +1873,34 @@ validate_polis_snapshots <- function(path, data_type = "AFP", "POLIS %s Data Quality Checks For Data Snapshots (%d-%d)", data_type, year_of_interest - 1, year_of_interest ) - + if (is.null(pattern)) { pattern <- paste0("^polis_raw_data_", year_of_interest, "_") } - + files <- list.files( path, pattern = pattern, full.names = TRUE ) - + weeks <- sort( as.numeric(gsub(".*_([0-9]+)\\.rds$", "\\1", files)), decreasing = TRUE )[1:snapshots] - + date_col <- if (data_type == "AFP") "CaseDate" else "CollectionDate" - + if (is.null(data_type_string)) { data_type_string <- if (data_type == "AFP") "human" else "env" } - + df <- purrr::map_dfr(weeks, function(week) { file_path <- file.path( path, sprintf("polis_raw_data_%d_%d.rds", year_of_interest, week) ) - + poliprep::read(file_path)[[data_type_string]] |> dplyr::filter( lubridate::year(!!rlang::sym(date_col)) >= year_of_interest - 1 @@ -1913,7 +1913,7 @@ validate_polis_snapshots <- function(path, data_type = "AFP", levels = sprintf("%d W%d", year_of_interest, rev(weeks)) ) ) - + results <- poliprep::validate_polis( data = df, group_var = "snapshot", @@ -1926,7 +1926,7 @@ validate_polis_snapshots <- function(path, data_type = "AFP", vwidth = vwidth, ... ) - + today <- format(Sys.Date(), "%Y%m%d") save_path <- file.path( plots_path, @@ -1935,11 +1935,11 @@ validate_polis_snapshots <- function(path, data_type = "AFP", tolower(data_type), "_snapshots_", year_of_interest, "_", today, ".RData" ) ) - + base::save(results, file = save_path) - + return(results) - } +} #' Validate AFRO Data #' @@ -2028,12 +2028,12 @@ validate_afro <- function(data, type = "AFP", required_packages <- c( "scales", "zoo", "gt", "glue", "webshot" ) - + missing_packages <- required_packages[!sapply( required_packages, requireNamespace, quietly = TRUE )] - + if (length(missing_packages) > 0) { stop( paste0( @@ -2043,7 +2043,7 @@ validate_afro <- function(data, type = "AFP", call. = FALSE ) } - + # Define parameters based on data type default_params <- list( AFP = list( @@ -2074,14 +2074,14 @@ validate_afro <- function(data, type = "AFP", lat_long_cols = c("Latitude", "Longitude") ) ) - + if (!type %in% names(default_params)) { stop( "Invalid data type. Supported types are: ", paste(names(default_params), collapse = ", ") ) } - + # Use provided parameters if not NULL, otherwise use defaults params <- list( group_var = if (!is.null(group_var)) { @@ -2120,7 +2120,7 @@ validate_afro <- function(data, type = "AFP", default_params[[type]]$date_pair_cols } ) - + # Check if all specified columns exist in the dataset all_cols <- c( params$group_var, params$id_col, params$geo_name_cols, @@ -2128,15 +2128,15 @@ validate_afro <- function(data, type = "AFP", unlist(params$date_pair_cols) ) missing_cols <- setdiff(all_cols, names(data)) - + if (length(missing_cols) > 0) { stop(paste( "The following columns are not present in the dataset:", paste(missing_cols, collapse = ", ") )) } - - + + summary <- create_summary_by_group( data = data, group_var = params$group_var, @@ -2150,14 +2150,14 @@ validate_afro <- function(data, type = "AFP", decreasing = decreasing, ... ) - + # set up time lab time_labs <- paste0( "For ", zoo::as.yearmon(Sys.Date()), " in Epiweek ", lubridate::epiweek(Sys.Date()) ) - + title <- if (is.null(custom_title)) { glue::glue( "AFRO {type} Data Quality Checks ", @@ -2166,7 +2166,7 @@ validate_afro <- function(data, type = "AFP", } else { custom_title } - + gt_table <- create_gt_table( summary$summary_table |> dplyr::filter( Column != "Total Null Columns" @@ -2174,27 +2174,27 @@ validate_afro <- function(data, type = "AFP", autoscale_nanoplot = autoscale_nanoplot, title = title ) - + if (save_output) { if (is.null(plots_path)) { stop("plots_path must be provided when save_output is TRUE") } - + today <- format(Sys.Date(), "%Y%m%d") file_prefix <- glue::glue( "afro_quality_check_{type}_validation_{today}}" ) - + html_path <- file.path(plots_path, glue::glue("{file_prefix}.html")) png_path <- file.path(plots_path, glue::glue("{file_prefix}.png")) - + gt_table |> gt::gtsave(html_path) - + webshot::webshot(html_path, png_path, vheight = vheight, vwidth = vwidth) - + file.remove(html_path) } - + return(list(gt_table = gt_table, id_data = summary$id_data)) } @@ -2236,7 +2236,7 @@ summarize_validation_results <- function(data, if (!id_col %in% names(data)) { stop("id_col not found in the dataset") } - + if (length(agg_vars) == 0) { stop("At least one aggregation variable must be provided") } @@ -2248,12 +2248,12 @@ summarize_validation_results <- function(data, if (!test_type_name %in% metadata$Test) { stop("test_type not found in metadata") } - + # Get relevant IDs for the specified test type ids <- metadata |> dplyr::filter(Test %in% test_type_name) |> dplyr::pull(!!rlang::sym(id_col)) - + # Summarize test results summary <- data |> dplyr::filter(!!rlang::sym(id_col) %in% ids) |> @@ -2262,7 +2262,7 @@ summarize_validation_results <- function(data, dplyr::summarise(Total = dplyr::n_distinct(!!rlang::sym(id_col))) |> dplyr::arrange(dplyr::desc(Total)) |> dplyr::ungroup() - + # Get culprit columns if return_cols is provided culprits <- NULL if (!is.null(return_cols)) { @@ -2270,7 +2270,7 @@ summarize_validation_results <- function(data, dplyr::filter(!!rlang::sym(id_col) %in% ids) |> dplyr::select(!!rlang::sym(id_col), !!!rlang::syms(return_cols)) } - + plot <- NULL if (create_plot) { if (length(agg_vars) == 1) { @@ -2317,6 +2317,351 @@ summarize_validation_results <- function(data, warning("Plot can only be created for one or two aggregation variables.") } } - + list(summary = summary, plot = plot, culprits = culprits) } + +#' Get detections from POLIS data +#' +#' @param data A POLIS dataset +#' @param case_type Type of cases to filter ("ES" or "AFP") +#' @return Filtered dataset containing only specified case type +get_polis_detections <- function(data, case_type = c("ES", "AFP")) { + case_type <- match.arg(case_type) + + data |> + dplyr::filter( + !is.na(if (case_type == "ES") CollectionDate else ParalysisOnsetDate) + ) |> + dplyr::mutate( + cVDPV1 = "VDPV1" %in% names(data) & VDPV1 & + VdpvClassifications == "Circulating", + cVDPV2 = "VDPV2" %in% names(data) & VDPV2 & + VdpvClassifications == "Circulating", + WPV1 = "WILD1" %in% names(data) & WILD1 + ) |> + dplyr::mutate( + virus_type = dplyr::case_when( + cVDPV1 ~ "cVDPV1", + cVDPV2 ~ "cVDPV2", + WPV1 ~ "WPV1", + TRUE ~ NA_character_ + ) + ) +} + +#' Compare AFP POLIS Snapshots +#' +#' This function performs a detailed comparison between two POLIS datasets, +#' focusing on Acute Flaccid Paralysis (AFP) cases. It conducts various +#' validations and analyses to identify differences and changes. +#' +#' Key operations include: +#' 1. Checking for removed columns +#' 2. Identifying new blank columns +#' 3. Detecting removed EPIDs +#' 4. Finding lost detections +#' 5. Analyzing changes in AFP detections, comparing various fields +#' +#' The function outputs informative CLI messages throughout the process, +#' providing insights into the differences between the datasets. +#' +#' @param data1 First POLIS dataset (data frame) +#' @param data2 Second POLIS dataset (data frame) +#' +#' @return A list containing: +#' - results: Detailed validation results +#' - summary: A summary data frame of key changes +#' - gt_tab: A formatted GT table for easy visualization +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' results <- compare_polis_afp_snapshots(old_data, new_data) +#' print(results$gt_tab) +#' } +compare_polis_afp_snapshots <- function(data1, data2) { + # Get detections table + detections1 <- get_polis_detections(data1, "AFP") |> + dplyr::filter(cVDPV1 | cVDPV2 | WPV1) + detections2 <- get_polis_detections(data2, "AFP") |> + dplyr::filter(cVDPV1 | cVDPV2 | WPV1) + + # Establish labels for the different datasets + lab_date1 <- format(as.Date(max(data1$LastUpdateDate)), "%d %b %Y") + lab_date2 <- format(as.Date(max(data2$LastUpdateDate)), "%d %b %Y") + + # Create labels for CLI messages and table headers + cli_label1 <- paste0("Dataset 1 (", lab_date1, ")") + cli_label2 <- paste0("Dataset 2 (", lab_date2, ")") + table_header1 <- paste0(lab_date1, " (Dataset 1)") + table_header2 <- paste0(lab_date2, " (Dataset 2)") + + results <- list() + + # 1. Check for removed columns --------------------------------------------- + + cli::cli_h1("Checking for removed columns") + removed_cols <- setdiff(names(data1), names(data2)) + if (length(removed_cols) > 0) { + cli::cli_alert_info(paste0( + "Columns removed in {cli_label2}: ", paste(removed_cols, collapse = ", ") + )) + } else { + cli::cli_alert_success( + "All columns from {cli_label1} are present in {cli_label2}" + ) + } + + results$removed_columns <- removed_cols + results$removed_columns_count <- length(removed_cols) + + # 2. Check for blank columns ------------------------------------------------ + + cli::cli_h1("Checking for blank columns") + + # Convert blank spaces to NA in both datasets + data1 <- data1 |> + dplyr::mutate(dplyr::across(where(is.character), ~ dplyr::na_if(., ""))) + data2 <- data2 |> + dplyr::mutate(dplyr::across(where(is.character), ~ dplyr::na_if(., ""))) + + # Find columns that are completely NA in each dataset + blank_cols1 <- names(data1)[sapply(data1, \(x) all(is.na(x)))] + blank_cols2 <- names(data2)[sapply(data2, \(x) all(is.na(x)))] + + # Compare blank columns between datasets + blank_cols_diff <- setdiff(blank_cols1, blank_cols2) + new_blank_cols <- setdiff(blank_cols2, blank_cols1) + + if (length(blank_cols_diff) > 0 || length(new_blank_cols) > 0) { + cli::cli_alert_info(paste0( + "Differences in blank columns:\n", + " Columns blank in {cli_label1} but not in {cli_label2}: ", + paste(big_mark(blank_cols_diff), collapse = ", "), "\n", + " New blank columns in {cli_label2}: ", + paste(big_mark(new_blank_cols), collapse = ", ") + )) + } else { + cli::cli_alert_success("Blank columns are identical in both datasets") + } + + results$blank_columns_new <- new_blank_cols + results$blank_columns_new_count <- length(new_blank_cols) + + # 3. Check for removed EPIDs ----------------------------------------------- + cli::cli_h1("Checking for removed EPIDs") + + removed_epids <- setdiff(data1$EPID, data2$EPID) + + # Get the number of detections in the first dataset + number_detections <- detections1 |> + dplyr::filter(EPID %in% removed_epids) |> + nrow() + + if (length(removed_epids) > 0) { + cli::cli_alert_warning(paste0( + "Number of EPIDs from {cli_label1} removed in {cli_label2}: ", + crayon::red(big_mark(length(removed_epids))), "\n", + "Number of these that were detections in {cli_label1}: ", + crayon::red(big_mark(number_detections)) + )) + } else { + cli::cli_alert_success( + "All EPIDs from {cli_label1} are present in {cli_label2}" + ) + } + results$removed_epids <- removed_epids + results$removed_epids_count <- length(removed_epids) + + # 4. Check for removed detections in dataset -------------------------------- + + cli::cli_h1("Checking for lost AFP detections") + + data1_counts <- detections1 |> + dplyr::group_by(EPID) |> + dplyr::summarise(n = dplyr::n_distinct(virus_type)) + + data2_counts <- detections2 |> + dplyr::group_by(EPID) |> + dplyr::summarise(n = dplyr::n_distinct(virus_type)) + + lost_detections <- dplyr::full_join(data1_counts, data2_counts, + by = "EPID", + suffix = c("_data1", "_data2") + ) |> + dplyr::filter(!is.na(n_data1), !is.na(n_data2), n_data2 < n_data1) |> + dplyr::arrange(dplyr::desc(n_data1 - n_data2)) + + if (nrow(lost_detections) > 0) { + cli::cli_alert_warning(paste0( + "Number of EPIDs with lost detections between ", + "{cli_label1} and {cli_label2}: ", + crayon::red(big_mark(nrow(lost_detections))) + )) + } else { + cli::cli_alert_success("No detections were lost between datasets") + } + + results$lost_detections <- unique(lost_detections$EPID) + results$lost_detections_count <- nrow(lost_detections) + + # 5. Check for changes in detections dataset --------------------------------- + cli::cli_h1("Checking for changes in AFP detections dataset") + + # Compare with corresponding rows in data2 and identify changes + changed_rows <- detections1 |> + dplyr::left_join(data2, by = "EPID") |> + dplyr::mutate( + changes = dplyr::case_when( + CountryISO3Code.x != CountryISO3Code.y ~ "CountryISO3Code", + Admin0Name.x != Admin0Name.y ~ "Admin0Name", + Admin0GUID.x != Admin0GUID.y ~ "Admin0GUID", + Admin1GUID.x != Admin1GUID.y ~ "Admin1GUID", + Admin2GUID.x != Admin2GUID.y ~ "Admin2GUID", + Admin1Name.x != Admin1Name.y ~ "Admin1Name", + Admin2Name.x != Admin2Name.y ~ "Admin2Name", + Latitude.x != Latitude.y ~ "Latitude", + Longitude.x != Longitude.y ~ "Longitude", + PersonSex.x != PersonSex.y ~ "PersonSex", + PersonAgeInMonths.x != PersonAgeInMonths.y ~ "PersonAgeInMonths", + PersonAgeInYears.x != PersonAgeInYears.y ~ "PersonAgeInYears", + DosesIPVNumber.x != DosesIPVNumber.y ~ "DosesIPVNumber", + DosesTotal.x != DosesTotal.y ~ "DosesTotal", + DosesIPVRoutine.x != DosesIPVRoutine.y ~ "DosesIPVRoutine", + TotalNumberOfDoses.x != TotalNumberOfDoses.y ~ "TotalNumberOfDoses", + ParalysisOnsetDate.x != ParalysisOnsetDate.y ~ "ParalysisOnsetDate", + VdpvClassifications.x != VdpvClassifications.y ~ "VdpvClassifications", + TotalNumberOfDoses.x != TotalNumberOfDoses.y ~ "TotalNumberOfDoses", + Stool1Condition.x != Stool1Condition.y ~ "Stool1Condition", + Stool2Condition.x != Stool2Condition.y ~ "Stool2Condition", + NotificationDate.x != NotificationDate.y ~ "NotificationDate", + InvestigationDate.x != InvestigationDate.y ~ "InvestigationDate", + Stool1CollectionDate.x != Stool1CollectionDate.y ~ + "Stool1CollectionDate", + Stool2CollectionDate.x != Stool2CollectionDate.y ~ + "Stool2CollectionDate", + DateNotificationtoHQ.x != DateNotificationtoHQ.y ~ + "DateNotificationtoHQ", + VdpvClassifications.x != VdpvClassifications.y ~ + "VdpvClassifications", + SurveillanceTypeName.x != SurveillanceTypeName.y ~ + "SurveillanceTypeName", + PolioVirusTypes.x != PolioVirusTypes.y ~ "PolioVirusTypes", + VdpvEmergenceGroupNames.x != VdpvEmergenceGroupNames.y ~ + "VdpvEmergenceGroupNames", + # Classification.x != Classification.y ~ "Classification", + FinalCultureResult.x != FinalCultureResult.y ~ "FinalCultureResult", + NtChanges.x != NtChanges.y ~ "NtChanges", + TRUE ~ NA_character_ + ) + ) |> + dplyr::filter(!is.na(changes)) + + # Count changes by column + change_summary <- changed_rows |> + dplyr::count(changes, name = "count") |> + dplyr::arrange(dplyr::desc(count)) + + # Total number of changed rows + total_changed <- nrow(changed_rows) + cli::cli_alert_info(paste0( + "Number of changed rows in AFP detections dataset from ", + "{cli_label1} to {cli_label2}: ", + crayon::red(big_mark(total_changed)) + )) + # Output the columns changed + if (nrow(change_summary) > 0) { + cli::cli_alert_info("Columns changed:") + for (i in 1:nrow(change_summary)) { + cli::cli_alert_info(paste0( + " ", crayon::blue(change_summary$changes[i]), ": ", + crayon::red(big_mark(change_summary$count[i])) + )) + } + } else { + cli::cli_alert_success("No changes detected in the columns of interest.") + } + + results$changed_detection_rows <- changed_rows + results$changed_detection_rows_count <- total_changed + results$changed_detection_rows_summary <- change_summary + + # 6. Create summary dataframe ---------------------------------------------- + summary_df <- data.frame( + val_type = c( + "Removed Columns", + "New Blank Columns", + "Removed EPIDs", + "Lost Detections" + ), + Validation = c( + glue::glue("Removed Columns in {table_header2}"), + glue::glue("New Blank Columns in {table_header2}"), + glue::glue("Removed EPIDs in {table_header2}"), + glue::glue("Lost Detections in {table_header2}") + ), + Difference = c( + results$removed_columns_count, + results$blank_columns_new_count, + results$removed_epids_count, + results$lost_detections_count + ) + ) |> + dplyr::rename(`Validation Type` = val_type) |> + dplyr::bind_rows( + results$changed_detection_rows_summary |> + dplyr::mutate( + `Validation Type` = "Changes in Detections Dataset", + changes = glue::glue( + "Changes in {changes} in AFP Detections in {table_header2}" + ) + ) |> dplyr::select( + `Validation Type`, + Validation = changes, Difference = count + ) + ) + + gt_tab <- summary_df |> + gt::gt() |> + gt::data_color( + columns = 3, + # rows = row_start:nrow(summary_df), + fn = function(x) { + # Handle potential negative values or NAs for this row + x_clean <- pmax(x, 0, na.rm = TRUE) + row_max <- max(x_clean, na.rm = TRUE) * 1.3 + + if (row_max == 0) { + return(rep("white", length(x))) + } + + scales::col_numeric( + palette = c("#FFFFFF", "#FF9999", "#FF0000"), + domain = c(0, row_max) + )(x_clean) + } + ) |> + gt::tab_header( + title = "Comparison of POLIS AFP Snapshots", + subtitle = glue::glue("{table_header1} Vs {table_header2}") + ) |> + gt::cols_align(align = "left", columns = 2) |> + gt::fmt_number( + columns = dplyr::where(is.numeric), + decimals = 0 + ) |> + gt::tab_style( + style = gt::cell_text(weight = "bold"), + locations = gt::cells_column_labels(everything()) + ) + + + # Return results + list( + gt_tab = gt_tab, + results = results, + summary = summary_df + ) +} diff --git a/man/compare_polis_afp_snapshots.Rd b/man/compare_polis_afp_snapshots.Rd new file mode 100644 index 0000000..ac07190 --- /dev/null +++ b/man/compare_polis_afp_snapshots.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_data.R +\name{compare_polis_afp_snapshots} +\alias{compare_polis_afp_snapshots} +\title{Compare AFP POLIS Snapshots} +\usage{ +compare_polis_afp_snapshots(data1, data2) +} +\arguments{ +\item{data1}{First POLIS dataset (data frame)} + +\item{data2}{Second POLIS dataset (data frame)} +} +\value{ +A list containing: +\itemize{ +\item results: Detailed validation results +\item summary: A summary data frame of key changes +\item gt_tab: A formatted GT table for easy visualization +} +} +\description{ +This function performs a detailed comparison between two POLIS datasets, +focusing on Acute Flaccid Paralysis (AFP) cases. It conducts various +validations and analyses to identify differences and changes. +} +\details{ +Key operations include: +\enumerate{ +\item Checking for removed columns +\item Identifying new blank columns +\item Detecting removed EPIDs +\item Finding lost detections +\item Analyzing changes in AFP detections, comparing various fields +} + +The function outputs informative CLI messages throughout the process, +providing insights into the differences between the datasets. +} +\examples{ +\dontrun{ +results <- compare_polis_afp_snapshots(old_data, new_data) +print(results$gt_tab) +} +} diff --git a/man/get_polis_detections.Rd b/man/get_polis_detections.Rd new file mode 100644 index 0000000..57b4ace --- /dev/null +++ b/man/get_polis_detections.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_data.R +\name{get_polis_detections} +\alias{get_polis_detections} +\title{Get detections from POLIS data} +\usage{ +get_polis_detections(data, case_type = c("ES", "AFP")) +} +\arguments{ +\item{data}{A POLIS dataset} + +\item{case_type}{Type of cases to filter ("ES" or "AFP")} +} +\value{ +Filtered dataset containing only specified case type +} +\description{ +Get detections from POLIS data +}