Skip to content

Commit

Permalink
Merge pull request nationalparkservice#64 from iquevedo123/master
Browse files Browse the repository at this point in the history
Slight Modifications to Flagging Summary Function
  • Loading branch information
RobLBaker authored Jan 31, 2024
2 parents ab960bd + d9a305c commit c228430
Showing 1 changed file with 51 additions and 61 deletions.
112 changes: 51 additions & 61 deletions R/summarize_qc_flags.R
Original file line number Diff line number Diff line change
Expand Up @@ -345,13 +345,12 @@ get_dc_flags <- function(directory = here::here()) {
#' @description
#' `r lifecycle::badge("experimental")`
#' get_custom_flags returns data frames that that summarize data
#' quality control flags (one that summarizes at the data package level, one at
#' the data file level, and one for each column). The summaries include all data
#' quality control flags (one that summarizes at the data file level and one for each column). The summaries include all data
#' with quality control flagging (a column name that ends in "_flag") and
#' optionally any additional custom columns the user specifies, either by column
#' name or number.
#'
#' The use can specify which of the 3 data frames (or all 3, as a list of
#' The use can specify which of the 2 data frames (or all as a list of
#' dataframes) should be returned.
#'
#' The number of each flag type for each column (A, AE, R, P) is reported.
Expand All @@ -361,9 +360,7 @@ get_dc_flags <- function(directory = here::here()) {
#' Unweighted Relative Response (RRU) is calculated as the total number of
#' accepted data points (A, AE, and data that are not flagged) divided by the
#'total number of data points (excluding missing values) in all specified
#' columns (and the flagged columns). Standard deviations are reported for RRU.
#' The geometric mean of percent missing and the standard deviation of percent
#' missing are also reported.
#' columns (and the flagged columns).
#'
#' @details Flagged columns must have names ending in "_flag". Missing values
#' must be specified as NA. The function counts cells within "*_flag" columns
Expand All @@ -382,9 +379,7 @@ get_dc_flags <- function(directory = here::here()) {
#' @param output A string indicating what output should be provided. "columns"
#' returns a summary table of QC flags and RRU values in each specified column
#' for every data file. "files" returns a summary table of total QC flags and
#' mean and standard deviations across each data file. "package" provides a
#' summary table with total, mean (per file) and standard deviation values
#' for flagged data across the entire data package. "all" will return all three
#' mean across each data file. "all" will return all three
#' data frames in a single list.
#'
#' @return a dataframe with quality control summary information summarized at
Expand All @@ -402,12 +397,11 @@ get_dc_flags <- function(directory = here::here()) {
#' "locality"),
#' output="all")
#' cols <- colnames(read.csv("mydata.csv"))[c(1:4, 7, 10)]
#' get_custom_flags(cols = cols, output="package")
#' get_custom_flags(cols = cols, output="files")
#' }
get_custom_flags <- function(directory = here::here(),
cols = (""),
output = c("all",
"package",
"files",
"columns")) {

Expand All @@ -419,45 +413,44 @@ get_custom_flags <- function(directory = here::here(),

names(dfList) <- base::basename(fileList)



cust_flags <- NULL

for (i in seq_along(dfList)) {
# get custom columns:
cust_cols <- dfList[[i]] %>% dplyr::select(any_of(cols))
cust_cols <- dfList[[i]] %>% dplyr::select(any_of(cols) & !contains("_flag"))
if (ncol(cust_cols) > 0) {
for (j in seq_along(cust_cols)) {
A_flag <- sum(!is.na(cust_cols[j]))
AE_flag <- 0
R_flag <- 0
P_flag <- 0
RRU <- A_flag / (nrow(cust_cols[j]))
Cell_count <- A_flag
percent_missing <- (sum(is.na(cust_cols[j]))) / nrow(cust_cols[j])

Cell_count <- nrow(cust_cols[j])
filename <- names(dfList)[i]
column <- colnames(cust_cols)[j]
flags <- assign(
paste0(names(dfList)[i]),
data.frame(
names(dfList[i]),
tibble::tibble(
filename = names(dfList[i]),
column,
Cell_count,
A_flag,
AE_flag,
R_flag,
P_flag,
Cell_count,
percent_missing,
RRU
)
)
colnames(flags)[1] <- "filename"

# add to df_flags dataframe:
cust_flags <- rbind(cust_flags, flags)
}
}

# get just flagging columns:
flags_only <- dfList[[i]] %>% dplyr::select(ends_with("_flag"))
flags_only <- dfList[[i]] %>% dplyr::select(contains("_flag"))

if (ncol(flags_only) > 0) {
# for each column in data and each data flags:
Expand All @@ -481,35 +474,28 @@ get_custom_flags <- function(directory = here::here(),
flags_only[j],
"\\bP"
), na.rm = TRUE))
Cell_count <- nrow(flags_only[j])

# get cell count in file, exclude NAs and flags:
Cell_count <- sum(!is.na(flags_only[j]))

percent_missing <- (sum(is.na(flags_only[j]))) / nrow(flags_only[j])

RRU <- (A_flag + AE_flag) / nrow(flags_only[j])
RRU <- ((A_flag + AE_flag) / Cell_count)

filename <- names(dfList)[i]
column <- colnames(flags_only)[j]

# make a dataframe with data:
flags <- assign(
paste0(names(dfList)[i]),
data.frame(
names(dfList)[i],
tibble::tibble(
filename = names(dfList)[i],
column,
Cell_count,
A_flag,
AE_flag,
R_flag,
P_flag,
Cell_count,
percent_missing,
RRU
)
)

colnames(flags)[1] <- "filename"

# add to df_flags dataframe:
cust_flags <- rbind(cust_flags, flags)
}
Expand All @@ -524,51 +510,55 @@ get_custom_flags <- function(directory = here::here(),
R_flag <- NA
P_flag <- NA
Cell_count <- NA
percent_missing <- NA
RRU <- NA

flags <- data.frame(
filename, column, A_flag, AE_flag, R_flag, P_flag,
Cell_count, percent_missing, RRU
filename = names(dfList)[i],
column,
Cell_count,
A_flag,
AE_flag,
R_flag,
P_flag,
RRU
)

cust_flags <- rbind(cust_flags, flags)
}
}
colnames(cust_flags)[7] <- "Data Totals"

#generate summary statistics for each column:
data_file_summaries <- cust_flags %>%
dplyr::group_by(filename) %>%
dplyr::summarize(A_total = sum(A_flag),
AE_total = sum(AE_flag),
P_total = sum(P_flag),
R_total = sum(R_flag),
missing_mean_percent = mean(percent_missing),
RRU_mean = mean(RRU),
RRU_sd = stats::sd(RRU))

#generate data package level summaries
data_package_summary <- cust_flags %>%
plyr::summarize(A_total = sum(A_flag, na.rm = TRUE),
AE_total = sum(AE_flag, na.rm = TRUE),
P_total = sum(P_flag, na.rm = TRUE),
R_total = sum(R_flag, na.rm = TRUE),
missing_mean_percent = mean(percent_missing, na.rm = TRUE),
RRU_mean = mean(RRU, na.rm = TRUE),
RRU_sd = sd(RRU, na.rm = TRUE))
dplyr::summarize("A" = sum(A_flag),
"AE" = sum(AE_flag),
"P" = sum(P_flag),
"R" = sum(R_flag),
"% Accepted" = mean(RRU)) %>%
dplyr::rename("File Name" = filename) %>%
dplyr::mutate(`% Accepted` = paste0(formatC(100 * `% Accepted`, format = "f", digits = 1), "%"))

cust_flags <- cust_flags %>%
dplyr::mutate(
column = stringr::str_remove(column, "_flag"),
RRU = paste0(formatC(100 * RRU, format = "f", digits = 1), "%")) %>%
dplyr::select(
"File Name" = filename,
"Measure" = column,
"Number of Records" = Cell_count,
"A" = A_flag,
"AE" = AE_flag,
"R" = R_flag,
"P" = P_flag,
"% Accepted" = RRU)


qc_summary <- list(cust_flags,
data_file_summaries,
data_package_summary)
data_file_summaries)

names(qc_summary) <- c("Column Level QC Summaries",
"Data File Level QC Summaries",
"Data Package Level QC Summaries")
"Data File Level QC Summaries")

if (output == "package") {
return(qc_summary[[3]])
}
if (output == "files") {
return(qc_summary[[2]])
}
Expand Down

0 comments on commit c228430

Please sign in to comment.