diff --git a/R/get_ona.R b/R/get_ona.R index faf633c..7f64818 100644 --- a/R/get_ona.R +++ b/R/get_ona.R @@ -944,7 +944,7 @@ get_updated_ona_data <- function(base_url = "https://api.whonghub.org", cat("\n") if (nrow(full_data_orig) == 0) { - cli::cli_alert_info("Initial download detected. Saving full dataset...") + cli::cli_alert_info("Initial download complete. Saving full dataset...") poliprep::save(full_data, file_name) } else if (nrow(new_data) > 0) { cli::cli_alert_success( diff --git a/R/prep_geonames.R b/R/prep_geonames.R index 2f18000..e1b1146 100644 --- a/R/prep_geonames.R +++ b/R/prep_geonames.R @@ -21,46 +21,55 @@ #' @keywords internal handle_file_save <- function(data_to_save, default_save_path = NULL) { cache_path <- default_save_path - + while (TRUE) { - # prompt the user to confirm if they want to save the cleaned cache file + # Prompt user for confirmation confirm_save <- tolower( readline("Do you want to save the cleaned cache file? [y/n]: ") ) - + if (confirm_save == "y") { - if (is.null(cache_path) || !file.exists(cache_path)) { + # Validate or set a default cache path + if (is.null(cache_path) || dir.exists(cache_path)) { cli::cli_alert_warning( - "The specified file path is null or the file does not exist." + "The specified path is null or is a directory." ) - - # Ask for a new file path + + # Prompt for a valid file path cache_path <- readline( - prompt = "Enter the new file path for saving: " + prompt = "Enter the new file path (including filename) for saving: " ) - - # check if user wants to create a new file if path does not exist - if (!file.exists(cache_path)) { - # give alternative name for saving if none given - cache_path <- paste0(getwd(), "/prepped_geoname_cache.rds") - - cli::cli_alert_info(paste0( - "The specified path does not exist, a default path and", - " name will be used." - )) + + # Ensure the file path includes a filename + if (cache_path == "" || dir.exists(cache_path)) { + cache_path <- file.path(getwd(), "prepped_geoname_cache.rds") + cli::cli_alert_info( + "No valid path provided. Using default: {cache_path}" + ) } } - + + # Ensure parent directory exists + cache_dir <- dirname(cache_path) + if (!dir.exists(cache_dir)) { + dir.create(cache_dir, recursive = TRUE) + } + + # Ensure we are working with a file, not a directory + if (dir.exists(cache_path)) { + stop("`cache_path` should be a file path, not a directory.") + } + # Set up file lock to prevent concurrent overwrites lock_path <- paste0(cache_path, ".lock") lock <- filelock::lock(lock_path, timeout = 10000) on.exit(filelock::unlock(lock)) - + # Load existing cache if available existing_cache <- if (file.exists(cache_path)) readRDS(cache_path) else NULL - - # Merge with existing cache and retain latest changes + + # Merge with existing cache if applicable if (!is.null(existing_cache) && nrow(existing_cache) > 0) { merged_cache <- dplyr::bind_rows(existing_cache, data_to_save) |> dplyr::arrange(dplyr::desc(created_time)) |> @@ -68,24 +77,16 @@ handle_file_save <- function(data_to_save, default_save_path = NULL) { } else { merged_cache <- data_to_save } - - # save the file - saveRDS( - data_to_save, cache_path - ) - cli::cli_alert_success( - "File saved successfully to {cache_path}." - ) + + # Save the merged cache + saveRDS(merged_cache, cache_path) + cli::cli_alert_success("File saved successfully to {cache_path}.") break } else if (confirm_save == "n") { - cli::cli_alert_info( - "File not saved. Proceeding without saving..." - ) + cli::cli_alert_info("File not saved. Proceeding without saving...") break } else { - cli::cli_alert_warning( - "Invalid input. Please respond with 'y' for yes or 'n' for no." - ) + cli::cli_alert_warning("Invalid input. Please respond with 'y' or 'n'.") } } } @@ -122,7 +123,7 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, level3 = NULL, level4 = NULL) { # Calculate unique matches for each admin level results <- list() - + if (!is.null(level0)) { matches_level0 <- sum(unique(data[[level0]]) %in% unique( lookup_data[[level0]] @@ -131,7 +132,7 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, "matches" = matches_level0, "total" = length(unique(data[[level0]])) ) } - + if (!is.null(level1)) { matches_level1 <- sum( unique(data[[level1]]) %in% unique(lookup_data[[level1]]) @@ -140,7 +141,7 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, "matches" = matches_level1, "total" = length(unique(data[[level1]])) ) } - + if (!is.null(level2)) { matches_level2 <- sum( unique(data[[level2]]) %in% unique(lookup_data[[level2]]) @@ -150,7 +151,7 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, length(unique(data[[level2]])) ) } - + if (!is.null(level3)) { matches_level3 <- sum( unique(data[[level3]]) %in% unique(lookup_data[[level3]]) @@ -160,7 +161,7 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, length(unique(data[[level3]])) ) } - + if (!is.null(level4)) { matches_level4 <- sum( unique(data[[level4]]) %in% unique(lookup_data[[level4]]) @@ -170,11 +171,11 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, length(unique(data[[level4]])) ) } - + # Presenting the results using cli cli::cli_alert_info("Match Summary:") cli::cli_ul() - + if (!is.null(level0)) { cli::cli_li( glue::glue( @@ -183,14 +184,14 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, ) ) } - + if (!is.null(level1)) { cli::cli_li(glue::glue( "{level1} (level 1): {big_mark(results$level1['matches'])} ", "out of {big_mark(results$level1['total'])} matched" )) } - + if (!is.null(level2)) { cli::cli_li( glue::glue( @@ -199,7 +200,7 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, ) ) } - + if (!is.null(level3)) { cli::cli_li( glue::glue( @@ -208,7 +209,7 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, ) ) } - + if (!is.null(level4)) { cli::cli_li( glue::glue( @@ -217,7 +218,7 @@ calculate_match_stats <- function(data, lookup_data, level0 = NULL, ) ) } - + cli::cli_end() invisible(NULL) } @@ -249,7 +250,7 @@ format_choice <- function(index, choice, width) { format_choices <- function(choices, num_columns, column_width = 45) { num_choices <- length(choices) rows_per_column <- ceiling(num_choices / num_columns) - + formatted_choices <- character(rows_per_column) for (i in 1:rows_per_column) { row_parts <- character(num_columns) @@ -290,21 +291,21 @@ display_custom_menu <- function(title, main_header, choices_input, special_actions, prompt) { cli::cli_h1(main_header) cli::cli_h2(title) - + num_choices <- length(choices_input) num_columns <- if (num_choices > 50) 3 else if (num_choices > 25) 2 else 1 - + formatted_choices <- format_choices(choices_input, num_columns, - column_width = 45 + column_width = 45 ) cat("\n") cat(formatted_choices, sep = "\n") cat("\n") - + for (key in names(special_actions)) { cat(sprintf("%s: %s\n", key, special_actions[[key]])) } - + cat("\n") repeat { choice <- tolower(readline(prompt = paste0(prompt))) @@ -316,7 +317,7 @@ display_custom_menu <- function(title, main_header, choices_input, } cat("Invalid choice, please try again.\n") } - + choice } @@ -349,7 +350,7 @@ calculate_string_distance <- function( name_to_match = admins_to_clean, matched_names = lookup_admins ) |> dplyr::distinct() - + # Calculate distances for each pair results <- results |> dplyr::rowwise() |> @@ -360,24 +361,24 @@ calculate_string_distance <- function( ) ) |> dplyr::ungroup() - + # Sort results for each name_to_match based on distance results <- results |> dplyr::group_by(name_to_match) |> dplyr::arrange(distance, .by_group = TRUE) |> dplyr::mutate(match_rank = dplyr::row_number()) |> dplyr::ungroup() - + # Add algorithm name results$algorithm_name <- method - + # Reorder columns results <- results |> dplyr::select( algorithm_name, name_to_match, matched_names, distance, match_rank ) - + return(results) } @@ -409,7 +410,7 @@ handle_user_interaction <- function(input_data, levels, level, clear_console = TRUE, stratify, max_options) { # Interactivity -------------------------------------------------------------- - + # set up the messaging prompts at the start of the function prompts <- c( "What'll it be?:", @@ -425,20 +426,20 @@ handle_user_interaction <- function(input_data, levels, level, "Where to next?:" ) prompt <- sample(prompts)[1] - + # filter out missing cachees input_data <- input_data |> dplyr::filter( !is.na(matched_names) & !is.na(name_to_match) ) - + # set cachees for looping unique_names <- unique(input_data$name_to_match) - + # initialize empty lists to store user choices user_choices <- list() user_choice <- NULL - + # loop through unmatched records in input_data # Initialize the index i <- 1 @@ -448,14 +449,14 @@ handle_user_interaction <- function(input_data, levels, level, cat("\014") cat("\033[2J", "\033[H") } - + # Define color using crayon function red <- crayon::red bl <- crayon::blue gr <- crayon::green b <- crayon::bold p <- crayon::underline - + # set up choices ----------------------------------------------------------- # select the cache to clean and suggested replacements name_to_clean <- unique_names[i] @@ -468,21 +469,21 @@ handle_user_interaction <- function(input_data, levels, level, dplyr::slice(0:max_options) |> dplyr::pull() |> stringr::str_to_title() - + # get unique long names unique_geo_long <- input_data |> dplyr::filter( name_to_match == name_to_clean ) |> dplyr::distinct() - + # set output title --------------------------------------------------------- - + # set up main header to keep track main_header <- glue::glue( "{stringr::str_to_title(level)} {i} of {length(unique_names)}" ) - + if (!is.na(levels[2]) && stratify && level == levels[2]) { level_label <- "level1" long_geo <- unique_geo_long$long_geo[1] @@ -551,7 +552,7 @@ handle_user_interaction <- function(input_data, levels, level, "{b(red(str_cache))} with?" ) } - + # action set up ------------------------------------------------------------ special_actions <- list( "B" = "Go Back", @@ -560,7 +561,7 @@ handle_user_interaction <- function(input_data, levels, level, "Q" = "Exit without saving", "M" = "Enter name manually" ) - + # present the menu to the user --------------------------------------------- user_choice <- display_custom_menu( title, main_header, @@ -568,7 +569,7 @@ handle_user_interaction <- function(input_data, levels, level, special_actions, prompt = prompt ) - + # handle user choices ------------------------------------------------------ if (user_choice == "b") { # Go Back if (i > 1) { @@ -640,15 +641,15 @@ handle_user_interaction <- function(input_data, levels, level, i <- i + 1 } } - + # Aggregation user-chosen replacements into df ------------------------------- - + # clear console if (clear_console) { cat("\014") cat("\033[2J", "\033[H") } - + if (length(user_choices) != 0) { # Combine user choices into a single data frame user_choices_df <- dplyr::bind_rows(user_choices) |> @@ -665,7 +666,7 @@ handle_user_interaction <- function(input_data, levels, level, dplyr::group_by(longname_to_match) |> dplyr::slice_max(created_time, with_ties = FALSE) |> dplyr::ungroup() - + cli::cli_alert_success( "Your selections have been successfully saved. Exiting..." ) @@ -839,7 +840,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, interactive = TRUE, max_options = 200) { # Validation ----------------------------------------------------------------- - + # Ensure higher levels cannot be used without corresponding lower levels if (stratify && !is.null(level1) && is.null(level0)) { stop("You cannot specify level1 without level0.") @@ -852,7 +853,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, )) { stop("You cannot specify level3 without level0, level1, and level2.") } - + # Prompt the user if using levels beyond level2 without a custom lookup_df if (is.null(lookup_df) && (!is.null(level3) || !is.null(level4))) { cli::cli_alert_warning( @@ -862,7 +863,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, "lookup_df is provided." ) ) - + user_choice <- tolower( readline( paste0( @@ -874,7 +875,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, ) ) ) - + if (user_choice == "1") { cli::cli_alert_info( "Proceeding with cleaning up to level2 only. Ignoring level3 and level4." @@ -891,7 +892,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, return(invisible(NULL)) } } - + # Ensure lookup_df contains necessary columns if provided if (!is.null(lookup_df)) { required_columns <- NULL @@ -900,7 +901,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, if (!is.null(level2)) required_columns <- c(required_columns, level2) if (!is.null(level3)) required_columns <- c(required_columns, level3) if (!is.null(level4)) required_columns <- c(required_columns, level4) - + missing_columns <- setdiff(required_columns, colnames(lookup_df)) if (length(missing_columns) > 0) { stop( @@ -911,7 +912,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, ) } } - + # Ensure target_df contains necessary columns required_columns <- NULL if (!is.null(level0)) required_columns <- c(required_columns, level0) @@ -919,7 +920,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, if (!is.null(level2)) required_columns <- c(required_columns, level2) if (!is.null(level3)) required_columns <- c(required_columns, level3) if (!is.null(level4)) required_columns <- c(required_columns, level4) - + missing_columns <- setdiff(required_columns, colnames(target_df)) if (length(missing_columns) > 0) { stop( @@ -929,7 +930,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, ) ) } - + # Ensure method is supported supported_methods <- c( "jw", "osa", "lv", "dl", "hamming", "lcs", "qgram", @@ -944,27 +945,27 @@ prep_geonames <- function(target_df, lookup_df = NULL, ) ) } - + # Ensure stratify is logical if (!is.logical(stratify)) { stop("stratify must be a logical value (TRUE or FALSE).") } - + # Ensure interactive is logical if (!is.logical(interactive)) { stop("interactive must be a logical value (TRUE or FALSE).") } - + # Ensure cache_path is a valid file path if provided if (!is.null(cache_path) && !dir.exists(dirname(cache_path))) { stop("The directory for cache_path does not exist.") } - + # Validation: Ensure lookup_df is not empty if provided if (!is.null(lookup_df) && nrow(lookup_df) == 0) { stop("The lookup_df is empty.") } - + # Ensure level0, level1, level2, and level3 are valid column names if (!is.null(level0) && !(level0 %in% colnames(target_df))) { stop(paste("The column", level0, "is not in target_df.")) @@ -981,17 +982,17 @@ prep_geonames <- function(target_df, lookup_df = NULL, if (!is.null(level4) && !(level4 %in% colnames(target_df))) { stop(paste("The column", level4, "is not in target_df.")) } - + # Step 0: Setup target and lookup datasets ----------------------------------- - + # Get the internal shapefile if lookup data is not provided if (is.null(lookup_df)) { lookup_df <- poliprep::shp_global - + if (!is.null(who_region)) { lookup_df <- dplyr::filter(lookup_df, WHO_REGION == toupper(who_region)) } - + if (!is.null(level0)) { lookup_df <- dplyr::rename(lookup_df, !!level0 := ADM0_NAME) } @@ -1008,7 +1009,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, lookup_df <- dplyr::rename(lookup_df, !!level4 := ADM4_NAME) } } - + # Create the levels vector levels <- c( if (exists("level0")) level0 else NULL, @@ -1017,7 +1018,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, if (exists("level3")) level3 else NULL, if (exists("level4")) level4 else NULL ) - + # Ensure administrative names are uppercase target_df <- target_df |> dplyr::mutate( @@ -1025,22 +1026,22 @@ prep_geonames <- function(target_df, lookup_df = NULL, dplyr::any_of(levels), toupper ) ) - + lookup_df <- lookup_df |> dplyr::mutate( dplyr::across( dplyr::any_of(levels), toupper ) ) - + # Step 1: Configure cache if saved cache file exists available --------------- - + if (!is.null(cache_path) && !file.exists(cache_path)) { # Alert the user about the missing cache file, including the path cli::cli_alert_info( paste0("The specified cache file '", cache_path, "' does not exist.") ) - + # Ask the user if they want to proceed and create a new cache file user_input <- readline( paste0( @@ -1048,7 +1049,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, " Proceed to create a new one? (yes/no): " ) ) - + # Check the user's response if (!(tolower(user_input) %in% c("yes", "y"))) { cli::cli_alert_info("Exiting without creating a new cache file.") @@ -1057,12 +1058,12 @@ prep_geonames <- function(target_df, lookup_df = NULL, cli::cli_alert_info("Proceeding to create a new cache file...") } } - + # load saved cache file if (!is.null(cache_path) && file.exists(cache_path)) { # load the cache file saved_cache_df <- readRDS(cache_path) - + # harmonise column names in case using old version of cache file saved_cache_df <- saved_cache_df |> dplyr::rename(level0_prepped = any_of("country_prepped")) |> @@ -1098,7 +1099,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, saved_cache_df <- data.frame() target_todo <- target_df } - + # if the cache file exists, merge it with the target data and replace # incorrect names with correct ones. if (!is.null(saved_cache_df) && nrow(saved_cache_df) > 0) { @@ -1117,7 +1118,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, !!level0 := dplyr::coalesce(level0_prepped, .data[[level0]]) ) } - + if (!is.null(level1)) { target_df <- target_df |> dplyr::left_join( @@ -1133,7 +1134,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, !!level1 := dplyr::coalesce(level1_prepped, .data[[level1]]) ) } - + if (!is.null(level2)) { target_df <- target_df |> dplyr::left_join( @@ -1153,7 +1154,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, !!level2 := dplyr::coalesce(level2_prepped, .data[[level2]]) ) } - + if (!is.null(level3)) { target_df <- target_df |> dplyr::left_join( @@ -1176,7 +1177,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, !!level3 := dplyr::coalesce(level3_prepped, .data[[level3]]) ) } - + if (!is.null(level4)) { target_df <- target_df |> dplyr::left_join( @@ -1199,25 +1200,25 @@ prep_geonames <- function(target_df, lookup_df = NULL, !!level4 := dplyr::coalesce(level4_prepped, .data[[level4]]) ) } - + # remove prepped columns target_df <- target_df |> dplyr::select(-matches("_prepped$")) } - + # Step 2: Filter out for those where there is a match ------------------------ - + # get the original data orig_df <- target_df - + # Dynamically filter for missing geolocations filter_na_expr <- purrr::map(levels, ~ rlang::expr(is.na(!!.x))) target_df_na <- target_df |> dplyr::filter(!!!filter_na_expr) - + # Dynamically filter for non-missing geolocations filter_not_na_expr <- purrr::map(levels, ~ rlang::expr(!is.na(!!.x))) target_df <- target_df |> dplyr::filter(!!!filter_not_na_expr) - + # dynamically construct the long geonames on target data target_df <- construct_geo_names( target_df, level0, level1, level2, level3, level4 @@ -1225,89 +1226,89 @@ prep_geonames <- function(target_df, lookup_df = NULL, lookup_df <- construct_geo_names( lookup_df, level0, level1, level2, level3, level4 ) - + # filter to matched rows target_done <- target_df |> dplyr::filter( (long_geo %in% - unique(lookup_df[["long_geo"]])) + unique(lookup_df[["long_geo"]])) ) - + # reduce down to only unmatched rows target_todo <- target_df |> dplyr::filter( !(long_geo %in% unique(lookup_df[["long_geo"]])) ) - + calculate_match_stats( target_df, lookup_df, level0, level1, level2, level3, level4 ) - + # Early return with finalised_df if (nrow(target_todo) == 0) { cli::cli_alert_success( "All records matched; process completed. Exiting..." ) - + return(orig_df) } - + # return if non-interactive. if (!interactive) { cli::cli_alert_success( "In non-interactive mode. Exiting after matching with cache..." ) - + return(orig_df) } - + cli::cli_alert_info( "Partial match completed. There are still matches to be made." ) - + user_input <- readline("Would you like to do interactive matching? (yes/no):") - + if (!(tolower(user_input) %in% c("yes", "y"))) { cli::cli_alert_info( "Exiting without interactive matching..." ) return(orig_df) } - + # Step 3: String distance matching in interactivity -------------------------- - + # initialize empty lists to store results unmatched_df_group <- list() cleaned_dfs <- list() - + # Initialize flag variable skip_to_end <- FALSE - + for (level in levels) { top_res_list <- list() replacement_df <- NULL - + # Check if the current level should be stratified if (stratify && level %in% c(levels[2], levels[3], levels[4], levels[5])) { # Set up the grouping level (previous level in hierarchy) grouping_level <- levels[which(levels == level) - 1] - + for (group in unique(target_todo[[grouping_level]])) { if (!(group %in% unique(lookup_df[[grouping_level]]))) { skip_to_end <- TRUE break } - + lookup_df_group <- lookup_df |> dplyr::filter(.data[[grouping_level]] == group) - + unmatched_df_group <- target_todo |> dplyr::filter(.data[[grouping_level]] == group) |> dplyr::filter(!(.data[[level]] %in% unique(lookup_df_group[[level]]))) - + if (nrow(unmatched_df_group) == 0) next - + # Dynamically create a long_geo from the previous levels if (level == levels[2]) { long_geo_group <- paste( @@ -1338,7 +1339,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, } else { long_geo_group <- group } - + top_res <- calculate_string_distance( unmatched_df_group[[level]], @@ -1353,21 +1354,21 @@ prep_geonames <- function(target_df, lookup_df = NULL, dplyr::select(name_to_match, matched_names) |> dplyr::ungroup() |> dplyr::mutate(long_geo = long_geo_group) - + top_res_list[[group]] <- top_res } - + if (skip_to_end) { break } - + top_res <- do.call(rbind, top_res_list) } else { unmatched_df_group <- target_todo |> dplyr::filter(!(.data[[level]] %in% unique(lookup_df[[level]]))) - + if (nrow(unmatched_df_group) == 0) next - + top_res <- calculate_string_distance( unmatched_df_group[[level]], @@ -1383,14 +1384,14 @@ prep_geonames <- function(target_df, lookup_df = NULL, dplyr::ungroup() |> dplyr::mutate(long_geo = name_to_match) } - + if (!is.null(top_res) && nrow(top_res) > 0) { cli::cli_alert_info("Handling user interaction for level: {level}") replacement_df <- handle_user_interaction( input_data = top_res, levels = levels, level = level, stratify = stratify, max_options = max_options ) - + if (!is.null(replacement_df) && nrow(replacement_df) > 0) { cleaned_dfs[[level]] <- replacement_df cli::cli_alert_success("Replacements made for level: {level}") @@ -1398,7 +1399,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, cli::cli_alert_warning("No replacements made for level: {level}") } } - + # Always update target_todo, even if no replacements were made if (!is.null(replacement_df) && nrow(replacement_df) > 0) { target_todo <- target_todo |> @@ -1416,19 +1417,19 @@ prep_geonames <- function(target_df, lookup_df = NULL, ) |> dplyr::select(-replacement) } - + target_todo <- construct_geo_names( target_todo, level0, level1, level2, level3, level4 ) - + if (skip_to_end) { cli::cli_alert_danger("Skipping to end due to unmatched higher level") break } } - + # Step 4: clean up the cache file and save ----------------------------------- - + if (length(cleaned_dfs) > 0 && any(sapply(cleaned_dfs, nrow) > 0)) { # clean up the cache df suppressWarnings( @@ -1462,7 +1463,7 @@ prep_geonames <- function(target_df, lookup_df = NULL, # add username dplyr::mutate(name_of_creator = Sys.getenv("RSTUDIO_USER_IDENTITY")) ) - + # combine cleaned data frames final_cache_dfs <- dplyr::bind_rows(saved_cache_df, cleaned_cache_joined) |> @@ -1474,18 +1475,18 @@ prep_geonames <- function(target_df, lookup_df = NULL, paste(level0_prepped, name_to_match, sep = "_"), is.na(longname_to_match) & level == "level2" ~ paste(level0_prepped, level1_prepped, - name_to_match, - sep = "_" + name_to_match, + sep = "_" ), is.na(longname_to_match) & level == "level3" ~ paste(level0_prepped, level1_prepped, level2_prepped, - name_to_match, - sep = "_" + name_to_match, + sep = "_" ), is.na(longname_to_match) & level == "level4" ~ paste(level0_prepped, level1_prepped, level2_prepped, - level3_prepped, name_to_match, - sep = "_" + level3_prepped, name_to_match, + sep = "_" ) ) ) |> @@ -1499,30 +1500,30 @@ prep_geonames <- function(target_df, lookup_df = NULL, ) |> dplyr::arrange(created_time) |> dplyr::distinct(longname_to_match, - .keep_all = TRUE + .keep_all = TRUE ) |> dplyr::select(-longname_to_match) - + # file saving - handle_file_save(final_cache_dfs, cache_path) + handle_file_save(final_cache_dfs, default_save_path = cache_path) } else { cli::cli_alert_warning( "No cleanings were made. Cache will not be updated." ) } - + # Step 5: Combine the cleaned data frames ------------------------------------ - + finalised_df <- dplyr::bind_rows(target_done, target_todo, target_df_na) |> dplyr::select(-long_geo) - + # get stats calculate_match_stats( finalised_df, lookup_df, level0, level1, level2, level3, level4 ) - + gc() # clean up memory - + # return the final data frame return(finalised_df) }