Skip to content

Commit

Permalink
Update prep_geoname to filter ourtduplicate cache rows
Browse files Browse the repository at this point in the history
  • Loading branch information
truenomad committed May 21, 2024
1 parent 2295fed commit d911bdf
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 12 deletions.
2 changes: 1 addition & 1 deletion R/globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ globalVariables(
"match_rank", "matched_names", "form_id_num", "last_date_in_chunk", "x",
"date_last_updated", "form_id", "name_of_creator", "created_time",
"user_name", "ADM0_NAME", "ADM1_NAME", "ADM2_NAME", "ENDDATE",
"CENTER_LAT", "CENTER_LON", "method"
"CENTER_LAT", "CENTER_LON", "method", "longname_to_match"
)
)

26 changes: 15 additions & 11 deletions R/prep_geonames.R
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ handle_user_interaction <- function(input_data, level,
level = level,
name_to_match = name_to_clean,
replacement = replace_int,
# longname_to_match = paste(long_geo, name_to_clean, sep = "_"),
longname_to_match = paste(long_geo, name_to_clean, sep = "_"),
longname_corrected = paste(
long_geo, replace_int, sep = "_"),
created_time = format(Sys.time(), tz = "UTC", usetz = TRUE)
Expand All @@ -506,9 +506,9 @@ handle_user_interaction <- function(input_data, level,
user_choices_df <- dplyr::bind_rows(user_choices) |>
# fix longname_corrected for country
dplyr::mutate(
# longname_to_match = dplyr::if_else(
# level == "country", replacement, longname_to_match
# ),
longname_to_match = dplyr::if_else(
level == "country", replacement, longname_to_match
),
longname_corrected = dplyr::if_else(
level == "country", replacement, longname_corrected
)
Expand All @@ -527,7 +527,7 @@ handle_user_interaction <- function(input_data, level,
level = NULL,
name_to_match = NULL,
replacement = NULL,
# longname_to_match = NULL,
longname_to_match = NULL,
longname_corrected = NULL,
created_time = NULL
)
Expand Down Expand Up @@ -683,7 +683,7 @@ prep_geonames <- function(target_df, lookup_df = NULL,
# is not provided
if (is.null(lookup_df)) {
lookup_df <- poliprep::shp_global |>
# dplyr::filter(ENDDATE == "9999-12-31 01:00:00") |>
# dplyr::filter(ENDDATE == "9999-12-31 01:00:00") |>
dplyr::select(
!!level0 := ADM0_NAME,
!!level1 := ADM1_NAME,
Expand Down Expand Up @@ -774,8 +774,10 @@ prep_geonames <- function(target_df, lookup_df = NULL,
dplyr::left_join(
saved_cache_df |>
dplyr::filter(level == level2) |>
dplyr::select(name_to_match, country_prepped,
province_prepped, district_prepped) |>
dplyr::distinct(name_to_match, country_prepped,
province_prepped, district_prepped),
province_prepped, .keep_all = TRUE),
by = stats::setNames(
c("country_prepped", "province_prepped", "name_to_match"),
c(level0, level1, level2))
Expand Down Expand Up @@ -949,9 +951,8 @@ prep_geonames <- function(target_df, lookup_df = NULL,
foreach::foreach(
method = methods,
.combine = 'bind_rows'
# .packages = c("dplyr", "tibble", "jsonlite", "httr")
), {
p() # Update progress
p()
calculate_string_distance(
unmatched_df_group[[level]],
lookup_df[[level]],
Expand Down Expand Up @@ -983,10 +984,11 @@ prep_geonames <- function(target_df, lookup_df = NULL,
cleaned_dfs[[level]] <- replacement_df
} else {
cleaned_dfs <- NULL
replacement_df <- NULL
}


if (length(replacement_df) > 0) {
if (!is.null(replacement_df) || length(replacement_df) > 0) {
# lets update the dataset
target_todo <- target_todo |>
dplyr::left_join(
Expand Down Expand Up @@ -1044,7 +1046,9 @@ prep_geonames <- function(target_df, lookup_df = NULL,
) |>
dplyr::mutate(name_of_creator = stringr::str_to_title(user_name)) |>
dplyr::arrange(created_time) |>
dplyr::distinct()
dplyr::distinct() |>
dplyr::distinct(longname_to_match, .keep_all = TRUE) |>
dplyr::select(-longname_to_match)

# file saving
handle_file_save(final_cache_dfs, cache_path)
Expand Down

0 comments on commit d911bdf

Please sign in to comment.