Skip to content

Commit

Permalink
add info on targets pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
wlangera committed Sep 17, 2024
1 parent b05bad0 commit fccbf71
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 1 deletion.
5 changes: 5 additions & 0 deletions source/markdown/data_controle_2024.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ targets_store <- here::here("source", "targets", "data_preparation", "_targets")
# Doel

Data preparatie verloopt via een targets pipeline waarbij verschillende stappen elkaar opvolgen.
We maken gebruik van ["dynamic branching"](https://books.ropensci.org/targets/dynamic.html) in de targets pipeline.
Dit is een manier om nieuwe targets te definiëren terwijl de pipeline actief is.
Hierbij wordt een nieuwe target gemaakt voor elk bestand.
Bij het toevoegen van een nieuwe dataset van een jaar, zal de pipeline bijgevolg enkel de berekeningen voor de data van het nieuwe jaar moeten doen en niet opnieuw de berekeningen voor de vorige jaren.
De volledige pipeline ziet er als volgt uit:

```{r, message=FALSE}
tar_visnetwork(script = paste0(targets_store, ".R"),
Expand Down
36 changes: 35 additions & 1 deletion source/targets/data_preparation/_targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,20 @@ source(file.path(mbag_dir, "source", "R", "taxon_mapping.R"))
# Target list
list(
# 1. Read in observation data
## We use "dynamic branching" in the targets pipeline.
## The pipeline creates new targets at runtime for each file.
## When we add a new dataset file for a certain year, the pipeline will only
## do calculations for the data of that year and not again for the other years
## if nothing changed there.

# Get file paths for each year
tarchetypes::tar_files_input(
name = mas_counts_sovon_files,
files = paths_to_counts_sovon(
proj_path = target_dir
)
),
# Read data from file paths
tar_target(
name = mas_counts_sovon,
command = sf::st_read(
Expand All @@ -69,22 +77,27 @@ list(
pattern = map(mas_counts_sovon),
iteration = "list"
),

# 2. Read in sample points of MBAG MAS

# Get file path
tarchetypes::tar_file(
name = sample_file,
command = path_to_samples(
proj_path = mbag_dir,
file = "steekproef_avimap_mbag_mas.csv"
)
),
# Read table from file path
tar_target(
name = sample,
command = readr::read_csv(
file = sample_file,
show_col_types = FALSE
)
),
# Select locations in data that belong to sample points of MBAG MAS
# Select locations in MAS data that belong to sample points of MBAG MAS
# We still branch per year
tar_target(
name = select_sampled_points,
command = join_with_sample(
Expand All @@ -94,7 +107,10 @@ list(
pattern = map(crs_pipeline),
iteration = "list"
),

# 3. Data selection and preparation steps

# Select data that fall within valid time periods
tar_target(
name = select_time_periods,
command = select_within_time_periods(
Expand All @@ -103,6 +119,7 @@ list(
pattern = map(select_sampled_points),
iteration = "list"
),
# Calculate distances to observer
tar_target(
name = calculate_obs_distance,
command = calculate_obs_dist(
Expand All @@ -111,13 +128,15 @@ list(
pattern = map(select_time_periods),
iteration = "list"
),
# Select data that fall within the sampling unit circles
tar_target(
name = select_within_radius,
command = calculate_obs_distance %>%
filter(.data$distance2plot <= 300),
pattern = map(calculate_obs_distance),
iteration = "list"
),
# Select data for birds and mammals
tar_target(
name = select_species_groups,
command = dplyr::filter(
Expand All @@ -127,6 +146,8 @@ list(
pattern = map(select_within_radius),
iteration = "list"
),
# Remove data from counts that were performed twice within the same time
# period
tar_target(
name = remove_double_counts,
command = process_double_counted_data(
Expand All @@ -135,6 +156,7 @@ list(
pattern = map(select_species_groups),
iteration = "list"
),
# Set all taxon names to species level
tar_target(
name = remove_subspecies_names,
command = adjust_subspecies_names_nl(
Expand All @@ -143,22 +165,29 @@ list(
pattern = map(remove_double_counts),
iteration = "list"
),
# Stop branching over years, bind all data together
tar_target(
name = mas_data_full,
command = do.call(
what = rbind.data.frame,
args = c(remove_subspecies_names, make.row.names = FALSE)
)
),
# Remove unwanted columns
tar_target(
name = mas_data_clean,
command = remove_columns(mas_data_full)
),

# 4. Prepare data for publication on GBIF

# Perform mapping of Darwin Core column names
tar_target(
name = darwincore_mapping,
command = dwc_mapping(mas_data_clean)
),

# Get taxon names and split dataframe in groups of `size`
tarchetypes::tar_group_size(
name = prepare_taxon_mapping,
command = darwincore_mapping %>%
Expand All @@ -170,6 +199,8 @@ list(
dplyr::arrange(dwc_vernacularName),
size = 50
),
# Get taxonomic info from GBIF tax. backbone, branch over the groups to limit
# the number of connections to GBIF backbone at once
tar_target(
name = taxon_mapping,
command = map_taxa_from_vernacular(
Expand All @@ -184,6 +215,7 @@ list(
),
pattern = map(prepare_taxon_mapping)
),
# Add taxon names manual if required
tar_target(
name = manual_taxon_mapping,
command = map_taxa_manual(
Expand All @@ -198,13 +230,15 @@ list(
"species", "authorship", "rank", "key")
)
),
# Join taxon names and sort columns
tar_target(
name = dwc_mapping_final,
command = finalise_dwc_df(
data_df = darwincore_mapping,
taxonomy_df = manual_taxon_mapping
)
),
# Write out GBIF dataset
tar_target(
name = create_dwc_csv,
command = create_output_csv(
Expand Down

0 comments on commit fccbf71

Please sign in to comment.