Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set up Dockerfile for running the R script #46

Merged
merged 6 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,5 @@ version.py
# pixi environments
.pixi
*.egg-info

outputs/
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ Imports:
pbapply,
furrr,
parallel,
logger
logger,
tidyverse
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
39 changes: 39 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
FROM rocker/r-ver:4.2.0

RUN apt-get update && apt-get install -y \
build-essential \
libgit2-dev \
libcurl4-openssl-dev \
libssl-dev \
libxml2-dev \
libfontconfig1-dev \
zlib1g-dev \
libharfbuzz-dev \
libfribidi-dev \
libfreetype6-dev \
libpng-dev \
libtiff5-dev \
libgdal-dev \
libgeos-dev \
libproj-dev \
libudunits2-dev \
&& rm -rf /var/lib/apt/lists/*

RUN Rscript -e "install.packages('terra', repos='https://cloud.r-project.org')"
RUN Rscript -e "install.packages('remotes', repos='https://cloud.r-project.org')"
RUN Rscript -e "install.packages('devtools', repos='https://cloud.r-project.org', dependencies=TRUE)"

WORKDIR /home/biodiversity-horizons

COPY DESCRIPTION .
COPY NAMESPACE .

COPY R ./R
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you move this line below the install_local we should be able to change files in the ./R folder without having to reinstall the dependencies

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's a great point, i should move this line

Copy link
Collaborator Author

@IshikaKhandelwal IshikaKhandelwal Jan 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just tried this and realized that moving install_local() before COPY R ./R doesn’t work correctly, as the biodiversityhorizons package was missing and gave the error:

Error: there is no package called ‘biodiversityhorizons’
Execution halted

Since install_local() was running before the R/ folder was copied, the package source code wasn’t available at the time of installation, so it wasn’t installed inside Docker.

Hence, we have to copy R ./R before install_local() so that the package source code is available before installation, ensuring that it is correctly installed inside Docker. Hope that makes sense :)


RUN Rscript -e "remotes::install_local('.', dependencies=TRUE)"

COPY scripts ./scripts

# By default, run the script with "data-raw/" as path, "multisession", and (availableCores()-1) workers.
# The user can override by passing in arguments at runtime, e.g.:
CMD ["Rscript", "scripts/VISS_Sample_Data.R", "data-raw/", "multisession"]
64 changes: 51 additions & 13 deletions scripts/VISS_Sample_Data.R
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,49 @@ library(pbapply)
library(sf)
library(parallel)
library(logger)
library(future)

# Initialize logger
log_threshold(INFO)
log_info("Starting VISS Sample Data script.")

# Set the folder containing the files as the working directory
path <- "data-raw/"
# Parse command-line arguments
args <- commandArgs(trailingOnly = TRUE)

# Set Data path
if (length(args) >= 1) {
path <- args[1]
} else {
# Default to data-raw/ if not provided
if (interactive()) {
path <- "data-raw/"
message("No data path argument provided. Using default: ", path)
} else {
stop("No data folder argument provided.\nUsage: Rscript VISS_Sample_Data.R /path/to/data [plan_type] [workers]")
}
}
log_info("Data path set to: {path}")

# Plan type
if (length(args) >= 2) {
plan_type <- args[2]
} else {
plan_type <- "multisession"
}

# Number of workers
if (length(args) >= 3) {
workers <- as.numeric(args[3])
} else {
workers <- availableCores() - 1
}

# Load data
log_info("Loading data...")
historical_climate <- readRDS(paste0(path, "historical_climaate_data.rds"))
future_climate <- readRDS(paste0(path, "future_climaate_data.rds"))
grid <- readRDS(paste0(path, "grid.rds"))
primates_shp <- readRDS(paste0(path, "primates_shapefiles.rds"))
historical_climate <- readRDS(file.path(path, "historical_climaate_data.rds"))
future_climate <- readRDS(file.path(path, "future_climaate_data.rds"))
grid <- readRDS(file.path(path, "grid.rds"))
primates_shp <- readRDS(file.path(path, "primates_shapefiles.rds"))
log_info("Data loaded successfully.")

# Log data details
Expand Down Expand Up @@ -58,8 +86,8 @@ colnames(future_climate_df) <- c("world_id", 2015:2100)
log_info("Column renaming complete.")

# 3. Compute the thermal limits for each species
log_info("Computing thermal limits for each species.")
plan("multisession", workers = availableCores() - 1)
log_info("Computing thermal limits for each species using {workers} workers and a '{plan_type}' parallelization plan.")
plan(plan_type, workers = workers)

niche_limits <- future_map_dfr(
primates_range_data,
Expand All @@ -72,7 +100,7 @@ log_info("Thermal limit computation complete.")
# 4. Calculate exposure
log_info("Calculating exposure for each species.")
exposure_list <- future_map(
1:length(primates_range_data),
seq_along(primates_range_data),
~ exposure(.x, primates_range_data, future_climate_df, niche_limits),
.progress = TRUE
)
Expand All @@ -84,11 +112,10 @@ log_info("Calculating exposure times.")
exposure_df <- exposure_list %>%
bind_rows() %>%
mutate(sum = rowSums(select(., starts_with("2")))) %>%
filter(sum < 82) %>% # Select only cells with < 82 suitable years
filter(sum < 82) %>% # Select only cells with < 82 suitable years
select(-sum)

cl <- makeCluster(availableCores() - 1)
log_info("Parallel cluster created with {availableCores() - 1} workers.")
cl <- future::makeClusterPSOCK(workers, port = 12000, outfile = NULL, verbose = TRUE)
clusterEvalQ(cl, library(dplyr))
clusterExport(cl, "exposure_times")

Expand All @@ -110,14 +137,25 @@ res_final <- res_final %>%
na.omit()

log_info("Exposure time calculation complete.")

stopCluster(cl)
log_info("Cluster stopped.")

# Final data frame with exposure times for each species at each grid cell
log_info("Final data frame contains {nrow(res_final)} rows.")
print(res_final)

# 6. Save the output to "outputs/" directory
output_dir <- "outputs"
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}

saveRDS(
res_final,
file.path(output_dir, "res_final.rds")
)
log_info("Saved results to {file.path(output_dir, 'res_final.rds')}")

# Reset parallel processing plan
future::plan("sequential")
log_info("VISS Sample Data script completed successfully.")
Loading