From 5a7b49ddf718c621fe0e4d6da5a79b47ee1ff2c0 Mon Sep 17 00:00:00 2001 From: rafapereirabr Date: Fri, 22 Mar 2024 14:02:33 -0300 Subject: [PATCH 1/5] update prep health factilities --- data_prep/R/health_facilities.R | 38 +++++++++++++++++++++------------ data_prep/R/support_fun.R | 4 ++-- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/data_prep/R/health_facilities.R b/data_prep/R/health_facilities.R index 0e5f64af..c0aafdfd 100644 --- a/data_prep/R/health_facilities.R +++ b/data_prep/R/health_facilities.R @@ -84,20 +84,23 @@ update_health_facilities <- function(){ 'date_update', 'year_update')) - + # deal with points with missing coordinates head(dt) - # dt[is.na(lat) | is.na(lon),] - # dt[lat==0,] + dt[is.na(lat) | is.na(lon),] + dt[lat==0,] + + # identify which points should have empty geo + dt[is.na(lat) | is.na(lon), empty_geo := T] + + dt[code_cnes=='0000930', lat] + dt[code_cnes=='0000930', lon] - # dt[code_cnes=='0000930', lat] - # dt[code_cnes=='0000930', lon] - # - # # replace NAs with 0 - # data.table::setnafill(dt, - # type = "const", - # fill = 0, - # cols=c("lat","lon") - # ) + # replace NAs with 0 + data.table::setnafill(dt, + type = "const", + fill = 0, + cols=c("lat","lon") + ) @@ -107,18 +110,25 @@ update_health_facilities <- function(){ crs = "+proj=longlat +datum=WGS84") + # convert to point empty + # solution from: https://gis.stackexchange.com/questions/459239/how-to-set-a-geometry-to-na-empty-for-some-features-of-an-sf-dataframe-in-r + temp_sf$geometry[temp_sf$empty_geo == T] = sf::st_point() + + subset(temp_sf, code_cnes=='0000930') + + # Change CRS to SIRGAS Geodetic reference system "SIRGAS2000" , CRS(4674). temp_sf <- harmonize_projection(temp_sf) # create folder to save the data - dest_dir <- paste0('./data/health_facilities/', geobr_date) + dest_dir <- paste0('./data/health_facilities/', geobr_date,'/') dir.create(path = dest_dir, recursive = TRUE, showWarnings = FALSE) # Save raw file in sf format sf::st_write(temp_sf, - dsn= paste0(dest_dir, 'cnes_', date_update,".gpkg"), + dsn= paste0(dest_dir, 'cnes_', geobr_date,".gpkg"), overwrite = TRUE, append = FALSE, delete_dsn = T, diff --git a/data_prep/R/support_fun.R b/data_prep/R/support_fun.R index d271d68d..6d741213 100644 --- a/data_prep/R/support_fun.R +++ b/data_prep/R/support_fun.R @@ -1,7 +1,7 @@ #### Support functions to use in the preprocessing of the data -# library(dplyr) -# library(data.table) +library(dplyr) +library(data.table) # library(mapview) # mapviewOptions(platform = 'deckgl') From 067e7004bc7a794f3b92aa86e5c255691632a6b8 Mon Sep 17 00:00:00 2001 From: rafapereirabr Date: Fri, 22 Mar 2024 14:22:40 -0300 Subject: [PATCH 2/5] update health factilities to add parameter "date" --- r-package/DESCRIPTION | 2 +- r-package/R/read_health_facilities.R | 8 +++--- r-package/R/utils.R | 6 ++--- r-package/man/geobr.Rd | 1 - r-package/man/read_health_facilities.Rd | 4 ++- r-package/prep_data/update_metadata_table.R | 7 +++-- .../testthat/test-read_health_facilities.R | 26 ++++++++++++++++--- 7 files changed, 40 insertions(+), 14 deletions(-) diff --git a/r-package/DESCRIPTION b/r-package/DESCRIPTION index 881097f7..7d53fc3a 100644 --- a/r-package/DESCRIPTION +++ b/r-package/DESCRIPTION @@ -46,6 +46,6 @@ Suggests: rmarkdown, scales, testthat -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Roxygen: list(markdown = TRUE) VignetteBuilder: knitr diff --git a/r-package/R/read_health_facilities.R b/r-package/R/read_health_facilities.R index 85879211..011b9d6f 100644 --- a/r-package/R/read_health_facilities.R +++ b/r-package/R/read_health_facilities.R @@ -16,6 +16,8 @@ #' update is registered in the database in the columns `date_update` and #' `year_update`. More information in the CNES data set available at \url{https://dados.gov.br/}. #' These data use Geodetic reference system "SIRGAS2000" and CRS(4674). +#' +#' @param date Numeric. Date of the data in YYYYMM format. Defaults to `202303`. #' @template showProgress #' #' @return An `"sf" "data.frame"` object @@ -25,12 +27,12 @@ #' #' @examplesIf identical(tolower(Sys.getenv("NOT_CRAN")), "true") #' # Read all health facilities of the whole country -#' h <- read_health_facilities() +#' h <- read_health_facilities( date = 202303) #' -read_health_facilities <- function( showProgress=TRUE ){ +read_health_facilities <- function(date = 202303, showProgress = TRUE){ # Get metadata with data url addresses - temp_meta <- select_metadata(geography="health_facilities", year=2015, simplified=F) + temp_meta <- select_metadata(geography="health_facilities", year=date, simplified=F) # list paths of files to download file_url <- as.character(temp_meta$download_path) diff --git a/r-package/R/utils.R b/r-package/R/utils.R index 82b44611..1afcbe3e 100644 --- a/r-package/R/utils.R +++ b/r-package/R/utils.R @@ -38,16 +38,16 @@ select_data_type <- function(temp_meta, simplified=NULL){ select_year_input <- function(temp_meta, y=year){ # NULL - if (is.null(y)){ stop(paste0("Error: Invalid Value to argument 'year'. It must be one of the following: ", + if (is.null(y)){ stop(paste0("Error: Invalid Value to argument 'year/date'. It must be one of the following: ", paste(unique(temp_meta$year),collapse = " "))) } # invalid input - else if (y %in% temp_meta$year){ message(paste0("Using year ", y)) + else if (y %in% temp_meta$year){ message(paste0("Using year/date ", y)) temp_meta <- subset(temp_meta, year == y) return(temp_meta) } # invalid input - else { stop(paste0("Error: Invalid Value to argument 'year'. It must be one of the following: ", + else { stop(paste0("Error: Invalid Value to argument 'year/date'. It must be one of the following: ", paste(unique(temp_meta$year), collapse = " "))) } } diff --git a/r-package/man/geobr.Rd b/r-package/man/geobr.Rd index b56585cd..9028646e 100644 --- a/r-package/man/geobr.Rd +++ b/r-package/man/geobr.Rd @@ -3,7 +3,6 @@ \docType{package} \name{geobr} \alias{geobr} -\alias{_PACKAGE} \alias{geobr-package} \title{geobr: Download Official Spatial Data Sets of Brazil} \description{ diff --git a/r-package/man/read_health_facilities.Rd b/r-package/man/read_health_facilities.Rd index 6008cd3f..2028eb98 100644 --- a/r-package/man/read_health_facilities.Rd +++ b/r-package/man/read_health_facilities.Rd @@ -8,6 +8,8 @@ read_health_facilities(showProgress = TRUE) } \arguments{ \item{showProgress}{Logical. Defaults to \code{TRUE} display progress bar.} + +\item{date}{Numeric. Date of the data in YYYYMM format. Defaults to \code{202303}.} } \value{ An \verb{"sf" "data.frame"} object @@ -32,7 +34,7 @@ These data use Geodetic reference system "SIRGAS2000" and CRS(4674). \examples{ \dontshow{if (identical(tolower(Sys.getenv("NOT_CRAN")), "true")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # Read all health facilities of the whole country -h <- read_health_facilities() +h <- read_health_facilities( date = 202303) \dontshow{\}) # examplesIf} } \seealso{ diff --git a/r-package/prep_data/update_metadata_table.R b/r-package/prep_data/update_metadata_table.R index 6fb88c51..f750154c 100644 --- a/r-package/prep_data/update_metadata_table.R +++ b/r-package/prep_data/update_metadata_table.R @@ -95,6 +95,8 @@ pb_new_release("ipeaGIT/geobr", metadata[geo=='municipality' & year==2022] +a <- metadata[geo=='health_facilities'] + ######### Step 3 - upload data to github ---------------------- all_files <- list.files("//storage1/geobr/data_gpkg", full.names = T, recursive = T) @@ -149,12 +151,13 @@ piggyback::pb_upload(to_go, # metadata$file_name <- NULL -# reorder columns -setcolorder(metadata, c("geo", "year", "code", "download_path", "code_abbrev")) ######### Step 5 - check and save metadata ---------------------- + # reorder columns + setcolorder(metadata, c("geo", "year", "code", "download_path", "code_abbrev")) + # to avoid conflict with data.table metadata <- as.data.frame(metadata) table(metadata$geo) diff --git a/r-package/tests/testthat/test-read_health_facilities.R b/r-package/tests/testthat/test-read_health_facilities.R index a27d06ce..d4d351e2 100644 --- a/r-package/tests/testthat/test-read_health_facilities.R +++ b/r-package/tests/testthat/test-read_health_facilities.R @@ -11,12 +11,32 @@ testthat::skip_on_cran() test_that("read_health_facilities", { # read data - test_sf <- read_health_facilities() + test_sf <- read_health_facilities(showProgress = FALSE) # check sf object expect_true(is(test_sf, "sf")) - # check number of micro - expect_equal(nrow(test_sf), 360177) + # read data + test_sf_202303 <- read_health_facilities(date = 202303) + + # check number of observations + expect_equal(nrow(test_sf_202303), 517629) }) + + + + +# ERRORS and messagens ----------------------- +test_that("read_health_facilities", { + + # Wrong date + testthat::expect_error(read_health_facilities(date = 9999999)) + testthat::expect_error(read_health_facilities(year = "banana")) + + # wrong showProgress + testthat::expect_error(read_health_facilities(showProgress = 'banana')) + +}) + + From a26c996285dbd9d1fc385d6e133caafef6fa6001 Mon Sep 17 00:00:00 2001 From: rafapereirabr Date: Fri, 22 Mar 2024 14:40:38 -0300 Subject: [PATCH 3/5] update read_health_facilities documentation --- r-package/R/read_health_facilities.R | 3 ++- r-package/man/read_health_facilities.Rd | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/r-package/R/read_health_facilities.R b/r-package/R/read_health_facilities.R index 011b9d6f..513d8dc9 100644 --- a/r-package/R/read_health_facilities.R +++ b/r-package/R/read_health_facilities.R @@ -17,7 +17,8 @@ #' `year_update`. More information in the CNES data set available at \url{https://dados.gov.br/}. #' These data use Geodetic reference system "SIRGAS2000" and CRS(4674). #' -#' @param date Numeric. Date of the data in YYYYMM format. Defaults to `202303`. +#' @param date Numeric. Date of the data in YYYYMM format. Defaults to `202303`, +#' which was the latest data available by the time of this update. #' @template showProgress #' #' @return An `"sf" "data.frame"` object diff --git a/r-package/man/read_health_facilities.Rd b/r-package/man/read_health_facilities.Rd index 2028eb98..ff048aef 100644 --- a/r-package/man/read_health_facilities.Rd +++ b/r-package/man/read_health_facilities.Rd @@ -4,12 +4,13 @@ \alias{read_health_facilities} \title{Download geolocated data of health facilities} \usage{ -read_health_facilities(showProgress = TRUE) +read_health_facilities(date = 202303, showProgress = TRUE) } \arguments{ -\item{showProgress}{Logical. Defaults to \code{TRUE} display progress bar.} +\item{date}{Numeric. Date of the data in YYYYMM format. Defaults to \code{202303}, +which was the latest data available by the time of this update.} -\item{date}{Numeric. Date of the data in YYYYMM format. Defaults to \code{202303}.} +\item{showProgress}{Logical. Defaults to \code{TRUE} display progress bar.} } \value{ An \verb{"sf" "data.frame"} object From 822d13277e6c8a0e7c1e8b579ef1af3663c5cedd Mon Sep 17 00:00:00 2001 From: rafapereirabr Date: Fri, 22 Mar 2024 14:54:43 -0300 Subject: [PATCH 4/5] update NEWS dev version --- r-package/DESCRIPTION | 2 +- r-package/NEWS.md | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/r-package/DESCRIPTION b/r-package/DESCRIPTION index 7d53fc3a..7f94d887 100644 --- a/r-package/DESCRIPTION +++ b/r-package/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: geobr Title: Download Official Spatial Data Sets of Brazil -Version: 1.8.2 +Version: 1.8.9999 Authors@R: c(person(given="Rafael H. M.", family="Pereira", email="rafa.pereira.br@gmail.com", role=c("aut", "cre"), comment = c(ORCID = "0000-0003-2125-7465")), person(given="Caio Nogueira", family="Goncalves", role=c("aut")), person(given="Paulo Henrique Fernandes de", family="Araujo", role=c("ctb")), diff --git a/r-package/NEWS.md b/r-package/NEWS.md index 3d3ece56..70893e03 100644 --- a/r-package/NEWS.md +++ b/r-package/NEWS.md @@ -1,3 +1,11 @@ +# geobr v1.9.0 + +**Major changes** + +- Function `read_health_facilities()` now has a new parameter `date`, which will allow users to access data for different dates of reference. The plan is to have at least one update of this data set per year. + + + # geobr v1.8.2 **CRAN request** From 5d795774638914ef9f2e1053eb23c91dc6f410d8 Mon Sep 17 00:00:00 2001 From: rafapereirabr Date: Fri, 22 Mar 2024 16:34:22 -0300 Subject: [PATCH 5/5] update schools --- data_prep/R/schools.R | 130 ++++++++++++++++++++++++ data_prep/R/support_fun.R | 2 +- r-package/NEWS.md | 4 + r-package/prep_data/prep_schools.R | 156 ----------------------------- 4 files changed, 135 insertions(+), 157 deletions(-) create mode 100644 data_prep/R/schools.R delete mode 100644 r-package/prep_data/prep_schools.R diff --git a/data_prep/R/schools.R b/data_prep/R/schools.R new file mode 100644 index 00000000..eb56bfbf --- /dev/null +++ b/data_prep/R/schools.R @@ -0,0 +1,130 @@ +#> DATASET: schools 2020 +#> Source: INEP - +#> https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/inep-data/catalogo-de-escolas +#> +#: scale +#> Metadata: +# Titulo: schools +#' Frequencia de atualizacao: anual +#' +#' Forma de apresentação: Shape +#' Linguagem: Pt-BR +#' Character set: Utf-8 +#' +#' Resumo: Pontos com coordenadas gegráficas das escolas do censo escolar +#' Informações adicionais: Dados produzidos pelo INEP. Os dados de escolas e sua +#' geolocalização são atualizados pelo INEP continuamente. Para finalidade do geobr, +#' esses dados precisam ser baixados uma vez ao ano + + + + +update_schools <- function(){ + + + # If the data set is updated regularly, you should create a function that will have + # a `date` argument download the data + update <- 2023 + date_update <- Sys.Date() + + # date shown to geobr user + geobr_date <- gsub('-', '' , date_update) + geobr_date <- substr(geobr_date, 1, 6) + + + # download manual + # https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/inep-data/catalogo-de-escolas + dt <- fread('C:/Users/r1701707/Downloads/Análise - Tabela da lista das escolas - Detalhado.csv', + encoding = 'UTF-8') + head(dt) + + + ##### 4. Rename columns ------------------------- + head(dt) + + df <- dplyr::select(dt, + abbrev_state = 'UF', + name_muni = 'Município', + code_school = 'Código INEP', + name_school = 'Escola', + education_level = 'Etapas e Modalidade de Ensino Oferecidas', + education_level_others = 'Outras Ofertas Educacionais', + admin_category = 'Categoria Administrativa', + address = 'Endereço', + phone_number = 'Telefone', + government_level = 'Dependência Administrativa', + private_school_type = 'Categoria Escola Privada', + private_government_partnership = 'Conveniada Poder Público', + regulated_education_council = 'Regulamentação pelo Conselho de Educação', + service_restriction ='Restrição de Atendimento', + size = 'Porte da Escola', + urban = 'Localização', + location_type = 'Localidade Diferenciada', + date_update = 'date_update', + y = 'Latitude', + x = 'Longitude' + ) + + + + + head(df) + + + # add update date columns + df[, date_update := as.character(date_update)] + + + # deal with points with missing coordinates + head(df) + df[is.na(x) | is.na(y),] + df[x==0,] + + # identify which points should have empty geo + df[is.na(x) | is.na(y), empty_geo := T] + + df[code_school=='11000180', x] + + + # replace NAs with 0 + data.table::setnafill(df, + type = "const", + fill = 0, + cols=c("x","y") + ) + + + + # Convert originl data frame into sf + temp_sf <- sf::st_as_sf(x = df, + coords = c("x", "y"), + crs = "+proj=longlat +datum=WGS84") + + + # convert to point empty + # solution from: https://gis.stackexchange.com/questions/459239/how-to-set-a-geometry-to-na-empty-for-some-features-of-an-sf-dataframe-in-r + temp_sf$geometry[temp_sf$empty_geo == T] = sf::st_point() + + subset(temp_sf, code_school=='11000180') + + + # Change CRS to SIRGAS Geodetic reference system "SIRGAS2000" , CRS(4674). + temp_sf <- harmonize_projection(temp_sf) + + + # create folder to save the data + dest_dir <- paste0('./data/schools/', update,'/') + dir.create(path = dest_dir, recursive = TRUE, showWarnings = FALSE) + + + # Save raw file in sf format + sf::st_write(temp_sf, + dsn= paste0(dest_dir, 'schools_', update,".gpkg"), + overwrite = TRUE, + append = FALSE, + delete_dsn = T, + delete_layer = T, + quiet = T + ) + +} diff --git a/data_prep/R/support_fun.R b/data_prep/R/support_fun.R index 6d741213..ccbc6a6f 100644 --- a/data_prep/R/support_fun.R +++ b/data_prep/R/support_fun.R @@ -146,7 +146,7 @@ add_region_info <- function(temp_sf, column){ code_region==2, 'Nordeste', code_region==3, 'Sudeste', code_region==4, 'Sul', - code_region==5, 'Centro Oeste', + code_region==5, 'Centro-Oeste', default = NA)) return(temp_sf) } diff --git a/r-package/NEWS.md b/r-package/NEWS.md index 70893e03..be4cec52 100644 --- a/r-package/NEWS.md +++ b/r-package/NEWS.md @@ -5,6 +5,10 @@ - Function `read_health_facilities()` now has a new parameter `date`, which will allow users to access data for different dates of reference. The plan is to have at least one update of this data set per year. +**New data** +- schools for 2023 +- health facilities for 202303 + # geobr v1.8.2 diff --git a/r-package/prep_data/prep_schools.R b/r-package/prep_data/prep_schools.R deleted file mode 100644 index 88213335..00000000 --- a/r-package/prep_data/prep_schools.R +++ /dev/null @@ -1,156 +0,0 @@ -#> DATASET: schools 2020 -#> Source: INEP - -#> https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/inep-data/catalogo-de-escolas -#> -#: scale -#> Metadata: -# Titulo: schools -#' Frequencia de atualizacao: anual -#' -#' Forma de apresentação: Shape -#' Linguagem: Pt-BR -#' Character set: Utf-8 -#' -#' Resumo: Pontos com coordenadas gegráficas das escolas do censo escolar -#' Informações adicionais: Dados produzidos pelo INEP. Os dados de escolas e sua -#' geolocalização são atualizados pelo INEP continuamente. Para finalidade do geobr, -#' esses dados precisam ser baixados uma vez ao ano -# - -### Libraries (use any library as necessary) - -library(RCurl) -library(stringr) -library(sf) -library(dplyr) -library(readr) -library(data.table) -library(magrittr) -library(lwgeom) -library(stringi) -library(sfheaders) -library(mapview) -library(ggplot2) - -mapviewOptions(platform = 'leafgl') -# mapviewOptions(platform = 'mapdeck') - -####### Load Support functions to use in the preprocessing of the data - -source("./prep_data/prep_functions.R") - -# Root directory -root_dir <- "L:////# DIRUR #//ASMEQ//geobr//data-raw" -setwd(root_dir) - - - -###### 0. Create folders to save the data ----------------- - -# If the data set is updated regularly, you should create a function that will have -# a `date` argument download the data -update <- 2020 -date_update <- '2020-10-18' - - -# Root directory -root_dir <- "L:\\# DIRUR #\\ASMEQ\\geobr\\data-raw" -setwd(root_dir) - -# Directory to keep raw zipped files -dir.create("./schools") -destdir_raw <- paste0("./schools/",update) -dir.create(destdir_raw) - - - - -#### 1. Download manual do dado ----------------- - - -# download manual do dado a partir de - - -# leitura do dado bruto -df <- fread('C:/Users/r1701707/Downloads/Análise - Tabela da lista das escolas - Detalhado (1).csv', - encoding = 'UTF-8') - -head(df) - - - - - - -##### 4. Rename columns ------------------------- -df$date_update <- date_update - - -df2 <- - dplyr::select(df, - abbrev_state = 'UF', - name_muni = 'Município', - code_school = 'Código INEP', - name_school = 'Escola', - education_level = 'Etapas e Modalidade de Ensino Oferecidas', - education_level_others = 'Outras Ofertas Educacionais', - admin_category = 'Categoria Administrativa', - address = 'Endereço', - phone_number = 'Telefone', - government_level = 'Dependência Administrativa', - private_school_type = 'Categoria Escola Privada', - private_government_partnership = 'Conveniada Poder Público', - regulated_education_council = 'Regulamentação pelo Conselho de Educação', - service_restriction ='Restrição de Atendimento', - size = 'Porte da Escola', - urban = 'Localização', - location_type = 'Localidade Diferenciada', - date_update = 'date_update', - y = 'Latitude', - x = 'Longitude' - ) - -head(df2) - - - - -# fix spatial coordinates -summary(df2$x) -temp_sf <- sfheaders::sf_point(df2, x='x', y='y', keep = T) -# temp_sf <- sfheaders::sf_point(subset(df2, !is.na(x)), x='x', y='y', keep = T) - - -# temp_sf = st_as_sf(subset(df2, !is.na(x)), coords = c("x", "y")) - - - -country <- geobr::read_country() -sirgas <- st_crs(country) -st_crs(temp_sf) <- sirgas -st_crs(temp_sf) <- 4674 - -# st_crs(temp_sf) -# head(temp_sf) -# -# a <- temp_sf[1:100,] -# -# plot(a) -mapview(temp_sf) - -ggplot() + - geom_sf(data= country) + - geom_sf(data= temp_sf) - - -##### Save file ------------------------- - -# save raw file -fwrite(df, paste0(destdir_raw, '/schools_', update, '_raw.csv')) - -# Save sf -sf::st_write(temp_sf, dsn= paste0(destdir_raw ,"/schools_", update,".gpkg"), update = TRUE) - - - -