diff --git a/scripts/bra_00_boundaries.R b/scripts/bra_00_boundaries.R new file mode 100644 index 0000000..7a625d3 --- /dev/null +++ b/scripts/bra_00_boundaries.R @@ -0,0 +1,129 @@ +# set up ------------------------------------------------------------------ +# Clear environment +rm(list = ls()) + +# Load necessary libraries +library(dplyr) +library(geobr) +library(sf) +library(lwgeom) +library(readxl) + +ids <- read_excel("00_doc/Documentacao/Documentação/Divisão Territorial do Brasil/Unidades da Federação, Mesorregiões, microrregiões e municípios 2010_modif.xls") + +setwd("01_data/01_raw/boundaries/") + +# downloading ------------------------------------------------------------- +nat <- read_country(year=2010) |> + st_write("nat.geojson") + +reg <- read_region(year = 2010) |> + setNames(c("reg_code", "reg_name", "geom")) |> + st_write("reg.geojson", delete_dsn = TRUE) + +uf <- read_state(year=2010) |> + setNames(c("uf_code", "uf_name", "reg_code", "reg_name", "geom")) |> + st_write("uf.geojson") + +uf_codes <- unique(uf$abbrev_state) # get a vector of uf codes to download census tract files + +uf <- read_state(year=2010) |> + select(-abbrev_state) + +mesoreg <- read_meso_region(year=2010) |> + select(-abbrev_state) |> + setNames(c("uf_code", "uf_name", "mesoreg_id", "mesoreg_name", "geom")) |> + st_write("mesoreg.geojson") + +microreg <- read_micro_region(year=2010) |> + select(-abbrev_state) |> + setNames(c("uf_code", "uf_name", "microreg_id", "microreg_name", "geom")) |> + st_write("microreg.geojson") + +metro <- read_metro_area(year=2010) |> + select(-abbrev_state) |> + setNames(c("mun_id", "mun_name", "uf_code", "metro_name", "metro_type", "metro_legislation_date", "geom"))|> + st_write("metro.geojson") + +mun_10 <- read_municipality(year=2010) +mun_20 <- read_municipality(year=2020) +mun <- read_municipality(year=2010) |> + select(-abbrev_state) |> + setNames(c("mun_id", "mun_name", "uf_code", "geom")) |> + st_write("mun.geojson") + +wa <- read_weighting_area(year=2010) |> + select(-abbrev_state) |> + setNames(c("wa_id","mun_id", "mun_name", "uf_code", "reg_code", "reg_name","geom")) |> + st_write("wa.geojson") + +# geo_levels_names -------------------------------------------------------- +# Define a function to process each data frame +process_df <- function(df, select_cols = NULL) { + df <- df %>% + st_drop_geometry() %>% + as.data.frame() %>% + mutate(across(everything(), as.character)) + + if (!is.null(select_cols)) { + df <- df %>% select(all_of(select_cols)) + } + + return(df) +} + +# Process each data frame +mun_names <- process_df(mun) +uf_names <- process_df(uf) +reg_names <- process_df(reg) +mesoreg_names <- process_df(mesoreg, c("mesoreg_id", "mesoreg_name")) +microreg_names <- process_df(microreg, c("microreg_id", "microreg_name")) +metro_names <- process_df(metro) %>% select(-mun_name) + +# Join data frames +names0 <- full_join(ids, mun_names, by = c("uf_code", "mun_id")) +names <- full_join(uf_names, reg_names) %>% + full_join(names0) %>% + full_join(mesoreg_names) %>% + full_join(microreg_names) %>% + full_join(metro_names) + +# Write to CSV +write.csv(names, "geo_levels_names_full.csv", row.names = FALSE, fileEncoding = "latin1") +# census tract ------------------------------------------------------------ + +# Create directories for census tract data +dir.create("tract_uf/", showWarnings = FALSE) + +# Function to process and save census tract data for each state +process_census_tract <- function(state_code) { + current_state_data <- read_census_tract( + code_tract = state_code, + year = 2010, + showProgress = FALSE + ) %>% + mutate(uf_code = substr(code_subdistrict, 1, 2)) %>% + select(code_state, code_muni, name_muni, code_tract) %>% + rename(uf_code = code_state, + tract_id = code_tract, + mun_id = code_muni, + mun_name = name_muni) + + st_write(current_state_data, + paste0("01_data/01_raw/boundaries/tract_uf/", state_code, ".geojson"), + delete_dsn = TRUE) +} + +# Process census tract data for each state +lapply(uf_codes, process_census_tract) + +# Combine all state GeoJSON files into one +folder_path <- "tract_uf" +geojson_files <- list.files(path = folder_path, pattern = "\\.geojson$", + full.names = TRUE) +tract <- geojson_files %>% + lapply(st_read) %>% + bind_rows() %>% + select(-uf_code, -mun_name) + +st_write(tract, "tract.geojson", delete_dsn = TRUE) diff --git a/scripts/bra_00_data.R b/scripts/bra_00_data.R new file mode 100644 index 0000000..f6a4629 --- /dev/null +++ b/scripts/bra_00_data.R @@ -0,0 +1,215 @@ + +# This code: +# > downloads 2010 Brazilian Census microdata from IBGE +# > unzip the microdata +# > reads .txt microdata into data frame +# > saves data sets as .csv files + +# By, Rafael Pereira +# you can fund my contacts at https://sites.google.com/site/rafaelhenriquemoraespereira/ +# 17-June-2016, Oxford, UK +# R version: RRO 3.2.2 (64 bits) + + +############################################################################ +## ATTENTION: This is the only modification you have to do in this script, I hope ;) +rm(list=ls()) # Clear environment +setwd("01_data/01_raw") # set working Directory +############################################################################ + + +##################### Load packages ---------------------------------------- + +# install devel version 1.9.7 of data.table +# GUIDELINES here : https://github.com/Rdatatable/data.table/wiki/Installation +library(data.table) # to manipulate data frames (fread and fwrite are ultrafast for reading and writing CSV files) +library(readr) #fast read of fixed witdh files +library(readxl) # read excel spreadsheets +options(scipen=999) # disable scientific notation +library(zip) +######## Download Census DATA ----------------------------------------------------- + +# create subdirectories where we'll save files +dir.create(file.path(".", "dados_txt2010")) +dir.create(file.path(".", "dados_csv2010")) + + +destfolder <- "./dados_txt2010/" +UFlist <- c("AC","AL","AM","AP","BA","CE","DF","ES","GO","MA","MG","MS","MT","PA","PB","PE","PI","PR","RJ","RN","RO","RR","RS","SC","SE","SP1","SP2_RM","TO") +ftppath <- "ftp://ftp.ibge.gov.br/Censos/Censo_Demografico_2010/Resultados_Gerais_da_Amostra/Microdados/" +tf <- tempfile() +td <- tempdir() + +for (i in UFlist){ + tf <- paste0(ftppath, i, ".zip") + td <- paste0("./dados_txt2010/", i, ".zip") + print(i) + download.file(tf, td, mode="wb") +} + +# unzip all Files +filenames <- list.files("./dados_txt2010", pattern=".zip", full.names=TRUE) +lapply(filenames,unzip, exdir = "./dados_txt2010") + +######## Download Census Documentation ----------------------------------------------------- + +file_url <- "ftp://ftp.ibge.gov.br/Censos/Censo_Demografico_2010/Resultados_Gerais_da_Amostra/Microdados/Documentacao.zip" +download.file(file_url,"Documentacao.zip", mode="wb") +zip::unzip("Documentacao.zip", exdir="documentacao2010", junkpaths=T) + + +######## Prepare Documentation files to read .txt ----------------------------------------------------- + +# Open variables layout from Excel file +dic_dom <- read_excel("./documentacao2010/Layout_microdados_Amostra.xls", sheet =1, skip = 1) +dic_pes <- read_excel("./documentacao2010/Layout_microdados_Amostra.xls", sheet =2, skip = 1) + + +# convert to data table +setDT(dic_dom) +setDT(dic_pes) + + +# compute width of each variable + +# Create function to compute width +computeWidth <- function(dataset){dataset[is.na(DEC), DEC := 0] # Convert NA to 0 + dataset[, width := INT + DEC] # create width variable + setnames(dataset,colnames(dataset)[which(colnames(dataset) == "POSIÇÃO INICIAL")],"pos.ini") # change name of variable initial position + setnames(dataset,colnames(dataset)[which(colnames(dataset) == "POSIÇÃO FINAL")],"pos.fin") # change name of variable final position +} + +# Apply function +lapply(list(dic_dom,dic_pes), computeWidth) + + +### In case you need to work with a smaller subset of the data (e.g. because of memory limits), +### I would suggest you read from the .txt file only those variables you want +# myvariblesPES <- c("V0001", "V6400", "V0011", "V0300", "V0601", "V6036", "V0606", "V0010") # list of variables you want +# dic_pes <- dic_pes[VAR %in% myvariblesPES] # filter documentation, continue the code + +### Alternatively, you could save the whole data set file as .csv and load only the variables you want +# individuals <- fread("pesBrasil.csv", select= myvariblesPES) + + +### HOUSEHOLD Files (13 minutes) ---------------------------------------------------- +# readr and bind ALL .txt files one by one +# still quite fast +# memory intensive for big data sets +# Create the output directory if it does not exist +# Create the output directory if it does not exist +dir.create(file.path("dados_csv2010", "dom"), recursive = TRUE) + + +# List with all Household files +data_files <- list.files(path = "./dados_txt2010", + recursive = TRUE, + pattern = "Dom", + full.names = TRUE) + +# Create function to read Household files +readDOM <- function(f) { + cat(f, "\n") + data <- read_fwf(f, + fwf_positions(dput(dic_dom[, pos.ini]), + dput(dic_dom[, pos.fin]), + col_names = dput(dic_dom[, VAR])), + progress = interactive()) + return(data) +} + +# Loop through each file and save individually by V0001 and V0002 +for (f in data_files) { + temp <- readDOM(f) + setDT(temp) + + # Update decimals in the data + var.decimals <- dic_dom[DEC > 0, ] # identify variables with decimals to update + var.decimals <- var.decimals[, .(VAR, DEC)] + var <- var.decimals$VAR # list of variables to update decimals + + for (j in seq_along(var)) { + set(temp, j = var[j], value = as.numeric(temp[[var[j]]]) / 10^var.decimals[j, DEC]) + } + + # Save each subset of data by V0001 and V0002 + unique_states <- unique(temp$V0001) + for (state in unique_states) { + state_data <- temp[V0001 == state] + + # Create directory for the state if it doesn't exist + state_dir <- file.path("dados_csv2010", "dom", as.character(state)) + if (!dir.exists(state_dir)) { + dir.create(state_dir, recursive = TRUE) + } + + unique_municipalities <- unique(state_data$V0002) + for (municipality in unique_municipalities) { + municipality_data <- state_data[V0002 == municipality] + fwrite(municipality_data, file = file.path(state_dir, paste0(municipality, ".csv"))) + } + } +} + +rm(temp, readDOM); gc() + +### INDIVIDUALS Files (41 minutes) ---------------------------------------------------- +# readr each .txt files and save it appending to one single data set +# not as fast +# not memory intensive for big data sets +# Create the output directory if it does not exist +dir.create(file.path("dados_csv2010", "pes"), recursive = TRUE) + +# List with all Household files +data_files <- list.files(path = "./dados_txt2010", + recursive = TRUE, + pattern = "Pes", + full.names = TRUE) + +# Prepare documentation to update decimals in the data +var.decimals <- dic_pes[DEC > 0, ] # identify variables with decimals to update +var.decimals <- var.decimals[, .(VAR, DEC)] +var <- var.decimals$VAR # list of variables to update decimals + + +# Loop through each file and process +for (i in seq_along(data_files)) { + + # Select state file + file <- data_files[i] + + # Read data into temp + temp <- read_fwf(file, + fwf_positions(dput(dic_pes[, pos.ini]), + dput(dic_pes[, pos.fin]), + col_names = dput(dic_pes[, VAR])), + progress = interactive()) + setDT(temp) # Set as Data Table + + # Update decimals in the data + for (j in seq_along(var)) { + set(temp, j = var[j], value = as.numeric(temp[[var[j]]]) / 10^var.decimals[j, DEC]) + } + + # Save each subset of data by V0001 and V0002 + unique_states <- unique(temp$V0001) + for (state in unique_states) { + state_data <- temp[V0001 == state] + + # Create directory for the state if it doesn't exist + state_dir <- file.path("dados_csv2010", "pes", as.character(state)) + if (!dir.exists(state_dir)) { + dir.create(state_dir, recursive = TRUE) + } + + unique_municipalities <- unique(state_data$V0002) + for (municipality in unique_municipalities) { + municipality_data <- state_data[V0002 == municipality] + fwrite(municipality_data, file = file.path(state_dir, paste0(municipality, ".csv"))) + } + } + + cat("saving", i, "out of", length(data_files), file, "\n") # Update status of the loop + rm(temp); gc() +} +