Skip to content

Commit

Permalink
dados e scraping🎲🕸️
Browse files Browse the repository at this point in the history
  • Loading branch information
jtrecenti committed Sep 3, 2023
1 parent 1b10881 commit 7a5032d
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 3 deletions.
103 changes: 103 additions & 0 deletions R/gaas.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#' Estados a partir do site do GAA
#'
#' @return vetor de estados
#' @export
gaa_estados <- function() {
u_gaas <- "https://www.angaad.org.br/portal/gaas/"
r <- httr::GET(u_gaas)
estados <- r |>
xml2::read_html() |>
xml2::xml_find_all("//select/option") |>
xml2::xml_attr("value") |>
purrr::discard(\(x) x == "")
estados
}

# Instituicoes a partir do site do GAA
#
# Faz o parse de um item extraido da pagina inicial do GAA
gaa_parse_item <- function(item) {
link <- item |>
xml2::xml_find_first(".//a") |>
xml2::xml_attr("href")
img_link <- item |>
xml2::xml_find_first(".//img") |>
xml2::xml_attr("src")
title <- item |>
xml2::xml_find_first(".//h4") |>
xml2::xml_text()
city <- item |>
xml2::xml_find_first(".//div[@class='pt-cv-ctf-value']") |>
xml2::xml_text()
tibble::tibble(
link = link,
img_link = img_link,
title = title,
city = city
)
}

#' Instituicoes a partir do site do GAA
#'
#' @param estado sigla do estado
#' @return tibble com as instituicoes
#' @export
gaa_instituicoes <- function(estado) {
u <- paste0("https://www.angaad.org.br/portal/gaas/?tx_category=", estado)
r <- httr::GET(u)
items <- r |>
xml2::read_html() |>
xml2::xml_find_all("//div[contains(@class,'pt-cv-content-item')]") |>
purrr::map(parse_item, .progress = TRUE) |>
purrr::list_rbind(names_to = "id") |>
dplyr::mutate(estado = estado)
items
}

#' Faz o download de um GAA a partir do link
#'
#' @param link link do GAA
#' @param path pasta onde salvar o arquivo
#' @return caminho do arquivo baixado
#' @export
gaa_download <- function(link, path) {
fs::dir_create(path)
f <- paste0(path, "/", basename(link), ".html")
if (!file.exists(f)) {
r <- httr::GET(link, httr::write_disk(f, TRUE))
}
f
}

#' Faz o parse de um GAA
#'
#' @param f caminho do arquivo
#' @return tibble com os dados do GAA
#' @export
gaa_parse <- function(f) {
xml <- xml2::read_html(f)
views <- xml |>
xml2::xml_find_all("//span[@class='post-views-count']") |>
xml2::xml_text() |>
stringr::str_squish()
txt_completo <- xml |>
xml2::xml_find_first("//div[@class='the_content']") |>
xml2::xml_text()
xml |>
xml2::xml_find_all("//div[@class='the_content']/p") |>
xml2::xml_text() |>
tibble::as_tibble() |>
tidyr::separate(
value,
into = c("key", "value"),
sep = ":",
extra = "merge",
fill = "right"
) |>
dplyr::mutate(
key = stringr::str_squish(key),
value = stringr::str_squish(value)
) |>
tibble::add_row(key = "views", value = views) |>
tibble::add_row(key = "txt_completo", value = txt_completo)
}
110 changes: 110 additions & 0 deletions data-raw/gaas-data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
devtools::load_all()

# baixando lista de GAAs ----

estados <- gaa_estados()

items_parsed <- purrr::map(estados, gaa_instituicoes, .progress = TRUE) |>
purrr::list_rbind() |>
dplyr::filter(!is.na(city))

items_parsed_ibge <- items_parsed |>
dplyr::mutate(uf = dplyr::case_when(
estado == "acre" ~ "AC",
estado == "alagoas" ~ "AL",
estado == "amapa" ~ "AP",
estado == "amazonas" ~ "AM",
estado == "bahia" ~ "BA",
estado == "ceara" ~ "CE",
estado == "distrito-federal" ~ "DF",
estado == "espirito-santo" ~ "ES",
estado == "goias" ~ "GO",
estado == "maranhao" ~ "MA",
estado == "mato-grosso" ~ "MT",
estado == "mato-grosso-do-sul" ~ "MS",
estado == "minas-gerais" ~ "MG",
estado == "para" ~ "PA",
estado == "paraiba" ~ "PB",
estado == "parana" ~ "PR",
estado == "pernambuco" ~ "PE",
estado == "piaui" ~ "PI",
estado == "rio-de-janeiro" ~ "RJ",
estado == "rio-grande-do-norte" ~ "RN",
estado == "rio-grande-do-sul" ~ "RS",
estado == "rondonia" ~ "RO",
estado == "roraima" ~ "RR",
estado == "santa-catarina" ~ "SC",
estado == "sao-paulo" ~ "SP",
estado == "sergipe" ~ "SE",
estado == "tocantins" ~ "TO"
)) |>
dplyr::mutate(
city = stringr::str_remove(city, "-.*"),
city = stringr::str_remove(city, "/.*"),
city = stringr::str_remove(city, "GAA ")
) |>
munifacil::limpar_colunas(city, uf) |>
dplyr::mutate(
muni_join = dplyr::case_when(
muni_join == "sao luiz" ~ "sao luis",
muni_join == "garanhus" ~ "garanhuns",
muni_join == "jaboatao" ~ "jaboatao dos guararapes",
muni_join == "pernambuco" ~ "recife",
.default = muni_join
)
) |>
munifacil::incluir_codigo_ibge() |>
dplyr::select(
id:uf, ibge = id_municipio,
-estado
) |>
dplyr::mutate(slug = basename(link))

# baixando todos os arquivos ----
purrr::walk(
items_parsed$link,
gaa_download,
path = "data-raw/gaas",
.progress = TRUE
)

files <- fs::dir_ls("data-raw/gaas")

aux_gaas <- purrr::map(files, gaa_parse, .progress = TRUE) |>
purrr::list_rbind(names_to = "file") |>
dplyr::filter(key != "") |>
dplyr::group_by(file, key) |>
dplyr::summarise(
value = paste(unique(value), collapse = " | "),
.groups = "drop"
) |>
tidyr::pivot_wider(names_from = key, values_from = value) |>
janitor::clean_names()

# Parse ----
# Essa parte pode ser melhorada posteriormente

da_gaas <- aux_gaas |>
dplyr::mutate(
data_de_fundacao = dplyr::coalesce(
data_de_fundacao,
dada_de_fundacao,
ano_de_fundacao,
fundado_em,
data_fundacao,
ano_de_fundacao_2
)
) |>
dplyr::select(
file, cep:txt_completo,
-nos_acompanhe_em_nossas_redes_sociais
) |>
dplyr::mutate(
dplyr::across(dplyr::everything(), \(x) dplyr::na_if(x, "")),
dplyr::across(dplyr::everything(), \(x) dplyr::na_if(x, "NA"))
) |>
dplyr::mutate(slug = basename(tools::file_path_sans_ext(file))) |>
dplyr::inner_join(items_parsed_ibge, "slug") |>
dplyr::select(id:ibge, file:txt_completo)

usethis::use_data(da_gaas, overwrite = TRUE)
3 changes: 0 additions & 3 deletions data-raw/gaas.R

This file was deleted.

Binary file added data/da_gaas.rda
Binary file not shown.

0 comments on commit 7a5032d

Please sign in to comment.