Skip to content

Commit

Permalink
Add TSE crawler to raw layer
Browse files Browse the repository at this point in the history
  • Loading branch information
TeoCalvo authored and TeoCalvo committed Jan 18, 2024
1 parent cd6a671 commit cb1493d
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 0 deletions.
6 changes: 6 additions & 0 deletions databases/create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,9 @@ CREATE DATABASE IF NOT EXISTS silver.pizza_query;
-- DBTITLE 1,Olist
CREATE DATABASE IF NOT EXISTS bronze.olist;
CREATE DATABASE IF NOT EXISTS silver.olist;

-- COMMAND ----------

-- DBTITLE 1,TSE
CREATE DATABASE IF NOT EXISTS bronze.tse;
CREATE DATABASE IF NOT EXISTS silver.tse;
120 changes: 120 additions & 0 deletions src/01.raw/tse/collect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Databricks notebook source
import requests
import json
import os
import zipfile
from multiprocessing import Pool

# COMMAND ----------

with open("datasources.json", "r") as open_file:
datasources = json.load(open_file)

dbutils.widgets.dropdown("tipo_coleta",
defaultValue=list(datasources.keys())[0],
choices=datasources.keys(),
label="Tipo Coleta")

# COMMAND ----------

datasource = dbutils.widgets.get("tipo_coleta")

url = datasources[datasource]['url']
base_path = datasources[datasource]['path']
if not os.path.exists(base_path):
os.mkdir(base_path)

anos_start = datasources[datasource]['anos']['start']
anos_stop = datasources[datasource]['anos']['stop']+1
anos_step = datasources[datasource]['anos']['step']
anos = list(range(anos_start, anos_stop, anos_step))

download_path = os.path.join(base_path, "download.zip")

sufix_not_remove = datasources[datasource]['sufix_not_remove']
sufix_manage = datasources[datasource]['sufix_manage']

ufs = [
'AC', 'AL', 'AP', 'AM',
'BA',
'CE',
'ES',
'GO',
'MA', 'MT', 'MS', 'MG',
'PA', 'PB', 'PR', 'PE', 'PI', 'RJ',
'RN', 'RS', 'RO', 'RR',
'SC', 'SP', 'SE', 'TO'
]

# COMMAND ----------

def check_sufix(text, sufixes):
for s in sufixes:
if text.endswith(s):
return True
return False


def get_data(url, download_path):
response = requests.get(url)
if response.status_code == 404:
return response
with open(download_path, "wb") as open_file:
open_file.write(response.content)
return response


def unzip_download(download_path, unzip_path):
with zipfile.ZipFile(download_path, 'r') as zip_ref:
zip_ref.extractall(unzip_path)


def remove_files(path, sufix_not_remove=["BR.csv", "BRASIL.csv"]):
all_files = dbutils.fs.ls(path.replace("/dbfs", ""))
to_remove = [i.path for i in all_files if not check_sufix(i.name, sufix_not_remove)]
for i in to_remove:
dbutils.fs.rm(i)


def manage_files(path, sufix_to_move=['csv', 'txt']):
db_path = path.replace("/dbfs", "")
all_files = dbutils.fs.ls(db_path)
for s in sufix_to_move:
s_files = [i.path for i in all_files if i.name.endswith(s)]
dbutils.fs.mkdirs(os.path.join(db_path, s))
for file in s_files:
dbutils.fs.mv(file, os.path.join(db_path, s))


def get_years(url, anos, uf, download_path, base_path):
for i in anos:
print(f"{uf}-{i}")
resp = get_data(url.format(ano=i, uf=uf), download_path)
if resp.status_code != 200:
print(f"Ignorado, razão: Código {resp.status_code}.")
continue
try:
unzip_download(download_path, base_path)
except:
print("erro ao tentar extrair os dados:", i, "-", uf)


# COMMAND ----------

if datasource.endswith("uf"):
print("Coletando dados de eleições por estado")

data = [(url, anos, uf, download_path.replace("download", f"download_{uf}"), base_path) for uf in ufs]
with Pool(1) as p:
p.starmap(get_years, data)

else:
print("Coletando dados de eleições nível nacional")
get_years(url, anos, "BR", download_path, base_path)

print("Movendo arquivos...")
manage_files(base_path)

for i in sufix_manage:
remove_path = os.path.join(base_path.replace("/dbfs", ""), i)
remove_files(remove_path, sufix_not_remove)
50 changes: 50 additions & 0 deletions src/01.raw/tse/datasources.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"candidatos":{
"url": "https://cdn.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_{ano}.zip",
"path": "/dbfs/mnt/datalake/tse/candidatos/",
"sufix_not_remove": ["BRASIL.csv"],
"sufix_manage": ["csv", "txt"],
"anos": {
"start": 2018,
"stop": 2022,
"step": 2
}
},

"bem_candidatos":{
"url": "https://cdn.tse.jus.br/estatistica/sead/odsele/bem_candidato/bem_candidato_{ano}.zip",
"path": "/dbfs/mnt/datalake/tse/bem_candidatos/",
"sufix_not_remove": ["BRASIL.csv"],
"sufix_manage": ["csv", "txt"],
"anos": {
"start": 2000,
"stop": 2022,
"step": 2
}
},

"votacao":{
"url": "https://cdn.tse.jus.br/estatistica/sead/odsele/votacao_secao/votacao_secao_{ano}_BR.zip",
"path": "/dbfs/mnt/datalake/tse/votacao/",
"sufix_not_remove": [".csv", ".txt"],
"sufix_manage": ["csv", "txt"],
"anos": {
"start": 2002,
"stop": 2022,
"step": 4
}
},

"votacao_uf":{
"url": "https://cdn.tse.jus.br/estatistica/sead/odsele/votacao_secao/votacao_secao_{ano}_{uf}.zip",
"path": "/dbfs/mnt/datalake/tse/votacao/",
"sufix_not_remove": [".csv", ".txt"],
"sufix_manage": ["csv", "txt"],
"anos": {
"start": 2000,
"stop": 2022,
"step": 4
}
}
}

0 comments on commit cb1493d

Please sign in to comment.