-
Notifications
You must be signed in to change notification settings - Fork 16
Original preparation
Peter edited this page Oct 16, 2017
·
2 revisions
After Loading external data to SQL, the table was prepared with the following procedures,
SELECT s.subdivision, c.nomemunic , c.Codmun, oficial.name2lex(c.nomemunic) as xx
FROM tmpcsv_ibge_cities c INNER JOIN tmpcsv_state_codes s ON s.idIBGE=c.CodUF::int;
-- -- -- -- --
--
-- Preparo das fontes de dados:
-- php src/wikitext2CSV.php < ../br-cities.wiki.txt > /tmp/cities_from_wiki.csv
-- wget -c -O /tmp/localidade-v1.csv https://github.com/okfn-brasil/lexml-vocabulary/raw/master/data/localidade-v1.csv
-- wget -c -O /tmp/br-state-codes.csv https://github.com/datasets-br/state-codes/raw/master/data/br-state-codes.csv
CREATE VIEW tmpcsv_cities_wiki_lex AS
SELECT c.*, concat('br;', s.lexlabel, ';', oficial.name2lex(c.name)) as lexlabel
FROM tmpcsv_cities_wiki c INNER JOIN tmpcsv_state_codes s ON s.subdivision=c.state
;
-- bug report:
CREATE VIEW tmpcsv_cities_wiki_difflexv1 AS -- differences from lexml_v1
SELECT * FROM tmpcsv_cities_wiki_lex WHERE lexlabel NOT IN (
SELECT w.lexlabel from tmpcsv_lexml_loc l INNER JOIN tmpcsv_cities_wiki_lex w ON w.lexlabel=l.about;
);
-- prepare:
CREATE VIEW tmpcsv_cities_wiki_full AS
SELECT name, state, wdid as "wdId", base36_encode(oficial.br_city_id_gap(name)*(row_number() OVER ()),4) as "idSeq",
oficial.name2lex(name) as "lexLabel",
NULL::int AS creation, NULL::int AS extinction, NULL::text AS "postalCode_ranges", NULL::text AS notes
FROM tmpcsv_cities_wiki
ORDER BY name
;
-- Simulating ideal idSeq for firsts, base36_encode(oficial.br_city_id_gap(name)*(row_number() OVER ()),3) as "idSeq"
COPY (
SELECT f.name, f.state, f."wdId", c.Codmun AS "idIBGE", f."lexLabel",
f.creation, f.extinction, f."postalCode_ranges", f.notes
FROM (tmpcsv_ibge_cities c INNER JOIN tmpcsv_state_codes s ON s.idIBGE=c.CodUF::int)
LEFT JOIN tmpcsv_cities_wiki_full f ON f.state=s.subdivision AND oficial.name2lex(c.nomemunic)=f."lexLabel"
) TO '/tmp/br-cities.csv' DELIMITER ',' CSV HEADER;
Self-load to SQL and some other scripts.
CREATE FOREIGN TABLE tmpcsv_city_codes (
-- cd /tmp; git clone https://github.com/datasets-br/city-codes.git
name text,
state text,
wdId text,
idIBGE int,
lexLabel text,
creation int,
extinction int,
postalCode_ranges text,
notes text
) SERVER csv_files OPTIONS (
filename '/tmp/city-codes/data/br-city-codes.csv', --change
format 'csv',
header 'true'
);
-- obtido em https://github.com/chandez/Estados-Cidades-IBGE ou outro
-- \d municipio Table "public.municipio" tem id, codigo, uf, nome.
create view vw_municipio AS select oficial.name2lex(nome) as lexlabel, nome, uf, codigo from municipio;
select t.*, c.codigo
from vw_municipio c inner join tmpcsv_city_codes t ON t.lexlabel=c.lexlabel AND t.state=c.uf;
---
SELECT cep.metahash1, cep.cidade, c2.name, c2.state, c2.wdid, c2.idibge, cep.cep
FROM tmpcsv_city_codes c2 INNER JOIN vw_cep cep ON cep.metahash1 = c2.state||metaphone(c2.name,4)
WHERE (lower(c2.state) ||';'|| c2.lexlabel) NOT IN (
select distinct v.lexlabel
FROM vw_cep v INNER JOIN tmpcsv_city_codes c
ON lower(c.state)||';'||c.lexLabel = v.lexlabel
)
ORDER BY 1