Skip to content

Original preparation

Peter edited this page Oct 16, 2017 · 2 revisions

After Loading external data to SQL, the table was prepared with the following procedures,

SELECT s.subdivision, c.nomemunic , c.Codmun, oficial.name2lex(c.nomemunic) as xx
FROM tmpcsv_ibge_cities c INNER JOIN tmpcsv_state_codes s ON s.idIBGE=c.CodUF::int;

-- -- -- -- --

--
-- Preparo das fontes de dados:
--   php src/wikitext2CSV.php < ../br-cities.wiki.txt > /tmp/cities_from_wiki.csv
--   wget -c -O /tmp/localidade-v1.csv https://github.com/okfn-brasil/lexml-vocabulary/raw/master/data/localidade-v1.csv
--   wget -c -O /tmp/br-state-codes.csv https://github.com/datasets-br/state-codes/raw/master/data/br-state-codes.csv

CREATE VIEW tmpcsv_cities_wiki_lex AS
  SELECT c.*, concat('br;', s.lexlabel, ';', oficial.name2lex(c.name)) as lexlabel
  FROM tmpcsv_cities_wiki c INNER JOIN tmpcsv_state_codes s ON s.subdivision=c.state
;

-- bug report:
CREATE VIEW tmpcsv_cities_wiki_difflexv1 AS  -- differences from lexml_v1
SELECT * FROM tmpcsv_cities_wiki_lex WHERE lexlabel NOT IN (
  SELECT w.lexlabel  from tmpcsv_lexml_loc l INNER JOIN tmpcsv_cities_wiki_lex w ON w.lexlabel=l.about; 
);

-- prepare:
CREATE VIEW tmpcsv_cities_wiki_full AS
    SELECT name, state, wdid as "wdId", base36_encode(oficial.br_city_id_gap(name)*(row_number() OVER ()),4) as "idSeq", 
           oficial.name2lex(name) as "lexLabel",
           NULL::int AS creation, NULL::int AS extinction, NULL::text AS "postalCode_ranges", NULL::text AS notes 
    FROM tmpcsv_cities_wiki 
    ORDER BY name
;
-- Simulating ideal idSeq for firsts, base36_encode(oficial.br_city_id_gap(name)*(row_number() OVER ()),3) as "idSeq"  

COPY (
  SELECT f.name, f.state, f."wdId", c.Codmun AS "idIBGE", f."lexLabel",
       f.creation, f.extinction, f."postalCode_ranges", f.notes
  FROM (tmpcsv_ibge_cities c INNER JOIN tmpcsv_state_codes s ON s.idIBGE=c.CodUF::int) 
     LEFT JOIN tmpcsv_cities_wiki_full f ON f.state=s.subdivision AND oficial.name2lex(c.nomemunic)=f."lexLabel"
) TO '/tmp/br-cities.csv' DELIMITER ',' CSV HEADER;

Checking by IBGE

Self-load to SQL and some other scripts.

CREATE FOREIGN TABLE tmpcsv_city_codes (
	-- cd /tmp; git clone https://github.com/datasets-br/city-codes.git
	name text,
	state text,
	wdId text,
	idIBGE int,
	lexLabel text,
	creation int,
	extinction int,
	postalCode_ranges text,
	notes text
) SERVER csv_files OPTIONS ( 
     filename '/tmp/city-codes/data/br-city-codes.csv', --change
     format 'csv', 
     header 'true'
);

-- obtido em https://github.com/chandez/Estados-Cidades-IBGE ou outro
-- \d municipio     Table "public.municipio"  tem id, codigo, uf, nome.

create view vw_municipio AS select oficial.name2lex(nome) as lexlabel, nome, uf, codigo from municipio;

select t.*, c.codigo
from vw_municipio c inner join tmpcsv_city_codes t ON t.lexlabel=c.lexlabel AND t.state=c.uf;



---

SELECT cep.metahash1, cep.cidade, c2.name, c2.state, c2.wdid, c2.idibge, cep.cep
FROM tmpcsv_city_codes c2 INNER JOIN vw_cep cep ON cep.metahash1 = c2.state||metaphone(c2.name,4)
WHERE (lower(c2.state) ||';'|| c2.lexlabel) NOT IN (
  select distinct v.lexlabel 
  FROM vw_cep v INNER JOIN tmpcsv_city_codes c 
       ON lower(c.state)||';'||c.lexLabel = v.lexlabel
) 
ORDER BY 1