-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalise.sql
155 lines (133 loc) · 5.5 KB
/
analise.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
-- copiar arquivo Cartorios~utf8.csv para /tmp
DROP FOREIGN TABLE IF EXISTS vw_cartorios_original CASCADE;
CREATE FOREIGN TABLE vw_cartorios_original (
"UF" text, "CNPJ" text, "CNS" text,
"Data de Instalação" text, "Nome Oficial" text,
"Nome Fantasia" text, "Endereço" text, "Bairro" text,
"Município" text, "CEP" text, "Nome do Titular" text,
"Nome do Substituto" text, "Nome do Juiz" text, "Homepage" text,
"Email" text, "Telefone" text, "Fax" text, "Observação" text,
"Última Atualização" text, "Horário de Funcionamento" text,
"Área de Abrangência" text, "Atribuições" text, "Comarca" text, "Entrância" text,
nada text
) SERVER csv_files OPTIONS (
filename '/tmp/Cartorios2013~utf8.csv', -- pasta permitida para qualquer PostgreSQL
format 'csv',
delimiter ';',
quote '"',
header 'true'
);
CREATE SCHEMA IF NOT EXISTS lib;
CREATE or replace FUNCTION lib.brdate2isodate(text,text default NULL) RETURNS text AS $f$
SELECT CASE
WHEN array_length(x,1)>1 THEN concat(x[3],'-',x[2],'-',x[1])
ELSE CASE WHEN $2 IS NOT NULL THEN $2||$1 ELSE NULL END
END
FROM (SELECT regexp_split_to_array(COALESCE($1,''), '/')) t(x)
$f$ language SQL IMMUTABLE;
CREATE or replace FUNCTION array_distinct_sort (ANYARRAY) RETURNS ANYARRAY AS $f$
SELECT ARRAY(SELECT DISTINCT unnest($1) ORDER BY 1)
$f$ language SQL strict IMMUTABLE;
CREATE or replace FUNCTION lib.supertrim(
-- sanitize text
text, -- 1. input string (many words separed by spaces or punctuation)
text DEFAULT ' ' -- 2. output separator
) RETURNS text AS $f$
SELECT
TRIM( regexp_replace( -- for review: regex(regex()) for ` , , ` remove
TRIM(regexp_replace($1,E'[\\n\\r \\+/,;:\\(\\)\\{\\}\\[\\]="\\s ]*[\\+/,;:\\(\\)\\{\\}\\[\\]="]+[\\+/,;:\\(\\)\\{\\}\\[\\]="\\s ]*|[\\s ]+[–\\-][\\s ]+',
$2, 'g'),' ,'), -- s*ps*|s-s
E'[\\s ;\\|"]+[\\.\'][\\s ;\\|"]+|[\\s ;\\|"]+', -- s.s|s
$2,
'g'
))
$f$ LANGUAGE SQL IMMUTABLE;
CREATE or replace FUNCTION lib.cart_sanitize_abrangencia(text,text default ';') RETURNS text AS $f$
-- falta debug de trim, maldito ainda retorna espaco.
SELECT array_to_string( array_distinct_sort(array_agg(lower(x))), $2 )
FROM (
SELECT CASE WHEN x='' OR x IS NULL THEN NULL ELSE trim(lib.supertrim(x),chr(160)) END as x
FROM (
SELECT trim( regexp_split_to_table(COALESCE($1,''), ',') , ' ;.,')
) t(x)
) t2
$f$ language SQL IMMUTABLE;
-- == ==
-- DROP VIEW vw_cartorios_basico cascade;
CREATE or replace VIEW vw_cartorios_basico AS
SELECT "UF" uf, trim("CNPJ") cnpj, trim("CNS") cns,
lower(trim("Nome Oficial",' ,.')) nome_oficial,
trim("Município",' ,;.') municipio, "CEP" cep, lower(trim("Homepage")) homepage,
lib.brdate2isodate("Data de Instalação")::date as data_instalacao, -- falta sanitizar
lib.cart_sanitize_abrangencia("Atribuições") atribuicoes,
lib.cart_sanitize_abrangencia("Área de Abrangência") area_abrangencia,
lower(trim("Comarca",' ,.;')) as comarca
FROM vw_cartorios_original
;
-- -- -- -- --
-- RELATORIOS:
CREATE or replace VIEW vw_cartorios_relat01_datas AS
SELECT distinct data_instalacao as data_instalacao
FROM vw_cartorios_basico
ORDER BY 1 desc
; -- 2012-11-19, 2012-11-09, 2012-09-19. Determinou 2013 como ano de referencia.
CREATE or replace VIEW vw_cartorios_relat02_atribs AS
SELECT regexp_split_to_table(atribuicoes,';') as atribuicoes,
count(*) as n
FROM vw_cartorios_basico
GROUP BY 1 ORDER BY 1
;
CREATE or replace VIEW vw_cartorios_relat03_areas AS
SELECT uf, replace(regexp_split_to_table(area_abrangencia,';'),'município de','') as area_abrangencia,
count(*) as n
FROM vw_cartorios_basico
GROUP BY 1,2
ORDER BY 3 desc, 1,2
;
------
CREATE or replace VIEW vw_cartorios_relat04_nonrepeat_v1 AS
SELECT cns, array_distinct_sort(array_agg(cnpj)) as cnpjs,
jsonb_agg(jsonb_build_object('cnpj',cnpj, 'uf',uf, 'cep',cep, 'atribuicoes',atribuicoes)) as info
FROM vw_cartorios_basico
group by 1,2 having count(*)>1
ORDER BY 1
;
----
CREATE or replace VIEW vw_cartorios_relat04_repetidos_v2 AS
SELECT cns,
round(avg(1+length(regexp_replace(atribuicoes, '[^;]+', '','g')))) as atrib_len_avg,
count(*) n,
count(distinct cnpj) as cnpjs,
count(distinct cep) as ceps
FROM vw_cartorios_basico
group by 1 having count(*)>1
ORDER BY 3 desc, 1
;
CREATE or replace VIEW vw_cartorios_relat04_repetidos_v3 AS
SELECT cnpj,
round(avg(1+length(regexp_replace(atribuicoes, '[^;]+', '','g')))) as atrib_len_avg,
count(*) n,
count(distinct cns) as cnss,
count(distinct cep) as ceps
FROM vw_cartorios_basico
group by 1 having count(*)>1
ORDER BY 3 desc, 1
; -- 13.100.722/0001-60 é o Tribunal de Justica do Estado da Bahia.
-- 09.444.530/0001-01 é o Tribunal de Justica do Estado do Ceara.
-- -- -- -- --
-- OUTPUT:
COPY (select * from vw_cartorios_basico)
TO '/tmp/cartorios2013_basico.csv' HEADER CSV;
COPY (select * from vw_cartorios_relat02_atribs)
TO '/tmp/cartorios2013_relat02_atribs.csv' HEADER CSV;
COPY (select uf, trim(substr(area_abrangencia,0,30)) area_abrangencia,
sum(n) as n
from vw_cartorios_relat03_areas
group by 1,2
having sum(n)>2
order by 3 desc, 1,2
) TO '/tmp/cartorios2013_relat03_areas_multi.csv' HEADER CSV;
COPY (select * from vw_cartorios_relat04_repetidos_v2)
TO '/tmp/cartorios_relat04_repetidos_v2.csv' HEADER CSV;
COPY (select * from vw_cartorios_relat04_repetidos_v3)
TO '/tmp/cartorios_relat04_repetidos_v3.csv' HEADER CSV;