Skip to content

Commit

Permalink
adding updated merges through to roster inactive. should remove settl…
Browse files Browse the repository at this point in the history
…ement merge steps since they are not being used on the site/we're using the chicago reporter data directly from airtable
  • Loading branch information
ashwin934 committed Oct 2, 2024
1 parent c17dd68 commit cde8541
Show file tree
Hide file tree
Showing 524 changed files with 6,232 additions and 2,001 deletions.
21 changes: 21 additions & 0 deletions merge/01_roster_1936-2017_2017-04_p058155/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# use absolute path of this makefile in case this called elsewhere with include
merge_path := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
merge_dirname := $(lastword $(subst /, ,$(merge_path)))
split := $(subst _, ,$(merge_dirname))

merge_number := $(firstword $(split))
prev_merge_number := $(shell echo $(merge_number) - 1 | bc)

ifeq (prev_merge_number,0)
prev_merge_folder :=
else
prev_merge_folder := $(wildcard ../*$(prev_merge_number)_*)
endif

without_number := $(wordlist 2,$(words $(split)),$(split))
# split on space again
individual := $(subst $(eval) ,_,$(without_number))

# output/*.csv.gz: src/*.py ../$(individual)/export/output/*.csv.gz
all:
echo $(prev_merge_folder)
3 changes: 1 addition & 2 deletions merge/01_roster_1936-2017_2017-04_p058155/output/merge.log
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
running src/merge.py
running merge.py
Creating reference data from filewith intrafile ID: roster_1936-2017_2017-04_ID
609 rows dropped due to merge!=1. 31789 rows remaining
Data reshaped wide (31789 rows) to long (45539 rows) by star columns
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# from /Users/ashwin/dev/inv/update/chicago-police-data/merge_update/01_roster_1936-2017_2017-04_p058155/src/merge.py
1 change: 1 addition & 0 deletions merge/01_roster_1936-2017_2017-04_p058155/src/foia_data.py
23 changes: 11 additions & 12 deletions merge/01_roster_1936-2017_2017-04_p058155/src/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
import __main__

from merge_functions import ReferenceData
from reference_data import ReferenceData
import setup


Expand Down Expand Up @@ -35,14 +35,13 @@ def get_setup():
return setup.do_setup(script_path, args)


cons, log = get_setup()
data_df = pd.read_csv(cons.input_profiles_file)
ReferenceData(data_df=data_df, uid=cons.universal_id,
data_id=cons.intrafile_id,
starting_uid = cons.starting_uid,
log=log)\
.remerge_to_file(cons.input_remerge_file,
cons.output_remerge_file,
cons.csv_opts)\
.write_reference(cons.output_reference_file,
cons.csv_opts)
if __name__ == "__main__":
cons, log = get_setup()

df = pd.read_csv(cons.input_profiles_file)
rd = ReferenceData.from_first_file(df,
id=cons.universal_id,
data_id=cons.intrafile_id,
starting_id=cons.starting_uid,
log=log)\
.write_reference(cons.output_reference_file, cons.csv_opts)
132 changes: 102 additions & 30 deletions merge/02_unit-history__2016-12_p052262/output/merge.log
Original file line number Diff line number Diff line change
@@ -1,36 +1,108 @@
running src/merge.py
Loading reference data with 45539 rows and 31789 universal IDs
Adding file with intrafile ID: unit-history__2016-12_ID
running merge.py
Adding column [F4FN] to data with ID = UID
Adding column [F4LN] to data with ID = UID
Adding column [L4FN] to data with ID = UID
Adding column [BY_to_CA] to data with ID = UID
Adding column [F4FN] to data with ID = unit-history__2016-12_ID
Adding column [F4LN] to data with ID = unit-history__2016-12_ID
Adding column [L4FN] to data with ID = unit-history__2016-12_ID
Beginning loop_merge.
Merge Report:
31355 Total Merged. 98.63% of ref and 97.97% of sup Merged.
434 Unmerged in ref. 1.37% Unmerged.
651 Unmerged in sup. 2.03% Unmerged.

first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-race-current_unit 22227
first_name_NS-last_name_NS-appointed_date-birth_year-gender-race-current_unit 4562
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-race 2543
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-race-suffix_name-current_unit 906
first_name_NS-last_name_NS-appointed_date-birth_year-gender-race 524
first_name_NS-last_name_NS-appointed_date-birth_year-gender-race-suffix_name-current_unit 399
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-race-suffix_name 128
first_name_NS-last_name_NS-appointed_date-birth_year-gender-race-suffix_name 23
L4FN-birth_year-race-gender-current_unit-appointed_date 15
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-middle_initial2-gender-race-current_unit 7
first_name_NS-last_name_NS-birth_year-gender 6
first_name_NS-F4LN-appointed_date-birth_year-middle_initial-gender-race-current_unit 4
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-current_unit 3
race-gender-birth_year-appointed_date-last_name_NS 2
first_name_NS-last_name_NS-gender-race-current_unit 1
F4FN-last_name_NS-appointed_date-birth_year-gender-race-current_unit 1
first_name_NS-last_name_NS-appointed_date-birth_year-gender-current_unit 1
first_name_NS-last_name_NS-appointed_date-birth_year-gender-suffix_name-current_unit 1
first_name_NS-last_name_NS-appointed_date-middle_initial-gender-race-current_unit 1
first_name_NS-last_name_NS-appointed_date-middle_initial-gender-race 1
Name: matched_on, dtype: int64
Adding column [BY_to_CA] to data with ID = unit-history__2016-12_ID
Running merge (1 of 9): base
Required column types: first_name, last_name, appointed_date
Optional column types: star, birth_year, middle_initial, gender, race, suffix_name, current_unit
Alternate columns for types: first_name: first_name_NS, F4FN; last_name: last_name_NS, F4LN; birth_year: birth_year, current_age, current_age_m1, current_age_p1,
31331 rows merged. 98.56% of ref and 97.89% of sup merged.
458 unmerged in ref. 1.44% unmerged.
675 unmerged in sup. 2.11% unmerged.

Running merge (2 of 9): multirow merges
Required column types: first_name
Optional column types: star, last_name, appointed_date, birth_year, middle_initial, gender, race, suffix_name, current_unit
Alternate columns for types: first_name: first_name_NS, F4FN; last_name: last_name_NS, F4LN, ; birth_year: birth_year, current_age, current_age_m1, current_age_p1,
0 rows merged. 0.0% of ref and 0.0% of sup merged.
458 unmerged in ref. 1.44% unmerged.
675 unmerged in sup. 2.11% unmerged.

Running merge (3 of 9): null appointed date (reference) merge
Required column types: first_name, last_name, birth_year
Optional column types: star, appointed_date, middle_initial, gender, race, suffix_name, current_unit
Alternate columns for types: first_name: first_name_NS, F4FN; last_name: last_name_NS, F4LN; birth_year: birth_year, current_age
5 rows merged. 0.02% of ref and 0.02% of sup merged.
453 unmerged in ref. 1.43% unmerged.
670 unmerged in sup. 2.09% unmerged.

Running merge (4 of 9): null appointed date (supplemental) merge
Required column types: first_name, last_name, birth_year
Optional column types: star, appointed_date, middle_initial, gender, race, suffix_name, current_unit
Alternate columns for types: first_name: first_name_NS, F4FN; last_name: last_name_NS, F4LN; birth_year: birth_year, current_age
0 rows merged. 0.0% of ref and 0.0% of sup merged.
453 unmerged in ref. 1.43% unmerged.
670 unmerged in sup. 2.09% unmerged.

Running merge (5 of 9): null appointed date and birth year reference merge
Required column types: first_name, last_name
Optional column types: star, appointed_date, birth_year, middle_initial, gender, race, suffix_name, current_unit
Alternate columns for types: first_name: first_name_NS, F4FN; last_name: last_name_NS, F4LN; birth_year: birth_year, current_age,
1 rows merged. 0.0% of ref and 0.0% of sup merged.
452 unmerged in ref. 1.42% unmerged.
669 unmerged in sup. 2.09% unmerged.

Running merge (6 of 9): null appointed date and birth year supplemental merge
Required column types: first_name, last_name
Optional column types: star, appointed_date, birth_year, middle_initial, gender, race, suffix_name, current_unit
Alternate columns for types: first_name: first_name_NS, F4FN; last_name: last_name_NS, F4LN; birth_year: birth_year, current_age,
0 rows merged. 0.0% of ref and 0.0% of sup merged.
452 unmerged in ref. 1.42% unmerged.
669 unmerged in sup. 2.09% unmerged.

Running merge (7 of 9): married - last name change merge
Required column types: star, first_name, appointed_date
Optional column types: last_name, birth_year, middle_initial, gender, race, suffix_name, current_unit
Alternate columns for types: first_name: first_name_NS, F4FN; last_name: last_name_NS, F4LN, ; birth_year: birth_year, current_age, current_age_m1, current_age_p1,
13 rows merged. 0.04% of ref and 0.04% of sup merged.
439 unmerged in ref. 1.38% unmerged.
656 unmerged in sup. 2.05% unmerged.

Running merge (8 of 9): last name substring merge
Required column types: first_name, appointed_date
Optional column types: star, last_name, birth_year, middle_initial, gender, race, suffix_name, current_unit
Alternate columns for types: first_name: first_name_NS, F4FN; last_name: last_name_NS, L4LN, ; birth_year: birth_year, current_age, current_age_m1, current_age_p1,
1 rows merged. 0.0% of ref and 0.0% of sup merged.
438 unmerged in ref. 1.38% unmerged.
655 unmerged in sup. 2.05% unmerged.

Running merge (9 of 9): manual/custom merges
4 rows merged. 0.01% of ref and 0.01% of sup merged.
434 unmerged in ref. 1.37% unmerged.
651 unmerged in sup. 2.03% unmerged.

Final Merge Report:
31355 rows merged. 98.63% of ref and 97.97% of sup merged.
434 unmerged in ref. 1.37% unmerged.
651 unmerged in sup. 2.03% unmerged.
matched_on
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-race-current_unit 22234
first_name_NS-last_name_NS-appointed_date-birth_year-gender-race-current_unit 4562
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-race 2543
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-race-suffix_name-current_unit 906
first_name_NS-last_name_NS-appointed_date-birth_year-gender-race 524
first_name_NS-last_name_NS-appointed_date-birth_year-gender-race-suffix_name-current_unit 399
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-race-suffix_name 128
first_name_NS-last_name_NS-appointed_date-birth_year-gender-race-suffix_name 23
first_name_NS-appointed_date-birth_year-middle_initial-gender-race-current_unit 11
first_name_NS-F4LN-appointed_date-birth_year-middle_initial-gender-race-current_unit 4
first_name_NS-last_name_NS-appointed_date-birth_year-middle_initial-gender-current_unit 3
first_name_NS-appointed_date-birth_year-gender-race-current_unit 3
first_name_NS-last_name_NS-birth_year-gender-race 3
first_name_NS-last_name_NS-birth_year-gender 2
last_name_NS-appointed_date-birth_year-middle_initial-gender-race 1
L4FN-birth_year-race-gender-current_unit-appointed_date 1
first_name_NS-last_name_NS-birth_year-middle_initial-gender-race-current_unit 1
first_name_NS-last_name_NS-appointed_date-middle_initial-gender-race 1
first_name_NS-last_name_NS-gender-race-current_unit 1
F4FN-last_name_NS-appointed_date-birth_year-gender-race-current_unit 1
first_name_NS-last_name_NS-appointed_date-middle_initial-gender-race-current_unit 1
first_name_NS-last_name_NS-appointed_date-birth_year-gender-current_unit 1
first_name_NS-last_name_NS-appointed_date-birth_year-gender-suffix_name-current_unit 1
last_name_NS-appointed_date-birth_year-gender-race-current_unit 1
Name: count, dtype: int64
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# from /Users/ashwin/dev/inv/update/chicago-police-data/merge_update/02_unit-history__2016-12_p052262/src/merge.py
1 change: 1 addition & 0 deletions merge/02_unit-history__2016-12_p052262/output/merge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# from src/merge.py
1 change: 1 addition & 0 deletions merge/02_unit-history__2016-12_p052262/src/filters.py
1 change: 1 addition & 0 deletions merge/02_unit-history__2016-12_p052262/src/foia_data.py
70 changes: 47 additions & 23 deletions merge/02_unit-history__2016-12_p052262/src/merge.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#!usr/bin/env python3
#
# Author(s): Roman Rivera (Invisible Institute)
# Author(s): Roman Rivera (Invisible Institute) / Ashwin Sharma (Invisible Institute)

'''merge script for 02_unit-history__2016-12_p052262'''

import pandas as pd
import __main__

from merge_functions import ReferenceData
from reference_data import ReferenceData
from default_merges import default_merges, whitelist_merge
import setup


Expand All @@ -23,17 +24,30 @@ def get_setup():
'input_reference_file': 'input/officer-reference.csv.gz',
'output_reference_file': 'output/officer-reference.csv.gz',
'universal_id': 'UID',
'data_id': 'unit-history__2016-12_ID',
'input_profiles_file' : 'input/unit-history__2016-12_profiles.csv.gz',
'add_cols' : ['F4FN', 'F4LN', 'L4FN'],
'loop_merge' : {
'custom_merges' : [
['L4FN', 'birth_year', 'race', 'gender', 'current_unit', 'appointed_date'],
['race', 'gender', 'birth_year', 'appointed_date', 'last_name_NS'],
['first_name_NS', 'last_name_NS', 'birth_year', 'gender'],
['first_name_NS', 'last_name_NS', 'gender', 'race', 'current_unit']
],
'verbose' : True,
},
'add_cols' : ['F4FN', 'F4LN', 'L4FN', "BY_to_CA"],
'null_flag_cols': ['appointed_date', 'birth_year'],
'whitelist': {
'matched_on':
{0: 'first_name_NS-last_name_NS-birth_year-middle_initial-gender-race-current_unit',
1: 'first_name_NS-last_name_NS-gender-race-current_unit',
2: 'L4FN-birth_year-race-gender-current_unit-appointed_date',
3: 'last_name_NS-appointed_date-birth_year-middle_initial-gender-race',
4: 'last_name_NS-appointed_date-birth_year-gender-race-current_unit'},
'UID':
{0: 100808,
1: 116595,
2: 121423,
3: 111109,
4: 104139},
'unit-history__2016-12_ID':
{0: 798.0,
1: 16688.0,
2: 7219.0,
3: 11101.0,
4: 19573.0}
},
'input_remerge_file' : 'input/unit-history__2016-12.csv.gz',
'output_remerge_file' : 'output/unit-history__2016-12.csv.gz'
}
Expand All @@ -45,17 +59,27 @@ def get_setup():

return setup.do_setup(script_path, args)

if __name__ == "__main__":
cons, log = get_setup()

cons, log = get_setup()
ref_df = pd.read_csv(cons.input_reference_file)
sup_df = pd.read_csv(cons.input_profiles_file)

ref_df = pd.read_csv(cons.input_reference_file)
sup_df = pd.read_csv(cons.input_profiles_file)
ref_df.appointed_date = pd.to_datetime(ref_df.appointed_date)
sup_df.appointed_date = pd.to_datetime(sup_df.appointed_date)

ReferenceData(ref_df, uid=cons.universal_id, log=log)\
.add_sup_data(sup_df, add_cols=cons.add_cols)\
.loop_merge(**cons.loop_merge)\
.append_to_reference()\
.remerge_to_file(cons.input_remerge_file,
cons.output_remerge_file,
cons.csv_opts)\
.write_reference(cons.output_reference_file, cons.csv_opts)
# 2 manual values: amy petrouski had a appointed date update
# jacqueline mizera = jackie mizera
whitelist_df = pd.DataFrame(cons.whitelist)
merges = default_merges + [whitelist_merge(whitelist_df, cons.universal_id, cons.data_id)]

rd = ReferenceData(ref_df, id=cons.universal_id,add_cols=cons.add_cols,
null_flag_cols=cons.null_flag_cols,
log=log)\
.add_sup_data(sup_df, data_id=cons.data_id, add_cols=cons.add_cols)\
.loop_merge(merges=merges)\
.append_to_reference() \
.remerge_to_file(cons.input_remerge_file,
cons.output_remerge_file,
cons.csv_opts)\
.write_reference(cons.output_reference_file, cons.csv_opts)
1 change: 1 addition & 0 deletions merge/02_unit-history__2016-12_p052262/src/merge_data.py
Loading

0 comments on commit cde8541

Please sign in to comment.