Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: pull v2 api #361

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/workflows/pr_check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Python tests

on:
pull_request:
branches:
- master
workflow_dispatch:

env:
working_dir: download

jobs:
run_tests:
name: Run tests on Python download code
runs-on: ubuntu-22.04
defaults:
run:
working-directory: ${{ env.working_dir }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version-file: ${{ env.working_dir }}/.python-version
cache: pip
cache-dependency-path: ${{ env.working_dir }}/requirements.txt
- run: pip install -r requirements.txt
- name: Run tests
run: pytest tests/test_*.py

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ inputs
.local
**/__pycache__
build/candidates.xlsx
.vscode/
21 changes: 21 additions & 0 deletions download/.github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: "Check Google Drive Access"
on:
workflow_dispatch:
jobs:
check:
runs-on: ubuntu-latest
env:
REPO_OWNER: ${{ github.repository_owner}}
REPO_BRANCH: ${{ github.ref_name }}
SERVICE_ACCOUNT_KEY_JSON: ${{ secrets.SERVICE_ACCOUNT_KEY_JSON }}
GDRIVE_FOLDER: ${{ vars.GDRIVE_FOLDER }}
steps:
- uses: actions/checkout@v3
- run: "pip install -r gdrive_requirements.txt"
- run: "python test_pull_from_gdrive.py"
- name: Archive pulled files
uses: actions/upload-artifact@v2
with:
name: redacted-netfile-files
path: .local/downloads

7 changes: 7 additions & 0 deletions download/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.venv/
__pycache__
.env
.idea
.vscode
SERVICE_ACCOUNT_KEY_JSON.json
.local
1 change: 1 addition & 0 deletions download/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11
7 changes: 7 additions & 0 deletions download/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Query NetFile V2 API to Load Disclosure-Backend DB

Run main.py to download redacted JSON files from Google Drive and create csv files. To download, the key for the service account used to access Google Drive has to be placed in the file .local/SERVICE_ACCOUNT_KEY_JSON.json.

# Run tests

Tests are in the folder **tests**. To run them all, simply do `pytest tests`.
6 changes: 6 additions & 0 deletions download/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
''' Pytest config '''

pytest_plugins = [
# Autoload all fixtures in every test because they are kept in a separate file from the tests themselves
"tests.fixtures.data_fixtures"
]
80 changes: 80 additions & 0 deletions download/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
""" main, to run everything """
import json
from model.a_contributions import A_Contributions
from model.committee import Committees
# Next line ingored because Pylint reports cannot find election in model
from model.election import Elections # pylint: disable=import-error,no-name-in-module
from model.filing import Filings
from model.transaction import Transactions

from gdrive_datastore.gdrive import pull_data

DATA_DIR_PATH = '.local/downloads'
OUTPUT_DIR = '.local'

def unique_statuses(filers):
""" What are the unique values for status? """
return set(
s['status'] for f in filers
for s in f['statusList']
)

def main():
""" Do everyting """
# pull data from gdrive and put it in .local/downloads
pull_data(subfolder='main', default_folder='OpenDisclosure')

with open(f'{DATA_DIR_PATH}/elections.json', encoding='utf8') as f:
elections_json = json.loads(f.read())

elections = Elections(elections_json)

with open(f'{DATA_DIR_PATH}/filers.json', encoding='utf8') as f:
filers = json.loads(f.read())

committees = Committees(filers, elections.pl)

# A-Contribs:
# join filers + filings + elections + transactions
# transactions.filing_nid -> filings.filing_nid
# filings.filer_nid -> committees.filer_nid
# committees.Ballot_Measure_Election -> elections.Ballot_Measure_Election
# where trans['transaction']['calTransactionType'] == 'F460A'
with open(f'{DATA_DIR_PATH}/filings.json', encoding='utf8') as f:
filings = Filings(json.loads(f.read())).pl

with open(f'{DATA_DIR_PATH}/transactions.json', encoding='utf8') as f:
records = json.loads(f.read())
transactions = Transactions(records).pl

a_contributions = A_Contributions(transactions, filings, committees.pl)
a_contribs_df = a_contributions.df
if not a_contribs_df.is_empty:
print(a_contribs_df.drop(columns=[
'BakRef_TID',
'Bal_Name',
'Bal_Juris',
'Bal_Num',
'Dist_No',
'Form_Type',
'Int_CmteId',
'Juris_Cd',
'Juris_Dscr',
'Loan_Rate',
'Memo_Code',
'Memo_RefNo',
'Off_S_H_Cd',
'tblCover_Offic_Dscr',
'tblCover_Office_Cd',
'tblDetlTran_Office_Cd',
'tblDetlTran_Offic_Dscr',
'XRef_SchNm',
'XRef_Match',
]).sample(n=20))

elections.pl.write_csv(f'{OUTPUT_DIR}/elections.csv')
committees.pl.write_csv(f'{OUTPUT_DIR}/committees.csv')
a_contributions.df.write_csv(f'{OUTPUT_DIR}/a_contributions.csv')

if __name__ == '__main__':
main()
Empty file added download/model/__init__.py
Empty file.
24 changes: 24 additions & 0 deletions download/model/a_contributions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Schedule A, Contributions
Hopefully this can be joined with other Schedule classes into a single Transaction class
"""
import polars as pl
from .schedule import ScheduleBase

class A_Contributions(ScheduleBase):
"""
Each record represents Schedule A - Contributions from form 460
"""
def __init__(
self,
transactions:pl.DataFrame,
filings:pl.DataFrame,
committees:pl.DataFrame
):
self._form_id = 'F460A'
super().__init__(
self._form_id,
transactions,
filings,
committees
)
51 changes: 51 additions & 0 deletions download/model/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
""" This is the base model, upon all others shall be based """
import pandas as pd
import polars as pl

class BaseModel:
""" Base model other models inherit from """
def __init__(self, data):
self._data = data
self._df = None
self._pl = None
self._dtypes = []
self._pl_dtypes = []
self._sql_dtypes = []
self._sql_cols = []
self._sql_table_name = ''

@property
def data(self):
""" Just return the data """
return self._data

@property
def pl(self):
''' Return a Polars dataframe '''
if self._pl is None or self._pl.is_empty():
self._pl = pl.DataFrame(self._data, schema=self._pl_dtypes)

return self._pl

@property
def df(self):
""" Get a dataframe of the data """
if self._df is None or self._df.empty:
self._df = pd.DataFrame(self._data).astype(self._dtypes)

return self._df

def to_sql(self, connection, **kwargs):
""" Write to a postgresql table """
options = {
'index_label': 'id',
'if_exists': 'replace'
}
options.update(kwargs)

self.df[self._sql_cols].to_sql(
self._sql_table_name,
connection,
dtype=self._sql_dtypes,
**options
)
123 changes: 123 additions & 0 deletions download/model/committee.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
""" This is the Committee model """
from typing import List
import polars as pl
from sqlalchemy.types import String
from . import base

class Committees(base.BaseModel):
""" A collection of committees """
def __init__(self, filers:List[dict], elections:pl.DataFrame):
empty_election_influence = {
'electionDate': None,
'measure': None,
'candidate': None,
'doesSupport': None,
'startDate': None,
'endDate': None
}

super().__init__([
{
'filer_nid': int(f['filerNid']),
# 'Ballot_Measure_Election': [ *elections[elections['date'] == infl['electionDate']]['name'].array, None ][0],
'Ballot_Measure_Election': self._get_possibly_empty_ballot_measure_election(
elections,
infl
),
'Filer_ID': f['registrations'].get('CA SOS'),
'Filer_NamL': infl.get('committeeName', f['filerName']),
'_Status': 'INACTIVE' if f['isTerminated'] else 'ACTIVE',
'_Committee_Type': (f['committeeTypes'][0]
if len(f['committeeTypes']) == 1
else 'Multiple Types'),
'Ballot_Measure': infl['measure'].get('measureNumber') if infl['measure'] else None,
'Support_Or_Oppose': self.support_or_oppose(infl),
'candidate_controlled_id': None, # TODO: link to candidates if candidate committee
'Start_Date': infl['startDate'],
'End_Date': infl['endDate'],
'data_warning': None,
'Make_Active': None
} for f in filers
for infl in (
# TODO: This is slightly effed because some filers have duplicate electionInfluences
# See: filer with filerName "Families in Action For Justice Fund"
# I guess we have to dedupe electionInfluences blurg
f['electionInfluences']
if f['electionInfluences']
else [ empty_election_influence ]
)
if f['registrations'].get('CA SOS')
])
self._dtypes = {
'filer_nid': int,
'Ballot_Measure_Election': 'string',
'Filer_ID': 'string',
'Filer_NamL': 'string',
'_Status': 'string',
'_Committee_Type': 'string',
'Ballot_Measure': 'string',
'Support_Or_Oppose': 'string',
'candidate_controlled_id': 'string',
'Start_Date': 'string',
'End_Date': 'string',
'data_warning': 'string',
'Make_Active': 'string'
}
self._pl_dtypes = {
'filer_nid': pl.UInt64,
'Ballot_Measure_Election': pl.Utf8,
'Filer_ID': pl.Utf8,
'Filer_NamL': pl.Utf8,
'_Status': pl.Utf8,
'_Committee_Type': pl.Utf8,
'Ballot_Measure': pl.Utf8,
'Support_Or_Oppose': pl.Utf8,
'candidate_controlled_id': pl.Utf8,
'Start_Date': pl.Utf8,
'End_Date': pl.Utf8,
'data_warning': pl.Utf8,
'Make_Active': pl.Utf8
}
self._sql_dtypes = {
'Ballot_Measure_Election': String,
'Filer_ID': String,
'Filer_NamL': String,
'_Status': String,
'_Committee_Type': String,
'Ballot_Measure': String,
'Support_Or_Oppose': String,
'candidate_controlled_id': String,
'Start_Date': String,
'End_Date': String,
'data_warning': String,
'Make_Active': String
}
self._sql_cols = self._sql_dtypes.keys()
self._sql_table_name = 'committees'

@staticmethod
def support_or_oppose(influence):
"""
Return 'S' or 'O' code only for committees that support or oppose measures,
or committees that oppose candidates
"""
sup_opp_cd = 'S' if influence['doesSupport'] else 'O'

if (influence['measure'] is not None or influence['candidate'] and sup_opp_cd == 'O'):
return sup_opp_cd

@staticmethod
def _get_possibly_empty_ballot_measure_election(elections: pl.DataFrame, influence: dict):
'''
The Ballot Measure Election is the election's slugified `name` like "oakland-march-2020".
To get the BME for a committee, we match the `electionDate` of an `influence` object
of the committee against election `date`. Then we unpack the results into a list,
appending None in case no matches were found. Finally we return the first index of the
list, which will contain either the matched election slug or None.
'''
return [
*elections.lazy().filter(
pl.col('date') == influence['electionDate']
).first().collect().get_column('name'),
None
][0]
23 changes: 23 additions & 0 deletions download/model/d_expenditures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
'''
FPPC Form 460, Schedule D, Expenditures
'''
import polars as pl
from .schedule import ScheduleBase

class DExpenditures(ScheduleBase):
'''
Schedule D - Expenditures from FPPC Form 460
'''
def __init__(
self,
transactions: pl.DataFrame,
filings: pl.DataFrame,
committees: pl.DataFrame
):
self._form_id = 'F460D'
super().__init__(
self._form_id,
transactions,
filings,
committees
)
Loading
Loading