caciviclab · ckingbailey · Oct 30, 2023 · Oct 30, 2023 · Nov 1, 2023 · Jun 28, 2024
diff --git a/.github/workflows/pr_check.yaml b/.github/workflows/pr_check.yaml
@@ -0,0 +1,29 @@
+name: Python tests
+
+on:
+  pull_request:
+    branches:
+      - master 
+  workflow_dispatch:
+
+env:
+  working_dir: download
+
+jobs:
+  run_tests:
+    name: Run tests on Python download code
+    runs-on: ubuntu-22.04
+    defaults:
+      run:
+        working-directory: ${{ env.working_dir }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version-file: ${{ env.working_dir }}/.python-version
+          cache: pip
+          cache-dependency-path: ${{ env.working_dir }}/requirements.txt
+      - run: pip install -r requirements.txt
+      - name: Run tests
+        run: pytest tests/test_*.py
+
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ inputs
 .local
 **/__pycache__
 build/candidates.xlsx
+.vscode/
diff --git a/download/.github/workflows/main.yml b/download/.github/workflows/main.yml
@@ -0,0 +1,21 @@
+name: "Check Google Drive Access"
+on:
+  workflow_dispatch:
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    env:
+      REPO_OWNER: ${{ github.repository_owner}}
+      REPO_BRANCH: ${{ github.ref_name }}
+      SERVICE_ACCOUNT_KEY_JSON: ${{ secrets.SERVICE_ACCOUNT_KEY_JSON }}
+      GDRIVE_FOLDER: ${{ vars.GDRIVE_FOLDER }}
+    steps:
+    - uses: actions/checkout@v3
+    - run: "pip install -r gdrive_requirements.txt"
+    - run: "python test_pull_from_gdrive.py"
+    - name: Archive pulled files
+      uses: actions/upload-artifact@v2
+      with:
+        name: redacted-netfile-files
+        path: .local/downloads
+
diff --git a/download/.gitignore b/download/.gitignore
@@ -0,0 +1,7 @@
+.venv/
+__pycache__
+.env
+.idea
+.vscode
+SERVICE_ACCOUNT_KEY_JSON.json
+.local
diff --git a/download/.python-version b/download/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/download/README.md b/download/README.md
@@ -0,0 +1,7 @@
+# Query NetFile V2 API to Load Disclosure-Backend DB
+
+Run main.py to download redacted JSON files from Google Drive and create csv files.  To download, the key for the service account used to access Google Drive has to be placed in the file .local/SERVICE_ACCOUNT_KEY_JSON.json.
+
+# Run tests
+
+Tests are in the folder **tests**. To run them all, simply do `pytest tests`.
diff --git a/download/conftest.py b/download/conftest.py
@@ -0,0 +1,6 @@
+''' Pytest config '''
+
+pytest_plugins = [
+    # Autoload all fixtures in every test because they are kept in a separate file from the tests themselves
+    "tests.fixtures.data_fixtures"
+]
diff --git a/download/main.py b/download/main.py
@@ -0,0 +1,80 @@
+""" main, to run everything """
+import json
+from model.a_contributions import A_Contributions
+from model.committee import Committees
+# Next line ingored because Pylint reports cannot find election in model
+from model.election import Elections # pylint: disable=import-error,no-name-in-module
+from model.filing import Filings
+from model.transaction import Transactions
+
+from gdrive_datastore.gdrive import pull_data
+
+DATA_DIR_PATH = '.local/downloads'
+OUTPUT_DIR = '.local'
+
+def unique_statuses(filers):
+    """ What are the unique values for status? """
+    return set(
+        s['status'] for f in filers
+        for s in f['statusList']
+    )
+
+def main():
+    """ Do everyting """
+    # pull data from gdrive and put it in .local/downloads
+    pull_data(subfolder='main', default_folder='OpenDisclosure')
+
+    with open(f'{DATA_DIR_PATH}/elections.json', encoding='utf8') as f:
+        elections_json = json.loads(f.read())
+
+    elections = Elections(elections_json)
+
+    with open(f'{DATA_DIR_PATH}/filers.json', encoding='utf8') as f:
+        filers = json.loads(f.read())
+
+    committees = Committees(filers, elections.pl)
+
+    # A-Contribs:
+    # join filers + filings + elections + transactions
+    # transactions.filing_nid -> filings.filing_nid
+    #   filings.filer_nid -> committees.filer_nid
+    #     committees.Ballot_Measure_Election -> elections.Ballot_Measure_Election
+    # where trans['transaction']['calTransactionType'] == 'F460A'
+    with open(f'{DATA_DIR_PATH}/filings.json', encoding='utf8') as f:
+        filings = Filings(json.loads(f.read())).pl
+
+    with open(f'{DATA_DIR_PATH}/transactions.json', encoding='utf8') as f:
+        records = json.loads(f.read())
+        transactions = Transactions(records).pl
+
+    a_contributions = A_Contributions(transactions, filings, committees.pl)
+    a_contribs_df = a_contributions.df
+    if not a_contribs_df.is_empty:
+        print(a_contribs_df.drop(columns=[
+            'BakRef_TID',
+            'Bal_Name',
+            'Bal_Juris',
+            'Bal_Num',
+            'Dist_No',
+            'Form_Type',
+            'Int_CmteId',
+            'Juris_Cd',
+            'Juris_Dscr',
+            'Loan_Rate',
+            'Memo_Code',
+            'Memo_RefNo',
+            'Off_S_H_Cd',
+            'tblCover_Offic_Dscr',
+            'tblCover_Office_Cd',
+            'tblDetlTran_Office_Cd',
+            'tblDetlTran_Offic_Dscr',
+            'XRef_SchNm',
+            'XRef_Match',
+        ]).sample(n=20))
+
+    elections.pl.write_csv(f'{OUTPUT_DIR}/elections.csv')
+    committees.pl.write_csv(f'{OUTPUT_DIR}/committees.csv')
+    a_contributions.df.write_csv(f'{OUTPUT_DIR}/a_contributions.csv')
+
+if __name__ == '__main__':
+    main()
diff --git a/download/model/__init__.py b/download/model/__init__.py
diff --git a/download/model/a_contributions.py b/download/model/a_contributions.py
@@ -0,0 +1,24 @@
+"""
+Schedule A, Contributions
+Hopefully this can be joined with other Schedule classes into a single Transaction class
+"""
+import polars as pl
+from .schedule import ScheduleBase
+
+class A_Contributions(ScheduleBase):
+    """
+    Each record represents Schedule A - Contributions from form 460
+    """
+    def __init__(
+        self,
+        transactions:pl.DataFrame,
+        filings:pl.DataFrame,
+        committees:pl.DataFrame
+    ):
+        self._form_id = 'F460A'
+        super().__init__(
+            self._form_id,
+            transactions,
+            filings,
+            committees
+        )
diff --git a/download/model/base.py b/download/model/base.py
@@ -0,0 +1,51 @@
+""" This is the base model, upon all others shall be based """
+import pandas as pd
+import polars as pl
+
+class BaseModel:
+    """ Base model other models inherit from """
+    def __init__(self, data):
+        self._data = data
+        self._df = None
+        self._pl = None
+        self._dtypes = []
+        self._pl_dtypes = []
+        self._sql_dtypes = []
+        self._sql_cols = []
+        self._sql_table_name = ''
+
+    @property
+    def data(self):
+        """ Just return the data """
+        return self._data
+
+    @property
+    def pl(self):
+        ''' Return a Polars dataframe '''
+        if self._pl is None or self._pl.is_empty():
+            self._pl = pl.DataFrame(self._data, schema=self._pl_dtypes)
+
+        return self._pl
+
+    @property
+    def df(self):
+        """ Get a dataframe of the data """
+        if self._df is None or self._df.empty:
+            self._df = pd.DataFrame(self._data).astype(self._dtypes)
+
+        return self._df
+
+    def to_sql(self, connection, **kwargs):
+        """ Write to a postgresql table """
+        options = {
+            'index_label': 'id',
+            'if_exists': 'replace'
+        }
+        options.update(kwargs)
+
+        self.df[self._sql_cols].to_sql(
+            self._sql_table_name,
+            connection,
+            dtype=self._sql_dtypes,
+            **options
+        )
diff --git a/download/model/committee.py b/download/model/committee.py
@@ -0,0 +1,123 @@
+""" This is the Committee model """
+from typing import List
+import polars as pl
+from sqlalchemy.types import String
+from . import base
+
+class Committees(base.BaseModel):
+    """ A collection of committees """
+    def __init__(self, filers:List[dict], elections:pl.DataFrame):
+        empty_election_influence = {
+            'electionDate': None,
+            'measure': None,
+            'candidate': None,
+            'doesSupport': None,
+            'startDate': None,
+            'endDate': None
+        }
+
+        super().__init__([
+            {
+                'filer_nid': int(f['filerNid']),
+                # 'Ballot_Measure_Election': [ *elections[elections['date'] == infl['electionDate']]['name'].array, None ][0],
+                'Ballot_Measure_Election': self._get_possibly_empty_ballot_measure_election(
+                    elections,
+                    infl
+                ),
+                'Filer_ID': f['registrations'].get('CA SOS'),
+                'Filer_NamL': infl.get('committeeName', f['filerName']),
+                '_Status': 'INACTIVE' if f['isTerminated'] else 'ACTIVE',
+                '_Committee_Type': (f['committeeTypes'][0]
+                                    if len(f['committeeTypes']) == 1
+                                    else 'Multiple Types'),
+                'Ballot_Measure': infl['measure'].get('measureNumber') if infl['measure'] else None,
+                'Support_Or_Oppose': self.support_or_oppose(infl),
+                'candidate_controlled_id': None, # TODO: link to candidates if candidate committee
+                'Start_Date': infl['startDate'],
+                'End_Date': infl['endDate'],
+                'data_warning': None,
+                'Make_Active': None
+            } for f in filers
+            for infl in (
+                # TODO: This is slightly effed because some filers have duplicate electionInfluences
+                # See: filer with filerName "Families in Action For Justice Fund"
+                # I guess we have to dedupe electionInfluences blurg
+                f['electionInfluences']
+                if f['electionInfluences']
+                else [ empty_election_influence ]
+            )
+            if f['registrations'].get('CA SOS')
+        ])
+        self._dtypes = {
+            'filer_nid': int,
+            'Ballot_Measure_Election': 'string',
+            'Filer_ID': 'string',
+            'Filer_NamL': 'string',
+            '_Status': 'string',
+            '_Committee_Type': 'string',
+            'Ballot_Measure': 'string',
+            'Support_Or_Oppose': 'string',
+            'candidate_controlled_id': 'string',
+            'Start_Date': 'string',
+            'End_Date': 'string',
+            'data_warning': 'string',
+            'Make_Active': 'string'
+        }
+        self._pl_dtypes = {
+            'filer_nid': pl.UInt64,
+            'Ballot_Measure_Election': pl.Utf8,
+            'Filer_ID': pl.Utf8,
+            'Filer_NamL': pl.Utf8,
+            '_Status': pl.Utf8,
+            '_Committee_Type': pl.Utf8,
+            'Ballot_Measure': pl.Utf8,
+            'Support_Or_Oppose': pl.Utf8,
+            'candidate_controlled_id': pl.Utf8,
+            'Start_Date': pl.Utf8,
+            'End_Date': pl.Utf8,
+            'data_warning': pl.Utf8,
+            'Make_Active': pl.Utf8
+        }
+        self._sql_dtypes = {
+            'Ballot_Measure_Election': String,
+            'Filer_ID': String,
+            'Filer_NamL': String,
+            '_Status': String,
+            '_Committee_Type': String,
+            'Ballot_Measure': String,
+            'Support_Or_Oppose': String,
+            'candidate_controlled_id': String,
+            'Start_Date': String,
+            'End_Date': String,
+            'data_warning': String,
+            'Make_Active': String
+        }
+        self._sql_cols = self._sql_dtypes.keys()
+        self._sql_table_name = 'committees'
+
+    @staticmethod
+    def support_or_oppose(influence):
+        """
+        Return 'S' or 'O' code only for committees that support or oppose measures,
+        or committees that oppose candidates
+        """
+        sup_opp_cd = 'S' if influence['doesSupport'] else 'O'
+
+        if (influence['measure'] is not None or influence['candidate'] and sup_opp_cd == 'O'):
+            return sup_opp_cd
+
+    @staticmethod
+    def _get_possibly_empty_ballot_measure_election(elections: pl.DataFrame, influence: dict):
+        '''
+        The Ballot Measure Election is the election's slugified `name` like "oakland-march-2020".
+        To get the BME for a committee, we match the `electionDate` of an `influence` object
+        of the committee against election `date`. Then we unpack the results into a list,
+        appending None in case no matches were found. Finally we return the first index of the
+        list, which will contain either the matched election slug or None.
+        '''
+        return [
+            *elections.lazy().filter(
+                pl.col('date') == influence['electionDate']
+            ).first().collect().get_column('name'),
+            None
+        ][0]
diff --git a/download/model/d_expenditures.py b/download/model/d_expenditures.py
@@ -0,0 +1,23 @@
+'''
+FPPC Form 460, Schedule D, Expenditures
+'''
+import polars as pl
+from .schedule import ScheduleBase
+
+class DExpenditures(ScheduleBase):
+    '''
+    Schedule D - Expenditures from FPPC Form 460
+    '''
+    def __init__(
+        self,
+        transactions: pl.DataFrame,
+        filings: pl.DataFrame,
+        committees: pl.DataFrame
+    ):
+        self._form_id = 'F460D'
+        super().__init__(
+            self._form_id,
+            transactions,
+            filings,
+            committees
+        )