[326b] NiH Curation/Aggregation (#336)

* setting up new dir structure * updated orm, simplified collect pipeline * added a lot of preprocessing * added cleaning and preprocessing steps and tests * migrated from health_data to nih * migrated from health_data to nih * added test for auto splitting json fields * test running * added cleaning for terms fields, added date parsing * added cleaning for terms fields, added date parsing * added upsert logic * add docstrings * add docstrings * updated orm * speedups for inserts * closed session * added vectors to nih * nih links and clinical trials * file renaming factored from 326_nih * revert autobatch * removed references to old health_data terminology in favour of nih * traded lambda for partial * factored out utils * factored in utils from 326_rename * added docstrings and comments, and small amount of refactoring * added pk tests * test for auto pkey check * test for generate pk * added retrieve pk tests * added delete stmt test * added delete stmt test * added merge tests * added bucket keys tests * backwards compatability for older pipelines * fixes wrt to new insert method * updated range * removed debugging lines * removing todo comments * tidying up * added minor fix to gtr routine due to tanzania * added faiss and sim search * rebuilding docker images with faiss removed from reqs * added comments to faiss processor * tasks run after committing more regularly * added logging * updated orm * rewritten dedupe tasked * faiss speedups * dev runs * added filters to query * dedupe runs * rmd testing comments * added explanatory comments * added explanatory comments * added explanatory comments * removed testing assert * removed commented out lines * added docs to read * added helpful comments to similarity * added helpful comments to similarity * added index to date fields * fine-tuned some hyperparams to speed up large searches * fine-tuned some hyperparams to speed up large searches * removed experimental feature * adding curate * needed to drop edge case abstracts for dedupe FK * Update nih_orm.py * Update nih_orm.py * added faiss and pytorch to reqs * stashing changes * updated reqs * updated reqs * added exception for dud ids * adding orm and impute functions * added more info to the readme * added tests * added tests * updating curate * similar ids routine * added aggregation * dev runs * comments into run.py * rmd dangling test file * refactored getattr * rmd whitespace * added zillion tests * added tests * imputing base id * added tests * added missing import * added option * switched to instance checking * added country edge cases * bad dict * try both title and non-title * added config * fixed test after options removed and individual fields introduced * needed to clear lru cache between tests
nestauk · Nov 26, 2020 · 8d1c29e · 8d1c29e
1 parent 647e46f
commit 8d1c29e
Show file tree

Hide file tree

Showing 18 changed files with 1,151 additions and 129 deletions.
diff --git a/nesta/core/batchables/general/nih/curate/run.py b/nesta/core/batchables/general/nih/curate/run.py
diff --git a/nesta/core/batchables/general/nih/curate/tests/test_nih_curate.py b/nesta/core/batchables/general/nih/curate/tests/test_nih_curate.py
@@ -0,0 +1,358 @@
+from unittest import mock
+
+from nesta.core.batchables.general.nih.curate import run
+
+PATH = "nesta.core.batchables.general.nih.curate.run.{}"
+dt = run.datetime
+
+@mock.patch(PATH.format("db_session"))
+@mock.patch(PATH.format("object_to_dict"))
+def test_get_projects_by_appl_id(mocked_obj2dict,
+                                 mocked_db_session):
+    appl_ids = ['a', 'b', 1, 2, 3]
+
+    # Mock the session and query
+    mocked_session = mock.Mock()
+    q = mocked_session.query().filter().order_by().limit()
+    q.all.return_value = appl_ids  # <-- will just return the input
+    # Assign the session to the context manager
+    mocked_db_session().__enter__.return_value = mocked_session
+
+    # Just return the value itself
+    mocked_obj2dict.side_effect = lambda obj, shallow, properties: obj
+
+    # Test that single-member groups are created
+    groups = run.get_projects_by_appl_id(None, appl_ids)
+    assert groups == [[id_] for id_ in appl_ids]
+
+
+def _result_factory(value):
+    m = mock.Mock()
+    m.base_core_project_num = value
+    return m
+
+
+@mock.patch(PATH.format("db_session"))
+@mock.patch(PATH.format("object_to_dict"))
+def test_group_projects_by_core_id(mocked_obj2dict,
+                                   mocked_db_session):
+    core_ids = ['a', 1, 'b', 'b', 1, 2, 1]
+    results = [_result_factory(v) for v in core_ids]
+    groups = [[{'base_core_project_num': 'a'}],  # Group 1
+              [{'base_core_project_num': 1},     # Group 2
+               {'base_core_project_num': 1},
+               {'base_core_project_num': 1}],
+              [{'base_core_project_num': 'b'},   # Group 3
+               {'base_core_project_num': 'b'}],
+              [{'base_core_project_num': 2}]]    # Group 4
+
+    # Mock the session and query
+    mocked_session = mock.Mock()
+    q = mocked_session.query().filter().order_by().limit()
+    q.all.return_value = results  # <-- will just return the input
+    # Assign the session to the context manager
+    mocked_db_session().__enter__.return_value = mocked_session
+
+    # Just return the value itself
+    mocked_obj2dict.side_effect = lambda obj, shallow, properties: obj
+
+    # Test that single-member groups are created
+    groups = run.group_projects_by_core_id(None, core_ids)
+    assert groups == groups
+
+
+def test_get_sim_weights():
+    appl_ids = [1, 2, 3, 4]
+    dupes = [{'application_id_1': 1,
+              'application_id_2': 5,
+              'weight': 0.4},
+             {'application_id_1': 1,
+              'application_id_2': 6,
+              'weight': 0.9},
+             {'application_id_1': 2,
+              'application_id_2': 6,
+              'weight': 0.8},
+             {'application_id_1': 3,
+              'application_id_2': 5,
+              'weight': 0.3}]
+    # The max weight of ids not in `appl_ids`
+    sim_weights = {5: 0.4, 6: 0.9}
+
+    assert run.get_sim_weights(dupes, appl_ids) == sim_weights
+
+@mock.patch(PATH.format("db_session"))
+@mock.patch(PATH.format("object_to_dict"))
+@mock.patch(PATH.format("get_sim_weights"))
+def test_retrieve_similar_projects(mocked_get_sim_weights,
+                                   mocked_obj2dict,
+                                   mocked_db_session):
+    sim_weights = {5: 0.4, 6: 0.9}
+    mocked_get_sim_weights.return_value = sim_weights
+    sim_ids = [(id,) for id in set(sim_weights.keys())]
+    sim_projs = [{"application_id": id} for id, in sim_ids]
+
+    # Mock the session and query
+    mocked_session = mock.MagicMock()
+    q = mocked_session.query().filter()
+    q.all.return_value = sim_ids  # <-- will just return the input
+    # Assign the session to the context manager
+    mocked_db_session().__enter__.return_value = mocked_session
+
+    # Just return the value itself
+    mocked_obj2dict.side_effect = lambda obj, shallow: obj
+    assert run.retrieve_similar_projects(None, []) == (sim_projs, sim_weights)
+
+
+def test_earliest_date_good_dates():
+    project = {'fy': 2020,
+               'project_start': '2022-1-20',
+               'project_end': None,
+               'award_notice_date': '2021-1-20',
+               'budget_end': '2021-1-20',
+               'budget_start': None}
+    assert run.earliest_date(project) == dt(year=2021, month=1, day=20)
+
+
+def test_earliest_date_only_year():
+    project = {'fy': 2020,
+               'project_start': None,
+               'project_end': None,
+               'award_notice_date': None,
+               'budget_end': None,
+               'budget_start': None}
+    assert run.earliest_date(project) == dt(year=2020, month=1, day=1)
+
+
+def test_earliest_date_no_dates():
+    project = {'fy': None,
+               'project_start': None,
+               'project_end': None,
+               'award_notice_date': None,
+               'budget_end': None,
+               'budget_start': None}
+    assert run.earliest_date(project) == dt.min
+
+
+@mock.patch(PATH.format('retrieve_similar_projects'))
+@mock.patch(PATH.format('group_projects_by_core_id'))
+@mock.patch(PATH.format('earliest_date'))
+def test_retrieve_similar_proj_ids(mocked_earliest_date,
+                                   mocked_group_projects,
+                                   mocked_rsp):
+    projs = [{'application_id': 1,
+              'base_core_project_num': None},
+             {'application_id': 2,
+              'base_core_project_num': 'two'},
+             {'application_id': 3,
+              'base_core_project_num': 'three'},
+             {'application_id': 4,
+              'base_core_project_num': None}]
+    weights = {1: 0.5, 2: 0.9, 3: 0.05, 4: 0.5,
+               22: 0.1, 33: 0.7}
+    groups = [[{'application_id': 22,
+                'base_core_project_num': 'two'},
+               {'application_id': 2,
+                'base_core_project_num': 'two'}],
+              [{'application_id': 33,
+                'base_core_project_num': 'three'},
+               {'application_id': 3,
+                'base_core_project_num': 'three'}]]
+
+    mocked_rsp.return_value = projs, weights
+    mocked_group_projects.return_value = groups
+    # The following will pick 22 and 33 from their groups, because
+    # they are the largest value (instead of fully implementing
+    # `earliest_date` in this test)
+    mocked_earliest_date.side_effect = lambda x: x['application_id']
+    # Note that 22 picks up the weight of 2, and 33 keeps its own
+    # weight since, it is the largest weight in the group that wins
+    expected = {'near_duplicate_ids': [22],
+                'very_similar_ids': [33],
+                'fairly_similar_ids': [1, 4]}
+    assert run.retrieve_similar_proj_ids(None, None) == expected
+
+
+def test_combine():
+    list_of_dict = [{'a': 1}, {'a': -1}, {'a': None}]
+    assert run.combine(max, list_of_dict, 'a') == 1
+    assert run.combine(min, list_of_dict, 'a') == -1
+
+
+def test_first_non_null():
+    values = [None, None, 'foo', None, 'bar', None]
+    assert run.first_non_null(values) == 'foo'
+
+
+def test_join_and_dedupe():
+    values = [[None, None, 'foo', None, 'bar', None],
+              [None, None, 'foo', None, 'baz', None]]
+    expected = ['foo', 'bar', None, 'baz']
+    found = run.join_and_dedupe(values)
+    assert len(expected) == len(found)
+    assert set(expected) == set(found)
+
+
+def test_format_us_zipcode():
+    assert run.format_us_zipcode('123456789') == '12345-6789'
+    assert run.format_us_zipcode('23456789')  == '02345-6789'
+    assert run.format_us_zipcode('3456789')   == '00345-6789'
+    assert run.format_us_zipcode('456789')    == '00045-6789'
+    assert run.format_us_zipcode('56789') == '56789'
+    assert run.format_us_zipcode('6789')  == '06789'
+    assert run.format_us_zipcode('789')   == '00789'
+    assert run.format_us_zipcode('89')    == '00089'
+    assert run.format_us_zipcode('9')     == '00009'
+
+    assert run.format_us_zipcode('anything else') == 'anything else'
+    assert run.format_us_zipcode('?') == '?'
+
+
+@mock.patch(PATH.format('_geocode'))
+def test_geocode(mocked__geocode):
+    assert run.geocode(None, None, None, None) == None
+
+    mocked__geocode.side_effect = [None, 'bar']
+    assert run.geocode(None, None, None, postalcode='something') == 'bar'
+
+    mocked__geocode.side_effect = [None, None, 'foo']
+    assert run.geocode(None, None, None, postalcode='something') == 'foo'
+
+    mocked__geocode.side_effect = [None, 'baz']
+    assert run.geocode(None, None, country='something',
+                       postalcode=None) == 'baz'
+
+
+def test_aggregate_group():
+    proj1 = {'application_id': 1,
+             'base_core_project_num': 'first',
+             'fy': 2001,
+             'org_city': 'Kansas City',
+             'org_country': 'United States',
+             'org_name': 'Big Corp',
+             'org_state': None,
+             'org_zipcode': '123456789',
+             'project_title': 'first title',
+             'ic_name': None,
+             'phr': None,
+             'abstract_text': 'first abstract',
+             'total_cost': 100,
+             # List fields
+             'clinicaltrial_ids': [1,2,3],
+             'clinicaltrial_titles': ['title 1', 'title 3'],
+             'patent_ids': [2,3,4,5],
+             'patent_titles': ['patent 1', 'patent 2'],
+             'pmids': ['a', 'c', 'd'],
+             'project_terms': ['AAA', 'CCC'],
+             # Date fields
+             'project_start': '2022-1-20',
+             'project_end': None,
+             'award_notice_date': '2021-1-20',
+             'budget_end': '2021-1-20',
+             'budget_start': None}
+
+
+    proj2 = {'application_id': 2,
+             'base_core_project_num': 'first',
+             'fy': 2002,
+             'org_city': 'Kansas City',
+             'org_country': 'United States',
+             'org_name': 'Big Corp',
+             'org_state': None,
+             'org_zipcode': '123456789',
+             'project_title': 'second title',
+             'ic_name': None,
+             'phr': 'second phr',
+             'abstract_text': 'second abstract',
+             'total_cost': 200,
+             # List fields
+             'clinicaltrial_ids': [1,2,4],
+             'clinicaltrial_titles': ['title 1', 'title 2'],
+             'patent_ids': [1,3,4,5],
+             'patent_titles': ['patent 1', 'patent 3'],
+             'pmids': ['a', 'c', 'b'],
+             'project_terms': ['AAA', 'BBB'],
+             # Date fields
+             'project_start': '1990-1-20',
+             'project_end': None,
+             'award_notice_date': '2021-1-20',
+             'budget_end': '2021-1-20',
+             'budget_start': None}
+
+    proj3 = {'application_id': 2,
+             'base_core_project_num': 'first',
+             'fy': 2002,
+             'org_city': 'Kansas City',
+             'org_country': 'United States',
+             'org_name': 'Big Corp',
+             'org_state': 'third state',
+             'org_zipcode': '123456789',
+             'project_title': 'third title',
+             'ic_name': 'ms third',
+             'phr': None,
+             'abstract_text': None,
+             'total_cost': 300,
+             # List fields
+             'clinicaltrial_ids': [1,2,4],
+             'clinicaltrial_titles': ['title 0', 'title 2'],
+             'patent_ids': [1,3,4,5],
+             'patent_titles': ['patent 0', 'patent 3'],
+             'pmids': ['a', 'c', 'e'],
+             'project_terms': ['AAA', 'DDD'],
+             # Date fields
+             'project_start': '1999-1-20',
+             'project_end': '2025-1-20',
+             'award_notice_date': '2021-1-20',
+             'budget_end': '2021-1-20',
+             'budget_start': None}
+
+
+    group = [proj1, proj2, proj3]
+    aggregated_group = {'grouped_ids': [1, 2, 2],
+               'grouped_titles': ['first title', 'third title',
+                                  'second title'],
+               'application_id': 1,
+               'base_core_project_num': 'first',
+               'fy': 2001,
+               'org_city': 'Kansas City',
+               'org_country': 'United States',
+               'org_name': 'Big Corp',
+               'org_state': 'third state',
+               'org_zipcode': '123456789',
+               'project_title': 'first title',
+               'ic_name': 'ms third',
+               'phr': 'second phr',
+               'abstract_text': 'first abstract',
+               'clinicaltrial_ids': [1, 2, 3, 4],
+               'clinicaltrial_titles': ['title 0', 'title 1',
+                                        'title 2', 'title 3'],
+               'patent_ids': [1, 2, 3, 4, 5],
+               'patent_titles': ['patent 0', 'patent 2',
+                                 'patent 3', 'patent 1'],
+               'pmids': ['a', 'b', 'c', 'd', 'e'],
+               'project_terms': ['AAA', 'BBB', 'CCC', 'DDD'],
+               'project_start': '1990-1-20',
+               'project_end': '2025-1-20',
+               'total_cost': 600,
+               'yearly_funds': [{'year': 1990,
+                                 'project_start': '1990-1-20',
+                                 'project_end': None,
+                                 'total_cost': 200},
+                                {'year': 1999,
+                                 'project_start': '1999-1-20',
+                                 'project_end': '2025-1-20',
+                                 'total_cost': 300},
+                                {'year': 2021,
+                                 'project_start':
+                                 '2022-1-20',
+                                 'project_end': None,
+                                 'total_cost': 100}]}
+
+    # Check that all elements are the same
+    result = run.aggregate_group(group)
+    assert result.keys() == aggregated_group.keys()
+    for k, v in result.items():
+        _v = aggregated_group[k]
+        if type(v) is list and type(v[0]) is not dict:
+            assert sorted(v) == sorted(_v)
+        else:
+            assert v == _v