diff --git a/dataimporter/emu/views/preparation.py b/dataimporter/emu/views/preparation.py new file mode 100644 index 0000000..ba652c7 --- /dev/null +++ b/dataimporter/emu/views/preparation.py @@ -0,0 +1,82 @@ +from dataimporter.emu.views.utils import ( + NO_PUBLISH, + DISALLOWED_STATUSES, + DEPARTMENT_COLLECTION_CODES, + INVALID_STATUS, + INVALID_DEPARTMENT, + INVALID_TYPE, + is_web_published, + is_valid_guid, + INVALID_GUID, +) +from dataimporter.emu.views.utils import emu_date +from dataimporter.model import SourceRecord +from dataimporter.view import View, FilterResult, SUCCESS_RESULT + +INVALID_SUBDEPARTMENT = FilterResult(False, "Invalid subdepartment") + + +class PreparationView(View): + """ + View for preparation records. + + This view populates the preparation resource on the Data Portal. + """ + + def is_member(self, record: SourceRecord) -> FilterResult: + """ + Filters the given record, determining whether it should be included in the + preparation resource or not. + + :param record: the record to filter + :return: a FilterResult object + """ + if record.get_first_value("ColRecordType", default="").lower() != "preparation": + return INVALID_TYPE + + if not is_web_published(record): + return NO_PUBLISH + + if not is_valid_guid(record): + return INVALID_GUID + + if record.get_first_value("SecRecordStatus") in DISALLOWED_STATUSES: + return INVALID_STATUS + + if record.get_first_value("ColDepartment") not in DEPARTMENT_COLLECTION_CODES: + return INVALID_DEPARTMENT + + if record.get_first_value("ColSubDepartment") != "Molecular Collections": + return INVALID_SUBDEPARTMENT + + return SUCCESS_RESULT + + def make_data(self, record: SourceRecord) -> dict: + """ + Converts the record's raw data to a dict which will be the data presented on the + Data Portal. + + :param record: the record to project + :return: a dict containing the data for this record that should be displayed on + the Data Portal + """ + # cache these for perf + get_all = record.get_all_values + get_first = record.get_first_value + + return { + "_id": record.id, + "created": emu_date( + get_first("AdmDateInserted"), get_first("AdmTimeInserted") + ), + "modified": emu_date( + get_first("AdmDateModified"), get_first("AdmTimeModified") + ), + "project": get_all("NhmSecProjectName"), + "preparationNumber": get_first("EntPreNumber"), + "preparationType": get_first("EntPrePreparationKind"), + "mediumType": get_first("EntPreStorageMedium"), + "preparationProcess": get_first("EntPrePreparationMethod"), + "preparationContents": get_first("EntPreContents"), + "preparationDate": get_first("EntPreDate"), + } diff --git a/dataimporter/importer.py b/dataimporter/importer.py index 08bffb7..56fade3 100644 --- a/dataimporter/importer.py +++ b/dataimporter/importer.py @@ -20,10 +20,16 @@ from dataimporter.emu.views.image import ImageView from dataimporter.emu.views.indexlot import IndexLotView from dataimporter.emu.views.mss import MSSView +from dataimporter.emu.views.preparation import PreparationView from dataimporter.emu.views.specimen import SpecimenView from dataimporter.emu.views.taxonomy import TaxonomyView from dataimporter.ext.gbif import GBIFView, get_changed_records -from dataimporter.links import MediaLink, TaxonomyLink, GBIFLink +from dataimporter.links import ( + MediaLink, + TaxonomyLink, + GBIFLink, + PreparationSpecimenLink, +) from dataimporter.model import SourceRecord from dataimporter.view import View, ViewLink @@ -76,6 +82,7 @@ def __init__(self, config: Config): artefact_view = ArtefactView(self.views_path / "artefact", ecatalogue_db) indexlot_view = IndexLotView(self.views_path / "indexlot", ecatalogue_db) specimen_view = SpecimenView(self.views_path / "specimen", ecatalogue_db) + prep_view = PreparationView(self.views_path / "preparation", ecatalogue_db) # CREATE THE VIEW LINKS # first artefact links @@ -108,6 +115,11 @@ def __init__(self, config: Config): self.links_path / "specimen_gbif", specimen_view, gbif_view ) + # next preparation view + preparation_specimen = PreparationSpecimenLink( + self.links_path / "preparation_specimen", prep_view, specimen_view + ) + # SETUP STATE # store all the dbs, view, and links in dicts for easy access via their names self.dbs: Dict[str, DataDB] = { @@ -123,6 +135,7 @@ def __init__(self, config: Config): artefact_view, indexlot_view, specimen_view, + prep_view, ] } self.links: Dict[str, ViewLink] = { @@ -134,6 +147,7 @@ def __init__(self, config: Config): specimen_images, specimen_taxonomy, specimen_gbif, + preparation_specimen, ] } @@ -146,12 +160,13 @@ def __init__(self, config: Config): "indexlot": SplitgillDatabase(config.indexlot_id, self.client), "artefact": SplitgillDatabase(config.artefact_id, self.client), "mss": SplitgillDatabase("mss", self.client), + "preparation": SplitgillDatabase(config.preparation_id, self.client), } # a database for each data db's redacted IDs to be stored in self.redaction_database = RedactionDB(config.data_path / "redactions") - def _queue_changes(self, records: Iterable[SourceRecord], db_name: str): + def queue_changes(self, records: Iterable[SourceRecord], db_name: str): """ Update the records in the data DB with the given name. The views based on the DB that is being updated will also be updated. @@ -197,7 +212,7 @@ def queue_emu_changes(self): # record refers to a potentially different table from which it is # deleting a record if dump.table != "eaudit": - self._queue_changes(dump.read(), dump.table) + self.queue_changes(dump.read(), dump.table) else: # wrap the dump stream in a filter to only allow through records we # want to process @@ -211,7 +226,7 @@ def queue_emu_changes(self): ): # convert the raw audit records into delete records as we queue # them - self._queue_changes( + self.queue_changes( map(convert_eaudit_to_delete, records), table ) # we've handled all the dumps from this date, update the last date stored on @@ -223,7 +238,7 @@ def queue_gbif_changes(self): Retrieve the latest GBIF records, check which ones have changed compared to the ones stored in the gbif data DB, and then queue them into the GBIF view. """ - self._queue_changes( + self.queue_changes( get_changed_records( self.dbs["gbif"], self.config.gbif_username, self.config.gbif_password ), diff --git a/dataimporter/links.py b/dataimporter/links.py index 46bdefc..bfb4bb4 100644 --- a/dataimporter/links.py +++ b/dataimporter/links.py @@ -291,3 +291,110 @@ def clear_from_foreign(self): Clears out the gbif (foreign) ID to occurrence ID map. """ self.gbif_id_map.clear() + + +class PreparationSpecimenLink(ViewLink): + """ + A ViewLink representing the link between a preparation record and the specimen + voucher record it was created from. + + The mapping is one-to-one with exactly one ID sourced from the base prep record. + When transforming the base record using the linked specimen record, we copy some + fields from the specimen record over to the base prep record, essentially for + searching convenience. The full list of fields that are copied is below. + """ + + # the EMu field on the prep records which links to the specimen voucher record + SPECIMEN_ID_REF_FIELD = "EntPreSpecimenRef" + # the Portal fields which are copied from the specimen to the prep data dict + # TODO: missing CollEventDateVisitedFrom, CollEventName_tab, and kinda ColSite + MAPPED_SPECIMEN_FIELDS = [ + "barcode", + "scientificName", + "order", + "identifiedBy", + # this is a ColSite substitute which uses sumPreciseLocation + "locality", + "decimalLatitude", + "decimalLongitude", + ] + + def __init__(self, path: Path, prep_view: View, specimen_view: View): + """ + :param path: the path to store the ViewLink data in + :param prep_view: the preparation view + :param specimen_view: the specimen view + """ + super().__init__(path.name, prep_view, specimen_view) + self.path = path + # a one-to-one index from prep id -> specimen id + self.id_map = Index(path / "id_map") + + def update_from_base(self, prep_records: List[SourceRecord]): + """ + Extracts the linked specimen ID from each of the given prep records and adds + them to the ID map. + + :param prep_records: the changed prep records + """ + self.id_map.put_one_to_one( + (prep_record.id, specimen_id) + for prep_record in prep_records + if ( + specimen_id := prep_record.get_first_value( + PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD + ) + ) + ) + + def update_from_foreign(self, specimen_records: List[SourceRecord]): + """ + Propagate the changes in the given specimen records to the base prep records + linked to them. + + :param specimen_records: the updated specimen records + """ + base_ids = { + base_id + for specimen_record in specimen_records + for base_id in self.id_map.reverse_get(specimen_record.id) + } + + if base_ids: + base_records = list(self.base_view.db.get_records(base_ids)) + if base_records: + # if there are associated base records, queue changes to them on the + # base view + self.base_view.queue(base_records) + + def transform(self, prep_record: SourceRecord, data: dict): + """ + Transform the given prep record's data with data from the linked voucher + specimen, if one exists. + + :param prep_record: the prep record + :param data: the data dict to update + """ + specimen_id = prep_record.get_first_value( + PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD + ) + if specimen_id: + specimen = self.foreign_view.get_and_transform(specimen_id) + if specimen is not None: + # from DwC + data[ + "associatedOccurrences" + ] = f"Voucher: {specimen.pop('occurrenceID')}" + # not from DwC + data["specimenID"] = specimen.pop("_id") + data.update( + (field, value) + for field in PreparationSpecimenLink.MAPPED_SPECIMEN_FIELDS + if (value := specimen.get(field)) is not None + ) + + def clear_from_base(self): + """ + Clears out the ID map. + """ + self.id_map.clear() diff --git a/tests/emu/views/test_preparation.py b/tests/emu/views/test_preparation.py new file mode 100644 index 0000000..f79a531 --- /dev/null +++ b/tests/emu/views/test_preparation.py @@ -0,0 +1,73 @@ +from contextlib import closing +from pathlib import Path +from typing import List, Tuple + +import pytest + +from dataimporter.dbs import DataDB +from dataimporter.emu.views.utils import ( + NO_PUBLISH, + INVALID_TYPE, + INVALID_GUID, + INVALID_STATUS, + INVALID_DEPARTMENT, +) +from dataimporter.model import SourceRecord +from dataimporter.view import FilterResult, SUCCESS_RESULT +from dataimporter.emu.views.preparation import PreparationView, INVALID_SUBDEPARTMENT +from tests.helpers.samples.preparation import ( + SAMPLE_PREPARATION_DATA, + SAMPLE_PREPARATION_ID, +) + + +@pytest.fixture +def prep_view(tmp_path: Path) -> PreparationView: + with closing( + PreparationView(tmp_path / "prep_view", DataDB(tmp_path / "prep_data")) + ) as view: + yield view + + +is_member_scenarios: List[Tuple[dict, FilterResult]] = [ + ({"ColRecordType": "Specimen"}, INVALID_TYPE), + ({"AdmPublishWebNoPasswordFlag": "n"}, NO_PUBLISH), + ({"AdmGUIDPreferredValue": "not a valid guid!"}, INVALID_GUID), + ({"SecRecordStatus": "INVALID"}, INVALID_STATUS), + ({"ColDepartment": "DDI"}, INVALID_DEPARTMENT), + ({"ColSubDepartment": "Informatics"}, INVALID_SUBDEPARTMENT), + ({}, SUCCESS_RESULT), +] + + +@pytest.mark.parametrize("overrides, result", is_member_scenarios) +def test_is_member(overrides: dict, result: FilterResult, prep_view: PreparationView): + data = {**SAMPLE_PREPARATION_DATA, **overrides} + record = SourceRecord(SAMPLE_PREPARATION_ID, data, "test") + assert prep_view.is_member(record) == result + + +def test_transform_deleted(prep_view: PreparationView): + record = SourceRecord(SAMPLE_PREPARATION_ID, {}, "test") + assert record.is_deleted + + data = prep_view.transform(record) + assert data == {} + + +def test_make_data(prep_view: PreparationView): + record = SourceRecord(SAMPLE_PREPARATION_ID, SAMPLE_PREPARATION_DATA, "test") + + data = prep_view.make_data(record) + assert data == { + "_id": record.id, + "created": "2022-09-12T17:07:51+00:00", + "modified": "2022-09-12T17:21:14+00:00", + "project": "Darwin Tree of Life", + "preparationNumber": "C9K02TWP_B2", + "preparationType": "DNA Extract", + "mediumType": None, + "preparationProcess": None, + "preparationContents": "**OTHER_SOMATIC_ANIMAL_TISSUE**", + "preparationDate": "2022-05-09", + } diff --git a/tests/helpers/samples/dumps.py b/tests/helpers/samples/dumps.py index 98196ae..1f2d6fa 100644 --- a/tests/helpers/samples/dumps.py +++ b/tests/helpers/samples/dumps.py @@ -1,116 +1,56 @@ -import gzip -import shutil -from itertools import count -from pathlib import Path -from typing import List, Dict - -ECATALOGUE_ARTEFACT_SAMPLE_DUMP = ( - Path(__file__).parent / "emu" / "ecatalogue_artefact_53_sample.gz" -) -ECATALOGUE_INDEXLOT_SAMPLE_DUMP = ( - Path(__file__).parent / "emu" / "ecatalogue_indexlot_2000_sample.gz" -) -ECATALOGUE_SPECIMEN_SAMPLE_DUMP = ( - Path(__file__).parent / "emu" / "ecatalogue_specimen_10000_sample.gz" -) - -EMULTIMEDIA_ARTEFACT_SAMPLE_DUMP = ( - Path(__file__).parent / "emu" / "emultimedia_artefact_565_sample.gz" -) -EMULTIMEDIA_INDEXLOT_SAMPLE_DUMP = ( - Path(__file__).parent / "emu" / "emultimedia_indexlot_406_sample.gz" -) -EMULTIMEDIA_SPECIMEN_SAMPLE_DUMP = ( - Path(__file__).parent / "emu" / "emultimedia_specimen_11271_sample.gz" -) - -ETAXONOMY_ARTEFACT_SAMPLE_DUMP = ( - Path(__file__).parent / "emu" / "etaxonomy_artefact_1_sample.gz" -) -ETAXONOMY_INDEXLOT_SAMPLE_DUMP = ( - Path(__file__).parent / "emu" / "etaxonomy_indexlot_1880_sample.gz" -) -ETAXONOMY_SPECIMEN_SAMPLE_DUMP = ( - Path(__file__).parent / "emu" / "etaxonomy_specimen_1_sample.gz" -) - - -def create_ecatalogue_dump( - path: Path, - date: str, - include_artefacts: bool = True, - include_indexlots: bool = True, - include_specimens: bool = True, -): - dump_file = path / f"ecatalogue.export.{date}.gz" - dumps = [] - if include_artefacts: - dumps.append(ECATALOGUE_ARTEFACT_SAMPLE_DUMP) - if include_indexlots: - dumps.append(ECATALOGUE_INDEXLOT_SAMPLE_DUMP) - if include_specimens: - dumps.append(ECATALOGUE_SPECIMEN_SAMPLE_DUMP) - - with dump_file.open("wb") as g: - for dump in dumps: - with dump.open("rb") as f: - shutil.copyfileobj(f, g) - - -def create_emultimedia_dump( - path: Path, - date: str, - include_artefacts: bool = True, - include_indexlots: bool = True, - include_specimens: bool = True, -): - dump_file = path / f"emultimedia.export.{date}.gz" - dumps = [] - if include_artefacts: - dumps.append(EMULTIMEDIA_ARTEFACT_SAMPLE_DUMP) - if include_indexlots: - dumps.append(EMULTIMEDIA_INDEXLOT_SAMPLE_DUMP) - if include_specimens: - dumps.append(EMULTIMEDIA_SPECIMEN_SAMPLE_DUMP) - - with dump_file.open("wb") as g: - for dump in dumps: - with dump.open("rb") as f: - shutil.copyfileobj(f, g) - - -def create_etaxonomy_dump( - path: Path, - date: str, - include_artefacts: bool = True, - include_indexlots: bool = True, - include_specimens: bool = True, -): - dump_file = path / f"etaxonomy.export.{date}.gz" - dumps = [] - if include_artefacts: - dumps.append(ETAXONOMY_ARTEFACT_SAMPLE_DUMP) - if include_indexlots: - dumps.append(ETAXONOMY_INDEXLOT_SAMPLE_DUMP) - if include_specimens: - dumps.append(ETAXONOMY_SPECIMEN_SAMPLE_DUMP) - - with dump_file.open("wb") as g: - for dump in dumps: - with dump.open("rb") as f: - shutil.copyfileobj(f, g) - - -def create_eaudit_dump(path: Path, irns_to_delete: Dict[str, List[str]], date: str): - dump_file = path / f"eaudit.deleted-export.{date}.gz" - - irn_generator = count(1) - - with gzip.GzipFile(dump_file, "wb") as g: - for table, irns in irns_to_delete.items(): - for irn in irns: - g.write(f"irn:1={next(irn_generator)}\n".encode("utf-8")) - g.write(f"AudOperation:1=delete\n".encode("utf-8")) - g.write(f"AudTable:1={table}\n".encode("utf-8")) - g.write(f"AudKey:1={irn}\n".encode("utf-8")) - g.write("###\n".encode("utf-8")) +from enum import Enum +from typing import Optional +from uuid import uuid4 + + +class EcatalogueType(Enum): + specimen = "Specimen" + indexlot = "Index Lot" + artefact = "Artefact" + preparation = "Preparation" + + +def create_ecatalogue( + irn: str, ecatalogue_type: EcatalogueType, guid: Optional[str] = None, **extras +) -> dict: + base = { + "irn": irn, + "ColRecordType": ecatalogue_type.value, + "AdmPublishWebNoPasswordFlag": "Y", + "AdmGUIDPreferredValue": guid if guid is not None else str(uuid4()), + "ColDepartment": "Entomology", + } + if ecatalogue_type == EcatalogueType.preparation: + base["ColSubDepartment"] = "Molecular Collections" + base.update(extras) + return base + + +def create_emultimedia(irn: str, guid: Optional[str] = None, **extras): + return { + "irn": irn, + "MulMimeType": "image", + "AdmGUIDPreferredValue": guid if guid is not None else str(uuid4()), + "AdmPublishWebNoPasswordFlag": "Y", + # image doesn't need this, but MSS does so might as well include it + "DocIdentifier": "banana.jpg", + **extras, + } + + +def create_etaxonomy(irn: str, **extras): + return { + "irn": irn, + "AdmPublishWebNoPasswordFlag": "Y", + **extras, + } + + +def create_eaudit(irn_to_delete: str, table_to_delete_from: str) -> dict: + return { + # doesn't matter what the irn of this record is so just always set it to -1 + "irn": "-1", + "AudOperation": "delete", + "AudTable": table_to_delete_from, + "AudKey": irn_to_delete, + } diff --git a/tests/helpers/samples/preparation.py b/tests/helpers/samples/preparation.py new file mode 100644 index 0000000..bc1ae08 --- /dev/null +++ b/tests/helpers/samples/preparation.py @@ -0,0 +1,73 @@ +from tests.helpers.samples.specimen import SAMPLE_SPECIMEN_ID +from tests.helpers.samples.utils import read_emu_extract + +# this is taken from ecatalogue.export.20231008.gz but with the EntPreSpecimenRef field +# replaced with a single reference to the SAMPLE_SPECIMEN_ID +raw_data = f""" +rownum=3645 +irn:1=9968955 +SummaryData:1=no Collection Kind for preparation (irn 9968955) +ExtendedData:1=9968955 +ExtendedData:2= +ExtendedData:3=no Collection Kind for preparation (irn 9968955) +ColDepartment:1=Zoology +ColSubDepartment:1=Molecular Collections +ColRecordType:1=Preparation +GeneralCatalogueNumber:1=irn: 9968955 +EntIdeQualifiedNameAutomatic:1=Yes +EntPreSpecimenRef:1={SAMPLE_SPECIMEN_ID} +EntPreSpecimenRefLocal:1={SAMPLE_SPECIMEN_ID} +EntPreSpecimenTaxonLocal:1=Eurythenes maldoror d'Udekem d'Acoz & Havermans, 2015 -- Eurytheneidae; Amphipoda; Malacostraca +EntPreSpecimenTaxonLocalRef:1=790675 +EntPreCatalogueNumberLocal:1=014453676 +EntPreContents:1=**OTHER_SOMATIC_ANIMAL_TISSUE** +EntPrePreparationKind:1=DNA Extract +EntPrePreparatorRef:1=406667 +EntPrePreparatorRefLocal:1=406667 +EntPrePreparatorSumDataLocal:1=Chris Fletcher; Natural History Museum; Life Sciences; Fletcher +EntPreDate=2022-05-09 +EntPreNumber:1=C9K02TWP_B2 +EntPreTaxonSummaryDataLocal:1=Eurythenes maldoror d'Udekem d'Acoz & Havermans, 2015 -- Eurytheneidae; Amphipoda; Malacostraca +EntPreSpecimenCatNumLocal:1=014453676 +EntPreSpecimenPresLocal:1=Dry frozen (-80°C) +AcqHistoric:1=No +LocIndependentlyMoveable:1=Yes +AcqLegTransferOfTitle:1=No +AcqLegPurAgree:1=No +AcqLegConfirmationOfGift:1=No +AcqLegDueDilligence:1=No +AcqLegCollectionImpact:1=No +NteText0:1=S +NteText1:1=Purpose of specimen: DNA barcoding only +NteText2:1=Pleopod +NteType:1=Size +AdmPublishWebNoPasswordFlag:1=Y +AdmPublishWebNoPassword:1=Yes +AdmPublishWebPasswordFlag:1=Y +AdmPublishWebPassword:1=Yes +AdmGUIDPreferredType:1=UUID4 +AdmGUIDPreferredValue:1=f11c9c35-4da5-45e5-9dbb-6f8f55b26aa7 +AdmGUIDIsPreferred:1=Yes +AdmGUIDType:1=UUID4 +AdmGUIDValue:1=f11c9c35-4da5-45e5-9dbb-6f8f55b26aa7 +AdmInsertedBy:1=Heather Allen +AdmDateInserted=2022-09-12 +AdmImportIdentifier:1=12092022_JC231_Prep +AdmTimeInserted=17:07:51.000 +AdmSystemIdentifier:1=heata2-220912-1706 +AdmModifiedBy:1=Heather Allen +AdmDateModified=2022-09-12 +AdmTimeModified=17:21:14.000 +AdmDateRecordModified=2023-10-06 +AdmTimeRecordModified=15:01:03.000 +SecRecordStatus:1=Active +SecCanDisplay:1=Group Default +SecCanEdit:1=Group Default +SecCanDelete:1=Group Default +SecDepartment:1=Entomology +SecLookupRoot:1=Entomology +NhmSecOpenDataPolicyException:1=none +NhmSecProjectName:1=Darwin Tree of Life +""" + +SAMPLE_PREPARATION_ID, SAMPLE_PREPARATION_DATA = read_emu_extract(raw_data) diff --git a/tests/test_importer.py b/tests/test_importer.py index bfb6826..ab604fd 100644 --- a/tests/test_importer.py +++ b/tests/test_importer.py @@ -3,19 +3,21 @@ from unittest.mock import patch, MagicMock import pytest +from elasticsearch_dsl import Search from freezegun import freeze_time -from splitgill.manager import SplitgillDatabase from splitgill.utils import to_timestamp from dataimporter.config import Config, MongoConfig, ElasticsearchConfig from dataimporter.emu.dumps import FIRST_VERSION from dataimporter.importer import DataImporter, EMuStatus from dataimporter.model import SourceRecord +from tests.helpers.dumps import create_dump from tests.helpers.samples.dumps import ( - create_ecatalogue_dump, - create_emultimedia_dump, - create_etaxonomy_dump, - create_eaudit_dump, + create_ecatalogue, + EcatalogueType, + create_emultimedia, + create_etaxonomy, + create_eaudit, ) @@ -29,6 +31,7 @@ def config(tmp_path: Path) -> Config: specimen_id="specimen-id", artefact_id="artefact-id", indexlot_id="indexlot-id", + preparation_id="preparation-id", iiif_base_url="https://not.a.real.domain.com/media", mongo_config=mongo_config, es_config=elasticsearch_config, @@ -57,70 +60,154 @@ def test_init(self, config: Config): assert "artefact" in importer.views assert "indexlot" in importer.views assert "specimen" in importer.views + assert "preparation" in importer.views + + def check_view_link(name): + assert name in importer.links + base_name, foreign_name = name.split("_") + assert importer.links[name].base_view is importer.views[base_name] + assert importer.links[name].foreign_view is importer.views[foreign_name] # check that the view links we expect are created - assert "artefact_image" in importer.links - assert "indexlot_image" in importer.links - assert "indexlot_taxonomy" in importer.links - assert "specimen_image" in importer.links - assert "specimen_taxonomy" in importer.links - assert "specimen_gbif" in importer.links + check_view_link("artefact_image") + check_view_link("indexlot_image") + check_view_link("indexlot_taxonomy") + check_view_link("specimen_image") + check_view_link("specimen_taxonomy") + check_view_link("specimen_gbif") + check_view_link("preparation_specimen") # check that the Splitgill databases we expect are created assert "specimen" in importer.sg_dbs assert "indexlot" in importer.sg_dbs assert "artefact" in importer.sg_dbs assert "mss" in importer.sg_dbs + assert "preparation" in importer.sg_dbs importer.close() def test_queue_emu_changes(self, config: Config): - config.dumps_path.mkdir(exist_ok=True) - date_1 = "20230905" - create_ecatalogue_dump(config.dumps_path, date_1) - create_emultimedia_dump(config.dumps_path, date_1) - create_etaxonomy_dump(config.dumps_path, date_1) - - with DataImporter(config) as importer: - importer.queue_emu_changes() - - assert importer.dbs["ecatalogue"].size() == 53 + 2000 + 10000 - assert importer.dbs["emultimedia"].size() == 12242 - assert importer.dbs["etaxonomy"].size() == 1879 - - assert importer.views["specimen"].changes.size() == 10000 - assert importer.views["indexlot"].changes.size() == 2000 - assert importer.views["artefact"].changes.size() == 53 - assert importer.views["image"].changes.size() == 12195 - assert importer.views["mss"].changes.size() == 12195 - - # flush all the queues - for view in importer.views.values(): - view.flush() - assert view.changes.size() == 0 - - # now create an audit dump with one image deleted which is associated with 4 - # index lots, and one artefact deleted - indexlot_image_irn_to_delete = "4712705" - artefact_irn_to_delete = "2475123" - create_eaudit_dump( - config.dumps_path, - { - "emultimedia": [indexlot_image_irn_to_delete], - "ecatalogue": [artefact_irn_to_delete], - }, - "20231005", - ) - - importer.queue_emu_changes() + importer = DataImporter(config) - # the deleted image should be in the image queue - assert importer.views["image"].changes.size() == 1 - # the deleted image should be propagated to the 4 index lots that reference - # it, plus the deleted artefact will be queued here too - assert importer.views["indexlot"].changes.size() == 5 - # the deleted artefact should be in the artefact queue - assert importer.views["artefact"].changes.size() == 1 + first_dump_date = date(2023, 10, 3) + # create an ecatalogue dump with one record per view + create_dump( + config.dumps_path, + "ecatalogue", + first_dump_date, + create_ecatalogue( + "1", EcatalogueType.specimen, MulMultiMediaRef="1", CardParasiteRef="1" + ), + create_ecatalogue( + "2", + EcatalogueType.indexlot, + MulMultiMediaRef="2", + EntIndIndexLotTaxonNameLocalRef="2", + ), + create_ecatalogue("3", EcatalogueType.artefact, MulMultiMediaRef="3"), + create_ecatalogue("4", EcatalogueType.preparation, EntPreSpecimenRef="1"), + ) + # create an emultimedia dump with 3 images each with an ID that matches the + # linked IDs above in the ecatalogue dump via the MulMultiMediaRef field + create_dump( + config.dumps_path, + "emultimedia", + first_dump_date, + create_emultimedia("1"), + create_emultimedia("2"), + create_emultimedia("3"), + ) + # create an etaxonomy dump with 2 records one matching the specimen made above + # and one matching the index lot + create_dump( + config.dumps_path, + "etaxonomy", + first_dump_date, + create_etaxonomy("1"), + create_etaxonomy("2"), + ) + + importer.queue_emu_changes() + + assert importer.dbs["ecatalogue"].size() == 4 + assert importer.dbs["emultimedia"].size() == 3 + assert importer.dbs["etaxonomy"].size() == 2 + assert importer.views["specimen"].changes.size() == 1 + assert importer.views["indexlot"].changes.size() == 1 + assert importer.views["artefact"].changes.size() == 1 + assert importer.views["preparation"].changes.size() == 1 + assert importer.views["image"].changes.size() == 3 + assert importer.views["mss"].changes.size() == 3 + + # flush all the view queues + for view in importer.views.values(): + view.flush() + assert view.changes.size() == 0 + + second_dump_date = date(2023, 10, 4) + create_dump( + config.dumps_path, + "eaudit", + second_dump_date, + # delete the index lot + create_eaudit("2", "ecatalogue"), + # delete the media on the artefact + create_eaudit("3", "emultimedia"), + # delete the taxonomy of the specimen + create_eaudit("1", "etaxonomy"), + ) + + importer.queue_emu_changes() + + # these should all be the same + assert importer.dbs["ecatalogue"].size() == 4 + assert importer.dbs["emultimedia"].size() == 3 + assert importer.dbs["etaxonomy"].size() == 2 + # 1 indexlot delete + specimen update because of the taxonomy delete + assert importer.views["specimen"].changes.size() == 2 + # 1 indexlot delete + assert importer.views["indexlot"].changes.size() == 1 + # 1 indexlot delete + artefact update because of the multimedia delete + assert importer.views["artefact"].changes.size() == 2 + # 1 indexlot delete, 1 specimen change by taxonomy change which is pushed down + assert importer.views["preparation"].changes.size() == 2 + # 1 multimedia delete + assert importer.views["image"].changes.size() == 1 + # 1 multimedia delete + assert importer.views["mss"].changes.size() == 1 + + for view in importer.views.values(): + view.flush() + assert view.changes.size() == 0 + + third_dump_date = date(2023, 10, 8) + # update all the multimedia records + a new record unlinked to anything + create_dump( + config.dumps_path, + "emultimedia", + third_dump_date, + create_emultimedia("1"), + create_emultimedia("2"), + create_emultimedia("3"), + create_emultimedia("4"), + ) + + importer.queue_emu_changes() + + assert importer.dbs["ecatalogue"].size() == 4 + # there's a new emultimedia record now + assert importer.dbs["emultimedia"].size() == 4 + assert importer.dbs["etaxonomy"].size() == 2 + # an image update on an associated image, so 1 + assert importer.views["specimen"].changes.size() == 1 + # an image update on an associated image, so 1 + assert importer.views["indexlot"].changes.size() == 1 + # an image update on an associated image, so 1 + assert importer.views["artefact"].changes.size() == 1 + # an image update on an associated specimen's image, so 1 + assert importer.views["preparation"].changes.size() == 1 + assert importer.views["image"].changes.size() == 4 + assert importer.views["mss"].changes.size() == 4 def test_queue_gbif_changes(self, config: Config): gbif_records = [ @@ -137,29 +224,33 @@ def test_queue_gbif_changes(self, config: Config): assert importer.views["gbif"].changes.size() == 3 @freeze_time("2023-10-20 11:04:31") - @pytest.mark.usefixtures("reset_mongo") - @pytest.mark.parametrize( - ("name", "count"), - [("artefact", 53), ("indexlot", 2000), ("specimen", 10000), ("mss", 12195)], - ) - def test_add_to_mongo(self, name: str, count: int, config: Config): - config.dumps_path.mkdir(exist_ok=True) - dump_date = "20230905" - - if name == "mss": - # just use emultimedia dumps for the mss view - create_emultimedia_dump(config.dumps_path, dump_date) - else: - # for the other views, only use the data associated with each view, makes - # things faster - dump_options = { - "include_artefacts": name == "artefact", - "include_indexlots": name == "indexlot", - "include_specimens": name == "specimen", - } - create_ecatalogue_dump(config.dumps_path, dump_date, **dump_options) - create_emultimedia_dump(config.dumps_path, dump_date, **dump_options) - create_etaxonomy_dump(config.dumps_path, dump_date, **dump_options) + @pytest.mark.usefixtures("reset_mongo", "reset_elasticsearch") + def test_add_to_mongo_and_sync_to_elasticsearch_artefact(self, config: Config): + name = "artefact" + # before the frozen time + dump_date = date(2023, 10, 3) + # create an ecatalogue dump with 8 artefacts + create_dump( + config.dumps_path, + "ecatalogue", + dump_date, + *[ + create_ecatalogue( + str(i), + EcatalogueType[name], + MulMultiMediaRef=str(i), + PalArtObjectName=f"{i} beans", + ) + for i in range(1, 9) + ], + ) + # create an emultimedia dump with 8 images + create_dump( + config.dumps_path, + "emultimedia", + dump_date, + *[create_emultimedia(str(i), MulTitle=f"image {i}") for i in range(1, 9)], + ) with DataImporter(config) as importer: importer.queue_emu_changes() @@ -167,58 +258,319 @@ def test_add_to_mongo(self, name: str, count: int, config: Config): importer.add_to_mongo(name) sg_db = importer.sg_dbs[name] + assert sg_db.get_mongo_version() == to_timestamp( + datetime(2023, 10, 20, 11, 4, 31) + ) + assert sg_db.data_collection.count_documents({}) == 8 + + # having parallel=True seems to break in testing, maybe it's something to do + # with the test setup or something to do with pytest, who knows + importer.sync_to_elasticsearch(name, parallel=False) + + assert sg_db.get_elasticsearch_version() == to_timestamp( + datetime(2023, 10, 20, 11, 4, 31) + ) + + search_base = Search( + using=config.get_elasticsearch_client(), index=sg_db.latest_index_name + ) + assert search_base.count() == 8 + assert ( + search_base.filter( + "term", **{"parsed.artefactName.k": "3 beans"} + ).count() + == 1 + ) + # this comes from the image + assert ( + search_base.filter( + "term", **{"parsed.associatedMedia.title.k": "image 4"} + ).count() + == 1 + ) + + @freeze_time("2023-10-20 11:04:31") + @pytest.mark.usefixtures("reset_mongo", "reset_elasticsearch") + def test_add_to_mongo_and_sync_to_elasticsearch_indexlot(self, config: Config): + name = "indexlot" + # before the frozen time + dump_date = date(2023, 10, 3) + # create an ecatalogue dump with 8 indexlots + create_dump( + config.dumps_path, + "ecatalogue", + dump_date, + *[ + create_ecatalogue( + str(i), + EcatalogueType[name], + MulMultiMediaRef=str(i), + EntIndIndexLotTaxonNameLocalRef=str(i), + EntIndMaterial=f"{i} lemons", + ) + for i in range(1, 9) + ], + ) + # create an emultimedia dump with 8 images + create_dump( + config.dumps_path, + "emultimedia", + dump_date, + *[create_emultimedia(str(i), MulTitle=f"image {i}") for i in range(1, 9)], + ) + # create an etaxonomy dump with 8 records + create_dump( + config.dumps_path, + "etaxonomy", + dump_date, + *[create_etaxonomy(str(i), ClaKingdom=f"kingdom {i}") for i in range(1, 9)], + ) + + with DataImporter(config) as importer: + importer.queue_emu_changes() + + importer.add_to_mongo(name) + sg_db = importer.sg_dbs[name] assert sg_db.get_mongo_version() == to_timestamp( datetime(2023, 10, 20, 11, 4, 31) ) - assert sg_db.data_collection.count_documents({}) == count + assert sg_db.data_collection.count_documents({}) == 8 + + # having parallel=True seems to break in testing, maybe it's something to do + # with the test setup or something to do with pytest, who knows + importer.sync_to_elasticsearch(name, parallel=False) + + assert sg_db.get_elasticsearch_version() == to_timestamp( + datetime(2023, 10, 20, 11, 4, 31) + ) + search_base = Search( + using=config.get_elasticsearch_client(), index=sg_db.latest_index_name + ) + assert search_base.count() == 8 + assert ( + search_base.filter("term", **{"parsed.material.k": "3 lemons"}).count() + == 1 + ) + # this comes from the image + assert ( + search_base.filter( + "term", **{"parsed.associatedMedia.title.k": "image 4"} + ).count() + == 1 + ) + # this comes from the taxonomy + assert ( + search_base.filter("term", **{"parsed.kingdom.k": "kingdom 4"}).count() + == 1 + ) @freeze_time("2023-10-20 11:04:31") @pytest.mark.usefixtures("reset_mongo", "reset_elasticsearch") - @pytest.mark.parametrize( - ("name", "count"), - [("artefact", 53), ("indexlot", 2000), ("specimen", 10000), ("mss", 12195)], - ) - def test_sync_to_elasticsearch(self, name: str, count: int, config: Config): - config.dumps_path.mkdir(exist_ok=True) - dump_date = "20230905" - - # setup the EMu dumps we're going to use - if name == "mss": - # just use emultimedia dumps for the mss view - create_emultimedia_dump(config.dumps_path, dump_date) - else: - # for the other views, only use the data associated with each view, makes - # things faster - dump_options = { - "include_artefacts": name == "artefact", - "include_indexlots": name == "indexlot", - "include_specimens": name == "specimen", - } - create_ecatalogue_dump(config.dumps_path, dump_date, **dump_options) - create_emultimedia_dump(config.dumps_path, dump_date, **dump_options) - create_etaxonomy_dump(config.dumps_path, dump_date, **dump_options) + def test_add_to_mongo_and_sync_to_elasticsearch_specimen(self, config: Config): + name = "specimen" + # before the frozen time + dump_date = date(2023, 10, 3) + # create an ecatalogue dump with 8 specimens + create_dump( + config.dumps_path, + "ecatalogue", + dump_date, + *[ + create_ecatalogue( + str(i), + EcatalogueType[name], + MulMultiMediaRef=str(i), + CardParasiteRef=str(i), + sumPreciseLocation=f"{i} Number Road", + ) + for i in range(1, 9) + ], + ) + # create an emultimedia dump with 8 images + create_dump( + config.dumps_path, + "emultimedia", + dump_date, + *[create_emultimedia(str(i), MulTitle=f"image {i}") for i in range(1, 9)], + ) + # create an etaxonomy dump with 8 records + create_dump( + config.dumps_path, + "etaxonomy", + dump_date, + *[create_etaxonomy(str(i), ClaKingdom=f"kingdom {i}") for i in range(1, 9)], + ) + + with DataImporter(config) as importer: + importer.queue_emu_changes() + + importer.add_to_mongo(name) + + sg_db = importer.sg_dbs[name] + assert sg_db.get_mongo_version() == to_timestamp( + datetime(2023, 10, 20, 11, 4, 31) + ) + assert sg_db.data_collection.count_documents({}) == 8 + + # having parallel=True seems to break in testing, maybe it's something to do + # with the test setup or something to do with pytest, who knows + importer.sync_to_elasticsearch(name, parallel=False) + + assert sg_db.get_elasticsearch_version() == to_timestamp( + datetime(2023, 10, 20, 11, 4, 31) + ) + search_base = Search( + using=config.get_elasticsearch_client(), index=sg_db.latest_index_name + ) + assert search_base.count() == 8 + assert ( + search_base.filter( + "term", **{"parsed.locality.k": "3 Number Road"} + ).count() + == 1 + ) + # this comes from the image + assert ( + search_base.filter( + "term", **{"parsed.associatedMedia.title.k": "image 4"} + ).count() + == 1 + ) + # this comes from the taxonomy + assert ( + search_base.filter("term", **{"parsed.kingdom.k": "kingdom 4"}).count() + == 1 + ) + + @freeze_time("2023-10-20 11:04:31") + @pytest.mark.usefixtures("reset_mongo", "reset_elasticsearch") + def test_add_to_mongo_and_sync_to_elasticsearch_mss(self, config: Config): + name = "mss" + # before the frozen time + dump_date = date(2023, 10, 3) + # create an emultimedia dump with 8 images + create_dump( + config.dumps_path, + "emultimedia", + dump_date, + *[ + create_emultimedia(str(i), DocIdentifier=f"banana-{i}.jpg") + for i in range(1, 9) + ], + ) with DataImporter(config) as importer: - # queue the changes from the dumps importer.queue_emu_changes() - # add the data to mongo + importer.add_to_mongo(name) + sg_db = importer.sg_dbs[name] + assert sg_db.get_mongo_version() == to_timestamp( + datetime(2023, 10, 20, 11, 4, 31) + ) + assert sg_db.data_collection.count_documents({}) == 8 + # having parallel=True seems to break in testing, maybe it's something to do # with the test setup or something to do with pytest, who knows importer.sync_to_elasticsearch(name, parallel=False) - sg_db: SplitgillDatabase = importer.sg_dbs[name] + assert sg_db.get_elasticsearch_version() == to_timestamp( + datetime(2023, 10, 20, 11, 4, 31) + ) + search_base = Search( + using=config.get_elasticsearch_client(), index=sg_db.latest_index_name + ) + assert search_base.count() == 8 + assert ( + search_base.filter("term", **{"parsed.file.k": "banana-4.jpg"}).count() + == 1 + ) + + @freeze_time("2023-10-20 11:04:31") + @pytest.mark.usefixtures("reset_mongo", "reset_elasticsearch") + def test_add_to_mongo_and_sync_to_elasticsearch_preparation(self, config: Config): + name = "preparation" + # before the frozen time + dump_date = date(2023, 10, 3) + # create an ecatalogue dump with 8 specimens and 8 preparations + ecat_records = [ + *[ + create_ecatalogue( + str(i), + EcatalogueType[name], + EntPreSpecimenRef=str(i + 8), + EntPreStorageMedium=f"Ethanol ({i}%)", + ) + for i in range(1, 9) + ], + *[ + create_ecatalogue( + str(i), + EcatalogueType.specimen, + MulMultiMediaRef=str(i), + CardParasiteRef=str(i), + EntCatBarcode=f"000-00-0-{i}", + ) + for i in range(9, 17) + ], + ] + create_dump(config.dumps_path, "ecatalogue", dump_date, *ecat_records) + # create an emultimedia dump with 8 images + create_dump( + config.dumps_path, + "emultimedia", + dump_date, + *[create_emultimedia(str(i), MulTitle=f"image {i}") for i in range(9, 17)], + ) + # create an etaxonomy dump with 8 records + create_dump( + config.dumps_path, + "etaxonomy", + dump_date, + *[create_etaxonomy(str(i), ClaOrder=f"order {i}") for i in range(9, 17)], + ) + + with DataImporter(config) as importer: + importer.queue_emu_changes() + + importer.add_to_mongo(name) + + sg_db = importer.sg_dbs[name] + assert sg_db.get_mongo_version() == to_timestamp( + datetime(2023, 10, 20, 11, 4, 31) + ) + assert sg_db.data_collection.count_documents({}) == 8 + + # having parallel=True seems to break in testing, maybe it's something to do + # with the test setup or something to do with pytest, who knows + importer.sync_to_elasticsearch(name, parallel=False) assert sg_db.get_elasticsearch_version() == to_timestamp( datetime(2023, 10, 20, 11, 4, 31) ) + search_base = Search( + using=config.get_elasticsearch_client(), index=sg_db.latest_index_name + ) + assert search_base.count() == 8 + assert ( + search_base.filter( + "term", **{"parsed.mediumType.k": "Ethanol (6%)"} + ).count() + == 1 + ) + # check a field that should have been copied from the voucher specimen + assert ( + search_base.filter( + "term", **{"parsed.barcode.k": "000-00-0-12"} + ).count() + == 1 + ) + # check a field that should have been copied from the voucher specimen's + # taxonomy assert ( - config.get_elasticsearch_client().count( - body={}, index=sg_db.latest_index_name - )["count"] - == count + search_base.filter("term", **{"parsed.order.k": "order 11"}).count() + == 1 ) def test_queue_changes_redactions(self, config: Config): @@ -234,7 +586,7 @@ def test_queue_changes_redactions(self, config: Config): importer.redaction_database.add_ids("ecatalogue", ["2", "3"], "reason_1") # queue all the change records - importer._queue_changes(changed_records, "ecatalogue") + importer.queue_changes(changed_records, "ecatalogue") assert "1" in importer.dbs["ecatalogue"] assert "2" not in importer.dbs["ecatalogue"] @@ -250,7 +602,7 @@ def test_redact_records(self, config: Config): ] # queue all the records - importer._queue_changes(records, "ecatalogue") + importer.queue_changes(records, "ecatalogue") # redact records 2 and 3 redacted_count = importer.redact_records("ecatalogue", ["2", "3"], "reason1") diff --git a/tests/test_links.py b/tests/test_links.py index efde964..6631bfa 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -2,7 +2,12 @@ from unittest.mock import MagicMock from dataimporter.dbs import DataDB -from dataimporter.links import MediaLink, TaxonomyLink, GBIFLink +from dataimporter.links import ( + MediaLink, + TaxonomyLink, + GBIFLink, + PreparationSpecimenLink, +) from dataimporter.model import SourceRecord from dataimporter.view import View @@ -480,3 +485,182 @@ def test_clear_from_foreign(self, tmp_path: Path): gbif_link.clear_from_foreign() assert gbif_link.gbif_id_map.size() == 0 + + +class TestPreparationSpecimenLink: + def test_update_from_base(self, tmp_path: Path): + base_view = View(tmp_path / "base_view", DataDB(tmp_path / "base_data")) + specimen_view = View( + tmp_path / "specimen_view", DataDB(tmp_path / "specimen_view") + ) + prep_link = PreparationSpecimenLink( + tmp_path / "prep_spec_link", base_view, specimen_view + ) + + base_records = [ + SourceRecord( + "p1", {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: "s1"}, "base" + ), + # this scenario is not expected, but sensible to check for it given EMu can + # do anything at any time + SourceRecord( + "p2", + {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: ("s2", "s3")}, + "base", + ), + SourceRecord("p3", {"not_the_field": "s4"}, "base"), + SourceRecord( + "p4", {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: "s1"}, "base" + ), + ] + + prep_link.update_from_base(base_records) + + assert prep_link.id_map.get_one("p1") == "s1" + assert prep_link.id_map.get_one("p2") == "s2" + assert prep_link.id_map.get_one("p3") is None + assert prep_link.id_map.get_one("p4") == "s1" + + def test_update_from_foreign(self, tmp_path: Path): + base_view = View(tmp_path / "base_view", DataDB(tmp_path / "base_data")) + specimen_view = View( + tmp_path / "specimen_view", DataDB(tmp_path / "specimen_view") + ) + prep_link = PreparationSpecimenLink( + tmp_path / "prep_spec_link", base_view, specimen_view + ) + + base_records = [ + SourceRecord( + "p1", {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: "s1"}, "base" + ), + # this scenario is not expected, but sensible to check for it given EMu can + # do anything at any time + SourceRecord( + "p2", + {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: ("s2", "s3")}, + "base", + ), + SourceRecord("p3", {"not_the_field": "s4"}, "base"), + SourceRecord( + "p4", {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: "s1"}, "base" + ), + ] + base_view.db.put_many(base_records) + prep_link.update_from_base(base_records) + + specimen_records = [ + SourceRecord("s1", {"x": "1"}, "specimen"), + SourceRecord("s2", {"x": "2"}, "specimen"), + SourceRecord("s3", {"x": "3"}, "specimen"), + SourceRecord("s4", {"x": "4"}, "specimen"), + ] + + # replace the queue method on the base view with a mock + base_view.queue = MagicMock() + + prep_link.update_from_foreign(specimen_records) + + queued_base_records = base_view.queue.call_args.args[0] + assert len(queued_base_records) == 3 + # p1 + assert base_records[0] in queued_base_records + # p2 + assert base_records[3] in queued_base_records + # p4 + assert base_records[3] in queued_base_records + + def test_transform_missing(self, tmp_path: Path): + base_view = View(tmp_path / "base_view", DataDB(tmp_path / "base_data")) + specimen_view = View( + tmp_path / "specimen_view", DataDB(tmp_path / "specimen_view") + ) + prep_link = PreparationSpecimenLink( + tmp_path / "prep_spec_link", base_view, specimen_view + ) + + base_record = SourceRecord( + "p1", {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: "s1"}, "base" + ) + prep_link.update_from_base([base_record]) + data = {"beans": "always"} + prep_link.transform(base_record, data) + + assert data == {"beans": "always"} + + def test_transform(self, tmp_path: Path): + base_view = View(tmp_path / "base_view", DataDB(tmp_path / "base_data")) + specimen_view = View( + tmp_path / "specimen_view", DataDB(tmp_path / "specimen_view") + ) + prep_link = PreparationSpecimenLink( + tmp_path / "prep_spec_link", base_view, specimen_view + ) + + base_record = SourceRecord( + "p1", {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: "s1"}, "base" + ) + prep_link.update_from_base([base_record]) + + mapped_field_data = { + field: f"{field} data" + for field in PreparationSpecimenLink.MAPPED_SPECIMEN_FIELDS + } + # set one of the fields to None + mapped_none_test_field = PreparationSpecimenLink.MAPPED_SPECIMEN_FIELDS[0] + mapped_field_data[mapped_none_test_field] = None + specimen_record = SourceRecord( + "s1", + { + "occurrenceID": "5", + "_id": "8", + "an_addition_field": "some value which shouldn't be copied over", + **mapped_field_data, + }, + "specimen", + ) + specimen_view.db.put_many([specimen_record]) + + data = {"x": "3", "z": "9"} + prep_link.transform(base_record, data) + + assert mapped_none_test_field not in data + del mapped_field_data[mapped_none_test_field] + assert data == { + "x": "3", + "z": "9", + "associatedOccurrences": "Voucher: 5", + "specimenID": "8", + **mapped_field_data, + } + + def test_clear_from_base(self, tmp_path: Path): + base_view = View(tmp_path / "base_view", DataDB(tmp_path / "base_data")) + specimen_view = View( + tmp_path / "specimen_view", DataDB(tmp_path / "specimen_view") + ) + prep_link = PreparationSpecimenLink( + tmp_path / "prep_spec_link", base_view, specimen_view + ) + + base_records = [ + SourceRecord( + "p1", {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: "s1"}, "base" + ), + SourceRecord( + "p2", + {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: ("s2", "s3")}, + "base", + ), + SourceRecord("p3", {"not_the_field": "s4"}, "base"), + SourceRecord( + "p4", {PreparationSpecimenLink.SPECIMEN_ID_REF_FIELD: "s1"}, "base" + ), + ] + base_view.db.put_many(base_records) + prep_link.update_from_base(base_records) + assert prep_link.id_map.size() > 0 + + prep_link.clear_from_base() + + assert prep_link.id_map.size() == 0