diff --git a/Dockerfile b/Dockerfile index a322c67..26246a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # syntax=docker/dockerfile:1 -FROM alpine/git as ms-tool-src +FROM alpine/git AS ms-tool-src RUN git clone https://github.com/microsoft/Tools-for-Health-Data-Anonymization.git /app FROM mcr.microsoft.com/dotnet/sdk:6.0 AS ms-tool @@ -25,7 +25,7 @@ RUN --mount=type=cache,target=/root/.cache \ pip3 install /app/[tests] RUN rm -r /app -ENV JAVA_HOME /opt/java/openjdk +ENV JAVA_HOME=/opt/java/openjdk ENTRYPOINT ["cumulus-etl"] @@ -42,6 +42,6 @@ RUN --mount=type=cache,target=/root/.cache \ pip3 install /app RUN rm -r /app -ENV JAVA_HOME /opt/java/openjdk +ENV JAVA_HOME=/opt/java/openjdk ENTRYPOINT ["cumulus-etl"] diff --git a/cumulus_etl/etl/tasks/basic_tasks.py b/cumulus_etl/etl/tasks/basic_tasks.py index 2dacc97..8de799a 100644 --- a/cumulus_etl/etl/tasks/basic_tasks.py +++ b/cumulus_etl/etl/tasks/basic_tasks.py @@ -95,7 +95,8 @@ class MedicationRequestTask(tasks.EtlTask): tags: ClassVar = {"cpu"} # We may write to a second Medication table as we go. - # MedicationRequest can have inline medications via CodeableConcepts, or external Medication references. + # MedicationRequest can have inline medications via CodeableConcepts, or external Medication + # references. # If external, we'll download them and stuff them in this output table. # We do all this special business logic because Medication is a special, "reference" resource, # and many EHRs don't let you simply bulk export them. @@ -110,20 +111,27 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Keep a cache of medication IDs that we've already downloaded. - # If this ends up growing too large in practice, we can wipe it during a call to table_batch_cleanup(). + # If this ends up growing too large in practice, we can wipe it during a call to + # table_batch_cleanup(). # But let's try initially with keeping it around for the whole task. self.medication_ids = set() + # Track whether we already warned about downloading Medications, to avoid spamming. + self.warned_connection_error = False + def scrub_medication(self, medication: dict | None) -> bool: """Scrub incoming medication resources, returns False if it should be skipped""" if not medication or not self.scrubber.scrub_resource(medication): # standard scrubbing return False # Normally the above is all we'd need to do. - # But this resource just came hot from the FHIR server, and we did not run the MS anonymizer on it. + # But this resource just came hot from the FHIR server, and we did not run the MS + # anonymizer on it. # Since Medications are not patient-specific, we don't need the full MS treatment. - # But still, we should probably drop some bits that might more easily identify the *institution*. - # This is a poor-man's MS config tool (and a blocklist rather than allow-list, but it's a very simple resource) + # But still, we should probably drop some bits that might more easily identify the + # *institution*. + # This is a poor-man's MS config tool (and a blocklist rather than allow-list, but it's a + # very simple resource) # *should* remove extensions at all layers, but this will catch 99% of them medication.pop("extension", None) @@ -141,8 +149,9 @@ async def fetch_medication(self, resource: dict) -> dict | None: if not reference.startswith("#"): # Don't duplicate medications we've already seen this run. - # This will still duplicate medications from previous runs, but avoiding that feels like more work than it's - # worth - just download em again and push em through (there might be updates to the resources, too!) + # This will still duplicate medications from previous runs, but avoiding that feels + # like more work than it's worth - just download em again and push em through (there + # might be updates to the resources, too!) if reference in self.medication_ids: return None self.medication_ids.add(reference) @@ -150,7 +159,10 @@ async def fetch_medication(self, resource: dict) -> dict | None: try: medication = await fhir.download_reference(self.task_config.client, reference) except Exception as exc: - logging.warning("Could not download Medication reference: %s", exc) + if not self.warned_connection_error: + logging.warning("Could not download Medication reference: %s", exc) + self.warned_connection_error = True + self.summaries[1].had_errors = True if self.task_config.dir_errors: diff --git a/tests/etl/test_tasks.py b/tests/etl/test_tasks.py index b417d49..7bc406b 100644 --- a/tests/etl/test_tasks.py +++ b/tests/etl/test_tasks.py @@ -441,7 +441,8 @@ async def test_external_medications_with_error(self, mock_download): ValueError("bad haircut"), ] - await basic_tasks.MedicationRequestTask(self.job_config, self.scrubber).run() + with self.assertLogs(level="WARNING") as logs: + await basic_tasks.MedicationRequestTask(self.job_config, self.scrubber).run() med_format = self.format med_req_format = self.format2 @@ -451,6 +452,11 @@ async def test_external_medications_with_error(self, mock_download): batch = med_req_format.write_records.call_args[0][0] self.assertEqual(3, len(batch.rows)) + # Confirm we only logged about the first problem (to avoid spamming console) + self.assertEqual( + logs.output, ["WARNING:root:Could not download Medication reference: bad hostname"] + ) + # Confirm we still wrote out the medication for B self.assertEqual(1, med_format.write_records.call_count) batch = med_format.write_records.call_args[0][0] @@ -520,6 +526,11 @@ async def test_external_medications_skips_unknown_modifiers(self, mock_download) self.make_json( "MedicationRequest", "B", medicationReference={"reference": "Medication/good"} ) + self.make_json( + "MedicationRequest", + "Skipped", # this will be ignored due to uknown modifier in the MedReq itself + modifierExtension=[{"url": "unrecognized"}], + ) mock_download.side_effect = [ { "resourceType": "Medication", @@ -534,7 +545,15 @@ async def test_external_medications_skips_unknown_modifiers(self, mock_download) await basic_tasks.MedicationRequestTask(self.job_config, self.scrubber).run() - med_format = self.format + med_format = self.format # Medication + medreq_format = self.format2 # MedicationRequest + + self.assertEqual(1, medreq_format.write_records.call_count) + batch = medreq_format.write_records.call_args[0][0] + self.assertEqual( # no "Skipped" + {self.codebook.db.resource_hash("A"), self.codebook.db.resource_hash("B")}, + {row["id"] for row in batch.rows}, + ) self.assertEqual(1, med_format.write_records.call_count) batch = med_format.write_records.call_args[0][0]