diff --git a/.github/workflows/export-deploy.yml b/.github/workflows/export-deploy.yml index d013891b3..aa46d8142 100644 --- a/.github/workflows/export-deploy.yml +++ b/.github/workflows/export-deploy.yml @@ -29,13 +29,9 @@ jobs: working-directory: data-serving/scripts/export-data env: REGISTRY: ${{ steps.login-ecr.outputs.registry }} - REPO_COUNTRY: gdh-tcp-country-exporter REPO_FULL: gdh-tcp-full-exporter IMAGE_TAG: ${{ github.sha }} run: | - docker build -t $REGISTRY/$REPO_COUNTRY:$IMAGE_TAG -t $REGISTRY/$REPO_COUNTRY . - docker push $REGISTRY/$REPO_COUNTRY:$IMAGE_TAG - docker push $REGISTRY/$REPO_COUNTRY:latest docker build -f ./Dockerfile_full_export -t $REGISTRY/$REPO_FULL:$IMAGE_TAG -t $REGISTRY/$REPO_FULL . docker push $REGISTRY/$REPO_FULL:$IMAGE_TAG docker push $REGISTRY/$REPO_FULL:latest diff --git a/.github/workflows/export-tests.yml b/.github/workflows/export-tests.yml deleted file mode 100644 index e03694ca0..000000000 --- a/.github/workflows/export-tests.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: Export tests - -on: - push: - branches: [main] - paths: - - '.github/workflows/export-tests.yml' - - 'data-serving/scripts/export-data/*.py' - - 'data-serving/scripts/export-data/*.sh' - pull_request: - paths: - - '.github/workflows/export-tests.yml' - - 'data-serving/scripts/export-data/*.py' - - 'data-serving/scripts/export-data/*.sh' - workflow_dispatch: - -jobs: - ci: - runs-on: ubuntu-20.04 - defaults: - run: - working-directory: data-serving/scripts/export-data - steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.10 - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Lint with shellcheck - run: | - shellcheck *.sh - - name: Lint with flake8 - run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test - run: | - pip install pytest - pytest diff --git a/data-serving/scripts/export-data/Dockerfile b/data-serving/scripts/export-data/Dockerfile deleted file mode 100644 index 821c180d5..000000000 --- a/data-serving/scripts/export-data/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM alpine:3.16 - -RUN apk update -RUN apk add bash mongodb-tools python3 aws-cli - -COPY common.sh . -COPY country_export.sh . -COPY logger.py . -COPY transform.py . -COPY fields.txt . -ENTRYPOINT ["./country_export.sh"] diff --git a/data-serving/scripts/export-data/Dockerfile_full_export b/data-serving/scripts/export-data/Dockerfile_full_export index 57520a22d..4a6b789af 100644 --- a/data-serving/scripts/export-data/Dockerfile_full_export +++ b/data-serving/scripts/export-data/Dockerfile_full_export @@ -1,13 +1,9 @@ FROM alpine:3.16 RUN apk update -RUN apk add bash aws-cli mongodb-tools python3 curl +RUN apk add bash aws-cli mongodb-tools curl -COPY common.sh . COPY full_export.sh . -COPY data_dictionary.txt . -COPY citation.txt . -COPY country_export.sh . -COPY logger.py . -COPY transform.py . +COPY query.json . COPY fields.txt . + ENTRYPOINT ["./full_export.sh"] diff --git a/data-serving/scripts/export-data/citation.txt b/data-serving/scripts/export-data/citation.txt deleted file mode 100644 index 953fd7fdb..000000000 --- a/data-serving/scripts/export-data/citation.txt +++ /dev/null @@ -1,30 +0,0 @@ -In order to cite the current version of the dataset please use the -citation below. As the data are updated regularly, please update the -retrieval date in the howpublished field. - -@misc{Global.health, -    author={Global.health team}, -    title={{Line List Epidemiological Data from the COVID-19 Outbreak}}, -    howpublished={Accessed on yyyy-mm-dd from \url{https://global.health}}, -    year=2021 -} - -In order to cite the original dataset and methodology of how it was -collected please use: - -@article{xu2020Epidemiological, -    author = {Xu, Bo and Gutierrez, Bernardo and Mekaru, Sumiko - and Sewalk, Kara and Goodwin, Lauren and Loskill, Alyssa - and Cohn, Emily and Hswen, Yulin and Hill, Sarah C. and - Cobo, Maria M and Zarebski, Alexander and Li, Sabrina and - Wu, Chieh-Hsi and Hulland, Erin and Morgan, Julia and - Wang, Lin and O'Brien, Katelynn and Scarpino, Samuel V. - and Brownstein, John S. and Pybus, Oliver G. - and Pigott, David M. and Kraemer, Moritz U. G.}, -    doi={doi.org/10.1038/s41597-020-0448-0}, -    journal={Scientific Data}, -    number={106}, -    title={{Epidemiological data from the COVID-19 outbreak, real-time case information}}, -    volume={7}, -    year={2020} -} diff --git a/data-serving/scripts/export-data/common.sh b/data-serving/scripts/export-data/common.sh deleted file mode 100644 index ba9760e2d..000000000 --- a/data-serving/scripts/export-data/common.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -export ECR=612888738066.dkr.ecr.eu-central-1.amazonaws.com -# ingestion role contains necessary permissions to access S3 buckets -export JOB_ROLE_ARN="arn:aws:iam::612888738066:role/gdh-ingestion-job-role" - -require_env() { - if [ -z "$1" ]; then - echo "$2" - exit 1 - fi -} diff --git a/data-serving/scripts/export-data/country_export.sh b/data-serving/scripts/export-data/country_export.sh deleted file mode 100755 index c11cd78c6..000000000 --- a/data-serving/scripts/export-data/country_export.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source ./common.sh -require_env "${CONN:-}" "Specify MongoDB connection string in CONN" -require_env "${BUCKET:-}" "Specify S3 bucket to output files in BUCKET" -require_env "${COUNTRY:-}" "Specify which country code to export in COUNTRY" - -SCRATCH="$(mktemp -d)" -BUCKETS="${SCRATCH}/buckets.json" -trap 'rm -rf "$SCRATCH"' EXIT # Cleanup before exit - -FORMAT="${FORMAT:-csv,tsv,json}" -QUERY="{\"location.countryISO3\": \"${COUNTRY}\", \"curators.verifiedBy\": { \"\$exists\": \"true\"}}" - -mongoexport --uri="$CONN" --collection=ageBuckets --type=json --jsonArray -o "${BUCKETS}" -mongoexport --query="$QUERY" --uri="$CONN" --collection=day0cases \ - --fieldFile=fields.txt --type=csv | python3 transform.py -f "$FORMAT" -b "${BUCKETS}" "$COUNTRY" - -# ignore shellcheck warning on word splitting, as it's actually needed here -# shellcheck disable=SC2086 -for fmt in ${FORMAT//,/ } -do - test -f "${COUNTRY}.${fmt}.gz" && aws s3 cp "${COUNTRY}.${fmt}.gz" "s3://${BUCKET}/${fmt}/" -done diff --git a/data-serving/scripts/export-data/data_dictionary.txt b/data-serving/scripts/export-data/data_dictionary.txt deleted file mode 100644 index c3ad62535..000000000 --- a/data-serving/scripts/export-data/data_dictionary.txt +++ /dev/null @@ -1,318 +0,0 @@ -DATA DICTIONARY -2022-01-06 - -CHANGELOG - -2022-01-06 Removed empty genomeSequences.* -2021-12-01 Added SGTF field - -Unless mentioned all fields are text strings. - -[] next to a field indicates that it is a comma separated -array. - -* marks mandatory fields - -METADATA - - 1. _id * - - Internal ID used by Global.health database. - This is not expected to be stable. - - 2. caseReference.additionalSources [] - - Additional sources (URLs) for this case - - 3. caseReference.sourceId * - - Unique source ID for this case. Each case is ingested from - a specific source URL, which has an unique ID. This is - stable for a particular source. - - 4. caseReference.sourceUrl * - - Data URL from which this case was ingested. - - 5. caseReference.uploadIds [] * - - Subsequent uploads following the initial upload of a case can - change the data of a case (only in sources that provide an - unique ID in caseReference.sourceEntryId). This field records - the unique upload IDs that updated this case. - - 6. caseReference.verificationStatus * - - Case verification status - Values: VERIFIED | UNVERIFIED | EXCLUDED - - VERIFIED: Case was verified by a curator after ingestion - UNVERIFIED: Case was automatically ingested without verification - EXCLUDED: Case has been excluded from the line list - - Most of our automated data ingestion is from authoritative - government datasets, with a few from volunteer-operated datasets. - -DEMOGRAPHICS - -Generally, we prefer to ingest demographic information over -location, if they are not available in the same dataset. - - 7. demographics.ageRange.end - - Upper age range of individual (0 - 120) - - 8. demographics.ageRange.start - - Lower age range of individual (0 - 120) - -9. demographics.ethnicity - - Ethnicity of individual - -10. demographics.gender - - Gender of individual (Male | Female | Non-binary/Third gender | Other) - -11. demographics.nationalities [] - - All the nationalities of the individual - -12. demographics.occupation - - Occupation of the individual - -EVENTS - -All .date values are dates in YYYY-MM-DD format. - -13. events.confirmed.date * -14. events.confirmed.value - - Confirmed date. If value is present, indicates - method of confirmation. - -15. events.firstClinicalConsultation.date - - First clinical consultation date - -16. events.hospitalAdmission.date -17. events.hospitalAdmission.value - - Hospital admission date, value (Yes | No) - -18. events.icuAdmission.date -19. events.icuAdmission.value - - Intensive Care Unit admission date, value (Yes | No) - -20. events.onsetSymptoms.date - - Date of onset of symptoms - -21. events.outcome.date -22. events.outcome.value - - Outcome date, values are - Death | Recovered | hospitalAdmission | icuAdmission | Unknown - -23. events.selfIsolation.date - - Date that individual started self-isolating - -LOCATION - -24. location.administrativeAreaLevel1 - - Admin1 level location of individual (usually state or province) - -25. location.administrativeAreaLevel2 - - Admin2 level location of individual (usually district) - -26. location.administrativeAreaLevel3 - - Admin3 level location of individual (usually city) - -27. location.country * - - Country that case was reported in. - -28. location.geoResolution * - - Geo-resolution of location (how coarse the location is) - Country | Admin1 | Admin2 | Admin3 | Point - -29. location.geometry.latitude * - - Geolocated latitude (-90 to 90) - Positive values are North, negative values are South - -30. location.geometry.longitude * - - Geolocated longitude (-180 to 180) - Positive values are East, negative values are West - -31. location.name - - Full name of location - (example: Lyon, Auvergne-Rhône-Alpes, France) - -32. location.place - - Name of the place this location refers to - (example: Boston Children's Hospital) - -PATHOGENS - -33. pathogens [] - - Pathogens other than SARS-CoV-2 - - -PRE-EXISTING CONDITIONS - -34. preexistingConditions.hasPreexistingConditions - - Whether the patient has pre-existing conditions - Boolean: True | False - -35. preexistingConditions.values [] - - List of pre-existing conditions - -REVISION METADATA - -36. revisionMetadata.creationMetadata.date - - Date this case was first created - -37. revisionMetadata.creationMetadata.notes - - Notes added by the curator for this case - -38. revisionMetadata.editMetadata.date - - Date this case was last edited - -39. revisionMetadata.editMetadata.notes - - Notes added by the curator for last edit - -40. revisionMetadata.revisionNumber - - Revision number of the case (positive integer) - -SGTF - -41. SGTF - - S-Gene Target failure (0 = no deletion, 1 = deletion (S-)) - -SYMPTOMS - -42. symptoms.status - - Symptom status (Asymptomatic | Symptomatic | Presymptomatic | null) - -43. symptoms.values [] - - List of symptoms - -TRANSMISSION - -How this case got infected and by who if known - -44. transmission.linkedCaseIds [] - - UUID of a related case in the system - -45. transmission.places [] - - Places where transmission occurred - -46. transmission.routes [] - - Routes of transmission - -TRAVEL HISTORY - -47. travelHistory.travel.dateRange.end -48. travelHistory.travel.dateRange.start - - Start and end dates for travel history - -49. travelHistory.travel.location.administrativeAreaLevel1 [] -50. travelHistory.travel.location.administrativeAreaLevel2 [] -51. travelHistory.travel.location.administrativeAreaLevel3 [] -52. travelHistory.travel.location.country [] -53. travelHistory.travel.location.geoResolution [] - - These have the same meaning as in LOCATION, except that these - pertain to travel history of the individual. Unlike the fields in - location, the fields here are all comma-separated arrays, with each - item corresponding to a travel location in the last 30 days. - -54. travelHistory.travel.location.geometry.coordinates [] - - Comma-separated tuples of latitude and longitude. If the individual - visited latitude m1 and longitude n1 this would be represented as - "(m1, n1)". If there was another travel coordinate (m2, n2), then - this would be represented as "(m1, n1),(m2, n2)" - -55. travelHistory.travel.location.name [] -56. travelHistory.travel.location.place [] - - Same as LOCATION, except these are arrays - -57. travelHistory.travel.methods [] - - Corresponding travel methods (such as air, ship, rail ...) - -58. travelHistory.travel.purpose [] - - Purpose of travel - -59. travelHistory.traveledPrior30Days - - Whether the patient has travelled in the past 30 days - Boolean: True | False - -VACCINES - -60. vaccines.0.batch - - First vaccine batch - -61. vaccines.0.date - - Date of first vaccine - -62. vaccines.0.name - - Name of first vaccine - -63. vaccines.0.sideEffects [] - - List of side-effects experienced after vaccine - -64. vaccines.1.batch -65. vaccines.1.date -66. vaccines.1.name -67. vaccines.1.sideEffects -68. vaccines.2.batch -69. vaccines.2.date -70. vaccines.2.name -71. vaccines.2.sideEffects -72. vaccines.3.batch -73. vaccines.3.date -74. vaccines.3.name -75. vaccines.3.sideEffects - - Same as before, for subsequent vaccines taken by the same individual - -VARIANT OF CONCERN - -76. variantOfConcern - - Variant of concern that was detected. This uses the Pango lineage. diff --git a/data-serving/scripts/export-data/fields.txt b/data-serving/scripts/export-data/fields.txt index d067215dd..ced652fac 100644 --- a/data-serving/scripts/export-data/fields.txt +++ b/data-serving/scripts/export-data/fields.txt @@ -1,17 +1,17 @@ _id -caseReference.additionalSources caseReference.sourceId +caseReference.sourceEntryId caseReference.sourceUrl +caseReference.isGovernmentSource caseReference.uploadIds -caseReference.verificationStatus +caseReference.additionalSources caseStatus demographics.ageRange.start demographics.ageRange.end demographics.ageBuckets -demographics.ethnicity demographics.gender -demographics.nationalities demographics.occupation +demographics.healthcareWorker events.dateEntry events.dateReported events.dateLastModified @@ -32,50 +32,39 @@ events.dateIsolation events.outcome events.dateDeath events.dateRecovered -location.administrativeAreaLevel1 -location.administrativeAreaLevel2 -location.administrativeAreaLevel3 location.country +location.countryISO3 location.geoResolution +location.location +location.admin1 +location.admin1WikiId +location.admin2 +location.admin2WikiId +location.admin3 +location.admin3WikiId +location.query +location.name location.geometry.latitude location.geometry.longitude -location.name -location.place -location.query -pathogens -preexistingConditions.hasPreexistingConditions -preexistingConditions.values -revisionMetadata.creationMetadata.curator -revisionMetadata.creationMetadata.date -revisionMetadata.creationMetadata.notes -revisionMetadata.editMetadata.curator -revisionMetadata.editMetadata.date -revisionMetadata.editMetadata.notes -revisionMetadata.revisionNumber -symptoms.status -symptoms.values -transmission.linkedCaseIds -transmission.places -transmission.routes -travelHistory.travel.dateRange.end -travelHistory.travel.dateRange.start -travelHistory.travel.location.name -travelHistory.travel.methods -travelHistory.travel.purpose -travelHistory.traveledPrior30Days -vaccines.0.name -vaccines.0.batch -vaccines.0.date -vaccines.0.sideEffects -vaccines.1.name -vaccines.1.batch -vaccines.1.date -vaccines.1.sideEffects -vaccines.2.name -vaccines.2.batch -vaccines.2.date -vaccines.2.sideEffects -vaccines.3.name -vaccines.3.batch -vaccines.3.date -vaccines.3.sideEffects +pathogen +preexistingConditions.previousInfection +preexistingConditions.coInfection +preexistingConditions.preexistingCondition +preexistingConditions.pregnancyStatus +symptoms +transmission.contactWithCase +transmission.contactId +transmission.contactSetting +transmission.contactAnimal +transmission.transmission +travelHistory.travelHistory +travelHistory.travelHistoryEntry +travelHistory.travelHistoryStart +travelHistory.travelHistoryLocation +travelHistory.travelHistoryCountry +genomeSequences.genomicsMetadata +genomeSequences.accessionNumber +vaccination.vaccination +vaccination.vaccineName +vaccination.vaccineDate +vaccination.vaccineSideEffects diff --git a/data-serving/scripts/export-data/full_export.sh b/data-serving/scripts/export-data/full_export.sh index 4262898e8..d6c7d70c8 100755 --- a/data-serving/scripts/export-data/full_export.sh +++ b/data-serving/scripts/export-data/full_export.sh @@ -1,30 +1,20 @@ #!/bin/bash # Full data export -# Depends: aws-cli set -xeuo pipefail -source ./common.sh -require_env "${FULL_EXPORT_BUCKET:-}" "Specify FULL_EXPORT_BUCKET" -require_env "${CONN:-}" "Specify MongoDB connection string in CONN" +require_env() { + if [ -z "$1" ]; then + echo "$2" + exit 1 + fi +} -SCRATCH="$(mktemp -d)" -BUCKETS="${SCRATCH}/buckets.json" -ALL_DATA="${SCRATCH}/all_data.csv" -trap 'rm -rf "$SCRATCH"' EXIT # Cleanup before exit +require_env "${FULL_EXPORT_BUCKET:-}" "Specify FULL_EXPORT_BUCKET" +require_env "${CONN:-}" "Specify MongoDB connection string in CONN" -FORMAT="${FORMAT:-csv,tsv,json}" -QUERY="{\"curators.verifiedBy\": { \"\$exists\": \"true\"}}" +mongoexport --uri="$CONN" --queryFile="query.json" --collection=day0cases --fieldFile=fields.txt --type=csv --out full.csv -mongoexport --uri="$CONN" --collection=ageBuckets --type=json --jsonArray -o "${BUCKETS}" -mongoexport --query="$QUERY" --uri="$CONN" --collection=day0cases \ - --fieldFile=fields.txt --type=csv -o "${ALL_DATA}" -python3 transform.py -f "$FORMAT" -b "${BUCKETS}" -i "${ALL_DATA}" "full" +gzip full.csv -# ignore shellcheck warning on word splitting, as it's actually needed here -# shellcheck disable=SC2086 -for fmt in ${FORMAT//,/ } -do - test -f "full.${fmt}.gz" && aws s3 cp "full.${fmt}.gz" "s3://${FULL_EXPORT_BUCKET}/${fmt}/" - rm "full.${fmt}.gz" -done +aws s3 cp full.csv.gz "s3://${FULL_EXPORT_BUCKET}/csv/" diff --git a/data-serving/scripts/export-data/logger.py b/data-serving/scripts/export-data/logger.py deleted file mode 100644 index 23672aa90..000000000 --- a/data-serving/scripts/export-data/logger.py +++ /dev/null @@ -1,9 +0,0 @@ -import logging -import sys - - -def setup_logger(): - h = logging.StreamHandler(sys.stdout) - rootLogger = logging.getLogger() - rootLogger.addHandler(h) - rootLogger.setLevel(logging.DEBUG) \ No newline at end of file diff --git a/data-serving/scripts/export-data/query.json b/data-serving/scripts/export-data/query.json new file mode 100644 index 000000000..98b0ec782 --- /dev/null +++ b/data-serving/scripts/export-data/query.json @@ -0,0 +1,8 @@ +{ + "caseStatus": { + "$nin": [ "discarded", "omit_error" ] + }, + "curators.verifiedBy": { + "$exists": "true" + } +} diff --git a/data-serving/scripts/export-data/setup_country_export.sh b/data-serving/scripts/export-data/setup_country_export.sh deleted file mode 100755 index 18c75211c..000000000 --- a/data-serving/scripts/export-data/setup_country_export.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# Sets up batch job definitions for the various countries -# Depends: jq aws-cli curl - -set -eou pipefail -source ./common.sh - -require_env "${ENV:-}" "Specify environment in ENV" -require_env "${CONN:-}" "Specify MongoDB connection string in CONN" -require_env "${BUCKET:-}" "Specify S3 bucket to output files in BUCKET" - -echo "Setting up country export job definitions for environment {ENV}..." - -CASECOUNT_URL=${CASECOUNT_URL:-https://covid-19-aggregates.s3.amazonaws.com/country/latest.json} -# mongoexport rate in cases/s -# actual rate is higher, but this allows some wiggle room -# in calculation of Batch job timeouts -EXPORT_RATE=400 -IMAGE="${IMAGE:-$ECR/gdh-country-exporter:latest}" -# ingestion role contains necessary permissions to access S3 buckets - -function casecounts { - curl -s -o - "$CASECOUNT_URL" | jq -r 'to_entries[0].value[] | [._id, .casecount] | @tsv' -} - -function containerprops { - # usage: containerprops "" - cat << EOF -{ - "image": "$IMAGE", - "vcpus": 2, - "memory": 4096, - "jobRoleArn": "$JOB_ROLE_ARN", - "environment": [ - {"name": "COUNTRY", "value": "$1" } - , {"name": "CONN", "value": "$CONN" } - , {"name": "BUCKET", "value": "$BUCKET" } - ] -} -EOF -} - -casecounts | \ - while IFS=$'\t' read -r code casecount; do - if [[ "$casecount" == "0" ]]; then - continue - fi - if [[ "$casecount" -lt "$((EXPORT_RATE * 3600))" ]]; then - timeout=3600 # allow minimum of 60 minutes for a job - else - timeout=$((casecount / EXPORT_RATE)) - fi - printf '%s-exporter-%s; casecount=%d; timeout=%d\n' "$ENV" "$code" "$casecount" "$timeout" - containerprops "${code}" - aws batch register-job-definition --job-definition-name "${ENV}-exporter-${code}" \ - --container-properties "$(containerprops "${code}")" \ - --timeout "attemptDurationSeconds=${timeout}" --type container - done diff --git a/data-serving/scripts/export-data/setup_full_export.sh b/data-serving/scripts/export-data/setup_full_export.sh deleted file mode 100755 index 52dfb4b0d..000000000 --- a/data-serving/scripts/export-data/setup_full_export.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# Sets up batch job definitions for the full export -# Depends: aws-cli - -set -eou pipefail -source ./common.sh -require_env "${ENV:-}" "Specify environment in ENV" -require_env "${COUNTRY_EXPORT_BUCKET:-}" "Specify COUNTRY_EXPORT_BUCKET" -require_env "${FULL_EXPORT_BUCKET:-}" "Specify FULL_EXPORT_BUCKET" -IMAGE="${IMAGE:-$ECR/gdh-full-exporter:latest}" - -echo "Setting up full expotr job definitions for environment ${ENV}..." -echo "Will tar files from ${COUNTRY_EXPORT_BUCKET} -> ${FULL_EXPORT_BUCKET}" - -function containerprops { - # usage: containerprops "" - cat <" - echo " where exporter_ is the job definition name" - exit 1 -fi -aws batch submit-job --job-name "$1" --job-queue export-queue --job-definition "$1" diff --git a/data-serving/scripts/export-data/test_age_buckets.json b/data-serving/scripts/export-data/test_age_buckets.json deleted file mode 100644 index c9a56ed59..000000000 --- a/data-serving/scripts/export-data/test_age_buckets.json +++ /dev/null @@ -1,27 +0,0 @@ -[ - {"_id":"0","start":0,"end":0}, - {"_id":"1-5","start":1,"end":5}, - {"_id":"6-10","start":6,"end":10}, - {"_id":"11-15","start":11,"end":15}, - {"_id":"16-20","start":16,"end":20}, - {"_id":"21-25","start":21,"end":25}, - {"_id":"26-30","start":26,"end":30}, - {"_id":"31-35","start":31,"end":35}, - {"_id":"36-40","start":36,"end":40}, - {"_id":"41-45","start":41,"end":45}, - {"_id":"46-50","start":46,"end":50}, - {"_id":"51-55","start":51,"end":55}, - {"_id":"56-60","start":56,"end":60}, - {"_id":"61-65","start":61,"end":65}, - {"_id":"66-70","start":66,"end":70}, - {"_id":"71-75","start":71,"end":75}, - {"_id":"76-80","start":76,"end":80}, - {"_id":"81-85","start":81,"end":85}, - {"_id":"86-90","start":86,"end":90}, - {"_id":"91-95","start":91,"end":95}, - {"_id":"96-100","start":96,"end":100}, - {"_id":"101-105","start":101,"end":105}, - {"_id":"106-110","start":106,"end":110}, - {"_id":"111-115","start":111,"end":115}, - {"_id":"116-120","start":116,"end":120} -] diff --git a/data-serving/scripts/export-data/test_convert_age.csv b/data-serving/scripts/export-data/test_convert_age.csv deleted file mode 100644 index 76db39ce2..000000000 --- a/data-serving/scripts/export-data/test_convert_age.csv +++ /dev/null @@ -1,10 +0,0 @@ -demographics.ageRange.start,demographics.ageRange.end,demographics.ageBuckets -20,30, -22,22, -,, -22,22,[] -20,30,[] -0,0,[] -,,["0"] -,,"[""26-30"",""31-35""]" -1.5,1.5,[] diff --git a/data-serving/scripts/export-data/test_transform.py b/data-serving/scripts/export-data/test_transform.py deleted file mode 100644 index 98050b494..000000000 --- a/data-serving/scripts/export-data/test_transform.py +++ /dev/null @@ -1,220 +0,0 @@ -import io -import csv -import pytest -import json -from pathlib import Path -from contextlib import redirect_stdout -import transform as T - -_DEEP_GET = [ - ({"x": {"y": {"z": 2}}}, "x.y.z", 2), - ({"x": {"y": [1, 2]}}, "x.y", [1, 2]), -] - -_ADDITIONAL_SOURCES = [ - ("[]", None), - ( - '[{"sourceUrl": "http://foo.bar"}, {"sourceUrl": "http://bar.baz"}]', - "http://foo.bar,http://bar.baz", - ), -] - -_EVENTS = { - "dateEntry": {"$date": "2021-07-21T00:00:00.000Z"}, - "dateReported": {"$date": "2021-07-20T00:00:00.000Z"}, - "dateLastModified": {"$date": "2021-07-19T00:00:00.000Z"}, - "dateOnset": {"$date": "2021-07-18T00:00:00.000Z"}, - "dateConfirmation": {"$date": "2021-07-17T00:00:00.000Z"}, - "confirmationMethod": "last report", - "dateOfFirstConsult": {"$date": "2021-07-16T00:00:00.000Z"}, - "hospitalized": "Y", - "reasonForHospitalization": "monitoring", - "dateHospitalization": {"$date": "2021-07-15T00:00:00.000Z"}, - "dateDischargeHospital": {"$date": "2021-07-14T00:00:00.000Z"}, - "intensiveCare": "Y", - "dateAdmissionICU": {"$date": "2021-07-13T00:00:00.000Z"}, - "dateDischargeICU": {"$date": "2021-07-12T00:00:00.000Z"}, - "homeMonitoring": "Y", - "isolated": "Y", - "dateIsolation": {"$date": "2021-07-11T00:00:00.000Z"}, - "outcome": "death", - "dateDeath": {"$date": "2021-07-10T00:00:00.000Z"}, - "dateRecovered": {"$date": "2021-07-09T00:00:00.000Z"}, -} - -_EVENTS_parsed = { - "events.dateEntry": "2021-07-21", - "events.dateReported": "2021-07-20", - "events.dateLastModified": "2021-07-19", - "events.dateOnset": "2021-07-18", - "events.dateConfirmation": "2021-07-17", - "events.confirmationMethod": "last report", - "events.dateOfFirstConsult": "2021-07-16", - "events.hospitalized": "Y", - "events.reasonForHospitalization": "monitoring", - "events.dateHospitalization": "2021-07-15", - "events.dateDischargeHospital": "2021-07-14", - "events.intensiveCare": "Y", - "events.dateAdmissionICU": "2021-07-13", - "events.dateDischargeICU": "2021-07-12", - "events.homeMonitoring": "Y", - "events.isolated": "Y", - "events.dateIsolation": "2021-07-11", - "events.outcome": "death", - "events.dateDeath": "2021-07-10", - "events.dateRecovered": "2021-07-09", -} - - -_TRAVEL = [ - { - "dateRange": {"start": "2021-10-10T00:00:00Z", "end": "2021-10-12T00:00:00Z"}, - "location": { - "geometry": {"latitude": 35, "longitude": -31}, - "administrativeAreaLevel1": "Port", - "country": "Atlantis", - "name": "Port of Atlantis", - }, - "methods": "Ship", - }, - { - "dateRange": {"start": "2021-10-13T00:00:00Z", "end": "2021-10-15T00:00:00Z"}, - "location": { - "geometry": {"latitude": 35, "longitude": -31}, - "administrativeAreaLevel1": "Coast", - "country": "Atlantis", - "name": "Coast", - }, - "methods": "Raft", - }, -] - -_TRAVEL_parsed = { - "travelHistory.travel.dateRange.end": "2021-10-12,2021-10-15", - "travelHistory.travel.dateRange.start": "2021-10-10,2021-10-13", - "travelHistory.travel.location.administrativeAreaLevel1": "Port,Coast", - "travelHistory.travel.location.country": "Atlantis,Atlantis", - "travelHistory.travel.location.geometry.coordinates": "(35, -31),(35, -31)", - "travelHistory.travel.location.name": "Port of Atlantis,Coast", - "travelHistory.travel.methods": "Ship,Raft", -} - -_BUCKETS = [ - { - "_id": "001", - "start": 20, - "end": 24, - }, - { - "_id": "002", - "start": 25, - "end": 29, - } -] - - -def _read_csv(fn): - with open(fn) as f: - c = csv.DictReader(f) - return [row for row in c] - - -@pytest.mark.parametrize("dictionary,key,value", _DEEP_GET) -def test_deep_get(dictionary, key, value): - assert T.deep_get(dictionary, key) == value - - -@pytest.mark.parametrize("sources,expected", _ADDITIONAL_SOURCES) -def test_convert_addl_sources(sources, expected): - assert T.convert_addl_sources(sources) == expected - - -def test_convert_travel(): - assert T.convert_travel(json.dumps(_TRAVEL)) == _TRAVEL_parsed - - -@pytest.mark.parametrize("fmt", ["csv", "tsv", "json"]) -def test_transform_output_match(fmt): - expected = Path(f'test_transform_mongoexport_expected.{fmt}').read_text() - with redirect_stdout(io.StringIO()) as f: - T.transform('test_transform_mongoexport.csv', '-', [fmt], "test_age_buckets.json") - # use str.splitlines to ignore line endings - - expected_lines = expected.splitlines() - actual_lines = f.getvalue().splitlines() - - lines_to_compare = zip(expected_lines, actual_lines) - for line_pair in lines_to_compare: - # whitespaces in tsv file are causing issues with assert - assert "".join(line_pair[0].split()) == "".join(line_pair[1].split()) - - -def test_transform_empty(tmp_path): - output = f"{tmp_path}/empty" - T.transform('test_transform_mongoexport_header.csv', output, ['csv'], "test_age_buckets.json") - assert not Path(f"{output}.csv.gz").exists() - - -def test_transform_creates_output(tmp_path): - formats = ['csv', 'tsv', 'json'] - output = f"{tmp_path}/output" - T.transform('test_transform_mongoexport.csv', output, formats, "test_age_buckets.json") - for fmt in formats: - assert Path(f"{output}.{fmt}.gz").exists() - - -def test_transform_buckets_age_ranges(): - expected = Path(f'test_transform_mongoexport_bucketed_ages_expected.csv').read_text() - with redirect_stdout(io.StringIO()) as f: - T.transform('test_transform_mongoexport_bucketed_ages.csv', '-', ['csv'], 'test_age_buckets.json') - - expected_lines = expected.splitlines() - actual_lines = f.getvalue().splitlines() - - lines_to_compare = zip(expected_lines, actual_lines) - for line_pair in lines_to_compare: - assert line_pair[0] == line_pair[1] - - -def test_age_bucket_conversion(): - case_buckets_json = "[\"001\", \"002\"]" - (start, end) = T.age_range(case_buckets_json, _BUCKETS) - assert start == 20 - assert end == 29 - - -def test_age_bucket_row_conversion(): - row = { - "_id": "1", - "travelHistory.traveledPrior30Days": "false", - "demographics.ageBuckets": "[\"001\"]" - } - converted_row = T.convert_row(row, _BUCKETS) - assert converted_row["demographics.ageRange.start"] == 20 - assert converted_row["demographics.ageRange.end"] == 24 - -@pytest.fixture -def age_buckets(): - with Path(__file__).with_name("test_age_buckets.json").open() as fp: - return json.load(fp) - - -@pytest.mark.parametrize("source,expected", [((22, 22),(21, 25)), ((58, 62),(56, 65)), ((130, 150), None)]) -def test_get_age_bucket_as_range(source, expected, age_buckets): - assert T.get_age_bucket_as_range(age_buckets, *source) == expected - - -def convert_age_data(): - with Path(__file__).with_name("test_convert_age.csv").open() as fp: - return list(csv.DictReader(fp)) - - -@pytest.mark.parametrize( - "row,expected_age", - zip(convert_age_data(), [ - (20, 30), (21, 25), None, (21, 25), - (20, 30), (0, 0), (0, 0), (26, 35), (1, 5), - ]) -) -def test_convert_age(row, expected_age, age_buckets): - assert T.convert_age(row, age_buckets) == expected_age diff --git a/data-serving/scripts/export-data/test_transform_mongoexport.csv b/data-serving/scripts/export-data/test_transform_mongoexport.csv deleted file mode 100644 index 1929b3cb1..000000000 --- a/data-serving/scripts/export-data/test_transform_mongoexport.csv +++ /dev/null @@ -1,3 +0,0 @@ -_id,caseReference.additionalSources,caseReference.sourceEntryId,caseReference.sourceId,caseReference.sourceUrl,caseReference.uploadIds,caseReference.verificationStatus,caseStatus,demographics.ageRange.end,demographics.ageRange.start,demographics.ethnicity,demographics.gender,demographics.nationalities,demographics.occupation,events.dateEntry,events.dateReported,events.dateLastModified,events.dateOnset,events.dateConfirmation,events.confirmationMethod,events.dateOfFirstConsult,events.hospitalized,events.reasonForHospitalization,events.dateHospitalization,events.dateDischargeHospital,events.intensiveCare,events.dateAdmissionICU,events.dateDischargeICU,events.homeMonitoring,events.isolated,events.dateIsolation,events.outcome,events.dateDeath,events.dateRecovered,location.administrativeAreaLevel1,location.administrativeAreaLevel2,location.administrativeAreaLevel3,location.country,location.geoResolution,location.geometry.latitude,location.geometry.longitude,location.name,location.place,location.query,pathogens,preexistingConditions.hasPreexistingConditions,preexistingConditions.values,revisionMetadata.creationMetadata.curator,revisionMetadata.creationMetadata.date,revisionMetadata.creationMetadata.notes,revisionMetadata.editMetadata.curator,revisionMetadata.editMetadata.date,revisionMetadata.editMetadata.notes,revisionMetadata.revisionNumber,symptoms.status,symptoms.values,transmission.linkedCaseIds,transmission.places,transmission.routes,travelHistory.travel.dateRange.end,travelHistory.travel.dateRange.start,travelHistory.travel.location.name,travelHistory.travel.methods,travelHistory.travel.purpose,travelHistory.traveledPrior30Days,vaccines.0.name,vaccines.0.batch,vaccines.0.date,vaccines.0.sideEffects,vaccines.1.name,vaccines.1.batch,vaccines.1.date,vaccines.1.sideEffects,vaccines.2.name,vaccines.2.batch,vaccines.2.date,vaccines.2.sideEffects,vaccines.3.name,vaccines.3.batch,vaccines.3.date,vaccines.3.sideEffects -1,[],,787123878aa90909811aaff1,http://foo/bar.csv,"[""bb12399abbb19230900aa123""]",UNVERIFIED,confirmed,69,60,,Male,[],,2021-10-01T00:00:00.000Z,2021-10-02T00:00:00.000Z,,,,,,,,,,,,,,,,,,,,,,Antarctica,Country,-79.402,0.323,Antarctica,,,,[],,,ingestion@example.com,2021-01-02T13:42:34.991Z,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,, -2,[],,787123878aa90909811aaff1,http://foo/bar.csv,"[""bb12399abbb19230900aa123""]",UNVERIFIED,suspected,29,20,,Female,[],,2021-01-05T00:00:00.000Z,2021-01-06T00:00:00.000Z,,,,,,,,,,,,,,,,,,,,,,Antarctica,Country,-79.402,0.323,Antarctica,,,,[],,,ingestion@example.com,2021-01-02T13:42:34.991Z,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,, diff --git a/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages.csv b/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages.csv deleted file mode 100644 index aa6724eb7..000000000 --- a/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages.csv +++ /dev/null @@ -1,3 +0,0 @@ -_id,caseReference.additionalSources,caseReference.sourceId,caseReference.sourceUrl,caseReference.uploadIds,caseReference.verificationStatus,caseStatus,demographics.ageBuckets,demographics.ethnicity,demographics.gender,demographics.nationalities,demographics.occupation,events,location.administrativeAreaLevel1,location.administrativeAreaLevel2,location.administrativeAreaLevel3,location.country,location.geoResolution,location.geometry.latitude,location.geometry.longitude,location.name,location.place,location.query,pathogens,preexistingConditions.hasPreexistingConditions,preexistingConditions.values,revisionMetadata.creationMetadata.curator,revisionMetadata.creationMetadata.date,revisionMetadata.creationMetadata.notes,revisionMetadata.editMetadata.curator,revisionMetadata.editMetadata.date,revisionMetadata.editMetadata.notes,revisionMetadata.revisionNumber,symptoms.status,symptoms.values,transmission.linkedCaseIds,transmission.places,transmission.routes,travelHistory.travel.dateRange.end,travelHistory.travel.dateRange.start,travelHistory.travel.location.name,travelHistory.travel.methods,travelHistory.travel.purpose,travelHistory.traveledPrior30Days,vaccines.0.name,vaccines.0.batch,vaccines.0.date,vaccines.0.sideEffects,vaccines.1.name,vaccines.1.batch,vaccines.1.date,vaccines.1.sideEffects,vaccines.2.name,vaccines.2.batch,vaccines.2.date,vaccines.2.sideEffects,vaccines.3.name,vaccines.3.batch,vaccines.3.date,vaccines.3.sideEffects -ObjectId(6817283abaa89324a90109aa),[],787123878aa90909811aaff1,http://foo/bar.csv,"[""bb12399abbb19230900aa123""]",UNVERIFIED,confirmed,"[""001"", ""002""]",,Male,[],,"[{""name"":""confirmed"",""dateRange"":{""start"":{""$date"":""2021-10-01T00:00:00.000Z""},""end"":{""$date"":""2021-01-01T00:00:00.000Z""}}}]",,,,Antarctica,Country,-79.402,0.323,Antarctica,,,[],,,ingestion@example.com,2021-01-02T13:42:34.991Z,,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,, -ObjectId(798989a98998acc98989a1bb),[],787123878aa90909811aaff1,http://foo/bar.csv,"[""bb12399abbb19230900aa123""]",UNVERIFIED,suspected,"[""003""]",,Female,[],,"[{""name"":""confirmed"",""dateRange"":{""start"":{""$date"":""2021-01-05T00:00:00.000Z""},""end"":{""$date"":""2021-01-05T00:00:00.000Z""}}}]",,,,Antarctica,Country,-79.402,0.323,Antarctica,,,[],,,ingestion@example.com,2021-01-02T13:42:34.991Z,,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,, diff --git a/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages_expected.csv b/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages_expected.csv deleted file mode 100644 index 45f02e98b..000000000 --- a/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages_expected.csv +++ /dev/null @@ -1,3 +0,0 @@ -_id,caseReference.additionalSources,caseReference.sourceId,caseReference.sourceUrl,caseReference.uploadIds,caseReference.verificationStatus,caseStatus,demographics.ageRange.end,demographics.ageRange.start,demographics.ethnicity,demographics.gender,demographics.nationalities,demographics.occupation,location.administrativeAreaLevel1,location.administrativeAreaLevel2,location.administrativeAreaLevel3,location.country,location.geometry.latitude,location.geometry.longitude,location.geoResolution,location.name,location.place,pathogens,preexistingConditions.hasPreexistingConditions,preexistingConditions.values,revisionMetadata.creationMetadata.date,revisionMetadata.creationMetadata.notes,revisionMetadata.editMetadata.date,revisionMetadata.editMetadata.notes,revisionMetadata.revisionNumber,symptoms.status,symptoms.values,transmission.linkedCaseIds,transmission.places,transmission.routes,travelHistory.travel.dateRange.end,travelHistory.travel.dateRange.start,travelHistory.travel.location.administrativeAreaLevel1,travelHistory.travel.location.administrativeAreaLevel2,travelHistory.travel.location.administrativeAreaLevel3,travelHistory.travel.location.country,travelHistory.travel.location.geometry.coordinates,travelHistory.travel.location.geoResolution,travelHistory.travel.location.name,travelHistory.travel.location.place,travelHistory.travel.methods,travelHistory.travel.purpose,travelHistory.traveledPrior30Days,vaccines.0.batch,vaccines.0.date,vaccines.0.name,vaccines.0.sideEffects,vaccines.1.batch,vaccines.1.date,vaccines.1.name,vaccines.1.sideEffects,vaccines.2.batch,vaccines.2.date,vaccines.2.name,vaccines.2.sideEffects,vaccines.3.batch,vaccines.3.date,vaccines.3.name,vaccines.3.sideEffects,variantOfConcern -ObjectId(6817283abaa89324a90109aa),,787123878aa90909811aaff1,http://foo/bar.csv,bb12399abbb19230900aa123,UNVERIFIED,confirmed,5,0,,Male,,,2021-01-01,,,,,,,,,,,,,,Antarctica,-79.402,0.323,Country,Antarctica,,,,,2021-01-02T13:42:34.991Z,,,,NA,,,,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,,, -ObjectId(798989a98998acc98989a1bb),,787123878aa90909811aaff1,http://foo/bar.csv,bb12399abbb19230900aa123,UNVERIFIED,suspected,9,6,,Female,,,2021-01-05,,,,,,,,,,,,,,Antarctica,-79.402,0.323,Country,Antarctica,,,,,2021-01-02T13:42:34.991Z,,,,NA,,,,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,,, diff --git a/data-serving/scripts/export-data/test_transform_mongoexport_expected.csv b/data-serving/scripts/export-data/test_transform_mongoexport_expected.csv deleted file mode 100644 index aa6936f5c..000000000 --- a/data-serving/scripts/export-data/test_transform_mongoexport_expected.csv +++ /dev/null @@ -1,3 +0,0 @@ -_id,caseReference.additionalSources,caseReference.sourceId,caseReference.sourceUrl,caseReference.uploadIds,caseReference.verificationStatus,caseStatus,demographics.ageRange.end,demographics.ageRange.start,demographics.ethnicity,demographics.gender,demographics.nationalities,demographics.occupation,events.confirmationMethod,events.dateAdmissionICU,events.dateConfirmation,events.dateDeath,events.dateDischargeHospital,events.dateDischargeICU,events.dateEntry,events.dateHospitalization,events.dateIsolation,events.dateLastModified,events.dateOfFirstConsult,events.dateOnset,events.dateRecovered,events.dateReported,events.homeMonitoring,events.hospitalized,events.intensiveCare,events.isolated,events.outcome,events.reasonForHospitalization,location.administrativeAreaLevel1,location.administrativeAreaLevel2,location.administrativeAreaLevel3,location.country,location.geometry.latitude,location.geometry.longitude,location.geoResolution,location.name,location.place,pathogens,preexistingConditions.hasPreexistingConditions,preexistingConditions.values,revisionMetadata.creationMetadata.date,revisionMetadata.creationMetadata.notes,revisionMetadata.editMetadata.date,revisionMetadata.editMetadata.notes,revisionMetadata.revisionNumber,symptoms.status,symptoms.values,transmission.linkedCaseIds,transmission.places,transmission.routes,travelHistory.travel.dateRange.end,travelHistory.travel.dateRange.start,travelHistory.travel.location.administrativeAreaLevel1,travelHistory.travel.location.administrativeAreaLevel2,travelHistory.travel.location.administrativeAreaLevel3,travelHistory.travel.location.country,travelHistory.travel.location.geometry.coordinates,travelHistory.travel.location.geoResolution,travelHistory.travel.location.name,travelHistory.travel.location.place,travelHistory.travel.methods,travelHistory.travel.purpose,travelHistory.traveledPrior30Days,vaccines.0.batch,vaccines.0.date,vaccines.0.name,vaccines.0.sideEffects,vaccines.1.batch,vaccines.1.date,vaccines.1.name,vaccines.1.sideEffects,vaccines.2.batch,vaccines.2.date,vaccines.2.name,vaccines.2.sideEffects,vaccines.3.batch,vaccines.3.date,vaccines.3.name,vaccines.3.sideEffects,variantOfConcern -1,,787123878aa90909811aaff1,http://foo/bar.csv,bb12399abbb19230900aa123,UNVERIFIED,confirmed,69,60,,Male,,,,,,,,,2021-10-01T00:00:00.000Z,,,,,,,2021-10-02T00:00:00.000Z,,,,,,,,,,Antarctica,-79.402,0.323,Country,Antarctica,,,[],,ingestion@example.com,2021-01-02T13:42:34.991Z,,,,,,,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,,, -2,,787123878aa90909811aaff1,http://foo/bar.csv,bb12399abbb19230900aa123,UNVERIFIED,suspected,29,20,,Female,,,,,,,,,2021-01-05T00:00:00.000Z,,,,,,,2021-01-06T00:00:00.000Z,,,,,,,,,,Antarctica,-79.402,0.323,Country,Antarctica,,,[],,ingestion@example.com,2021-01-02T13:42:34.991Z,,,,,,,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,,, diff --git a/data-serving/scripts/export-data/test_transform_mongoexport_expected.json b/data-serving/scripts/export-data/test_transform_mongoexport_expected.json deleted file mode 100644 index 8bb3388bd..000000000 --- a/data-serving/scripts/export-data/test_transform_mongoexport_expected.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - {"_id": "1", "caseReference.additionalSources": null, "caseReference.sourceId": "787123878aa90909811aaff1", "caseReference.sourceUrl": "http://foo/bar.csv", "caseReference.uploadIds": "bb12399abbb19230900aa123", "caseReference.verificationStatus": "UNVERIFIED", "caseStatus": "confirmed", "demographics.ageRange.end": 69, "demographics.ageRange.start": 60, "demographics.ethnicity": "", "demographics.gender": "Male", "demographics.nationalities": null, "demographics.occupation": "", "events.confirmationMethod": "", "events.dateAdmissionICU": "", "events.dateConfirmation": "", "events.dateDeath": "", "events.dateDischargeHospital": "", "events.dateDischargeICU": "", "events.dateEntry": "2021-10-01T00:00:00.000Z", "events.dateHospitalization": "", "events.dateIsolation": "", "events.dateLastModified": "", "events.dateOfFirstConsult": "", "events.dateOnset": "", "events.dateRecovered": "", "events.dateReported": "2021-10-02T00:00:00.000Z", "events.homeMonitoring": "", "events.hospitalized": "", "events.intensiveCare": "", "events.isolated": "", "events.outcome": "", "events.reasonForHospitalization": "", "location.administrativeAreaLevel1": "", "location.administrativeAreaLevel2": "", "location.administrativeAreaLevel3": "", "location.country": "Antarctica", "location.geoResolution": "Country", "location.geometry.latitude": "-79.402", "location.geometry.longitude": "0.323", "location.name": "Antarctica", "location.place": "", "pathogens": "", "preexistingConditions.hasPreexistingConditions": "[]", "preexistingConditions.values": "", "revisionMetadata.creationMetadata.date": "ingestion@example.com", "revisionMetadata.creationMetadata.notes": "2021-01-02T13:42:34.991Z", "revisionMetadata.editMetadata.date": "", "revisionMetadata.editMetadata.notes": "", "revisionMetadata.revisionNumber": "", "symptoms.status": "", "symptoms.values": "", "transmission.linkedCaseIds": "", "transmission.places": "", "transmission.routes": "", "travelHistory.travel.dateRange.end": "", "travelHistory.travel.dateRange.start": "", "travelHistory.travel.location.administrativeAreaLevel1": "", "travelHistory.travel.location.administrativeAreaLevel2": "", "travelHistory.travel.location.administrativeAreaLevel3": "", "travelHistory.travel.location.country": "", "travelHistory.travel.location.geoResolution": "", "travelHistory.travel.location.geometry.coordinates": "", "travelHistory.travel.location.name": "", "travelHistory.travel.location.place": "", "travelHistory.travel.methods": "", "travelHistory.travel.purpose": "true", "travelHistory.traveledPrior30Days": "", "vaccines.0.batch": "", "vaccines.0.date": "", "vaccines.0.name": "", "vaccines.0.sideEffects": "", "vaccines.1.batch": "", "vaccines.1.date": "", "vaccines.1.name": "", "vaccines.1.sideEffects": "", "vaccines.2.batch": "", "vaccines.2.date": "", "vaccines.2.name": "", "vaccines.2.sideEffects": "", "vaccines.3.batch": "", "vaccines.3.date": "", "vaccines.3.name": "", "vaccines.3.sideEffects": "", "variantOfConcern": ""} -, {"_id": "2", "caseReference.additionalSources": null, "caseReference.sourceId": "787123878aa90909811aaff1", "caseReference.sourceUrl": "http://foo/bar.csv", "caseReference.uploadIds": "bb12399abbb19230900aa123", "caseReference.verificationStatus": "UNVERIFIED", "caseStatus": "suspected", "demographics.ageRange.end": 29, "demographics.ageRange.start": 20, "demographics.ethnicity": "", "demographics.gender": "Female", "demographics.nationalities": null, "demographics.occupation": "", "events.confirmationMethod": "", "events.dateAdmissionICU": "", "events.dateConfirmation": "", "events.dateDeath": "", "events.dateDischargeHospital": "", "events.dateDischargeICU": "", "events.dateEntry": "2021-01-05T00:00:00.000Z", "events.dateHospitalization": "", "events.dateIsolation": "", "events.dateLastModified": "", "events.dateOfFirstConsult": "", "events.dateOnset": "", "events.dateRecovered": "", "events.dateReported": "2021-01-06T00:00:00.000Z", "events.homeMonitoring": "", "events.hospitalized": "", "events.intensiveCare": "", "events.isolated": "", "events.outcome": "", "events.reasonForHospitalization": "", "location.administrativeAreaLevel1": "", "location.administrativeAreaLevel2": "", "location.administrativeAreaLevel3": "", "location.country": "Antarctica", "location.geoResolution": "Country", "location.geometry.latitude": "-79.402", "location.geometry.longitude": "0.323", "location.name": "Antarctica", "location.place": "", "pathogens": "", "preexistingConditions.hasPreexistingConditions": "[]", "preexistingConditions.values": "", "revisionMetadata.creationMetadata.date": "ingestion@example.com", "revisionMetadata.creationMetadata.notes": "2021-01-02T13:42:34.991Z", "revisionMetadata.editMetadata.date": "", "revisionMetadata.editMetadata.notes": "", "revisionMetadata.revisionNumber": "", "symptoms.status": "", "symptoms.values": "", "transmission.linkedCaseIds": "", "transmission.places": "", "transmission.routes": "", "travelHistory.travel.dateRange.end": "", "travelHistory.travel.dateRange.start": "", "travelHistory.travel.location.administrativeAreaLevel1": "", "travelHistory.travel.location.administrativeAreaLevel2": "", "travelHistory.travel.location.administrativeAreaLevel3": "", "travelHistory.travel.location.country": "", "travelHistory.travel.location.geoResolution": "", "travelHistory.travel.location.geometry.coordinates": "", "travelHistory.travel.location.name": "", "travelHistory.travel.location.place": "", "travelHistory.travel.methods": "", "travelHistory.travel.purpose": "true", "travelHistory.traveledPrior30Days": "", "vaccines.0.batch": "", "vaccines.0.date": "", "vaccines.0.name": "", "vaccines.0.sideEffects": "", "vaccines.1.batch": "", "vaccines.1.date": "", "vaccines.1.name": "", "vaccines.1.sideEffects": "", "vaccines.2.batch": "", "vaccines.2.date": "", "vaccines.2.name": "", "vaccines.2.sideEffects": "", "vaccines.3.batch": "", "vaccines.3.date": "", "vaccines.3.name": "", "vaccines.3.sideEffects": "", "variantOfConcern": ""} -] diff --git a/data-serving/scripts/export-data/test_transform_mongoexport_expected.tsv b/data-serving/scripts/export-data/test_transform_mongoexport_expected.tsv deleted file mode 100644 index 8f838ffc7..000000000 --- a/data-serving/scripts/export-data/test_transform_mongoexport_expected.tsv +++ /dev/null @@ -1,3 +0,0 @@ -_id caseReference.additionalSources caseReference.sourceId caseReference.sourceUrl caseReference.uploadIds caseReference.verificationStatus caseStatus demographics.ageRange.end demographics.ageRange.start demographics.ethnicity demographics.gender demographics.nationalities demographics.occupation events.confirmationMethod events.dateAdmissionICU events.dateConfirmation events.dateDeath events.dateDischargeHospital events.dateDischargeICU events.dateEntry events.dateHospitalization events.dateIsolation events.dateLastModified events.dateOfFirstConsult events.dateOnset events.dateRecovered events.dateReported events.homeMonitoringevents.hospitalized events.intensiveCare events.isolated events.outcome events.reasonForHospitalization location.administrativeAreaLevel1 location.administrativeAreaLevel2 location.administrativeAreaLevel3 location.country location.geometry.latitude location.geometry.longitude location.geoResolution location.name location.place pathogens preexistingConditions.hasPreexistingConditions preexistingConditions.values revisionMetadata.creationMetadata.date revisionMetadata.creationMetadata.notes revisionMetadata.editMetadata.date revisionMetadata.editMetadata.notes revisionMetadata.revisionNumber symptoms.status symptoms.values transmission.linkedCaseIds transmission.places transmission.routes travelHistory.travel.dateRange.end travelHistory.travel.dateRange.start travelHistory.travel.location.administrativeAreaLevel1 travelHistory.travel.location.administrativeAreaLevel2 travelHistory.travel.location.administrativeAreaLevel3 travelHistory.travel.location.country travelHistory.travel.location.geometry.coordinates travelHistory.travel.location.geoResolution travelHistory.travel.location.name travelHistory.travel.location.place travelHistory.travel.methods travelHistory.travel.purpose travelHistory.traveledPrior30Days vaccines.0.batch vaccines.0.date vaccines.0.name vaccines.0.sideEffects vaccines.1.batch vaccines.1.date vaccines.1.name vaccines.1.sideEffects vaccines.2.batch vaccines.2.date vaccines.2.name vaccines.2.sideEffects vaccines.3.batch vaccines.3.date vaccines.3.name vaccines.3.sideEffects variantOfConcern -1 787123878aa90909811aaff1 http://foo/bar.csv bb12399abbb19230900aa123 UNVERIFIED confirmed 69 60 Male 2021-10-01T00:00:00.000Z 2021-10-02T00:00:00.000Z Antarctica -79.402 0.323 Country Antarctica [] ingestion@example.com 2021-01-02T13:42:34.991Z true -2 787123878aa90909811aaff1 http://foo/bar.csv bb12399abbb19230900aa123 UNVERIFIED suspected 29 20 Female 2021-01-05T00:00:00.000Z 2021-01-06T00:00:00.000Z Antarctica -79.402 0.323 Country Antarctica [] ingestion@example.com 2021-01-02T13:42:34.991Z true diff --git a/data-serving/scripts/export-data/test_transform_mongoexport_header.csv b/data-serving/scripts/export-data/test_transform_mongoexport_header.csv deleted file mode 100644 index f1e400a6c..000000000 --- a/data-serving/scripts/export-data/test_transform_mongoexport_header.csv +++ /dev/null @@ -1 +0,0 @@ -_id,caseStatus,caseReference.additionalSources,caseReference.sourceId,caseReference.sourceUrl,caseReference.uploadIds,caseReference.verificationStatus,demographics.ageRange.end,demographics.ageRange.start,demographics.ethnicity,demographics.gender,demographics.nationalities,demographics.occupation,events,genomeSequences,location.administrativeAreaLevel1,location.administrativeAreaLevel2,location.administrativeAreaLevel3,location.country,location.geoResolution,location.geometry.latitude,location.geometry.longitude,location.name,location.place,location.query,pathogens,preexistingConditions.hasPreexistingConditions,preexistingConditions.values,revisionMetadata.creationMetadata.curator,revisionMetadata.creationMetadata.date,revisionMetadata.creationMetadata.notes,revisionMetadata.editMetadata.curator,revisionMetadata.editMetadata.date,revisionMetadata.editMetadata.notes,revisionMetadata.revisionNumber,symptoms.status,symptoms.values,transmission.linkedCaseIds,transmission.places,transmission.routes,travelHistory.travel.dateRange.end,travelHistory.travel.dateRange.start,travelHistory.travel.location.name,travelHistory.travel.methods,travelHistory.travel.purpose,travelHistory.traveledPrior30Days,vaccines.0.name,vaccines.0.batch,vaccines.0.date,vaccines.0.sideEffects,vaccines.1.name,vaccines.1.batch,vaccines.1.date,vaccines.1.sideEffects,vaccines.2.name,vaccines.2.batch,vaccines.2.date,vaccines.2.sideEffects,vaccines.3.name,vaccines.3.batch,vaccines.3.date,vaccines.3.sideEffects diff --git a/data-serving/scripts/export-data/transform.py b/data-serving/scripts/export-data/transform.py deleted file mode 100644 index 67c50c61d..000000000 --- a/data-serving/scripts/export-data/transform.py +++ /dev/null @@ -1,375 +0,0 @@ -#!/usr/bin/python3 -# Transforms a CSV dump from mongoexport (see export.sh) into the CSV -# format usable for country exports -# The CSV file is read from stdin, with the processed CSV file written -# to stdout - -import io -import argparse -from contextlib import contextmanager -import csv -from functools import reduce -import gzip -import json -import logging -from pathlib import Path -import sys -from typing import Any, Optional - -from logger import setup_logger - -MINIMUM_AGE_WINDOW = 5 - -VALID_FORMATS = ["csv", "tsv", "json"] - -__ARRAYS = [ - "caseReference.uploadIds", - "demographics.nationalities", - "symptoms.values", - "preexistingConditions.values", - "transmission.linkedCaseIds", - "transmission.places", - "transmission.routes", - "pathogens", -] - -__OMIT = [ - "location.query", - "revisionMetadata.creationMetadata.curator", - "revisionMetadata.editMetadata.curator", - "events", - "notes", - "travelHistory.travel", - "caseReference.sourceEntryId" -] - -__TRAVEL = [ - "travelHistory.travel.dateRange.end", - "travelHistory.travel.dateRange.start", - "travelHistory.travel.location.administrativeAreaLevel1", - "travelHistory.travel.location.administrativeAreaLevel2", - "travelHistory.travel.location.administrativeAreaLevel3", - "travelHistory.travel.location.country", - "travelHistory.travel.location.geoResolution", - "travelHistory.travel.location.geometry.coordinates", - "travelHistory.travel.location.name", - "travelHistory.travel.location.place", - "travelHistory.travel.methods", - "travelHistory.travel.purpose", -] - -__VARIANT = ["variantOfConcern"] - - -def deep_get(dictionary: dict[str, Any], keys: str, default="") -> Any: - """ - Retrieve values from nested dictionaries - """ - return reduce( - lambda d, key: d.get(key, default) if isinstance(d, dict) else default, - keys.split("."), - dictionary, - ) - - -def convert_date(date_string: str) -> Optional[str]: - if date_string: - return date_string[:10] - - -def convert_addl_sources(sources_string: str) -> str: - if sources_string == "[]": - return None - sources_array = json.loads(sources_string) - source_urls = [x["sourceUrl"] for x in sources_array] - return ",".join(source_urls) - - -def convert_string_list(item: str) -> Optional[str]: - """ - Converts text list into comma-separated string. - """ - try: - if (type(item) != str) or (item == "") or (item == "[]"): - return None - if type(j := json.loads(item)) == list: - return ",".join(map(str, j)) - else: - return item - except Exception as e: - logging.error("Couldn't convert list.") - logging.error(e) - logging.error(item) - return item - - -def convert_travel(travel_array: str) -> dict[str, Any]: - if travel_array == "[]": - return {k: None for k in __TRAVEL} - try: - travel_array = json.loads(travel_array) - travel_dict = {} - logging.info("Processing travel arrays...") - for field in set(__TRAVEL) - { - "travelHistory.travel." + f - for f in [ - "dateRange.end", - "dateRange.start", - "location.geometry.coordinates", - "methods", - ] - }: - unnest = field.removeprefix("travelHistory.travel.") - items = [deep_get(x, unnest) for x in travel_array] - if any(i for i in items): - travel_dict[field] = ",".join(items) - logging.info("Travel arrays processed, processing dates...") - for _d in ("start", "end"): - if dates := [ - convert_date(deep_get(x, f"dateRange.{_d}")) - for x in travel_array - if "dateRange" in x.keys() - ]: - travel_dict[f"travelHistory.travel.dateRange.{_d}"] = ",".join(dates) - logging.info("Travel dates processed, processing methods...") - if methods := [str(x["methods"]) for x in travel_array if x.get("methods", [])]: - travel_dict["travelHistory.travel.methods"] = ",".join(methods) - logging.info("Travel methods processed, processing coordinates...") - try: - travel_dict[ - "travelHistory.travel.location.geometry.coordinates" - ] = ",".join( - [ - str( - ( - deep_get(x, "location.geometry.latitude"), - deep_get(x, "location.geometry.longitude"), - ) - ) - for x in travel_array - ] - ) - except Exception: - logging.error("No coordinates found.") - logging.info("Coordinates processed!") - return travel_dict - except Exception as e: - logging.error("Couldn't convert travel.") - logging.error(e) - logging.error(travel_array) - return {k: None for k in __TRAVEL} - - -def get_headers_and_fields(fileobject) -> list[str]: - """ - Add processed event fieldnames to fields. - """ - try: - headers = fileobject.readline().strip().split(",") - except Exception as e: - logging.exception("Error in reading mongoexport header") - sys.exit(1) - cols_to_add = [ - "demographics.ageRange.start", - "demographics.ageRange.end", - ] - cols_to_remove = [ - "demographics.ageBuckets", - ] - fields = set(headers).union(set(cols_to_add)) - fields = fields.union(set(__TRAVEL + __VARIANT)) - fields = fields.difference(cols_to_remove) - fields = sorted(list(fields - set(__OMIT)), key=str.casefold) - return headers, fields - -def get_age_bucket_as_range(buckets: list[dict[str]], age_start: int, age_end: int) -> Optional[tuple[int, int]]: - "Returns age bucket from demographics.ageRange.start and demographics.ageRange.end" - - def which_bucket(age: int) -> int: - for i, bucket in enumerate(buckets): - if bucket["start"] <= age <= bucket["end"]: - return i - return -1 - - index_bucket_start = which_bucket(int(age_start)) - index_bucket_end = which_bucket(int(age_end)) - if index_bucket_start < 0 or index_bucket_end < 0: - return None - bounds = [buckets[index_bucket_start]["start"], buckets[index_bucket_end]["start"], - buckets[index_bucket_start]["end"], buckets[index_bucket_end]["end"]] - return (min(bounds), max(bounds)) - -def age_range(case_buckets: str, buckets: list[dict[str, Any]]) -> tuple[int, int]: - bucket_ids = json.loads(case_buckets) - matching_buckets = [b for b in buckets if b["_id"] in bucket_ids] - min_age = min([b["start"] for b in matching_buckets]) - max_age = max([b["end"] for b in matching_buckets]) - return (min_age, max_age) - - -def convert_age(row: dict[str, Any], buckets: list[dict[str, Any]]) -> Optional[tuple[int, int]]: - """ - Converts age information from CSV row to age bounds - - To handle the transition from ageRange to ageBuckets, convert_age() - support both, with a preference to ageBuckets if that field exists. - - There are three scenarios that are handled: - - 1. demographics.ageBuckets is present. This is the preferred option and - the code sets the minimum and maximum of the buckets present as the age - range - - 2. demographics.ageRange is present and the age window (difference - between the maximum and minimum is at least MINIMUM_AGE_WINDOW. In this - case, this is passed unchanged for export. - - 3. demographics.ageRange has a age window below MINIMUM_AGE_WINDOW. In - this case the code matches it to the nearest age bucket(s). - """ - age_buckets = row.get("demographics.ageBuckets") - if age_buckets and age_buckets != "[]": - return age_range(age_buckets, buckets) - - # ensure demographics.ageRange is present - if not ( - row.get("demographics.ageRange.start") and - row.get("demographics.ageRange.end") - ): - return None - - age_start = int(float(row["demographics.ageRange.start"])) - age_end = int(float(row["demographics.ageRange.end"])) - age_window = age_end - age_start + 1 - - if age_window >= MINIMUM_AGE_WINDOW: - return age_start, age_end - elif bucketed_age_range := get_age_bucket_as_range(buckets, age_start, age_end): - return bucketed_age_range - else: - return None - - -def convert_row(row: dict[str, Any], buckets: list[dict[str, Any]]) -> Optional[dict[str, Any]]: - "Converts row for export using age buckets" - - if not row["_id"].isdigit(): - return None - for arr_field in __ARRAYS: - if row.get(arr_field): - row[arr_field] = convert_string_list(row[arr_field]) - if row.get("caseReference.additionalSources"): - row["caseReference.additionalSources"] = convert_addl_sources( - row["caseReference.additionalSources"] - ) - if row["travelHistory.traveledPrior30Days"] == "true": - if "travelHistory.travel" in row: - row.update(convert_travel(row["travelHistory.travel"])) - if age_bounds := convert_age(row, buckets): - row["demographics.ageRange.start"], row["demographics.ageRange.end"] = age_bounds - return row - - -class JSONWriter: - "JSON Writer class similar to csv.DictWriter" - def __init__(self, file: io.TextIOBase, fieldnames: list[str]): - self.file = file - self.fieldnames = fieldnames - - def writeheader(self): - self.file.write("[\n") - - def writerow(self, row: dict[str, Any], row_number: int): - row_to_write = {field: row.get(field, "") for field in self.fieldnames} - tok = ", " if row_number > 0 else " " - self.file.write(tok + json.dumps(row_to_write, sort_keys=True) + "\n") - - -def writerow( - formats: list[str], writers: dict[str, Any], row: dict[str, Any], row_number: int -): - for fmt in formats: - if row_number == 0: # first row, write header - writers[fmt].writeheader() - if fmt == "json": - writers[fmt].writerow(row, row_number) - else: - writers[fmt].writerow(row) - - -@contextmanager -def open_writers(formats: list[str], fields: list[str], output: str): - if unknown_formats := set(formats) - set(VALID_FORMATS): - raise ValueError(f"Unknown formats passed: {unknown_formats}") - files = {} - writers = {} - for fmt in formats: - files[fmt] = ( - gzip.open(f"{output}.{fmt}.gz", "wt") - if output != "-" else sys.stdout - ) - if fmt == "csv": - writers[fmt] = csv.DictWriter( - files[fmt], fieldnames=fields, extrasaction="ignore" - ) - if fmt == "tsv": - writers[fmt] = csv.DictWriter( - files[fmt], fieldnames=fields, extrasaction="ignore", delimiter="\t" - ) - if fmt == "json": - writers[fmt] = JSONWriter(files[fmt], fieldnames=fields) - try: - yield writers - except Exception as e: - logging.exception(f"Error occurred in open_writers(): {e}") - finally: - if output == "-": - if formats == ["json"]: - print("]") - return - for fmt in formats: - if fmt == "json": - files[fmt].write("]\n") - files[fmt].close() - - -def transform(input: Optional[str], output: str, formats: list[str], bucketpath: str): - with (open(input) if input else sys.stdin) as inputfile: - with open(bucketpath) as bucketfile: - buckets = json.load(bucketfile) - headers, fields = get_headers_and_fields(inputfile) - reader = csv.DictReader(inputfile, fieldnames=headers) - hasrows = False - with open_writers(formats, fields, output) as writers: - for i, row in enumerate(map(lambda row: convert_row(row, buckets), reader)): - hasrows = True - writerow(formats, writers, row, i) - if output != "-" and not hasrows: # cleanup empty files - cleanup_files = [Path(f"{output}.{fmt}.gz") for fmt in formats] - for file in cleanup_files: - file.unlink(missing_ok=True) - - -if __name__ == "__main__": - setup_logger() - parser = argparse.ArgumentParser() - parser.add_argument("output", help="Output file stem to use (extension is added), specify - for stdout") - parser.add_argument( - "-f", - "--format", - help="Output formats to use, comma separated (default=csv,tsv,json)", - default="csv,tsv,json", - ) - parser.add_argument( - "-i", - "--input", - help="Input file to transform instead of stdin" - ) - parser.add_argument( - "-b", - "--buckets", - help="JSON collection of age buckets to determine case age ranges", - required=True - ) - args = parser.parse_args() - transform(args.input, args.output, args.format.split(","), args.buckets)