Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide a no-brainer docker-compose build #80

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 16 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
MIMIC_SCHEMA=mimic
OMOP_SCHEMA=omop
MIMIC="host=localhost dbname=mimic user=postgres options=--search_path=$(MIMIC_SCHEMA)"
OMOP="host=localhost dbname=mimic user=postgres options=--search_path=$(OMOP_SCHEMA)"
MIMIC="host=$(DB_HOST) dbname=mimic user=postgres options=--search_path=$(MIMIC_SCHEMA),public"
OMOP="host=$(DB_HOST) dbname=mimic user=postgres options=--search_path=$(OMOP_SCHEMA),public"

build: buildmimic buildomop
runetl: sequence concept load
runetlprivate: runetl private

buildomop:
psql $(OMOP) -f omop/build-omop/postgresql/omop_ddl_comments.sql &&\
psql $(OMOP) -f omop/build-omop/postgresql/mimic-omop-add-column.sql &&\
psql $(OMOP) -f omop/build-omop/postgresql/mimic-omop-alter.sql
buildmimic:
cd mimic/build-mimic &&\
psql $(MIMIC) -v mimic_data_dir="$(MIMIC_DATA_DIR)" -f build-$(MIMIC_SCHEMA).sql &&\
psql $(MIMIC) -v mimic_data_dir="$(MIMIC_DATA_DIR)" -f postgres_add_indexes.sql &&\
psql $(MIMIC) -v mimic_data_dir="$(MIMIC_DATA_DIR)" -f analyze.sql

loadvocab:
psql $(OMOP) -f omop/build-omop/postgresql/omop_vocab_load.sql
buildomop:
psql $(OMOP) -f "omop/build-omop/postgresql/OMOP CDM postgresql ddl.txt" &&\
psql $(OMOP) -f omop/build-omop/postgresql/omop_cdm_comments.sql &&\
psql $(OMOP) -f omop/build-omop/postgresql/mimic-omop-alter.sql &&\
psql $(OMOP) -f omop/build-omop/postgresql/omop_vocab_load.sql &&\
psql $(OMOP) -f "omop/build-omop/postgresql/OMOP CDM indexes required - PostgreSQL.sql" &&\
psql $(OMOP) --set=OMOP_SCHEMA="$(OMOP_SCHEMA)" -f "omop/build-omop/postgresql/analyze.sql"

concept:
Rscript --vanilla etl/ConceptTables/loadTables.R $(MIMIC_SCHEMA)
Expand All @@ -29,7 +35,7 @@ private:
check:
psql $(MIMIC) --set=OMOP_SCHEMA="$(OMOP_SCHEMA)" -f etl/check_etl.sql

export:
exporter:
psql $(MIMIC) --set=OMOP_SCHEMA="$(OMOP_SCHEMA)" -f export/export_mimic_omop.sql &&\
cp import/import_mimic_omop.sql etl/Result/ &&\
cp omop/build-omop/postgresql/* etl/Result/
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ Below in the README, we provide two sections. The first section, *OMOP TABLES LO

The second section, *MIMIC TABLES EQUIVALENCE*, lists all the tables in MIMIC-III, and shows where the data now exists in the OMOP CDM. For example, we can see that the MIMIC-III table *patients* was used to populate the OMOP CDM tables *person* and *death*.

INSTALLATION INSTRUCTIONS
=========================

- [Manual](./README-run-etl.md)
- [Docker-compose](./docker/README.md)

OMOP TABLES LOADED
==================

Expand Down
2 changes: 2 additions & 0 deletions docker/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# either mimiciii or mimicdemo
MIMIC_SCHEMA=mimicdemo
5 changes: 5 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM debian:bullseye
RUN apt-get update && apt-get install -y postgresql-client postgresql-server-dev-13 r-base wget unzip \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
RUN R -e "install.packages('DBI', repos='http://cran.rstudio.com/')"
RUN R -e "install.packages('RPostgres', repos='http://cran.rstudio.com/')"
7 changes: 7 additions & 0 deletions docker/Dockerfile-pg
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# we can not use version >= 12 because of the way CTE are handled
# see https://www.postgresql.org/docs/16/queries-with.html#QUERIES-WITH-CTE-MATERIALIZATION
FROM postgres:11-alpine

RUN apk update && apk add wget unzip make perl patch
RUN wget https://api.pgxn.org/dist/pgtap/1.3.2/pgtap-1.3.2.zip && unzip pgtap-1.3.2.zip
RUN cd pgtap-1.3.2 && make && make install
160 changes: 160 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# How to

Clone the repository

You might adapt the `postgres.conf` file to your hardware. In particular the `shared_memory`.

You will have to either download the `mimicdemo` or `mimiciii` datasets, and place them in the `mimic` folder,
respectively. `data-mimicdemo` and `data-mimiciii`. Place the (gzipped for mimiciii/plain for mimicdemo) csv files
directly in the folder.

You will also need to download from athena the vocabulary and place it in the `extras/athena` folder.

```
root
├── extras
│   ├── athena
│   │   ├── CONCEPT.csv
│   │   ├── CONCEPT_ANCESTOR.csv
│   │   ├── CONCEPT_CLASS.csv
│   │   ├── CONCEPT_CPT4.csv
│   │   ├── CONCEPT_RELATIONSHIP.csv
│   │   ├── CONCEPT_SYNONYM.csv
│   │   ├── DOMAIN.csv
│   │   ├── DRUG_STRENGTH.csv
│   │   ├── RELATIONSHIP.csv
│   │   ├── VOCABULARY.csv
│   │   ├── athena2023.zip
│   │   ├── cpt.bat
│   │   ├── cpt.sh
│   │   ├── cpt4.jar
│   │   └── readme.txt
├── mimic
│   ├── data-mimicdemo
│   │   ├── ADMISSIONS.csv
│   │   ├── CALLOUT.csv
│   │   ├── CAREGIVERS.csv
│   │   ├── CHARTEVENTS.csv
│   │   ├── CPTEVENTS.csv
│   │   ├── DATETIMEEVENTS.csv
│   │   ├── DIAGNOSES_ICD.csv
│   │   ├── DRGCODES.csv
│   │   ├── D_CPT.csv
│   │   ├── D_ICD_DIAGNOSES.csv
│   │   ├── D_ICD_PROCEDURES.csv
│   │   ├── D_ITEMS.csv
│   │   ├── D_LABITEMS.csv
│   │   ├── ICUSTAYS.csv
│   │   ├── INPUTEVENTS_CV.csv
│   │   ├── INPUTEVENTS_MV.csv
│   │   ├── LABEVENTS.csv
│   │   ├── LICENSE.txt
│   │   ├── MICROBIOLOGYEVENTS.csv
│   │   ├── NOTEEVENTS.csv
│   │   ├── OUTPUTEVENTS.csv
│   │   ├── PATIENTS.csv
│   │   ├── PRESCRIPTIONS.csv
│   │   ├── PROCEDUREEVENTS_MV.csv
│   │   ├── PROCEDURES_ICD.csv
│   │   ├── SERVICES.csv
│   │   ├── SHA256SUMS.txt
│   │   ├── TRANSFERS.csv
│   │   └── index.html
│   ├── data-mimiciii
│   │   ├── ADMISSIONS.csv.gz
│   │   ├── CALLOUT.csv.gz
│   │   ├── CAREGIVERS.csv.gz
│   │   ├── CHARTEVENTS.csv.gz
│   │   ├── CPTEVENTS.csv.gz
│   │   ├── DATETIMEEVENTS.csv.gz
│   │   ├── DIAGNOSES_ICD.csv.gz
│   │   ├── DRGCODES.csv.gz
│   │   ├── D_CPT.csv.gz
│   │   ├── D_ICD_DIAGNOSES.csv.gz
│   │   ├── D_ICD_PROCEDURES.csv.gz
│   │   ├── D_ITEMS.csv.gz
│   │   ├── D_LABITEMS.csv.gz
│   │   ├── ICUSTAYS.csv.gz
│   │   ├── INPUTEVENTS_CV.csv.gz
│   │   ├── INPUTEVENTS_MV.csv.gz
│   │   ├── LABEVENTS.csv.gz
│   │   ├── LICENSE.txt
│   │   ├── MICROBIOLOGYEVENTS.csv.gz
│   │   ├── NOTEEVENTS.csv.gz
│   │   ├── OUTPUTEVENTS.csv.gz
│   │   ├── PATIENTS.csv.gz
│   │   ├── PRESCRIPTIONS.csv.gz
│   │   ├── PROCEDUREEVENTS_MV.csv.gz
│   │   ├── PROCEDURES_ICD.csv.gz
│   │   ├── README.md
│   │   ├── SERVICES.csv.gz
│   │   ├── SHA256SUMS.txt
│   │   ├── TRANSFERS.csv.gz
│   │   └── index.html

```

Then run in the root folder:
```shell
# edit docker/.env to choose either mimiciii or mimicdemo
docker compose -f docker/docker-compose.yml build
docker compose -f docker/docker-compose.yml up
```

It should last almost two hour to build the database, and you will find the output gzipped csvs in the `etl/Result`
folder.

```
root
├── etl
│   ├── Result
│   │   ├── OMOP CDM indexes required - PostgreSQL.sql
│   │   ├── OMOP CDM postgresql ddl.txt
│   │   ├── README.md
│   │   ├── analyze.sql
│   │   ├── attribute_definition.csv.gz
│   │   ├── care_site.csv.gz
│   │   ├── cdm_source.csv.gz
│   │   ├── cohort.csv.gz
│   │   ├── cohort_attribute.csv.gz
│   │   ├── cohort_definition.csv.gz
│   │   ├── concept.csv.gz
│   │   ├── concept_ancestor.csv.gz
│   │   ├── concept_class.csv.gz
│   │   ├── concept_relationship.csv.gz
│   │   ├── concept_synonym.csv.gz
│   │   ├── condition_era.csv.gz
│   │   ├── condition_occurrence.csv.gz
│   │   ├── cost.csv.gz
│   │   ├── death.csv.gz
│   │   ├── device_exposure.csv.gz
│   │   ├── domain.csv.gz
│   │   ├── dose_era.csv.gz
│   │   ├── drug_era.csv.gz
│   │   ├── drug_exposure.csv.gz
│   │   ├── drug_strength.csv.gz
│   │   ├── fact_relationship.csv.gz
│   │   ├── import_mimic_omop.sql
│   │   ├── location.csv.gz
│   │   ├── measurement.csv.gz
│   │   ├── mimic-omop-alter.sql
│   │   ├── mimic-omop-disable-trigger.sql
│   │   ├── mimic-omop-enable-trigger.sql
│   │   ├── mimic-omop-primary.sql
│   │   ├── note.csv.gz
│   │   ├── note_nlp.csv.gz
│   │   ├── observation.csv.gz
│   │   ├── observation_period.csv.gz
│   │   ├── omop_cdm_comments.sql
│   │   ├── omop_vocab_load.sql
│   │   ├── payer_plan_period.csv.gz
│   │   ├── person.csv.gz
│   │   ├── procedure_occurrence.csv.gz
│   │   ├── provider.csv.gz
│   │   ├── relationship.csv.gz
│   │   ├── source_to_concept_map.csv.gz
│   │   ├── specimen.csv.gz
│   │   ├── visit_detail.csv.gz
│   │   ├── visit_occurrence.csv.gz
│   │   └── vocabulary.csv.gz
```
37 changes: 37 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
version: '3.9'
services:
db:
build:
context: .
dockerfile: Dockerfile-pg
environment:
- POSTGRES_DB=mimic
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=mypassword
ports:
- 5432:5432
volumes:
- $PWD/docker/postgres.conf:/etc/postgresql/postgresql.conf
# - $PWD/db-data/:/var/lib/postgresql/data/
healthcheck:
test: ["CMD-SHELL", "psql -U postgres -d mimic -c 'select 1' | grep -q column"]
interval: 5s
command: postgres -c config_file=/etc/postgresql/postgresql.conf

etl:
build:
context: .
dockerfile: Dockerfile
depends_on:
db:
condition: service_healthy
environment:
- DB_HOST=db
- PGPASSWORD=mypassword
- MIMIC_DATA_DIR=/opt/mimic-omop/mimic/data-${MIMIC_SCHEMA}
volumes:
- "$PWD:/opt/mimic-omop/"
env_file:
- .env
entrypoint: /opt/mimic-omop/docker/entrypoint.sh
# command: ["tail", "-f", "/dev/null"]
6 changes: 6 additions & 0 deletions docker/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash
set -e # Exit immediately if a command exits with a non-zero status.

cd /opt/mimic-omop/
echo -e "dbname=mimic\nuser=postgres\nhost=db\nport=5432\npassword=$PGPASSWORD" > /opt/mimic-omop/mimic-omop.cfg
make build runetl exporter check
29 changes: 29 additions & 0 deletions docker/postgres.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# auto-generated by https://pgtune.leopard.in.ua/
# DB Version: 16
# OS Type: linux
# DB Type: dw
# Total Memory (RAM): 64 GB
# CPUs num: 8
# Connections num: 20
# Data Storage: ssd

listen_addresses = '*'
max_connections = 20
shared_buffers = 30GB
effective_cache_size = 16GB
maintenance_work_mem = 2GB
checkpoint_completion_target = 0.9
wal_buffers = 16MB
work_mem = 104857kB
#default_statistics_target = 500
#random_page_cost = 1.1
#effective_io_concurrency = 200
#huge_pages = try
min_wal_size = 4GB
max_wal_size = 16GB
#max_worker_processes = 8
#max_parallel_workers_per_gather = 4
#max_parallel_workers = 8
#max_parallel_maintenance_workers = 4
#track_activity_query_size=1048576
autovacuum=off
2 changes: 2 additions & 0 deletions etl/check_etl.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
\set ON_ERROR_STOP false
\set QUIET 1

CREATE EXTENSION IF NOT EXISTS pgtap;

\i etl/StandardizedVocabularies/CONCEPT/check_etl.sql
\i etl/StandardizedVocabularies/COHORT_DEFINITION/check_etl.sql
\i etl/StandardizedVocabularies/ATTRIBUTE_DEFINITION/check_etl.sql
Expand Down
26 changes: 26 additions & 0 deletions mimic/build-mimic/analyze.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
analyze admissions ;
analyze callout ;
analyze caregivers ;
analyze chartevents ;
analyze cptevents ;
analyze d_cpt ;
analyze d_icd_diagnoses ;
analyze d_icd_procedures ;
analyze d_items ;
analyze d_labitems ;
analyze datetimeevents ;
analyze diagnoses_icd ;
analyze drgcodes ;
analyze icustays ;
analyze inputevents_cv ;
analyze inputevents_mv ;
analyze labevents ;
analyze microbiologyevents ;
analyze noteevents ;
analyze outputevents ;
analyze patients ;
analyze prescriptions ;
analyze procedureevents_mv ;
analyze procedures_icd ;
analyze services ;
analyze transfers ;
Loading