From 509d292110f99729b64acb5fbb899b09c52c6da0 Mon Sep 17 00:00:00 2001 From: Viacheslav Pivovarov Date: Fri, 13 Dec 2024 19:40:57 +0400 Subject: [PATCH] [synthesized-io/tdk#5375] Remove workaraounds for Oracle database (#109) --- .github/workflows/test_oracle.yml | 46 ++++++++ oracle/README.md | 9 +- oracle/config.tdk.yaml | 109 ------------------ oracle/create_flag.sh | 3 + oracle/create_user.sql | 1 + oracle/docker-compose.yaml | 42 ++++--- oracle/ora2pg.conf | 2 +- oracle/soda/checks_for_input_db.yaml | 47 ++++++++ oracle/soda/checks_for_masking.yaml | 47 ++++++++ oracle/soda/configuration.yaml | 11 ++ oracle/soda/film_rating_distribution.yml | 16 +++ .../soda/masked_film_rating_distribution.yml | 16 +++ 12 files changed, 215 insertions(+), 134 deletions(-) create mode 100644 .github/workflows/test_oracle.yml create mode 100644 oracle/create_flag.sh create mode 100644 oracle/soda/checks_for_input_db.yaml create mode 100644 oracle/soda/checks_for_masking.yaml create mode 100644 oracle/soda/configuration.yaml create mode 100644 oracle/soda/film_rating_distribution.yml create mode 100644 oracle/soda/masked_film_rating_distribution.yml diff --git a/.github/workflows/test_oracle.yml b/.github/workflows/test_oracle.yml new file mode 100644 index 0000000..fceecaa --- /dev/null +++ b/.github/workflows/test_oracle.yml @@ -0,0 +1,46 @@ +name: test_oracle + +on: + push: + branches: [ 'main' ] + pull_request: + branches: [ '*' ] + +defaults: + run: + working-directory: ./oracle + +jobs: + + masking: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Build docker compose + run: | + docker compose pull + docker compose build + + - name: Run databases + continue-on-error: true + run: | + docker compose run databases + + - name: Run TDK + env: + SYNTHESIZED_KEY: ${{ secrets.SYNTHESIZED_LICENSE_KEY }} + run: | + docker compose run tdk + + - name: Verify input data + run: > + docker compose run check scan -d input_db + -c /sodacl/configuration.yaml + /sodacl/checks_for_input_db.yaml + + - name: Verify output data + run: > + docker compose run check scan -d output_db + -c /sodacl/configuration.yaml + /sodacl/checks_for_masking.yaml diff --git a/oracle/README.md b/oracle/README.md index 562f564..afaad0e 100644 --- a/oracle/README.md +++ b/oracle/README.md @@ -20,7 +20,7 @@ export SYNTHESIZED_KEY=kDpeQB... Spin up Oracle databases: ```shell -docker compose up oracle_source oracle_target -d +docker compose run databases ``` Run TDK: @@ -39,10 +39,3 @@ Run the PostgreSQL database: ```shell docker compose up postgres_masked -d ``` - - -## Knowing problems - -- get rid of `last_update` columns in the cofiguration file -- truncate shcema with recursive FKs -- mask the `film.description` column diff --git a/oracle/config.tdk.yaml b/oracle/config.tdk.yaml index 66f464f..f714cac 100644 --- a/oracle/config.tdk.yaml +++ b/oracle/config.tdk.yaml @@ -1,115 +1,6 @@ default_config: mode: MASKING -tables: - - table_name_with_schema: "TEST.CITY" - transformations: - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - - table_name_with_schema: "TEST.CUSTOMER" - transformations: - - columns: [ "CREATE_DATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - - table_name_with_schema: "TEST.FILM" - transformations: - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - - table_name_with_schema: "TEST.FILM_ACTOR" - transformations: - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - - table_name_with_schema: "TEST.ACTOR" - transformations: - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - - table_name_with_schema: "TEST.FILM_CATEGORY" - transformations: - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - - table_name_with_schema: "TEST.INVENTORY" - transformations: - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - - table_name_with_schema: "TEST.PAYMENT" - transformations: - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - - table_name_with_schema: "TEST.RENTAL" - transformations: - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - - table_name_with_schema: "TEST.ADDRESS" - transformations: - - columns: [ "LAST_UPDATE" ] - params: - type: "date_generator" - mean: 2022-03-01T12:00:00Z - std: 7776000000 - min: 2022-01-01T12:00:00Z - max: 2022-07-01T12:00:00Z - - cycle_resolution_strategy: DEFER_FOREIGN_KEY table_truncation_mode: TRUNCATE schema_creation_mode: CREATE_IF_NOT_EXISTS diff --git a/oracle/create_flag.sh b/oracle/create_flag.sh new file mode 100644 index 0000000..d56a574 --- /dev/null +++ b/oracle/create_flag.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +touch /opt/oracle/flag diff --git a/oracle/create_user.sql b/oracle/create_user.sql index 5d36bd1..28af92c 100644 --- a/oracle/create_user.sql +++ b/oracle/create_user.sql @@ -1,2 +1,3 @@ CREATE USER test IDENTIFIED BY test; GRANT ALL PRIVILEGES TO test; +GRANT SELECT ON V_$DATABASE TO test; diff --git a/oracle/docker-compose.yaml b/oracle/docker-compose.yaml index e9788d6..0c51dc4 100644 --- a/oracle/docker-compose.yaml +++ b/oracle/docker-compose.yaml @@ -2,9 +2,9 @@ version: '3' services: - oracle_source: + input_db: image: gvenzl/oracle-xe - container_name: oracle_source + container_name: input_db networks: - simple-network ports: @@ -15,16 +15,17 @@ services: - ./create_user.sql:/container-entrypoint-initdb.d/1.sql - ./sakila-schema.sql:/container-entrypoint-initdb.d/2.sql - ./oracle-sakila-insert-data.sql:/container-entrypoint-initdb.d/3.sql + - ./create_flag.sh:/container-entrypoint-initdb.d/4.sh healthcheck: - test: "${ORACLE_BASE}/healthcheck.sh" - interval: "3s" - timeout: "3s" - start_period: "3s" - retries: 30 + test: ["CMD", "sh", "-c", "if [ -e /opt/oracle/flag ]; then exit 0; else exit 1; fi"] + interval: "6s" + timeout: "6s" + start_period: "6s" + retries: 60 - oracle_target: + output_db: image: gvenzl/oracle-xe - container_name: oracle_target + container_name: output_db networks: - simple-network ports: @@ -34,7 +35,7 @@ services: volumes: - ./create_user.sql:/container-entrypoint-initdb.d/1.sql healthcheck: - test: "${ORACLE_BASE}/healthcheck.sh" + test: "/opt/oracle/healthcheck.sh" interval: "3s" timeout: "3s" start_period: "3s" @@ -42,14 +43,14 @@ services: databases: container_name: databases - image: docker/whalesay + image: synthesizedio/whalesay command: > cowsay "Both databases have been started and are ready for TDK exercises!" depends_on: - oracle_source: + input_db: condition: service_healthy - oracle_target: + output_db: condition: service_healthy @@ -60,10 +61,10 @@ services: networks: - simple-network environment: - SYNTHESIZED_INPUT_URL: jdbc:oracle:thin:@oracle_source:1521:xe + SYNTHESIZED_INPUT_URL: jdbc:oracle:thin:@input_db:1521:xe SYNTHESIZED_INPUT_USERNAME: test SYNTHESIZED_INPUT_PASSWORD: test - SYNTHESIZED_OUTPUT_URL: jdbc:oracle:thin:@oracle_target:1521:xe + SYNTHESIZED_OUTPUT_URL: jdbc:oracle:thin:@output_db:1521:xe SYNTHESIZED_OUTPUT_USERNAME: test SYNTHESIZED_OUTPUT_PASSWORD: test SYNTHESIZED_USERCONFIG_FILE: /app/config.yaml @@ -82,7 +83,9 @@ services: volumes: - ./config.tdk.yaml:/app/config.yaml - ./banner.txt:/app/banner.txt - + depends_on: + databases: + condition: service_completed_successfully ora2pg: image: georgmoser/ora2pg @@ -111,6 +114,13 @@ services: - ./postgres_output_data/COPY_TEST_output.sql:/docker-entrypoint-initdb.d/4.sql ports: [ "5432:5432" ] + check: + container_name: check + image: sodadata/soda-core + networks: + - simple-network + volumes: + - ./soda:/sodacl networks: simple-network: diff --git a/oracle/ora2pg.conf b/oracle/ora2pg.conf index 4d5f993..2d3ce5f 100644 --- a/oracle/ora2pg.conf +++ b/oracle/ora2pg.conf @@ -1,6 +1,6 @@ # PG_VERSION 11 -ORACLE_DSN dbi:Oracle:host=oracle_target;sid=xe;port=1521 +ORACLE_DSN dbi:Oracle:host=input_db;sid=xe;port=1521 ORACLE_USER test ORACLE_PWD test diff --git a/oracle/soda/checks_for_input_db.yaml b/oracle/soda/checks_for_input_db.yaml new file mode 100644 index 0000000..42a22b9 --- /dev/null +++ b/oracle/soda/checks_for_input_db.yaml @@ -0,0 +1,47 @@ +checks for actor: + - row_count = 200 + +checks for address: + - row_count = 603 + +checks for category: + - row_count = 16 + +checks for city: + - row_count = 600 + +checks for country: + - row_count = 109 + +checks for customer: + - row_count = 599 + +checks for film: + - row_count = 1000 +# - distribution_difference(rating) < 0.01: +# method: chi_square +# distribution reference file: /sodacl/film_rating_distribution.yml + +checks for film_actor: + - row_count = 5462 + +checks for film_category: + - row_count = 1000 + +checks for inventory: + - row_count = 4581 + +checks for language: + - row_count = 6 + +checks for payment: + - row_count = 16049 + +checks for rental: + - row_count = 16044 + +checks for staff: + - row_count = 2 + +checks for store: + - row_count = 2 diff --git a/oracle/soda/checks_for_masking.yaml b/oracle/soda/checks_for_masking.yaml new file mode 100644 index 0000000..ae8f357 --- /dev/null +++ b/oracle/soda/checks_for_masking.yaml @@ -0,0 +1,47 @@ +checks for staff: + - row_count same as staff in input_db + +checks for film: + - row_count same as film in input_db + - distribution_difference(rating) = 0.0: + method: swd + distribution reference file: /sodacl/masked_film_rating_distribution.yml + +checks for actor: + - row_count same as actor in input_db + +checks for address: + - row_count same as address in input_db + +checks for category: + - row_count same as category in input_db + +checks for city: + - row_count same as city in input_db + +checks for country: + - row_count same as country in input_db + +checks for customer: + - row_count same as customer in input_db + +checks for film_actor: + - row_count same as film_actor in input_db + +checks for film_category: + - row_count same as film_category in input_db + +checks for inventory: + - row_count same as inventory in input_db + +checks for language: + - row_count same as language in input_db + +checks for payment: + - row_count same as payment in input_db + +checks for rental: + - row_count same as rental in input_db + +checks for store: + - row_count same as store in input_db diff --git a/oracle/soda/configuration.yaml b/oracle/soda/configuration.yaml new file mode 100644 index 0000000..c0e4bc3 --- /dev/null +++ b/oracle/soda/configuration.yaml @@ -0,0 +1,11 @@ +data_source input_db: + type: oracle + username: test + password: test + connectstring: input_db:1521/XE + +data_source output_db: + type: oracle + username: test + password: test + connectstring: output_db:1521/XE diff --git a/oracle/soda/film_rating_distribution.yml b/oracle/soda/film_rating_distribution.yml new file mode 100644 index 0000000..2f1885b --- /dev/null +++ b/oracle/soda/film_rating_distribution.yml @@ -0,0 +1,16 @@ +dataset: film +column: rating +distribution_type: categorical +distribution_reference: + weights: + - 0.223 + - 0.21 + - 0.195 + - 0.194 + - 0.178 + bins: + - PG-13 + - NC-17 + - R + - PG + - G diff --git a/oracle/soda/masked_film_rating_distribution.yml b/oracle/soda/masked_film_rating_distribution.yml new file mode 100644 index 0000000..d0f3719 --- /dev/null +++ b/oracle/soda/masked_film_rating_distribution.yml @@ -0,0 +1,16 @@ +dataset: film +column: rating +distribution_type: categorical +distribution_reference: + weights: + - 0.223 + - 0.21 + - 0.195 + - 0.194 + - 0.178 + bins: + - MR-08 + - HB-85 + - E + - MR + - N