Skip to content

Commit

Permalink
CTK: Validate MongoDB Table Loader
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Sep 16, 2024
1 parent fe22d57 commit 30a5b06
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 2 deletions.
11 changes: 9 additions & 2 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,20 @@ updates:
schedule:
interval: "daily"

# Frameworks.
# Applications.

- directory: "/framework/apache-superset"
- directory: "/application/apache-superset"
package-ecosystem: "pip"
schedule:
interval: "daily"

- directory: "/application/cratedb-toolkit"
package-ecosystem: "pip"
schedule:
interval: "daily"

# Frameworks.

- directory: "/framework/gradio"
package-ecosystem: "pip"
schedule:
Expand Down
76 changes: 76 additions & 0 deletions .github/workflows/application-cratedb-toolkit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: CrateDB Toolkit

on:
pull_request:
branches: ~
paths:
- '.github/workflows/application-cratedb-toolkit.yml'
- 'application/cratedb-toolkit/**'
- '/requirements.txt'
push:
branches: [ main ]
paths:
- '.github/workflows/application-cratedb-toolkit.yml'
- 'application/cratedb-toolkit/**'
- '/requirements.txt'

# Allow job to be triggered manually.
workflow_dispatch:

# Run job each night after CrateDB nightly has been published.
schedule:
- cron: '0 3 * * *'

# Cancel in-progress jobs when pushing to the same branch.
concurrency:
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.ref }}

jobs:

test:
name: "
Python: ${{ matrix.python-version }}
CrateDB: ${{ matrix.cratedb-version }}
on ${{ matrix.os }}"
runs-on: ${{ matrix.os }}

strategy:
fail-fast: false
matrix:
os: [ ubuntu-22.04 ]
python-version: [ "3.9", "3.12" ]
cratedb-version: [ 'nightly' ]

services:
cratedb:
image: crate/crate:${{ matrix.cratedb-version }}
ports:
- 4200:4200
- 5432:5432
env:
CRATE_HEAP_SIZE: 4g

steps:

- name: Acquire sources
uses: actions/checkout@v4

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: "pip"
cache-dependency-path: |
pyproject.toml
requirements.txt
requirements-test.txt
- name: Install utilities
run: |
pip install -r requirements.txt
- name: Validate application/cratedb-toolkit
run: |
ngr test --accept-no-venv application/cratedb-toolkit
12 changes: 12 additions & 0 deletions application/cratedb-toolkit/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[tool.pytest.ini_options]
minversion = "2.0"
addopts = """
-rfEXs -p pytester --strict-markers --verbosity=3
--capture=no
"""
log_level = "DEBUG"
log_cli_level = "DEBUG"
testpaths = ["*.py"]
xfail_strict = true
markers = [
]
4 changes: 4 additions & 0 deletions application/cratedb-toolkit/requirements-test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
gitpython<4
platformdirs<5
pytest<9
requests<3
1 change: 1 addition & 0 deletions application/cratedb-toolkit/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cratedb-toolkit[mongodb]==0.0.23
98 changes: 98 additions & 0 deletions application/cratedb-toolkit/test_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import shlex
import sys

import requests
import logging
import platformdirs
from cratedb_toolkit.util import DatabaseAdapter
from git import Repo, RemoteProgress
import subprocess

logger = logging.getLogger(__name__)


class GitProgressPrinter(RemoteProgress):
def update(self, op_code, cur_count, max_count=None, message=""):
print(
op_code,
cur_count,
max_count,
cur_count / (max_count or 100.0),
message or "NO MESSAGE",
file=sys.stderr,
)


def test_ctk_load_table_mongodb_json():
"""
Probe importing data from MongoDB Extended JSON files.
"""

# Define table names used for testing.
table_names = [
"books",
"city_inspections",
"companies",
"countries-big",
"countries-small",
"covers",
"grades",
"products",
"profiles",
"restaurant",
"students",
]

# Define table cardinalities used in validation step.
table_cardinalities = {
"books": 431,
"city_inspections": 81047,
"companies": 2537,
"countries-big": 21640,
"countries-small": 248,
"covers": 5071,
"grades": 280,
"products": 11,
"profiles": 1515,
"restaurant": 2548,
"students": 200,
}

db = DatabaseAdapter("crate://localhost:4200/?schema=from-mongodb")

# Drop tables for blank canvas.
for table_name in table_names:
db.drop_table(table_name)

# Define path to source data.
mongodb_json_files_path = platformdirs.user_cache_path("cratedb-examples") / "mongodb_json_files"
datasets_path = mongodb_json_files_path / "datasets"

# Acquire source data.
if not datasets_path.exists():
repository_url = "https://github.com/ozlerhakan/mongodb-json-files"
print(f"Downloading repository: {repository_url}", file=sys.stderr)
Repo.clone_from(
url="https://github.com/ozlerhakan/mongodb-json-files",
to_path=mongodb_json_files_path,
progress=GitProgressPrinter(),
)

# Invoke data transfer.
command = f"""
ctk load table \
"file+bson://{datasets_path}/*.json?batch-size=2500" \
--cratedb-sqlalchemy-url="crate://localhost:4200/from-mongodb"
"""
print(f"Invoking CTK: {command}", file=sys.stderr)
subprocess.check_call(shlex.split(command))

# Validate data in database.
results = db.run_sql("SHOW TABLES", records=True)
results = [item["table_name"] for item in results]
assert results == table_names

cardinalities = {}
for table_name, cardinality in table_cardinalities.items():
cardinalities[table_name] = db.count_records(table_name)
assert cardinalities == table_cardinalities

0 comments on commit 30a5b06

Please sign in to comment.