Merge pull request #22 from mobiusml/edr/sql_db

Edr/sql db
mobiusml · Dec 15, 2023 · 302e313 · 302e313
2 parents a597569 + 28eb1a9
commit 302e313
Show file tree

Hide file tree

Showing 43 changed files with 2,747 additions and 657 deletions.
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
@@ -2,6 +2,6 @@
 
 set -e  # exit on error
 
-ruff check aana 
-ruff format aana
+poetry run ruff check aana 
+poetry run ruff format aana
 
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -5,10 +5,12 @@
         "editor.formatOnSave": true,
     },
     "python.testing.pytestArgs": [
+        // "--import-mode=importlib",
         "aana"
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
+    "python.testing.pytestPath": "poetry run pytest",
     "ruff.fixAll": true,
     "ruff.organizeImports": true,
 }
diff --git a/README.md b/README.md
@@ -102,7 +102,7 @@ to look for `/nas` and `/nas2`). You can read more about environment variables f
 
 ## Code Standards
 This project uses Ruff for linting and formatting. If you want to 
-manually run Ruff on the codebase, it's
+manually run Ruff on the codebase, using poetry it's
 
 ```sh
 poetry run ruff check aana
@@ -118,6 +118,7 @@ To run the auto-formatter, it's
 poetry run ruff format aana
 ```
 
+(If you are running code in a non-poetry environment, just leave off `poetry run`.)
 If you want to enable this as a local pre-commit hook, additionally
 run the following:
 
@@ -132,3 +133,27 @@ command is available in your default shell. You can also simply run
 For users of VS Code, the included `settings.json` should ensure
 that Ruff problems appear while you edit, and formatting is applied
 automatically on save.
+
+
+## Databases
+The project uses two databases: a vector database as well as a tradtional SQL database,
+referred to internally as vectorstore and datastore, respectively.
+
+### Vectorstore
+TBD
+
+### Datastore
+The datastore uses SQLAlchemy as an ORM layer and Alembic for migrations. The migrations are run 
+automatically at startup. If changes are made to the SQLAlchemy models, it is necessary to also 
+create an alembic migration that can be run to upgrade the database. 
+The easiest way to do so is as follows:
+
+```bash
+poetry run alembic revision --autogenerate -m "<Short description of changes in sentence form.>"
+```
+
+ORM models referenced in the rest of the code should be imported from `aana.models.db` directly,
+not from that model's file for reasons explained in `aana/models/db/__init__.py`. This also means that 
+if you add a new model class, it should be imported by `__init__.py` in addition to creating a migration.
+
+Higher level code for interacting with the ORM is available in `aana.repository.data`.
diff --git a/aana/alembic.ini b/aana/alembic.ini
@@ -0,0 +1,116 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python-dateutil library that can be
+# installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to dateutil.tz.gettz()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to alembic/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+# sqlalchemy.url = driver://user:pass@localhost/dbname
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+hooks = ruff
+ruff.type = exec
+ruff.executable = ruff
+ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/aana/alembic/README b/aana/alembic/README
@@ -0,0 +1 @@
+Generic single-database configuration.
diff --git a/aana/alembic/__init__.py b/aana/alembic/__init__.py
diff --git a/aana/alembic/env.py b/aana/alembic/env.py
@@ -0,0 +1,79 @@
+from logging.config import fileConfig
+
+from alembic import context
+from sqlalchemy import engine_from_config, pool
+
+from aana.configs.db import create_database_engine
+from aana.configs.settings import settings
+from aana.models.db.base import BaseEntity
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+
+target_metadata = BaseEntity.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    Modified to use our existing db config module.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    engine = create_database_engine(settings.db_config)
+    context.configure(
+        url=engine.url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    config_section = config.get_section(config.config_ini_section, {})
+    engine = create_database_engine(settings.db_config)
+    config_section["sqlalchemy.url"] = engine.url
+    connectable = engine_from_config(
+        config_section,
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(connection=connection, target_metadata=target_metadata)
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
diff --git a/aana/alembic/script.py.mako b/aana/alembic/script.py.mako
@@ -0,0 +1,28 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: str | None = ${repr(down_revision)}
+branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
+depends_on: str | Sequence[str] | None = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    """Upgrade database to this revision from previous."""
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    """Downgrade database from this revision to previous."""
+    ${downgrades if downgrades else "pass"}