Skip to content

Commit

Permalink
788 alembic (#1033)
Browse files Browse the repository at this point in the history
### Feature or Bugfix
- Bugfix
- Refactoring

### Detail
1. Alembic migrations are now synchronised with the current code-state
2. `make generate-migrations` command is added. It can be applied right
after the docker container with psql is started
3. Readme about migrations

### Relates
[data.all in local environment starts in a broken db state
#788](#788)

### Security
N/A


By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.

---------

Co-authored-by: Sofia Sazonova <[email protected]>
  • Loading branch information
SofiaSazonova and Sofia Sazonova authored Feb 23, 2024
1 parent 3f419a1 commit bd0ac99
Show file tree
Hide file tree
Showing 11 changed files with 324 additions and 27 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ upgrade-db: upgrade-pip install-backend
export PYTHONPATH=./backend && \
alembic -c backend/alembic.ini upgrade head

generate-migrations: upgrade-pip install-backend
pip install 'alembic'
export PYTHONPATH=./backend && \
alembic -c backend/alembic.ini upgrade head
alembic -c backend/alembic.ini revision -m "describe_changes_shortly" --autogenerate

clean:
@rm -fr cdk_out/
@rm -fr dist/
Expand Down
3 changes: 3 additions & 0 deletions backend/alembic.ini
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

[alembic:exclude]
tables = user
3 changes: 1 addition & 2 deletions backend/dataall/base/db/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,8 @@ def get_engine(envname=ENVNAME):
'schema': envname,
}
else:
hostname = 'db' if envname == 'dkrcompose' else 'localhost'
db_params = {
'host': hostname,
'host': 'db' if envname == 'dkrcompose' and os.path.exists("/.dockerenv") else 'localhost',
'db': 'dataall',
'user': 'postgres',
'pwd': 'docker',
Expand Down
2 changes: 1 addition & 1 deletion backend/dataall/core/environment/db/environment_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class EnvironmentParameter(Base):
__tablename__ = 'environment_parameters'
environmentUri = Column(String, ForeignKey("environment.environmentUri"), primary_key=True)
key = Column('paramKey', String, primary_key=True)
value = Column('paramValue', String, nullable=True)
value = Column('paramValue', String, nullable=False)

def __init__(self, env_uri, key, value):
super().__init__()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ class DataPipelineEnvironment(Base, Resource):
pipelineLabel = Column(String, nullable=False)
stage = Column(String, nullable=False)
order = Column(Integer, nullable=False)
region = Column(String, default='eu-west-1')
region = Column(String, default='eu-west-1', nullable=False)
AwsAccountId = Column(String, nullable=False)
samlGroupName = Column(String, nullable=False)
4 changes: 2 additions & 2 deletions backend/dataall/modules/datasets_base/db/dataset_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,12 @@ def uri(cls):

class DatasetBucket(Resource, Base):
__tablename__ = 'dataset_bucket'
datasetUri = Column(String, nullable=False)
datasetUri = Column(String, ForeignKey("dataset.datasetUri", ondelete='CASCADE'), nullable=False)
bucketUri = Column(String, primary_key=True, default=utils.uuid('bucket'))
AwsAccountId = Column(String, nullable=False)
S3BucketName = Column(String, nullable=False)
region = Column(String, default='eu-west-1')
partition = Column(String, default='aws')
partition = Column(String, default='aws', nullable=False)
KmsAlias = Column(String, nullable=False)
imported = Column(Boolean, default=False)
importedKmsKey = Column(Boolean, default=False)
Expand Down
2 changes: 1 addition & 1 deletion backend/dataall/modules/mlstudio/db/mlstudio_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
class SagemakerStudioDomain(Resource, Base):
"""Describes ORM model for sagemaker ML Studio domain"""
__tablename__ = 'sagemaker_studio_domain'
environmentUri = Column(String, ForeignKey("environment.environmentUri"))
environmentUri = Column(String, ForeignKey("environment.environmentUri"), nullable=False)
sagemakerStudioUri = Column(
String, primary_key=True, default=utils.uuid('sagemakerstudio')
)
Expand Down
19 changes: 0 additions & 19 deletions backend/migrations/README

This file was deleted.

121 changes: 121 additions & 0 deletions backend/migrations/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Managing DB with Alembic locally
In data.all we use [Alembic](https://alembic.sqlalchemy.org/en/latest/) -- a lightweight database migration tool for usage with the SQLAlchemy Database Toolkit for Python.
Alembic relies on the database's current state to generate and apply migrations accurately.
Alembic determines the changes to be made by comparing the current state of the database with the desired state specified in your SQLAlchemy models. This process, known as schema diffing, requires an existing database to identify differences.
Alembic generates migration scripts based on the detected differences between the current database schema and the desired schema defined in your application code. This generation is dependent on the actual structure and content of the database.

In order to create and test migrations locally you will have to create a local database.

## Prerequisites

1. Build and launch Docker containers for the database and GraphQL.
```bash
docker compose build db
docker compose run db
docker compose build graphql
docker compose run graphql
```
These can also be initiated alongside all local testing containers using the following command:
```bash
docker compose up
```
2. Specify the location of database model descriptions. If you are at the project's root, the target folder path is backend.
```bash
export PYTHONPATH=backend
```
3. The containers initiated in the first step will default to using the schema named `dkrcompose`. In order to freely experiment with database let's create new schema 'local'. Alembic relies on the environmental variable `envname` to determine the schema. Set it to `local` with the following command:
```bash
export envname=local
```
In a real-life RDS database, `envname` adopts the value of the environment (e.g., dev, test, etc.).
If you want to apply the same migrations against your db schema that is used in local data.all deployments, you should use schema `dkrcompose`.
```bash
export envname=dkrcompose
```

## Managing migrations

When we run ```docker compose build``` the postgres container is created with no tables or schemas.

Upon start of GraphQL container, sqlalchemy ```declarative_base``` is used to create all tables with this function:
```Base.metadata.create_all(engine.engine)```. **The number of tables depends on the modules that are enabled in ```config.json``` (in the root of the project).**

As the database is created from scratch, it has no current information about migration state, so, first we need to run database upgrade.
After that alembic will be able to generate the further migrations locally.

This command will apply all migrations, and syncronize the DB state with local alembic history of migrations.
```bash
make upgrade-db
```
or
```bash
alembic -c backend/alembic.ini upgrade head
```

To upgrade database and generate alembic migration during development use:

```bash
make generate-migrations
```
or
```bash
alembic -c backend/alembic.ini upgrade head
alembic -c backend/alembic.ini revision -m "_describe_changes_shortly" --autogenerate
```
Please, change the auto-generated filename postfix with the short description of the migration purpose. Also, rename this prefix in the file itself (first line)
Always check autogenerated migration file to ensure that necessary changed are reflected there.

## What to know about autogenerated migrations:

**Autogenerate will detect:**

- Table additions, removals.
- Column additions, removals.
- Change of nullable status on columns.
- Basic changes in indexes and explicitly-named unique constraints
- Basic changes in foreign key constraints

**Autogenerate can optionally detect:**

- Change of column type. This will occur by default unless the parameter `EnvironmentContext.configure.compare_type` is set to `False`. The default implementation will reliably detect major changes, such as between `Numeric` and `String`, as well as accommodate for the types generated by SQLAlchemy’s “generic” types such as `Boolean`. Arguments that are shared between both types, such as length and precision values, will also be compared. If either the metadata type or database type has additional arguments beyond that of the other type, these are not compared, such as if one numeric type featured a “scale” and other type did not, this would be seen as the backing database not supporting the value, or reporting on a default that the metadata did not specify.

The type comparison logic is fully extensible as well; see [Comparing Types](https://alembic.sqlalchemy.org/en/latest/autogenerate.html#compare-types) for details.

- Change of server default. This will occur if you set the `EnvironmentContext.configure.compare_server_default` parameter to `True`, or to a custom callable function. This feature works well for simple cases but cannot always produce accurate results. The Postgresql backend will actually invoke the “detected” and “metadata” values against the database to determine equivalence. The feature is off by default so that it can be tested on the target schema first. Like type comparison, it can also be customized by passing a callable; see the function’s documentation for details.

**Autogenerate can not detect:**

- Changes of table name. These will come out as an add/drop of two different tables, and should be hand-edited into a name change instead.
In this case you should remove the automatically generated scripts and replace them with the following code (e.g. renaming table 'marathon' to 'snickers):
```python
def upgrade():
op.rename_table('marathon', 'snickers')
op.execute('ALTER SEQUENCE marathon_id_seq RENAME TO snickers_id_seq') # don't forget to rename all related entities
op.execute('ALTER INDEX marathon_pkey RENAME TO snickers_pkey')

def downgrade():
op.rename_table('snickers', 'marathon')
op.execute('ALTER SEQUENCE snickers_id_seq RENAME TO marathon_id_seq')
op.execute('ALTER INDEX snickers_pkey RENAME TO marathon_pkey')
```
- Changes of column name. Like table name changes, these are detected as a column add/drop pair, which is not at all the same as a name change.
To keep all data in the column add this script to upgrade function (and don't forget to add inverse in downgrade function)
```python
with op.batch_alter_table('my_table', schema=None) as batch_op: batch_op.alter_column('old_col_name', new_column_name='new_col_name')
```
- Anonymously named constraints. Give your constraints a name, e.g. `UniqueConstraint('col1', 'col2', name="my_name")`. See the section The [Importance of Naming Constraints ](https://alembic.sqlalchemy.org/en/latest/naming.html)for background on how to configure automatic naming schemes for constraints.
- Special SQLAlchemy types such as Enum when generated on a backend which doesn’t support ENUM directly - this because the representation of such a type in the non-supporting database, i.e. a CHAR+ CHECK constraint, could be any kind of CHAR+CHECK. For SQLAlchemy to determine that this is actually an ENUM would only be a guess, something that’s generally a bad idea. To implement your own “guessing” function here, use the `sqlalchemy.events.DDLEvents.column_reflect()` event to detect when a CHAR (or whatever the target type is) is reflected, and change it to an ENUM (or whatever type is desired) if it is known that that’s the intent of the type. `The sqlalchemy.events.DDLEvents.after_parent_attach()` can be used within the autogenerate process to intercept and un-attach unwanted CHECK constraints.
As example of handling Enums, please refer to `ConfidentialityClassification` in migration `97050ec09354_release_3_7_8.py`


**Autogenerate can’t currently, but will eventually detect:**
- Some free-standing constraint additions and removals may not be supported, including PRIMARY KEY, EXCLUDE, CHECK; these are not necessarily implemented within the autogenerate detection system and also may not be supported by the supporting SQLAlchemy dialect.
- Sequence additions, removals - not yet implemented.


## Why alembic didn't add my new models into migration
For not yet detected reason alembic is 'blind' towards some files with models definition. If your new model is not added to migration file,
try import its class explicitly into file 'backend/migrations/env.py' under the line '# import additional models here'.


https://alembic.sqlalchemy.org/en/latest/
26 changes: 25 additions & 1 deletion backend/migrations/env.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
from __future__ import with_statement
from alembic import context
from logging.config import fileConfig
import re


# DO NOT DELETE
# these models are not used directly in env.py, but these imports are important for alembic
# import additional models here


from dataall.modules.catalog.db.glossary_models import GlossaryNode, TermLink
from dataall.modules.dashboards.db.dashboard_models import DashboardShare, Dashboard
from dataall.modules.datapipelines import DataPipeline
from dataall.modules.datapipelines.db.datapipelines_models import DataPipelineEnvironment
from dataall.modules.feed.db.feed_models import FeedMessage
from dataall.modules.notifications.db.notification_models import Notification
from dataall.modules.vote.db.vote_models import Vote
from dataall.modules.worksheets.db.worksheet_models import WorksheetQueryResult, Worksheet

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
Expand All @@ -25,6 +41,13 @@
# ... etc.


exclude_tables = config.get_section('alembic:exclude').get('tables', '').split(',')


def include_object(object, name, type_, *args, **kwargs):
return not (type_ == 'table' and name in exclude_tables)


def run_migrations_offline():
"""Run migrations in 'offline' mode.
Expand All @@ -47,6 +70,7 @@ def run_migrations_offline():
target_metadata=target_metadata,
version_table_schema=ENVNAME,
literal_binds=True,
include_object=include_object
)

with context.begin_transaction():
Expand All @@ -62,7 +86,7 @@ def run_migrations_online():
"""

with get_engine(ENVNAME).engine.connect() as connection:
context.configure(connection=connection, target_metadata=target_metadata)
context.configure(connection=connection, target_metadata=target_metadata, include_object=include_object)

with context.begin_transaction():
context.run_migrations()
Expand Down
Loading

0 comments on commit bd0ac99

Please sign in to comment.