Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🏗️ Integrate lnschema-core into lamindb #921

Merged
merged 32 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
1560475
🚚 Prepare migration away from lnschema-core
falexwolf Dec 31, 2024
25c4f71
🚚 Rename lnschema_core to lamindb
falexwolf Dec 31, 2024
ee11da1
💚 Fix
falexwolf Dec 31, 2024
acb66ae
♻️ Install bionty from correct branch
falexwolf Dec 31, 2024
522d6ed
💚 Fix installation
falexwolf Dec 31, 2024
63a932f
💚 Fixes
falexwolf Dec 31, 2024
45a33cc
💚 More fixes
falexwolf Dec 31, 2024
f591899
💚 Fix
falexwolf Dec 31, 2024
b75ac9b
💚 Start to migrate existing instances
falexwolf Dec 31, 2024
d83b451
💚 Fix
falexwolf Dec 31, 2024
da81321
💚 Fix
falexwolf Jan 1, 2025
3b03292
💚 Fix
falexwolf Jan 1, 2025
c0b8bc6
💚 Replace laminlabs/lamindata with laminlabs/lamin-site-assets
falexwolf Jan 1, 2025
8108c6d
♻️ Add some logging
falexwolf Jan 1, 2025
b5fd14d
♻️ Organize migration process
falexwolf Jan 1, 2025
e552f44
💚 Fix
falexwolf Jan 1, 2025
e1a1db2
♻️ Leave core within schema serialization because the hub hardcodes s…
falexwolf Jan 1, 2025
b4a71cb
💚 Fix for other schema modules
falexwolf Jan 1, 2025
c08052d
💚 Fixes
falexwolf Jan 1, 2025
f030859
💚 Fix
falexwolf Jan 1, 2025
a8a55b4
♻️ More consistency
falexwolf Jan 1, 2025
8975b5f
💚 More fixes
falexwolf Jan 1, 2025
c3f9f38
💚 Try fixing lamindb install
falexwolf Jan 1, 2025
79c0c3d
💚 Fix noxfile
falexwolf Jan 1, 2025
8ea566b
💚 Remove print statement
falexwolf Jan 1, 2025
1738ba4
💚 Fix install
falexwolf Jan 1, 2025
15d3791
🔥 Remove lnschema-core as a dependency
falexwolf Jan 1, 2025
8761094
💚 Fix
falexwolf Jan 1, 2025
238bb46
♻️ Refactor lamindb
falexwolf Jan 1, 2025
9b65980
💚 Fix import
falexwolf Jan 2, 2025
895b4b2
🔊 More logging
falexwolf Jan 2, 2025
a96cc7c
💚 Fix
falexwolf Jan 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/hub-cloud/01-init-local-instance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"outputs": [],
"source": [
"from pathlib import Path\n",
"from lnschema_core.models import Storage\n",
"from lamindb.models import Storage\n",
"\n",
"assert ln_setup.settings.instance.storage.type_is_cloud == False\n",
"assert ln_setup.settings.instance.owner == ln_setup.settings.user.handle\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/hub-cloud/03-add-managed-storage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@
"from laminhub_rest.core.instance.collaborator import InstanceCollaboratorHandler\n",
"from laminhub_rest.core.account.user import UserAccountHandler\n",
"from lamindb_setup.core._hub_client import connect_hub_with_auth\n",
"from lnschema_core.models import User\n",
"from lamindb.models import User\n",
"\n",
"admin_hub = connect_hub_with_auth()\n",
"testuser2 = UserAccountHandler(admin_hub).get_by_handle(\"testuser2\")\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/hub-cloud/08-test-multi-session.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
"metadata": {},
"outputs": [],
"source": [
"from lnschema_core.models import User"
"from lamindb.models import User"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/hub-prod/test-connect-anonymously.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"metadata": {},
"outputs": [],
"source": [
"ln_setup.connect(\"laminlabs/lamindata\")"
"ln_setup.connect(\"laminlabs/lamin-site-assets\")"
]
},
{
Expand Down
31 changes: 15 additions & 16 deletions lamindb_setup/_check_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class InstanceNotSetupError(DefaultMessageException):


CURRENT_ISETTINGS: InstanceSettings | None = None
IS_LOADING: bool = False


def _get_current_instance_settings() -> InstanceSettings | None:
Expand Down Expand Up @@ -54,16 +55,11 @@ def _get_current_instance_settings() -> InstanceSettings | None:

# we make this a private function because in all the places it's used,
# users should not see it
def _check_instance_setup(
from_lamindb: bool = False, from_module: str | None = None
) -> bool:
reload_module = from_lamindb or from_module is not None
from ._init_instance import get_schema_module_name, reload_schema_modules

def _check_instance_setup(from_module: str | None = None) -> bool:
if django.IS_SETUP:
# reload logic here because module might not yet have been imported
# upon first setup
if from_module is not None:
if from_module is not None and from_module != "lamindb":
il.reload(il.import_module(from_module))
return True
silence_loggers()
Expand All @@ -75,18 +71,21 @@ def _check_instance_setup(
return True
isettings = _get_current_instance_settings()
if isettings is not None:
if reload_module and settings.auto_connect:
if not django.IS_SETUP:
if (
from_module is not None
and settings.auto_connect
and not django.IS_SETUP
and not IS_LOADING
):
if not from_module == "lamindb":
import lamindb

il.reload(il.import_module(from_module))
else:
django.setup_django(isettings)
if from_module is not None:
# this only reloads `from_module`
il.reload(il.import_module(from_module))
else:
# this bulk reloads all schema modules
reload_schema_modules(isettings)
logger.important(f"connected lamindb: {isettings.slug}")
return django.IS_SETUP
else:
if reload_module and settings.auto_connect:
if from_module is not None and settings.auto_connect:
logger.warning(InstanceNotSetupError.default_message)
return False
65 changes: 32 additions & 33 deletions lamindb_setup/_connect_instance.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import importlib
import os
import sys
from typing import TYPE_CHECKING
from uuid import UUID

Expand Down Expand Up @@ -257,6 +257,14 @@
if _test:
return None
silence_loggers()
# migrate away from lnschema-core
no_lnschema_core_file = (
settings_dir / f"no_lnschema_core-{isettings.slug.replace('/', '--')}"
)
if not no_lnschema_core_file.exists():
migrate_lnschema_core(
isettings, no_lnschema_core_file, write_file=_write_settings
)
check, msg = isettings._load_db()
if not check:
local_db = (
Expand Down Expand Up @@ -292,21 +300,13 @@
# except ProgrammingError:
# pass
load_from_isettings(isettings, user=_user, write_settings=_write_settings)
importlib.reload(importlib.import_module("lamindb"))
except Exception as e:
if isettings is not None:
if _write_settings:
isettings._get_settings_file().unlink(missing_ok=True) # type: ignore
settings._instance_settings = None
raise e
# rename lnschema_bionty to bionty for sql tables
if "bionty" in isettings.schema:
no_lnschema_bionty_file = (
settings_dir / f"no_lnschema_bionty-{isettings.slug.replace('/', '')}"
)
if not no_lnschema_bionty_file.exists():
migrate_lnschema_bionty(
isettings, no_lnschema_bionty_file, write_file=_write_settings
)
return None


Expand All @@ -322,13 +322,10 @@
return result


def migrate_lnschema_bionty(
isettings: InstanceSettings, no_lnschema_bionty_file: Path, write_file: bool = True
def migrate_lnschema_core(
isettings: InstanceSettings, no_lnschema_core_file: Path, write_file: bool = True
):
"""Migrate lnschema_bionty tables to bionty tables if bionty_source doesn't exist.

:param db_uri: str, database URI (e.g., 'sqlite:///path/to/db.sqlite' or 'postgresql://user:password@host:port/dbname')
"""
"""Migrate lnschema_core tables to lamindb tables."""
from urllib.parse import urlparse

parsed_uri = urlparse(isettings.db)
Expand All @@ -348,60 +345,62 @@
cur = conn.cursor()

try:
# check if bionty_source table exists
if db_type == "sqlite":
cur.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='bionty_source'"
"SELECT name FROM sqlite_master WHERE type='table' AND name='lamindb_user'"
)
migrated = cur.fetchone() is not None

# tables that need to be renamed
cur.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'lnschema_bionty_%'"
"SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'lnschema_core_%'"
)
tables_to_rename = [
row[0][len("lnschema_bionty_") :] for row in cur.fetchall()
row[0][len("lnschema_core_") :] for row in cur.fetchall()
]
else: # postgres
cur.execute(
"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'bionty_source')"
"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'lamindb_user')"
)
migrated = cur.fetchone()[0]

# tables that need to be renamed
cur.execute(
"SELECT table_name FROM information_schema.tables WHERE table_name LIKE 'lnschema_bionty_%'"
"SELECT table_name FROM information_schema.tables WHERE table_name LIKE 'lnschema_core_%'"
)
tables_to_rename = [
row[0][len("lnschema_bionty_") :] for row in cur.fetchall()
row[0][len("lnschema_core_") :] for row in cur.fetchall()
]

if migrated:
if write_file:
no_lnschema_bionty_file.touch(exist_ok=True)
no_lnschema_core_file.touch(exist_ok=True)
else:
try:
# rename tables only if bionty_source doesn't exist and there are tables to rename
response = input(

Check warning on line 380 in lamindb_setup/_connect_instance.py

View check run for this annotation

Codecov / codecov/patch

lamindb_setup/_connect_instance.py#L380

Added line #L380 was not covered by tests
f"Do you want to migrate to lamindb 0.78 (integrate lnschema_core into lamindb)? (y/n) -- Will rename {tables_to_rename}"
)
if response != "y":
print("Aborted.")
quit()

Check warning on line 385 in lamindb_setup/_connect_instance.py

View check run for this annotation

Codecov / codecov/patch

lamindb_setup/_connect_instance.py#L383-L385

Added lines #L383 - L385 were not covered by tests
for table in tables_to_rename:
if db_type == "sqlite":
cur.execute(
f"ALTER TABLE lnschema_bionty_{table} RENAME TO bionty_{table}"
f"ALTER TABLE lnschema_core_{table} RENAME TO lamindb_{table}"
)
else: # postgres
cur.execute(
f"ALTER TABLE lnschema_bionty_{table} RENAME TO bionty_{table};"
f"ALTER TABLE lnschema_core_{table} RENAME TO lamindb_{table};"
)

# update django_migrations table
cur.execute(
"UPDATE django_migrations SET app = 'bionty' WHERE app = 'lnschema_bionty'"
"UPDATE django_migrations SET app = 'lamindb' WHERE app = 'lnschema_core'"
)

logger.warning(
"Please uninstall lnschema-bionty via `pip uninstall lnschema-bionty`!"
print(

Check warning on line 399 in lamindb_setup/_connect_instance.py

View check run for this annotation

Codecov / codecov/patch

lamindb_setup/_connect_instance.py#L399

Added line #L399 was not covered by tests
"Renaming tables finished.\nNow, *please* call: lamin migrate deploy"
)
if write_file:
no_lnschema_bionty_file.touch(exist_ok=True)
no_lnschema_core_file.touch(exist_ok=True)

Check warning on line 403 in lamindb_setup/_connect_instance.py

View check run for this annotation

Codecov / codecov/patch

lamindb_setup/_connect_instance.py#L403

Added line #L403 was not covered by tests
except Exception:
# read-only users can't rename tables
pass
Expand Down
12 changes: 6 additions & 6 deletions lamindb_setup/_django.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@


def django(command: str, package_name: str | None = None, **kwargs):
r"""Manage migrations.
r"""Call Django commands.

Examples:

Reset auto-incrementing primary integer ids after a database import:

>>> import lamindb as ln
>>> ln.setup.django("sqlsequencereset", "lnschema_core")
>>> ln.setup.django("sqlsequencereset", "lamindb")
BEGIN;
SELECT setval(pg_get_serial_sequence('"lnschema_core_user"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "lnschema_core_user"; # noqa
SELECT setval(pg_get_serial_sequence('"lnschema_core_storage"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "lnschema_core_storage"; # noqa
SELECT setval(pg_get_serial_sequence('"lamindb_user"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "lamindb_user"; # noqa
SELECT setval(pg_get_serial_sequence('"lamindb_storage"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "lamindb_storage"; # noqa
COMMIT;

You can then run the SQL output that you'll see like so:

>>> sql = \"\"\"BEGIN;
SELECT setval(pg_get_serial_sequence('"lnschema_core_user"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "lnschema_core_user"; # noqa
SELECT setval(pg_get_serial_sequence('"lnschema_core_storage"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "lnschema_core_storage"; # noqa
SELECT setval(pg_get_serial_sequence('"lamindb_user"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "lamindb_user"; # noqa
SELECT setval(pg_get_serial_sequence('"lamindb_storage"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "lamindb_storage"; # noqa
COMMIT;\"\"\"
>>> from django.db import connection
>>> with connection.cursor() as cursor:
Expand Down
2 changes: 1 addition & 1 deletion lamindb_setup/_exportdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path

MODELS = {
"core": {
"lamindb": {
"Collection": False,
"Artifact": False,
"Transform": False,
Expand Down
50 changes: 12 additions & 38 deletions lamindb_setup/_init_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import importlib
import os
import sys
import uuid
from typing import TYPE_CHECKING, Literal
from uuid import UUID
Expand All @@ -29,6 +28,8 @@
def get_schema_module_name(schema_name, raise_import_error: bool = True) -> str | None:
import importlib.util

if schema_name == "core":
return "lamindb"
name_attempts = [f"lnschema_{schema_name.replace('-', '_')}", schema_name]
for name in name_attempts:
module_spec = importlib.util.find_spec(name)
Expand All @@ -42,8 +43,8 @@


def register_storage_in_instance(ssettings: StorageSettings):
from lnschema_core.models import Storage
from lnschema_core.users import current_user_id
from lamindb.base.users import current_user_id
from lamindb.models import Storage
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am afraid this is much slower, no?

Copy link
Member Author

@falexwolf falexwolf Jan 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would it be much slower?

(1) Do you mean because of a few additional dependencies that lamindb has over lamindb_setup? If that's the reason, I'm pretty sure it should be negligible. If it's not negligible we ought to think through optimizing import time on lamindb.

(2) If something else: 🤔 -- I can only speculate that you're having the auto-connect behavior in mind. Also here there is no additional slow-ness because you need to have django setup (connection established, if you will) also in the case of lnschema_core.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean 1), yes. lamindb is much much heavier in terms of imports, and I don’t really think it can be improved. We definitely need to test the import times. In my opinion significant degradation of performance is not justified by more convenient code organization (and I am not even sure about convenience here). lamin-cli is already pretty slow and we shouldn’t make it even slower I think.

In summary I think we need to check the performance carefully before going forward with this.

Copy link
Member Author

@falexwolf falexwolf Jan 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though I don't believe it, let's assume that lamindb is so much heavier that it leads to noteworthy degradation of import times; but even if noteworthy, it's hard to believe that they'd ever degrade more than 300ms.

Now consider that from lamindb.models import Storage and all similar statements is a dynamic import in lamindb-setup. It has to be because django needs to be setup before it runs through.

All instances in which the dynamic import is being run are cases in which run time is a few seconds. The worst case scenario of 300ms then isn't nice, but is also no big deal.

I've played with this for the past hours and I don't notice any slow down. If anything, I notice that several parts of the UX are faster (because I disentangled some recursive imports).

more convenient code organization (and I am not even sure about convenience here)

This is not at all about convenience. It's about:

  1. achieving a mono repo
  2. https://github.com/laminlabs/pfizer-lamin-usage/issues/75
  3. no longer depending on an external Django (https://laminlabs.slack.com/archives/C04FPE8V01W/p1735587801694429)

Points 2. and 3. are enabled through 1. -- I imagine that 1. will enable more fundamental code organization improvements like 2. and 3. beyond easing operational complications.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not that optimistic about performance. I know that it is fine on Linux (and probably macOS), but on windows it is already inconveniently slow and probably will be even slower with these changes. On windows it looks like additional imports make a difference.

Re part 2, I don’t entirely get why 1 and 2 are really needed, but we can discuss of course.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because I disentangled some recursive imports

maybe this will make a positive difference on windows also.

Copy link
Member

@Koncopd Koncopd Jan 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are also systems with pretty slow import of modules, like Helmholtz compute cluster, we need to have them in mind also.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not that optimistic about performance. I know that it is fine on Linux (and probably macOS), but on windows it is already inconveniently slow and probably will be even slower with these changes. On windows it looks like additional imports make a difference.

Interesting, I wasn't aware of this at all. What is inconveniently slow? Can you make an issue on the lamindb repo and we discuss there?

Re part 2, I don’t entirely get why 1 and 2 are really needed, but we can discuss of course.

  1. is a big discussion, which we should rather have in person. 2. is what Andreas wants, and I agree with him in particular because it'll enable even deeper improvements like 3.

maybe this will make a positive difference on windows also.

I hope so!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, I wasn't aware of this at all. What is inconveniently slow? Can you make an issue on the lamindb repo and we discuss there?

Actually I am pretty sure we discussed this once or twice but never did anything. I will make an issue with measured loading times vs Linux. Also maybe on Helmholtz cluster (I believe they have pretty standard setup for compute clusters).

Copy link
Member

@Koncopd Koncopd Jan 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw import times on Helmholtz cluster are pretty abysmal at times.


from .core.hashing import hash_and_encode_as_b62

Expand Down Expand Up @@ -71,7 +72,7 @@


def register_user(usettings):
from lnschema_core.models import User
from lamindb.models import User

try:
# need to have try except because of integer primary key migration
Expand Down Expand Up @@ -99,35 +100,6 @@
logger.warning(f"instance seems not set up ({error})")


def reload_schema_modules(isettings: InstanceSettings, include_core: bool = True):
schema_names = ["core"] if include_core else []
# schema_names += list(isettings.schema)
schema_module_names = [get_schema_module_name(n) for n in schema_names]

for schema_module_name in schema_module_names:
if schema_module_name in sys.modules:
schema_module = importlib.import_module(schema_module_name)
importlib.reload(schema_module)


def reload_lamindb_itself(isettings) -> bool:
reloaded = False
if "lamindb" in sys.modules:
import lamindb

importlib.reload(lamindb)
reloaded = True
return reloaded


def reload_lamindb(isettings: InstanceSettings):
log_message = settings.auto_connect
if not reload_lamindb_itself(isettings):
log_message = True
if log_message:
logger.important(f"connected lamindb: {isettings.slug}")


ERROR_SQLITE_CACHE = """
Your cached local SQLite file exists, while your cloud SQLite file ({}) doesn't.
Either delete your cache ({}) or add it back to the cloud (if delete was accidental).
Expand Down Expand Up @@ -329,6 +301,8 @@
update_schema_in_hub(access_token=access_token)
if _write_settings:
settings.auto_connect = True
importlib.reload(importlib.import_module("lamindb"))
logger.important(f"initialized lamindb: {isettings.slug}")
except Exception as e:
from ._delete import delete_by_isettings
from .core._hub_core import delete_instance_record, delete_storage_record
Expand All @@ -338,16 +312,17 @@
delete_by_isettings(isettings)
else:
settings._instance_settings = None
if (
user_handle != "anonymous" or access_token is not None
) and isettings.is_on_hub:
delete_instance_record(isettings._id, access_token=access_token)
if (
ssettings is not None
and (user_handle != "anonymous" or access_token is not None)
and ssettings.is_on_hub
):
delete_storage_record(ssettings._uuid, access_token=access_token) # type: ignore
if isettings is not None:
if (
user_handle != "anonymous" or access_token is not None
) and isettings.is_on_hub:
delete_instance_record(isettings._id, access_token=access_token)

Check warning on line 325 in lamindb_setup/_init_instance.py

View check run for this annotation

Codecov / codecov/patch

lamindb_setup/_init_instance.py#L325

Added line #L325 was not covered by tests
raise e
return None

Expand Down Expand Up @@ -378,7 +353,6 @@
if not isettings._get_settings_file().exists():
register_user(user)
isettings._persist(write_to_disk=write_settings)
reload_lamindb(isettings)


def validate_sqlite_state(isettings: InstanceSettings) -> None:
Expand Down
Loading
Loading