Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support migrations between different file ID managers #49

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 200 additions & 32 deletions jupyter_server_fileid/manager.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import datetime
import importlib
import os
import posixpath
import sqlite3
import stat
import time
import uuid
from abc import ABC, ABCMeta, abstractmethod
from typing import Any, Callable, Dict, Optional
from typing import Any, Callable, Dict, Generator, Optional, Tuple

from jupyter_core.paths import jupyter_data_dir
from traitlets import TraitError, Unicode, validate
Expand Down Expand Up @@ -65,6 +67,8 @@ class BaseFileIdManager(ABC, LoggingConfigurable, metaclass=FileIdManagerMeta):
config=True,
)

con: Optional[sqlite3.Connection]
dlqqq marked this conversation as resolved.
Show resolved Hide resolved

@validate("db_path")
def _validate_db_path(self, proposal):
if proposal["value"] is None:
Expand Down Expand Up @@ -95,6 +99,9 @@ def _from_normalized_path(self, path: Optional[str]) -> Optional[str]:
def _move_recursive(self, old_path: str, new_path: str, path_mgr: Any = os.path) -> None:
"""Move all children of a given directory at `old_path` to a new
directory at `new_path`, delimited by `sep`."""
if not hasattr(self, "con") or self.con is None:
dlqqq marked this conversation as resolved.
Show resolved Hide resolved
return

old_path_glob = old_path + path_mgr.sep + "*"
records = self.con.execute(
"SELECT id, path FROM Files WHERE path GLOB ?", (old_path_glob,)
Expand All @@ -108,6 +115,9 @@ def _move_recursive(self, old_path: str, new_path: str, path_mgr: Any = os.path)
def _copy_recursive(self, from_path: str, to_path: str, path_mgr: Any = os.path) -> None:
"""Copy all children of a given directory at `from_path` to a new
directory at `to_path`, delimited by `sep`."""
if not hasattr(self, "con") or self.con is None:
return

from_path_glob = from_path + path_mgr.sep + "*"
records = self.con.execute(
"SELECT path FROM Files WHERE path GLOB ?", (from_path_glob,)
Expand All @@ -122,9 +132,114 @@ def _copy_recursive(self, from_path: str, to_path: str, path_mgr: Any = os.path)

def _delete_recursive(self, path: str, path_mgr: Any = os.path) -> None:
"""Delete all children of a given directory, delimited by `sep`."""
if not hasattr(self, "con") or self.con is None:
return

path_glob = path + path_mgr.sep + "*"
self.con.execute("DELETE FROM Files WHERE path GLOB ?", (path_glob,))

def _write_manifest(self) -> None:
"""Writes a manifest to a database containing the current manager's
import path."""
if not hasattr(self, "con") or self.con is None:
return

manager_module = self.__class__.__module__
manager_classname = self.__class__.__qualname__
self.con.execute(
"CREATE TABLE Manifest("
"manager_module TEXT NOT NULL, "
"manager_classname TEXT NOT NULL"
")"
)
self.con.execute(
"INSERT INTO Manifest (manager_module, manager_classname) VALUES (?, ?)",
(manager_module, manager_classname),
)

def _check_manifest(self) -> bool:
"""
Returns whether the database file's manifest matches the expected
manifest for this class. If one does not exist, writes the manifest and
returns True.
"""
if not hasattr(self, "con") or self.con is None:
return False

manifest = None
manager_module = self.__class__.__module__
manager_classname = self.__class__.__qualname__

try:
manifest = self.con.execute(
"SELECT manager_module, manager_classname FROM Manifest"
).fetchone()
except Exception:
pass

if manifest is None:
self._write_manifest()
return True

manifest_module, manifest_classname = manifest
return manifest_module == manager_module and manifest_classname == manager_classname

def _migrate_database(self) -> None:
"""Checks whether the database file's manifest matches the expected
manifest for this class. Writes the manifest if one does not exist.
If one already exists and is incompatible with the expected manifest,
then this method backs up the existing database file into a separate
file, and then migrates the database over via calling `export_row()` on
the old manager and passing the yielded rows to `self.import_row()`.
"""
if not hasattr(self, "con") or self.con is None:
return

manifest = self.con.execute(
"SELECT manager_module, manager_classname FROM Manifest"
).fetchone()
prev_module, prev_classname = manifest

# first, backup the database at backup_db_path.
db_path_dir, db_path_basename = os.path.split(self.db_path)
backup_db_path = os.path.join(
db_path_dir, datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S-") + db_path_basename
)
self.con.close()
os.rename(self.db_path, backup_db_path)

# recreate files table and indices
self.con = sqlite3.connect(self.db_path)
self._write_manifest()
self._create_tables()

# finally, migrate the database from backup_db_path to the current database.
PrevManager = getattr(importlib.import_module(prev_module), prev_classname)
for row in PrevManager.export_rows(backup_db_path):
dlqqq marked this conversation as resolved.
Show resolved Hide resolved
self.import_row(row)

@staticmethod
def export_rows(db_path: str) -> Generator[Tuple[str, str], None, None]:
dlqqq marked this conversation as resolved.
Show resolved Hide resolved
"""Exports the Files table by returning a generator that yields one row
at a time, encoded as a two-tuple [id, path].
dlqqq marked this conversation as resolved.
Show resolved Hide resolved

Notes
-----
- The base implementation is only for the ArbitraryFileIdManager and
BaseFileIdManager classes. Custom file ID managers should provide their
own custom implementation if necessary.
"""
con = sqlite3.connect(db_path)
cursor = con.execute("SELECT id, path FROM Files")
row = cursor.fetchone()
while row is not None:
yield row
row = cursor.fetchone()

@abstractmethod
def import_row(self, row: Tuple[str, str]) -> None:
"""Imports a row yielded from `export_rows()` into the Files table."""

dlqqq marked this conversation as resolved.
Show resolved Hide resolved
@abstractmethod
def index(self, path: str) -> Optional[str]:
"""Returns the file ID for the file corresponding to `path`.
Expand Down Expand Up @@ -220,6 +335,8 @@ class ArbitraryFileIdManager(BaseFileIdManager):
Server 2.
"""

con: sqlite3.Connection

@validate("root_dir")
def _validate_root_dir(self, proposal):
# Convert root_dir to an api path, since that's essentially what we persist.
Expand All @@ -229,26 +346,43 @@ def _validate_root_dir(self, proposal):
normalized_content_root = self._normalize_separators(proposal["value"])
return normalized_content_root

def _create_tables(self):
self.con.execute(
"CREATE TABLE IF NOT EXISTS Files("
"id TEXT PRIMARY KEY NOT NULL, "
"path TEXT NOT NULL UNIQUE"
")"
)
self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON Files (path)")

def __init__(self, *args, **kwargs):
# pass args and kwargs to parent Configurable
super().__init__(*args, **kwargs)

# initialize instance attrs
self._update_cursor = False

# initialize connection with db
self.log.info(f"ArbitraryFileIdManager : Configured root dir: {self.root_dir}")
self.log.info(f"ArbitraryFileIdManager : Configured database path: {self.db_path}")

self.con = sqlite3.connect(self.db_path)
self.log.info("ArbitraryFileIdManager : Successfully connected to database file.")
self.log.info("ArbitraryFileIdManager : Creating File ID tables and indices.")
# do not allow reads to block writes. required when using multiple processes
self.con.execute("PRAGMA journal_mode = WAL")
self.con.execute(
"CREATE TABLE IF NOT EXISTS Files("
"id TEXT PRIMARY KEY NOT NULL, "
"path TEXT NOT NULL UNIQUE"
")"
)
self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON Files (path)")

self.log.info("ArbitraryFileIdManager : Successfully connected to database file.")

if self._check_manifest():
# ensure tables and indices are created
self.log.info("ArbitraryFileIdManager : Creating File ID tables and indices.")
self._create_tables()
else:
# migrate database from old manager to current
self.log.info(
"ArbitraryFileIdManager : Database from different File ID manager detected. Migrating tables and indices."
)
self._migrate_database()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this lead to thrashing? For example, I use two classes of FIDs (e.g., LFID and AFID) configured on my system, neither of which have bothered to configure the db path (plausible if the LFID created the Files table since its schema is a superset of AFID), then won't the AFID trigger migration when it runs, and vice versa when the LFID starts?

I really think this kind of functionality should be in the user's face and performed by a CLI tool that also does import (from a source), export (to a destination), and migration using both. With a CLI tool, you can also support any database type or storage type and no need for a Manifest table. In addition, you could support migration between databases of the same type (e.g., SQLite).

If you're too strict about class names in the manifest, you could also run into issues where subclasses are treated as distinct classes, despite the fact that their schemas (and base functionality) are identical.

Copy link
Collaborator Author

@dlqqq dlqqq Nov 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the migration is being done automatically at the present moment.

I really think this kind of functionality should be in the user's face and performed by a CLI tool that also does import (from a source), export (to a destination), and migration using both.

Wait, but what would import and export do? These CLI options don't make sense to me. Importing requires somewhere to export to, and vice versa.

I do agree that migrations should be made explicit and only done via a CLI tool. This allows us to convey the backup database path to the end user more easily. However, I'm wary of the old behavior this would lead to:

  1. start server with ArbitraryFIDM
  2. stop server and restart with LocalFIDM
  3. File ID extension fails to load, but this can get easily lost in the logs
  4. all extensions depending on File ID are now broken and we could lose data since we're no longer able to index files

Do we know of a way to stop the server from starting if the server extension fails? That way, we can force the user to either migrate their database or switch back to their old FIDM.

With a CLI tool, you can also support any database type or storage type and no need for a Manifest table. In addition, you could support migration between databases of the same type (e.g., SQLite). If you're too strict about class names in the manifest, you could also run into issues where subclasses are treated as distinct classes, despite the fact that their schemas (and base functionality) are identical.

Right, but without a manifest table, how would we force the server extension to fail at extension load time rather than at run time? We need some reflection in the database itself to tell users to migrate at load time rather than giving them some unreadable SQLite error at run time.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, but what would import and export do? These CLI options don't make sense to me. Importing requires somewhere to export to, and vice versa.

Correct. Import and Export need sources and destinations and it would be up to the CLI tool to define the types of sources (FIDMs, csv files, jsons, etc.). Migration essentially reverses those, where the source is exported from and the destination is imported to and would minimally support FIDM "source types". I.e., the initial implementation of the tool would take FIDM classes from which instances are instantiated and used. Support for other types could be added as necessary.

Regarding the failure scenario bullets...
Why does the FileID service fail to load when switching to the LFIDM - is it because the schema from the AFIDM is not sufficient and they're both configured to use the same database file (which strikes me as an administration bug)?
Won't every call to the FID service then fail and wouldn't that only go unnoticed if the calling application ignored exceptions? I hope callers are robust enough to recognize exceptions, but I might be missing something here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. Import and Export need sources and destinations and it would be up to the CLI tool to define the types of sources (FIDMs, csv files, jsons, etc.).

Sorry if I'm not understanding you fully, but it almost sounds like you want two additional CLI options in addition to jupyter server fileid migrate:

  • jupyter server fileid import
  • jupyter server fileid export

My question is what would these options do differently than migrate? It seems importing a file ID database and exporting to a different one is the exact same as migrate.

Why does the FileID service fail to load when switching to the LFIDM - is it because the schema from the AFIDM is not sufficient and they're both configured to use the same database file (which strikes me as an administration bug)?

Yeah, so it'll fail at runtime because the database schemas are different. While this is partially an administration bug, there is a legitimate use case for migrations regardless. Right now, if you want to switch FIDM implementations, you're forced to use a separate DB file, meaning you just lost all the file IDs you previously indexed. Thus you also broke all extensions that depend on file ID.

Won't every call to the FID service then fail and wouldn't that only go unnoticed if the calling application ignored exceptions? I hope callers are robust enough to recognize exceptions, but I might be missing something here.

Right, but there remains the issue of how to communicate to the user that "because you switched file ID managers, the best way to fix these errors is to migrate your database". I would prefer the server to just not start if the file ID database hasn't been migrated rather than have the user try and debug this on their own. That way, we force the user to take one of three paths:

  1. Revert back to the old File ID manager
  2. Perform a migration with the CLI tool
  3. Drop the old database

I don't want people to have to scratch their heads and figure out why file ID isn't working after switching implementations.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry if I'm not understanding you fully, but it almost sounds like you want two additional CLI options

I'm just saying that since migration is the "pipe between export and import" a CLI tool could offer different sources. What we've been primarily talking about are sources of FIDM instances (since they front the database), but I'm saying that you could also support different output destinations where the CLI can store exported data, and different input sources where the CLI tool can get data for importing.

So I could perform an equivalent migration by exporting from FIDM1 to a file exported.json and importing from exported.json to FIDM2. That is, the CLI tool manages the "pipe" since it is the caller of FIDM1.export_rows() and FIDM2.import_rows(). This can then provide users the ability to have more choices for how they want to migrate (or backup) their FID databases.

One application for this may be that the user needs to perform some transformations on the exported rows (e.g., exported.json) prior to their import as part of their "migration plan".

We don't necessarily need to expose other sources and split migrate into export and import right away but should have that kind of thing in mind during implementation so that "the pipe" is extensible such that a user could manually intervene between "export" and "import" as part of their migration.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking a bit more on this, the first non-FIDM source type to support would probably want to be csv, since you want the ability to read rows from a file and JSON doesn't really lend itself to that format (that I'm aware of). .csv would also be immediately usable in spreadsheet applications in which the transformations could be performed, etc.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to support import/export to some secondary file format in this PR. Let's limit the scope of this PR for just migrating between SQLite DBs. However, these are great ideas we should definitely think about implementing later down the road.


self.con.commit()

@staticmethod
Expand All @@ -264,7 +398,6 @@ def _normalize_path(self, path):
# use commonprefix instead of commonpath, since root_dir may not be a
# absolute POSIX path.

# norm_root_dir = self._normalize_separators(self.root_dir)
path = self._normalize_separators(path)
if posixpath.commonprefix([self.root_dir, path]) != self.root_dir:
path = posixpath.join(self.root_dir, path)
Expand All @@ -278,20 +411,24 @@ def _from_normalized_path(self, path: Optional[str]) -> Optional[str]:
if path is None:
return None

# Convert root_dir to an api path, since that's essentially what we persist.
# norm_root_dir = self._normalize_separators(self.root_dir)
if posixpath.commonprefix([self.root_dir, path]) != self.root_dir:
return None

relpath = posixpath.relpath(path, self.root_dir)
return relpath

def _create(self, path: str) -> str:
def _create(self, path: str, id: Optional[str] = None) -> str:
path = self._normalize_path(path)
id = self._uuid()
if id is None:
id = self._uuid()
self.con.execute("INSERT INTO Files (id, path) VALUES (?, ?)", (id, path))
return id

def import_row(self, row: Tuple[str, str]) -> None:
id, path = row
path = self._normalize_path(path)
self._create(path, id)

def index(self, path: str) -> str:
path = self._normalize_path(path)
row = self.con.execute("SELECT id FROM Files WHERE path = ?", (path,)).fetchone()
Expand Down Expand Up @@ -384,6 +521,8 @@ class LocalFileIdManager(BaseFileIdManager):
performed during a method's procedure body.
"""

con: sqlite3.Connection

@validate("root_dir")
def _validate_root_dir(self, proposal):
if proposal["value"] is None:
Expand All @@ -394,20 +533,7 @@ def _validate_root_dir(self, proposal):
)
return proposal["value"]

def __init__(self, *args, **kwargs):
# pass args and kwargs to parent Configurable
super().__init__(*args, **kwargs)
# initialize instance attrs
self._update_cursor = False
self._last_sync = 0.0
# initialize connection with db
self.log.info(f"LocalFileIdManager : Configured root dir: {self.root_dir}")
self.log.info(f"LocalFileIdManager : Configured database path: {self.db_path}")
self.con = sqlite3.connect(self.db_path)
self.log.info("LocalFileIdManager : Successfully connected to database file.")
self.log.info("LocalFileIdManager : Creating File ID tables and indices.")
# do not allow reads to block writes. required when using multiple processes
self.con.execute("PRAGMA journal_mode = WAL")
def _create_tables(self):
self.con.execute(
"CREATE TABLE IF NOT EXISTS Files("
"id TEXT PRIMARY KEY NOT NULL, "
Expand All @@ -420,10 +546,42 @@ def __init__(self, *args, **kwargs):
"is_dir TINYINT NOT NULL"
")"
)
self._index_all()
# no need to index ino as it is autoindexed by sqlite via UNIQUE constraint
self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON Files (path)")
self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_is_dir ON Files (is_dir)")

def __init__(self, *args, **kwargs):
# pass args and kwargs to parent Configurable
super().__init__(*args, **kwargs)

# initialize instance attrs
self._update_cursor = False
self._last_sync = 0.0

self.log.info(f"LocalFileIdManager : Configured root dir: {self.root_dir}")
self.log.info(f"LocalFileIdManager : Configured database path: {self.db_path}")

# initialize connection with db
self.con = sqlite3.connect(self.db_path)
# do not allow reads to block writes. required when using multiple processes
self.con.execute("PRAGMA journal_mode = WAL")

self.log.info("LocalFileIdManager : Successfully connected to database file.")

if self._check_manifest():
# ensure tables and indices are created
self.log.info("LocalFileIdManager : Creating File ID tables and indices.")
self._create_tables()
else:
# migrate database from old manager to current
self.log.info(
"LocalFileIdManager : Database from different File ID manager detected. Migrating tables and indices."
)
self._migrate_database()

# index all directories under content root
self._index_all()

self.con.commit()

def _normalize_path(self, path):
Expand Down Expand Up @@ -632,7 +790,7 @@ def _stat(self, path):

return self._parse_raw_stat(raw_stat)

def _create(self, path, stat_info):
def _create(self, path, stat_info, id=None):
"""Creates a record given its path and stat info. Returns the new file
ID.

Expand All @@ -642,7 +800,8 @@ def _create(self, path, stat_info):
dangerous and may throw a runtime error if the file is not guaranteed to
have a unique `ino`.
"""
id = self._uuid()
if not id:
id = self._uuid()
self.con.execute(
"INSERT INTO Files (id, path, ino, crtime, mtime, is_dir) VALUES (?, ?, ?, ?, ?, ?)",
(id, path, stat_info.ino, stat_info.crtime, stat_info.mtime, stat_info.is_dir),
Expand Down Expand Up @@ -685,6 +844,15 @@ def _update(self, id, stat_info=None, path=None):
)
return

def import_row(self, row: Tuple[str, str]) -> None:
id, path = row
path = self._normalize_path(path)
stat_info = self._stat(path)
if stat_info is None:
return

self._create(path, stat_info, id)

def index(self, path, stat_info=None, commit=True):
"""Returns the file ID for the file at `path`, creating a new file ID if
one does not exist. Returns None only if file does not exist at path."""
Expand Down
Loading