Skip to content

Commit

Permalink
Merge pull request #1180 from facebookresearch/data-porter-for-qualif…
Browse files Browse the repository at this point in the history
…ications-only

Added porting of worker qualifications only for Data Porter
  • Loading branch information
meta-paul authored Jun 7, 2024
2 parents eee9834 + f9c2094 commit 980f69d
Show file tree
Hide file tree
Showing 13 changed files with 542 additions and 79 deletions.
11 changes: 10 additions & 1 deletion docs/web/docs/guides/how_to_use/data_porter/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ mephisto db export --export-task-runs-since-date 2024-01-01
mephisto db export --export-task-runs-since-date 2023-01-01T00:00:00
mephisto db export --labels first_dump --labels second_dump
mephisto db export --export-tasks-by-ids 1 --delete-exported-data --randomize-legacy-ids --export-indent 4
mephisto db export --qualification-only
```

Options (all optional):
Expand All @@ -43,11 +44,14 @@ Options (all optional):
- `-del/--delete-exported-data` - after exporting data, delete it from local DB
- `-r/--randomize-legacy-ids` - replace legacy autoincremented ids with
new pseudo-random ids to avoid conflicts during data merging
- `-qo/--qualification-only` - export only data related to worker qualifications (by default it's disabled)
- `-qn/--qualification-names` - is specified with `--qualification-only` option, only qualifications with these names will be exported
- `-i/--export-indent` - make dump easy to read via formatting JSON with indentations (Default 2)
- `-v/--verbosity` - write more informative messages about progress (Default 0. Values: 0, 1)

Note that the following options cannot be used together:
`--export-tasks-by-names`, `--export-tasks-by-ids`, `--export-task-runs-by-ids`, `--export-task-runs-since-date`, `--labels`.
- `--export-tasks-by-names`, `--export-tasks-by-ids`, `--export-task-runs-by-ids`, `--export-task-runs-since-date`, `--labels`
- `-qo/--qualification-only` and `--delete-exported-data`, `--export-tasks-by-names`, `--export-tasks-by-ids`, `--export-task-runs-by-ids`, `--export-task-runs-since-date`, `--labels`, `--randomize-legacy-ids`


## Import
Expand All @@ -62,6 +66,7 @@ mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --verbosity
mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --labels my_first_dump
mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --conflict-resolver MyCustomMergeConflictResolver
mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --keep-import-metadata
mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --qualification-only
```

Options:
Expand All @@ -72,12 +77,16 @@ Options:
- `-l/--labels` - one or more short strings serving as a reference for the ported data (stored in `imported_data` table),
so later you can export the imported data with `--labels` export option
- `-k/--keep-import-metadata` - write data from `imported_data` table of the dump (by default it's not imported)
- `-qo/--qualification-only` - import only data related to worker qualifications (by default it's disabled)
- `-v/--verbosity` - level of logging (default: 0; values: 0, 1)

Note that before every import we create a full snapshot copy of your local data, by
archiving content of your `data` directory. If any data gets corrupte during the import,
you can always return to the original state by replacing your `data` folder with the snaphot.

Note that the following options cannot be used together:
- `-qo/--qualification-only` and `--labels`, `--keep-import-metadata`

## Backup

Creates full backup of all current data (Mephisto DB, provider-specific datastores, and related files) on local machine.
Expand Down
129 changes: 123 additions & 6 deletions mephisto/client/cli_db_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,22 @@ def db_cli():
"to avoid conflicts during data merging"
),
)
@click.option(
"-qo",
"--qualification-only",
type=bool,
default=False,
is_flag=True,
help="export only data related to worker qualifications",
)
@click.option(
"-qn",
"--qualification-names",
type=str,
multiple=True,
default=None,
help="names of related to worker qualifications to export with `--qualification-only` option",
)
@click.option("-v", "--verbosity", type=int, default=VERBOSITY_DEFAULT_VALUE, help=VERBOSITY_HELP)
def export(ctx: click.Context, **options: dict):
"""
Expand All @@ -128,9 +144,11 @@ def export(ctx: click.Context, **options: dict):
export_tasks_by_ids: Optional[List[str]] = options.get("export_tasks_by_ids")
export_task_runs_by_ids: Optional[List[str]] = options.get("export_task_runs_by_ids")
export_task_runs_since_date: Optional[str] = options.get("export_task_runs_since_date")
export_labels: Optional[List[str]] = options.get("export_labels")
export_labels: Optional[List[str]] = options.get("labels")
delete_exported_data: bool = options.get("delete_exported_data", False)
randomize_legacy_ids: bool = options.get("randomize_legacy_ids", False)
qualification_only: bool = options.get("qualification_only", False)
qualification_names: Optional[List[str]] = options.get("qualification_names")
verbosity: int = options.get("verbosity", VERBOSITY_DEFAULT_VALUE)

porter = DBDataPorter()
Expand Down Expand Up @@ -167,6 +185,54 @@ def export(ctx: click.Context, **options: dict):
)
exit()

has_conflicting_qualification_only_options = (
len(
list(
filter(
bool,
[
delete_exported_data,
export_labels,
export_task_runs_by_ids,
export_task_runs_since_date,
export_tasks_by_ids,
export_tasks_by_names,
qualification_only,
randomize_legacy_ids,
],
)
)
)
> 1
)

if qualification_only and has_conflicting_qualification_only_options:
logger.warning(
"[yellow]"
"You cannot use following options together:"
"\n\t--qualification-only"
"\nand"
"\n\t--delete-exported-data"
"\n\t--export-task-runs-by-ids"
"\n\t--export-task-runs-since-date"
"\n\t--export-task-runs-since-date"
"\n\t--export-tasks-by-ids"
"\n\t--export-tasks-by-names"
"\n\t--labels"
"\n\t--randomize-legacy-ids"
"\nUse `--qualification-only` or other options to export data."
"[/yellow]"
)
exit()

if qualification_names and not qualification_only:
logger.warning(
"[yellow]"
"You cannot use option `--qualification-names` without `--qualification-only`."
"[/yellow]"
)
exit()

export_results = porter.export_dump(
json_indent=export_indent,
task_names=export_tasks_by_names,
Expand All @@ -176,6 +242,8 @@ def export(ctx: click.Context, **options: dict):
task_run_labels=export_labels,
delete_exported_data=delete_exported_data,
randomize_legacy_ids=randomize_legacy_ids,
qualification_only=qualification_only,
qualification_names=qualification_names,
metadata_export_options=get_export_options_for_metadata(ctx, options),
verbosity=verbosity,
)
Expand Down Expand Up @@ -236,6 +304,14 @@ def export(ctx: click.Context, **options: dict):
is_flag=True,
help="write data from `imported_data` table of the dump (by default it's not imported)",
)
@click.option(
"-qo",
"--qualification-only",
type=bool,
default=False,
is_flag=True,
help="import only data related to worker qualifications",
)
@click.option("-v", "--verbosity", type=int, default=VERBOSITY_DEFAULT_VALUE, help=VERBOSITY_HELP)
def _import(ctx: click.Context, **options: dict):
"""
Expand All @@ -249,21 +325,62 @@ def _import(ctx: click.Context, **options: dict):
labels: Optional[str] = options.get("labels")
conflict_resolver: Optional[str] = options.get("conflict_resolver", DEFAULT_CONFLICT_RESOLVER)
keep_import_metadata: Optional[bool] = options.get("keep_import_metadata", False)
qualification_only: bool = options.get("qualification_only", False)
verbosity: int = options.get("verbosity", VERBOSITY_DEFAULT_VALUE)

has_conflicting_qualification_only_options = (
len(
list(
filter(
bool,
[
keep_import_metadata,
labels,
qualification_only,
],
)
)
)
> 1
)

if qualification_only and has_conflicting_qualification_only_options:
logger.warning(
"[yellow]"
"You cannot use following options together:"
"\n\t--qualification-only"
"\nand"
"\n\t--labels"
"\n\t--keep-import-metadata"
"\nUse `--qualification-only` or other options to import data."
"[/yellow]"
)
exit()

porter = DBDataPorter()
results = porter.import_dump(
dump_archive_file_name_or_path=file,
conflict_resolver_name=conflict_resolver,
labels=labels,
keep_import_metadata=keep_import_metadata,
qualification_only=qualification_only,
verbosity=verbosity,
)
logger.info(
f"[green]"
f"Finished successfully. Imported {results['imported_task_runs_number']} TaskRuns"
f"[/green]"
)
if qualification_only:
logger.info(
f"[green]"
f"Finished successfully. Imported "
f"{results['workers_number']} Workers, "
f"{results['qualifications_number']} Qualifications, "
f"{results['granted_qualifications_number']} GrantedQualifications"
f"[/green]"
)
else:
logger.info(
f"[green]"
f"Finished successfully. Imported {results['task_runs_number']} TaskRuns"
f"[/green]"
)


# --- BACKUP ---
Expand Down
13 changes: 13 additions & 0 deletions mephisto/tools/db_data_porter/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,15 @@

BACKUP_OUTPUT_DIR = "outputs/backup"
EXPORT_OUTPUT_DIR = "outputs/export"

MEPHISTO_DUMP_KEY = "mephisto"

METADATA_DUMP_KEY = "dump_metadata"
METADATA_MIGRATIONS_KEY = "migrations"
METADATA_EXPORT_OPTIONS_KEY = "export_options"
METADATA_TIMESTAMP_KEY = "timestamp"
METADATA_PK_SUBSTITUTIONS_KEY = "pk_substitutions"

AVAILABLE_PROVIDER_TYPES = [
MEPHISTO_DUMP_KEY,
MOCK_PROVIDER_TYPE,
Expand Down Expand Up @@ -216,3 +223,9 @@
LOCAL_DB_LABEL = "_"

DEFAULT_ARCHIVE_FORMAT = "zip"

TABLE_NAMES_RELATED_TO_QUALIFICATIONS = [
"granted_qualifications",
"qualifications",
"workers",
]
Loading

0 comments on commit 980f69d

Please sign in to comment.