From be39360aa1856fbe300ee22ac2dae10ddd51ee6d Mon Sep 17 00:00:00 2001 From: Guido Petretto Date: Sun, 29 Sep 2024 22:36:18 +0200 Subject: [PATCH] backup documentation --- doc/source/user/backup.rst | 82 ++++++++++++++++++++++++++++++++ doc/source/user/index.rst | 1 + src/jobflow_remote/cli/backup.py | 11 +++-- 3 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 doc/source/user/backup.rst diff --git a/doc/source/user/backup.rst b/doc/source/user/backup.rst new file mode 100644 index 00000000..5d96a8bc --- /dev/null +++ b/doc/source/user/backup.rst @@ -0,0 +1,82 @@ +.. _backup: + +****** +Backup +****** + +As explained in the :ref:`projectconf` section, jobflow-remote uses a MongoDB +database to store the information about the state of Jobs and Flows. This is +defined in the ``queue`` section of the project configuration. +In several circumstances it may be required to perform a backup of this +database. For this reason jobflow-remote offers an option to create a dump +of the relevant collections for a project and restore it if needed. + +.. warning:: + This functionality does **not create a backup of the** ``JobStore`` **containing + the output of the workflows**. Since the output store can be any kind of ``Store`` + and the result may be split in the ``additional_stores``, if a backup is needed + it will be required to do that through the ``JobStore`` or directly with + the storage system. + +There are two options to create and restore a backup. The default relies on the official +MongoDB tools: ``mongodump`` and ``mongorestore``. For this to work the +`MongoDB database tools `_ need to be +installed. The connection details provided in the project configuration will be used +to executed the commands. This is the preferred option, since it is faster and also +dumps and restores all the metadata of the collections. However, not all the connection +options defined in the ``queue`` Store may be supported or it may be not possible +to install the tools. For this reason a second option, based on a pure python implementation +is also available. This can be activated by selecting the ``--python`` option from +the CLI. + +.. warning:: + The python version of the backup and restore will not preserve the metadata of the + collection. After restoring a backup with this option it would be better to + regenerate the standard indexes using the ``jf admin index rebuild`` command. + +.. note:: + It is of course possible to manually create a backup using the MongoDB tools. + This jobflow-remote feature is meant to ease the procedure by automatically + selecting the appropriate collections to backup. + +Create a backup +=============== + +A backup can be created with the command:: + + jf backup create + +As already mentioned, this will use the ``mongodump`` executable, unless the ``--python`` +option is specified. It is possible to specify the destination path of the backup and the +output folder contains the ``jobs.bson``, ``flows.bson`` and ``jf_auxiliary.bson`` +files. If the ``mongodump`` command is used, the folder will also contain the metadata +files for each collection. It is also possible to request that the backup files will +be gzipped, by adding the ``--compress`` option. + +.. note:: + The folder creation follows the convention of the ``mongodump`` executable, so + inside the folder specified in the ``create`` command there will be a subfolder + with the name of the database. + +.. note:: + The name of the files will be the standard ones, even if the names of the collections + defined in the project configuration file are different. + +Restore a backup +================ + +To restore a backup the following command can be used:: + + jf backup restore /path/to/backup/folder + +The path should point to the folder containing the bson files generated during the creation. +The code will automatically determine if the files are zipped, based on their extension. + +.. note:: + The name of the target collection are determined by the values defined in the project + settings, not by the names of the files, nor by the names of the collections from + which the backup was created. + +.. note:: + The backup can be restored only in an empty database. The code will raise an error + if the target database already contains jobs and flows. diff --git a/doc/source/user/index.rst b/doc/source/user/index.rst index ba675202..be1e9e86 100644 --- a/doc/source/user/index.rst +++ b/doc/source/user/index.rst @@ -19,6 +19,7 @@ details are found in :ref:`reference`. errors states advancedoptions + backup .. toctree:: :hidden: diff --git a/src/jobflow_remote/cli/backup.py b/src/jobflow_remote/cli/backup.py index 78f34592..7442834d 100644 --- a/src/jobflow_remote/cli/backup.py +++ b/src/jobflow_remote/cli/backup.py @@ -36,10 +36,10 @@ def create( help="Compress the output files", ), ] = False, - mongo_dir: Annotated[ + mongo_path: Annotated[ Optional[str], typer.Option( - "--mongo-dir", + "--mongo-path", "-m", help=( "The path to a folder containing the mongodump executable, if not present in the PATH" @@ -68,7 +68,7 @@ def create( jc = get_job_controller() n_docs = jc.backup_dump( dir_path=backup_dir, - mongo_bin_path=mongo_dir, + mongo_bin_path=mongo_path, compress=compress, python=python, ) @@ -123,3 +123,8 @@ def restore( ) out_console.print("Backup restored") + if python: + out_console.print( + "Python version does not restore indexes in the DB. It is advisable to run " + "'jf admin index rebuild' to create them." + )