diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 50a618e..24c939c 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -9,28 +9,30 @@ jobs: build: runs-on: ubuntu-latest name: Build the Sphinx docs + strategy: + matrix: + python-version: ["3.11"] steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.8 - uses: actions/setup-python@v3 - with: - python-version: 3.8 - - name: Install package dependencies - run: pip install -e .[rdkit] - - name: Install sphinx dependencies - run: pip install -r docs/requirements.txt - - name: Make docs - working-directory: ./docs - run: make html - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - name: html-docs - path: docs/build/html/ - - name: Deploy - uses: peaceiris/actions-gh-pages@v3 - if: github.ref == 'refs/heads/main' - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: docs/build/html - + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install package dependencies + run: pip install -e .[rdkit] + - name: Install sphinx dependencies + run: pip install -r docs/requirements.txt + - name: Make docs + working-directory: ./docs + run: make html + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: html-docs + path: docs/build/html/ + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + if: github.ref == 'refs/heads/main' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: docs/build/html diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 6553ea9..3cd7159 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -3,26 +3,28 @@ name: Build and publish rxn-onmt-models on PyPI on: push: tags: - - 'v*' + - "v*" jobs: build-and-publish: name: Build and publish rxn-onmt-models on PyPI runs-on: ubuntu-latest - + strategy: + matrix: + python-version: ["3.11"] steps: - - uses: actions/checkout@master - - name: Python setup 3.9 - uses: actions/setup-python@v1 - with: - python-version: 3.9 - - name: Install build package (for packaging) - run: pip install --upgrade build - - name: Build dist - run: python -m build - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.PYPI_TOKEN }} - skip_existing: true + - uses: actions/checkout@master + - name: Python setup ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install build package (for packaging) + run: pip install --upgrade build + - name: Build dist + run: python -m build + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_TOKEN }} + skip_existing: true diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index e10201b..79a5bfa 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -6,23 +6,27 @@ jobs: tests: runs-on: ubuntu-latest name: Style, mypy, pytest + strategy: + matrix: + python-version: ["3.11"] steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.7 - uses: actions/setup-python@v3 - with: - python-version: 3.7 - - name: Install Dependencies - run: pip install -e .[dev,rdkit] - - name: Check black - run: python -m black --check --diff --color . - - name: Check isort - run: python -m isort --check --diff . - - name: Check flake8 - run: python -m flake8 . - - name: Check mypy (on the package) - run: python -m mypy --namespace-packages -p rxn.onmt_models - - name: Check mypy (on the tests) - run: python -m mypy tests - - name: Run pytests - run: python -m pytest + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install Dependencies + run: pip install -e .[dev,rdkit] + - name: Check black + run: python -m black --check --diff --color . + - name: Check isort + run: python -m isort --check --diff . + - name: Check flake8 + run: python -m flake8 . + - name: Check mypy (on the package) + run: python -m mypy --namespace-packages -p rxn.onmt_models + - name: Check mypy (on the tests) + run: python -m mypy tests + - name: Run pytests + run: python -m pytest + diff --git a/pyproject.toml b/pyproject.toml index afd23ab..9cad47e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ module = [ "numpy.*", "pandas.*", "pytest.*", + "yaml.*", ] ignore_missing_imports = true diff --git a/setup.cfg b/setup.cfg index c9f9c7d..d34f092 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,9 +28,10 @@ install_requires = attrs>=21.2.0 click>=8.0 rxn-chem-utils>=1.1.4 - rxn-onmt-utils>=1.0.3 rxn-reaction-preprocessing>=2.0.2 rxn-utils>=1.1.9 + rxn-onmt-utils @ git+https://github.com/rxn4chemistry/rxn-onmt-utils.git@0058c723c7371c6ff3b88647247c9e44cf1ffaa7 #rxn-onmt-utils without rxn-opennmt-py depedency + OpenNMT-py>=3.5.1 # official onmt [options.packages.find] where = src diff --git a/src/rxn/onmt_models/scripts/rxn_onmt_continue_training.py b/src/rxn/onmt_models/scripts/rxn_onmt_continue_training.py index 1004f92..e993557 100644 --- a/src/rxn/onmt_models/scripts/rxn_onmt_continue_training.py +++ b/src/rxn/onmt_models/scripts/rxn_onmt_continue_training.py @@ -1,4 +1,5 @@ import logging +from pathlib import Path from typing import Optional, Tuple import click @@ -20,6 +21,12 @@ logger.addHandler(logging.NullHandler()) +def get_src_tgt_vocab(data: Path) -> Tuple[Path, Path]: + src_vocab = data.parent / (data.name + ".vocab.src") + tgt_vocab = data.parent / (data.name + ".vocab.tgt") + return src_vocab, tgt_vocab + + @click.command(context_settings=dict(show_default=True)) @click.option("--batch_size", default=defaults.BATCH_SIZE) @click.option( @@ -57,6 +64,7 @@ default=100000, help="Number of steps, including steps from the initial training run.", ) +@click.option("--model_task", type=str, required=True) def main( batch_size: int, data_weights: Tuple[int, ...], @@ -66,6 +74,7 @@ def main( preprocess_dir: str, train_from: Optional[str], train_num_steps: int, + model_task: str, ) -> None: """Continue training for an OpenNMT model. @@ -100,9 +109,15 @@ def main( dropout = get_model_dropout(train_from) seed = get_model_seed(train_from) + src_vocab, tgt_vocab = get_src_tgt_vocab( + data=onmt_preprocessed_files.preprocess_prefix + ) + train_cmd = OnmtTrainCommand.continue_training( batch_size=batch_size, data=onmt_preprocessed_files.preprocess_prefix, + src_vocab=src_vocab, + tgt_vocab=tgt_vocab, keep_checkpoint=keep_checkpoint, dropout=dropout, save_model=model_files.model_prefix, @@ -111,11 +126,11 @@ def main( train_steps=train_num_steps, no_gpu=no_gpu, data_weights=data_weights, + model_task=model_task, ) # Write config file - command_and_args = train_cmd.save_to_config_cmd(config_file) - run_command(command_and_args) + train_cmd.save_to_config_cmd(config_file) # Actual training config file command_and_args = train_cmd.execute_from_config_cmd(config_file) diff --git a/src/rxn/onmt_models/scripts/rxn_onmt_finetune.py b/src/rxn/onmt_models/scripts/rxn_onmt_finetune.py index c30fed8..4137238 100644 --- a/src/rxn/onmt_models/scripts/rxn_onmt_finetune.py +++ b/src/rxn/onmt_models/scripts/rxn_onmt_finetune.py @@ -54,6 +54,7 @@ @click.option("--warmup_steps", default=defaults.WARMUP_STEPS) @click.option("--report_every", default=1000) @click.option("--save_checkpoint_steps", default=5000) +@click.option("--model_task", type=str, required=True) def main( batch_size: int, data_weights: Tuple[int, ...], @@ -69,6 +70,7 @@ def main( warmup_steps: int, report_every: int, save_checkpoint_steps: int, + model_task: str, ) -> None: """Finetune an OpenNMT model.""" @@ -112,7 +114,7 @@ def main( dropout=dropout, keep_checkpoint=keep_checkpoint, learning_rate=learning_rate, - rnn_size=rnn_size, + hidden_size=rnn_size, save_model=model_files.model_prefix, seed=seed, train_from=train_from, @@ -122,11 +124,11 @@ def main( data_weights=data_weights, report_every=report_every, save_checkpoint_steps=save_checkpoint_steps, + model_task=model_task, ) # Write config file - command_and_args = train_cmd.save_to_config_cmd(config_file) - run_command(command_and_args) + train_cmd.save_to_config_cmd(config_file) # Actual training config file command_and_args = train_cmd.execute_from_config_cmd(config_file) diff --git a/src/rxn/onmt_models/scripts/rxn_onmt_preprocess.py b/src/rxn/onmt_models/scripts/rxn_onmt_preprocess.py index b2afc1c..7105a3a 100644 --- a/src/rxn/onmt_models/scripts/rxn_onmt_preprocess.py +++ b/src/rxn/onmt_models/scripts/rxn_onmt_preprocess.py @@ -1,9 +1,10 @@ import logging import random from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import click +import yaml from rxn.chemutils.tokenization import ensure_tokenized_file from rxn.onmt_utils import __version__ as onmt_utils_version from rxn.onmt_utils.train_command import preprocessed_id_names @@ -51,6 +52,89 @@ def determine_train_dataset( return src, tgt +def get_build_vocab_config_file( + train_srcs: List[PathLike], + train_tgts: List[PathLike], + valid_src: PathLike, + valid_tgt: PathLike, + save_data: Path, + share_vocab: bool = True, + overwrite: bool = True, + src_seq_length: int = 3000, + tgt_seq_length: int = 3000, + src_vocab_size: int = 3000, + tgt_vocab_size: int = 3000, +) -> Path: + """Wrapper function of the legacy cli `onmt_preprocessed` arguments. + The goal is to make them compatible with ONMT v.3.5.1 cli `onmt_build_vocab`. + The function takes the arguments of former onmt_preprocessed cli and dumps + them into a `config.yaml` file with a specific structure compatible with `onmt_build_vocab`. + The upgraded `onmt_build_vocab` takes them as `onmt_build_vocab -config config.yaml`. + + Args: + train_srcs (List[PathLike]): List of train source data files + train_tgts (List[PathLike]): List of train target data files + valid_src (List[PathLike]): List of validation source data files + valid_tgt (List[PathLike]): List of validation target data files + save_data (PathLike): Save vocabulary data directory + share_vocab (bool, optional): Share vocab. Defaults to True. + overwrite (bool, optional): Overwrite output directory. Defaults to True. + src_seq_length (int, optional): src_seq_length. Defaults to 3000. + tgt_seq_length (int, optional): tgt_seq_length. Defaults to 3000. + src_vocab_size (int, optional): src_vocab_size. Defaults to 3000. + tgt_vocab_size (int, optional): tgt_vocab_size. Defaults to 3000. + + Returns: + PathLike: Path of the config.yaml which is in directory `save_data` + """ + + # Build dictionary with build vocab config content + # See structure https://opennmt.net/OpenNMT-py/quickstart.html (Step 1: Prepare the data) + build_vocab_config: Dict[str, Any] = {} + + # Arguments save data + build_vocab_config["save_data"] = str(save_data.parent) + build_vocab_config["src_vocab"] = str( + save_data.parent / (save_data.name + ".vocab.src") + ) + build_vocab_config["tgt_vocab"] = str( + save_data.parent / (save_data.name + ".vocab.tgt") + ) + + # Other arguments + build_vocab_config["overwrite"] = str(overwrite) + build_vocab_config["share_vocab"] = str(share_vocab) + build_vocab_config["src_seq_length"] = str(src_seq_length) + build_vocab_config["tgt_seq_length"] = str(tgt_seq_length) + build_vocab_config["src_vocab_size"] = str(src_vocab_size) + build_vocab_config["tgt_vocab_size"] = str(tgt_vocab_size) + + # Arguments data paths (train) + build_vocab_config["data"] = {} + # TODO: raise error if lengths: train_srcs, train_tgts, valid_src, valid_tgt are different + number_corpus = len(train_srcs) + for i in range(number_corpus): + build_vocab_config["data"][f"corpus_{i+1}"] = { + "path_src": str(train_srcs[i]), + "path_tgt": str(train_tgts[i]), + } + + # Arguments data paths (valid) + build_vocab_config["data"]["valid"] = { + "path_src": str(valid_src), + "path_tgt": str(valid_tgt), + } + + # Path to same yaml file + config_file_path = save_data.parent / (save_data.name + "_build_vocab_config.yaml") + + # Save file that will be -config argument of onmt_build_vocab + with open(config_file_path, "w+") as file: + yaml.dump(build_vocab_config, file) + + return config_file_path + + @click.command() @click.option( "--input_dir", @@ -180,21 +264,28 @@ def main( valid_src = ensure_tokenized_file(valid_src) valid_tgt = ensure_tokenized_file(valid_tgt) + # Create config file for onmt_build_vocab for OpenNMT v.3.5.1 + # Dump train_srcs, train_tgts, valid_src, valid_tgt etc and return path + config_file_path = get_build_vocab_config_file( + train_srcs=train_srcs, + train_tgts=train_tgts, + valid_src=valid_src, + valid_tgt=valid_tgt, + save_data=onmt_preprocessed_files.preprocess_prefix, + share_vocab=True, + overwrite=True, + src_seq_length=3000, + tgt_seq_length=3000, + src_vocab_size=3000, + tgt_vocab_size=3000, + ) + # yapf: disable command_and_args = [ str(e) for e in [ - 'onmt_preprocess', - '-train_src', *train_srcs, - '-train_tgt', *train_tgts, - '-valid_src', valid_src, - '-valid_tgt', valid_tgt, - '-save_data', onmt_preprocessed_files.preprocess_prefix, - '-src_seq_length', 3000, - '-tgt_seq_length', 3000, - '-src_vocab_size', 3000, - '-tgt_vocab_size', 3000, - '-share_vocab', - '-overwrite', + 'onmt_build_vocab', + '-config', config_file_path, + '-n_sample', -1, ] ] # yapf: enable diff --git a/src/rxn/onmt_models/scripts/rxn_onmt_train.py b/src/rxn/onmt_models/scripts/rxn_onmt_train.py index dc33a28..f29fda5 100644 --- a/src/rxn/onmt_models/scripts/rxn_onmt_train.py +++ b/src/rxn/onmt_models/scripts/rxn_onmt_train.py @@ -1,4 +1,6 @@ import logging +import warnings +from pathlib import Path from typing import Tuple import click @@ -15,6 +17,30 @@ logger.addHandler(logging.NullHandler()) +def get_src_tgt_vocab(data: Path) -> Tuple[Path, Path]: + src_vocab = data.parent / (data.name + ".vocab.src") + tgt_vocab = data.parent / (data.name + ".vocab.tgt") + return src_vocab, tgt_vocab + + +def check_rnn_vs_hidden_size(hidden_size: int, rnn_size: int) -> int: + """ + Helper function that checks wether hidden_size and rnn_size are given, decides which one to use and raises warnings. + rnn_size always has a default defaults.RNN_SIZE, if no hidden_size is given, rnn_size will be used. + If hidden_size is given, hidden size will be used. + """ + if hidden_size is None: + warnings.warn( + f"Argument hidden_size is not given, rnn_size with value {rnn_size} will be used" + ) + return rnn_size + if hidden_size is not None: + warnings.warn( + f"Argument hidden_size was given with value {hidden_size}, rnn_size argument will be overwritten." + ) + return hidden_size + + @click.command(context_settings=dict(show_default=True)) @click.option("--batch_size", default=defaults.BATCH_SIZE) @click.option( @@ -44,11 +70,13 @@ help="Directory with OpenNMT-preprocessed files", ) @click.option("--rnn_size", default=defaults.RNN_SIZE) +@click.option("--hidden_size") @click.option("--seed", default=defaults.SEED) @click.option("--train_num_steps", default=100000) @click.option("--transformer_ff", default=defaults.TRANSFORMER_FF) @click.option("--warmup_steps", default=defaults.WARMUP_STEPS) @click.option("--word_vec_size", default=defaults.WORD_VEC_SIZE) +@click.option("--model_task", type=str, required=True) def main( batch_size: int, data_weights: Tuple[int, ...], @@ -61,11 +89,13 @@ def main( no_gpu: bool, preprocess_dir: str, rnn_size: int, + hidden_size: int, seed: int, train_num_steps: int, transformer_ff: int, warmup_steps: int, word_vec_size: int, + model_task: str, ) -> None: """Train an OpenNMT model. @@ -73,6 +103,9 @@ def main( `data_weights` parameters are given (Note: needs to be consistent with the rxn-onmt-preprocess command executed before training. """ + # Check rnn_size or hidden_size given, not both + # NOTE: rnn_size argument is kept for compatibility + hidden_size = check_rnn_vs_hidden_size(hidden_size=hidden_size, rnn_size=rnn_size) # set up paths model_files = ModelFiles(model_output_dir) @@ -88,15 +121,22 @@ def main( config_file = model_files.next_config_file() + src_vocab, tgt_vocab = get_src_tgt_vocab( + data=onmt_preprocessed_files.preprocess_prefix + ) + + # Init train_cmd = OnmtTrainCommand.train( batch_size=batch_size, data=onmt_preprocessed_files.preprocess_prefix, + src_vocab=src_vocab, + tgt_vocab=tgt_vocab, dropout=dropout, heads=heads, keep_checkpoint=keep_checkpoint, layers=layers, learning_rate=learning_rate, - rnn_size=rnn_size, + hidden_size=hidden_size, save_model=model_files.model_prefix, seed=seed, train_steps=train_num_steps, @@ -105,11 +145,11 @@ def main( word_vec_size=word_vec_size, no_gpu=no_gpu, data_weights=data_weights, + model_task=model_task, ) # Write config file - command_and_args = train_cmd.save_to_config_cmd(config_file) - run_command(command_and_args) + train_cmd.save_to_config_cmd(config_file) # Actual training config file command_and_args = train_cmd.execute_from_config_cmd(config_file) diff --git a/src/rxn/onmt_models/training_files.py b/src/rxn/onmt_models/training_files.py index 1157480..9eb5954 100644 --- a/src/rxn/onmt_models/training_files.py +++ b/src/rxn/onmt_models/training_files.py @@ -94,7 +94,7 @@ def preprocess_prefix(self) -> Path: @property def vocab_file(self) -> Path: - return self.preprocess_prefix.with_suffix(".vocab.pt") + return self.preprocess_prefix.with_suffix(".vocab.src") class RxnPreprocessingFiles: