Merge branch 'main' into save-in-outputs

metatensor · Oct 26, 2024 · 5f6630a · 5f6630a
2 parents c508d9d + 2c0a748
commit 5f6630a
Show file tree

Hide file tree

Showing 13 changed files with 622 additions and 11 deletions.
diff --git a/docs/src/advanced-concepts/auto-restarting.rst b/docs/src/advanced-concepts/auto-restarting.rst
@@ -0,0 +1,12 @@
+Automatic restarting
+====================
+
+When restarting multiple times (for example, when training an expensive model
+or running on an HPC cluster with short time limits), it is useful to be able
+to train and restart multiple times with the same command.
+
+In ``metatrain``, this functionality is provided via the ``--continue auto``
+(or ``-c auto``) flag of ``mtt train``. This flag will automatically restart
+the training from the last checkpoint, if one is found in the ``outputs/``
+of the current directory. If no checkpoint is found, the training will start
+from scratch.
diff --git a/docs/src/advanced-concepts/index.rst b/docs/src/advanced-concepts/index.rst
@@ -11,3 +11,4 @@ such as output naming, auxiliary outputs, and wrapper models.
    output-naming
    auxiliary-outputs
    multi-gpu
+   auto-restarting
diff --git a/examples/programmatic/llpr_forces/ethanol_reduced_100.xyz b/examples/programmatic/llpr_forces/ethanol_reduced_100.xyz
@@ -0,0 +1 @@
+../../ase/ethanol_reduced_100.xyz
diff --git a/examples/programmatic/llpr_forces/force_llpr.py b/examples/programmatic/llpr_forces/force_llpr.py
@@ -0,0 +1,229 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from metatensor.torch.atomistic import (
+    MetatensorAtomisticModel,
+    ModelEvaluationOptions,
+    ModelMetadata,
+    ModelOutput,
+    load_atomistic_model,
+)
+
+from metatrain.utils.data import Dataset, collate_fn, read_systems, read_targets
+from metatrain.utils.llpr import LLPRUncertaintyModel
+from metatrain.utils.loss import TensorMapDictLoss
+from metatrain.utils.neighbor_lists import get_system_with_neighbor_lists
+
+
+model = load_atomistic_model("model.pt", extensions_directory="extensions/")
+model = model.to("cuda")
+
+train_systems = read_systems("train.xyz")
+train_target_config = {
+    "energy": {
+        "quantity": "energy",
+        "read_from": "train.xyz",
+        "file_format": ".xyz",
+        "reader": "ase",
+        "key": "energy",
+        "unit": "kcal/mol",
+        "forces": {
+            "read_from": "train.xyz",
+            "file_format": ".xyz",
+            "reader": "ase",
+            "key": "forces",
+        },
+        "stress": {
+            "read_from": "train.xyz",
+            "file_format": ".xyz",
+            "reader": "ase",
+            "key": "stress",
+        },
+        "virial": False,
+    },
+}
+train_targets, _ = read_targets(train_target_config)
+
+valid_systems = read_systems("valid.xyz")
+valid_target_config = {
+    "energy": {
+        "quantity": "energy",
+        "read_from": "valid.xyz",
+        "file_format": ".xyz",
+        "reader": "ase",
+        "key": "energy",
+        "unit": "kcal/mol",
+        "forces": {
+            "read_from": "valid.xyz",
+            "file_format": ".xyz",
+            "reader": "ase",
+            "key": "forces",
+        },
+        "stress": {
+            "read_from": "valid.xyz",
+            "file_format": ".xyz",
+            "reader": "ase",
+            "key": "stress",
+        },
+        "virial": False,
+    },
+}
+valid_targets, _ = read_targets(valid_target_config)
+
+test_systems = read_systems("test.xyz")
+test_target_config = {
+    "energy": {
+        "quantity": "energy",
+        "read_from": "test.xyz",
+        "file_format": ".xyz",
+        "reader": "ase",
+        "key": "energy",
+        "unit": "kcal/mol",
+        "forces": {
+            "read_from": "test.xyz",
+            "file_format": ".xyz",
+            "reader": "ase",
+            "key": "forces",
+        },
+        "stress": {
+            "read_from": "test.xyz",
+            "file_format": ".xyz",
+            "reader": "ase",
+            "key": "stress",
+        },
+        "virial": False,
+    },
+}
+test_targets, target_info = read_targets(test_target_config)
+
+requested_neighbor_lists = model.requested_neighbor_lists()
+train_systems = [
+    get_system_with_neighbor_lists(system, requested_neighbor_lists)
+    for system in train_systems
+]
+train_dataset = Dataset({"system": train_systems, **train_targets})
+valid_systems = [
+    get_system_with_neighbor_lists(system, requested_neighbor_lists)
+    for system in valid_systems
+]
+valid_dataset = Dataset({"system": valid_systems, **valid_targets})
+test_systems = [
+    get_system_with_neighbor_lists(system, requested_neighbor_lists)
+    for system in test_systems
+]
+test_dataset = Dataset({"system": test_systems, **test_targets})
+
+train_dataloader = torch.utils.data.DataLoader(
+    train_dataset,
+    batch_size=4,
+    shuffle=False,
+    collate_fn=collate_fn,
+)
+valid_dataloader = torch.utils.data.DataLoader(
+    valid_dataset,
+    batch_size=4,
+    shuffle=False,
+    collate_fn=collate_fn,
+)
+test_dataloader = torch.utils.data.DataLoader(
+    test_dataset,
+    batch_size=4,
+    shuffle=False,
+    collate_fn=collate_fn,
+)
+
+loss_weight_dict = {
+    "energy": 1.0,
+    "energy_positions_grad": 1.0,
+    "energy_grain_grad": 1.0,
+}
+loss_fn = TensorMapDictLoss(loss_weight_dict)
+
+llpr_model = LLPRUncertaintyModel(model)
+
+print("Last layer parameters:")
+parameters = []
+for name, param in llpr_model.named_parameters():
+    if "last_layers" in name:
+        parameters.append(param)
+        print(name)
+
+llpr_model.compute_covariance_as_pseudo_hessian(
+    train_dataloader, target_info, loss_fn, parameters
+)
+llpr_model.compute_inverse_covariance()
+llpr_model.calibrate(valid_dataloader)
+
+exported_model = MetatensorAtomisticModel(
+    llpr_model.eval(),
+    ModelMetadata(),
+    llpr_model.capabilities,
+)
+
+evaluation_options = ModelEvaluationOptions(
+    length_unit="angstrom",
+    outputs={
+        "mtt::aux::last_layer_features": ModelOutput(per_atom=False),
+        "mtt::aux::energy_uncertainty": ModelOutput(per_atom=False),
+        "energy": ModelOutput(per_atom=False),
+    },
+    selected_atoms=None,
+)
+
+force_errors = []
+force_uncertainties = []
+
+for batch in test_dataloader:
+    systems, targets = batch
+    systems = [system.to("cuda", torch.float64) for system in systems]
+    for system in systems:
+        system.positions.requires_grad = True
+    targets = {name: tmap.to("cuda", torch.float64) for name, tmap in targets.items()}
+
+    outputs = exported_model(systems, evaluation_options, check_consistency=True)
+    energy = outputs["energy"].block().values
+    energy_sum = torch.sum(energy)
+    energy_sum.backward(retain_graph=True)
+
+    predicted_forces = -torch.concatenate(
+        [system.positions.grad.flatten() for system in systems]
+    )
+    true_forces = -targets["energy"].block().gradient("positions").values.flatten()
+
+    force_error = (predicted_forces - true_forces) ** 2
+    force_errors.append(force_error.detach().clone().cpu().numpy())
+
+    last_layer_features = outputs["mtt::aux::last_layer_features"].block().values
+    last_layer_features = torch.sum(last_layer_features, dim=0)
+    ll_feature_grads = []
+    for ll_feature in last_layer_features.reshape((-1,)):
+        ll_feature_grad = torch.autograd.grad(
+            ll_feature.reshape(()),
+            [system.positions for system in systems],
+            retain_graph=True,
+        )
+        ll_feature_grad = torch.concatenate(
+            [ll_feature_g.flatten() for ll_feature_g in ll_feature_grad]
+        )
+        ll_feature_grads.append(ll_feature_grad)
+    ll_feature_grads = torch.stack(ll_feature_grads, dim=1)
+
+    force_uncertainty = torch.einsum(
+        "if, fg, ig -> i",
+        ll_feature_grads,
+        exported_model._module.inv_covariance,
+        ll_feature_grads,
+    )
+    force_uncertainties.append(force_uncertainty.detach().clone().cpu().numpy())
+
+force_errors = np.concatenate(force_errors)
+force_uncertainties = np.concatenate(force_uncertainties)
+
+
+plt.scatter(force_uncertainties, force_errors, s=1)
+plt.xscale("log")
+plt.yscale("log")
+plt.xlabel("Predicted variance")
+plt.ylabel("Squared error")
+
+plt.savefig("figure.pdf")
diff --git a/examples/programmatic/llpr_forces/options.yaml b/examples/programmatic/llpr_forces/options.yaml
@@ -0,0 +1,35 @@
+seed: 42
+
+architecture:
+  name: experimental.soap_bpnn
+  training:
+    batch_size: 8
+    num_epochs: 100
+    log_interval: 1
+
+training_set:
+  systems:
+    read_from: train.xyz
+    length_unit: angstrom
+  targets:
+    energy:
+      key: energy
+      unit: eV
+
+validation_set:
+  systems:
+    read_from: valid.xyz
+    length_unit: angstrom
+  targets:
+    energy:
+      key: energy
+      unit: eV
+
+test_set:
+  systems:
+    read_from: test.xyz
+    length_unit: angstrom
+  targets:
+    energy:
+      key: energy
+      unit: eV
diff --git a/examples/programmatic/llpr_forces/readme.txt b/examples/programmatic/llpr_forces/readme.txt
@@ -0,0 +1,4 @@
+This is a small example of how to calculate force uncertainties with the LLPR.
+In order to run it, it is sufficient to split the ethanol dataset with `python split.py`.
+Then train a model with `mtt train options.yaml`, and finally run the example
+with `python force_llpr.py`.
diff --git a/examples/programmatic/llpr_forces/split.py b/examples/programmatic/llpr_forces/split.py
@@ -0,0 +1,13 @@
+import ase.io
+import numpy as np
+
+
+structures = ase.io.read("ethanol_reduced_100.xyz", ":")
+np.random.shuffle(structures)
+train = structures[:50]
+valid = structures[50:60]
+test = structures[60:]
+
+ase.io.write("train.xyz", train)
+ase.io.write("valid.xyz", valid)
+ase.io.write("test.xyz", test)
diff --git a/src/metatrain/__main__.py b/src/metatrain/__main__.py
@@ -16,6 +16,7 @@
     export_model,
 )
 from .cli.train import _add_train_model_parser, _prepare_train_model_args, train_model
+from .utils.distributed.logging import is_main_process
 from .utils.logging import get_cli_input, setup_logging
 
 
@@ -81,7 +82,8 @@ def main():
     if callable == "train_model":
         # define and create `checkpoint_dir` based on current directory, date and time
         checkpoint_dir = _datetime_output_path(now=datetime.now())
-        os.makedirs(checkpoint_dir, exist_ok=True)  # exist_ok=True for distributed
+        if is_main_process():
+            os.makedirs(checkpoint_dir)
         args.checkpoint_dir = checkpoint_dir
 
         log_file = checkpoint_dir / "train.log"

diff --git a/src/metatrain/cli/train.py b/src/metatrain/cli/train.py
@@ -5,6 +5,7 @@
 import os
 import random
 import shutil
+import time
 from pathlib import Path
 from typing import Dict, Optional, Union
 
@@ -75,7 +76,7 @@ def _add_train_model_parser(subparser: argparse._SubParsersAction) -> None:
         "-c",
         "--continue",
         dest="continue_from",
-        type=str,
+        type=_process_continue_from,
         required=False,
         help="File to continue training from.",
     )
@@ -99,6 +100,39 @@ def _prepare_train_model_args(args: argparse.Namespace) -> None:
     args.options = OmegaConf.merge(args.options, override_options)
 
 
+def _process_continue_from(continue_from: str) -> Optional[str]:
+    # covers the case where `continue_from` is `auto`
+    if continue_from == "auto":
+        # try to find the `outputs` directory; if it doesn't exist
+        # then we are not continuing from a previous run
+        if Path("outputs/").exists():
+            # take the latest year-month-day directory
+            dir = sorted(Path("outputs/").iterdir())[-1]
+            # take the latest hour-minute-second directory
+            dir = sorted(dir.iterdir())[-1]
+            # take the latest checkpoint. This cannot be done with
+            # `sorted` because some checkpoint files are named with
+            # the epoch number (e.g. `epoch_10.ckpt` would be before
+            # `epoch_8.ckpt`). We therefore sort by file creation time.
+            new_continue_from = str(
+                sorted(dir.glob("*.ckpt"), key=lambda f: f.stat().st_ctime)[-1]
+            )
+            logger.info(f"Auto-continuing from `{new_continue_from}`")
+        else:
+            new_continue_from = None
+            logger.info(
+                "Auto-continuation did not find any previous runs, "
+                "training from scratch"
+            )
+        # sleep for a few seconds to allow all processes to catch up. This is
+        # necessary because the `outputs` directory is created by the main
+        # process and the other processes might detect it by mistake if they're
+        # still executing this function
+        time.sleep(3)
+
+    return new_continue_from
+
+
 def train_model(
     options: Union[DictConfig, Dict],
     output: str = "model.pt",