Merge branch 'main' into generic-readers

metatensor · Nov 13, 2024 · a777fcb · a777fcb
2 parents 77f275b + 36de384
commit a777fcb
Show file tree

Hide file tree

Showing 14 changed files with 342 additions and 52 deletions.
diff --git a/docs/src/architectures/alchemical-model.rst b/docs/src/architectures/alchemical-model.rst
@@ -59,11 +59,19 @@ hyperparameters to tune are (in decreasing order of importance):
   This hyperparameter controls the size and depth of the descriptors and the neural
   network. In general, increasing this might lead to better accuracy,
   especially on larger datasets, at the cost of increased training and evaluation time.
-- ``loss_weights``: This controls the weighting of different contributions to the loss
-  (e.g., energy, forces, virial, etc.). The default values work well for most datasets,
-  but they might need to be adjusted. For example, to set a weight of 1.0 for the energy
-  and 0.1 for the forces, you can set the following in the ``options.yaml`` file:
-  ``loss_weights: {"energy": 1.0, "forces": 0.1}``.
+- ``loss``: This section describes the loss function to be used, and it has three
+  subsections. 1. ``weights``. This controls the weighting of different contributions
+  to the loss (e.g., energy, forces, virial, etc.). The default values of 1.0 for all
+  targets work well for most datasets, but they might need to be adjusted. For example,
+  to set a weight of 1.0 for the energy and 0.1 for the forces, you can set the
+  following in the ``options.yaml`` file under ``loss``:
+  ``weights: {"energy": 1.0, "forces": 0.1}``. 2. ``type``. This controls the type of
+  loss to be used. The default value is ``mse``, and other options are ``mae`` and
+  ``huber``. ``huber`` is a subsection of its own, and it requires the user to specify
+  the ``deltas`` parameters in a similar way to how the ``weights`` are specified (e.g.,
+  ``deltas: {"energy": 0.1, "forces": 0.01}``). 3. ``reduction``. This controls how the
+  loss is reduced over batches. The default value is ``sum``, and the other allowed
+  option is ``mean``.
 
 
 Architecture Hyperparameters

diff --git a/docs/src/architectures/soap-bpnn.rst b/docs/src/architectures/soap-bpnn.rst
@@ -55,14 +55,22 @@ hyperparameters to tune are (in decreasing order of importance):
 - ``radial_scaling`` hyperparameters: These hyperparameters control the radial scaling
   of the SOAP descriptor. In general, the default values should work well, but they
   might need to be adjusted for specific datasets.
-- ``loss_weights``: This controls the weighting of different contributions to the loss
-  (e.g., energy, forces, virial, etc.). The default values work well for most datasets,
-  but they might need to be adjusted. For example, to set a weight of 1.0 for the energy
-  and 0.1 for the forces, you can set the following in the ``options.yaml`` file:
-  ``loss_weights: {"energy": 1.0, "forces": 0.1}``.
 - ``layernorm``: Whether to use layer normalization before the neural network. Setting
   this hyperparameter to ``false`` will lead to slower convergence of training, but
   might lead to better generalization outside of the training set distribution.
+- ``loss``: This section describes the loss function to be used, and it has three
+  subsections. 1. ``weights``. This controls the weighting of different contributions
+  to the loss (e.g., energy, forces, virial, etc.). The default values of 1.0 for all
+  targets work well for most datasets, but they might need to be adjusted. For example,
+  to set a weight of 1.0 for the energy and 0.1 for the forces, you can set the
+  following in the ``options.yaml`` file under ``loss``:
+  ``weights: {"energy": 1.0, "forces": 0.1}``. 2. ``type``. This controls the type of
+  loss to be used. The default value is ``mse``, and other options are ``mae`` and
+  ``huber``. ``huber`` is a subsection of its own, and it requires the user to specify
+  the ``deltas`` parameters in a similar way to how the ``weights`` are specified (e.g.,
+  ``deltas: {"energy": 0.1, "forces": 0.01}``). 3. ``reduction``. This controls how the
+  loss is reduced over batches. The default value is ``sum``, and the other allowed
+  option is ``mean``.
 
 
 All Hyperparameters

diff --git a/src/metatrain/experimental/alchemical_model/default-hypers.yaml b/src/metatrain/experimental/alchemical_model/default-hypers.yaml
@@ -26,5 +26,8 @@ architecture:
     log_interval: 5
     checkpoint_interval: 25
     per_structure_targets: []
-    loss_weights: {}
     log_mae: False
+    loss:
+      type: mse
+      weights: {}
+      reduction: sum
diff --git a/src/metatrain/experimental/alchemical_model/model.py b/src/metatrain/experimental/alchemical_model/model.py
@@ -149,7 +149,7 @@ def forward(
     def load_checkpoint(cls, path: Union[str, Path]) -> "AlchemicalModel":
 
         # Load the checkpoint
-        checkpoint = torch.load(path, weights_only=False)
+        checkpoint = torch.load(path, weights_only=False, map_location="cpu")
         model_hypers = checkpoint["model_hypers"]
         model_state_dict = checkpoint["model_state_dict"]
 

diff --git a/src/metatrain/experimental/alchemical_model/schema-hypers.json b/src/metatrain/experimental/alchemical_model/schema-hypers.json
@@ -93,17 +93,57 @@
             "type": "string"
           }
         },
-        "loss_weights": {
+        "log_mae": {
+          "type": "boolean"
+        },
+        "loss": {
           "type": "object",
-          "patternProperties": {
-            ".*": {
-              "type": "number"
+          "properties": {
+            "weights": {
+              "type": "object",
+              "patternProperties": {
+                ".*": {
+                  "type": "number"
+                }
+              },
+              "additionalProperties": false
+            },
+            "reduction": {
+              "type": "string",
+              "enum": ["sum", "mean", "none"]
+            },
+            "type": {
+              "oneOf": [
+                {
+                  "type": "string",
+                  "enum": ["mse", "mae"]
+                },
+                {
+                  "type": "object",
+                  "properties": {
+                    "huber": {
+                      "type": "object",
+                      "properties": {
+                        "deltas": {
+                          "type": "object",
+                          "patternProperties": {
+                            ".*": {
+                              "type": "number"
+                            }
+                          },
+                          "additionalProperties": false
+                        }
+                      },
+                      "required": ["deltas"],
+                      "additionalProperties": false
+                    }
+                  },
+                  "additionalProperties": false
+                }
+              ]
             }
           },
           "additionalProperties": false
-        },
-        "log_mae": {
-          "type": "boolean"
         }
       },
       "additionalProperties": false

diff --git a/src/metatrain/experimental/alchemical_model/trainer.py b/src/metatrain/experimental/alchemical_model/trainer.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 from pathlib import Path
 from typing import List, Union
@@ -175,21 +176,26 @@ def train(
         loss_weights_dict = {}
         for output_name in outputs_list:
             loss_weights_dict[output_name] = (
-                self.hypers["loss_weights"][
+                self.hypers["loss"]["weights"][
                     to_external_name(output_name, model.outputs)
                 ]
                 if to_external_name(output_name, model.outputs)
-                in self.hypers["loss_weights"]
+                in self.hypers["loss"]["weights"]
                 else 1.0
             )
         loss_weights_dict_external = {
             to_external_name(key, model.outputs): value
             for key, value in loss_weights_dict.items()
         }
+        # Update the loss weights in the hypers:
+        loss_hypers = copy.deepcopy(self.hypers["loss"])
+        loss_hypers["weights"] = loss_weights_dict
         logging.info(f"Training with loss weights: {loss_weights_dict_external}")
 
         # Create a loss function:
-        loss_fn = TensorMapDictLoss(loss_weights_dict)
+        loss_fn = TensorMapDictLoss(
+            **loss_hypers,
+        )
 
         # Create an optimizer:
         optimizer = torch.optim.Adam(
@@ -384,7 +390,7 @@ def save_checkpoint(self, model, path: Union[str, Path]):
     def load_checkpoint(cls, path: Union[str, Path], train_hypers) -> "Trainer":
 
         # Load the checkpoint
-        checkpoint = torch.load(path, weights_only=False)
+        checkpoint = torch.load(path, weights_only=False, map_location="cpu")
         model_hypers = checkpoint["model_hypers"]
         model_state_dict = checkpoint["model_state_dict"]
         epoch = checkpoint["epoch"]

diff --git a/src/metatrain/experimental/pet/model.py b/src/metatrain/experimental/pet/model.py
@@ -138,7 +138,7 @@ def forward(
     @classmethod
     def load_checkpoint(cls, path: Union[str, Path]) -> "PET":
 
-        checkpoint = torch.load(path, weights_only=False)
+        checkpoint = torch.load(path, weights_only=False, map_location="cpu")
         hypers = checkpoint["hypers"]
         dataset_info = checkpoint["dataset_info"]
         model = cls(

diff --git a/src/metatrain/experimental/pet/trainer.py b/src/metatrain/experimental/pet/trainer.py
@@ -731,7 +731,7 @@ def save_checkpoint(self, model, path: Union[str, Path]):
         # together with the hypers inside a file that will act as a metatrain
         # checkpoint
         checkpoint_path = self.pet_dir / "checkpoint"  # type: ignore
-        checkpoint = torch.load(checkpoint_path, weights_only=False)
+        checkpoint = torch.load(checkpoint_path, weights_only=False, map_location="cpu")
         torch.save(
             {
                 "checkpoint": checkpoint,
@@ -749,7 +749,7 @@ def load_checkpoint(cls, path: Union[str, Path], train_hypers) -> "Trainer":
         # This function loads a metatrain PET checkpoint and returns a Trainer
         # instance with the hypers, while also saving the checkpoint in the
         # class
-        checkpoint = torch.load(path, weights_only=False)
+        checkpoint = torch.load(path, weights_only=False, map_location="cpu")
         trainer = cls(train_hypers)
         trainer.pet_checkpoint = checkpoint["checkpoint"]
         return trainer
diff --git a/src/metatrain/experimental/soap_bpnn/default-hypers.yaml b/src/metatrain/experimental/soap_bpnn/default-hypers.yaml
@@ -35,5 +35,8 @@ architecture:
     checkpoint_interval: 25
     fixed_composition_weights: {}
     per_structure_targets: []
-    loss_weights: {}
     log_mae: False
+    loss:
+      type: mse
+      weights: {}
+      reduction: sum
diff --git a/src/metatrain/experimental/soap_bpnn/model.py b/src/metatrain/experimental/soap_bpnn/model.py
@@ -303,7 +303,7 @@ def forward(
     def load_checkpoint(cls, path: Union[str, Path]) -> "SoapBpnn":
 
         # Load the checkpoint
-        checkpoint = torch.load(path, weights_only=False)
+        checkpoint = torch.load(path, weights_only=False, map_location="cpu")
         model_hypers = checkpoint["model_hypers"]
         model_state_dict = checkpoint["model_state_dict"]
 

diff --git a/src/metatrain/experimental/soap_bpnn/schema-hypers.json b/src/metatrain/experimental/soap_bpnn/schema-hypers.json
@@ -141,17 +141,57 @@
             "type": "string"
           }
         },
-        "loss_weights": {
+        "log_mae": {
+          "type": "boolean"
+        },
+        "loss": {
           "type": "object",
-          "patternProperties": {
-            ".*": {
-              "type": "number"
+          "properties": {
+            "weights": {
+              "type": "object",
+              "patternProperties": {
+                ".*": {
+                  "type": "number"
+                }
+              },
+              "additionalProperties": false
+            },
+            "reduction": {
+              "type": "string",
+              "enum": ["sum", "mean", "none"]
+            },
+            "type": {
+              "oneOf": [
+                {
+                  "type": "string",
+                  "enum": ["mse", "mae"]
+                },
+                {
+                  "type": "object",
+                  "properties": {
+                    "huber": {
+                      "type": "object",
+                      "properties": {
+                        "deltas": {
+                          "type": "object",
+                          "patternProperties": {
+                            ".*": {
+                              "type": "number"
+                            }
+                          },
+                          "additionalProperties": false
+                        }
+                      },
+                      "required": ["deltas"],
+                      "additionalProperties": false
+                    }
+                  },
+                  "additionalProperties": false
+                }
+              ]
             }
           },
           "additionalProperties": false
-        },
-        "log_mae": {
-          "type": "boolean"
         }
       },
       "additionalProperties": false

diff --git a/src/metatrain/experimental/soap_bpnn/trainer.py b/src/metatrain/experimental/soap_bpnn/trainer.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 import warnings
 from pathlib import Path
@@ -191,21 +192,25 @@ def train(
         loss_weights_dict = {}
         for output_name in outputs_list:
             loss_weights_dict[output_name] = (
-                self.hypers["loss_weights"][
+                self.hypers["loss"]["weights"][
                     to_external_name(output_name, train_targets)
                 ]
                 if to_external_name(output_name, train_targets)
-                in self.hypers["loss_weights"]
+                in self.hypers["loss"]["weights"]
                 else 1.0
             )
         loss_weights_dict_external = {
             to_external_name(key, train_targets): value
             for key, value in loss_weights_dict.items()
         }
+        loss_hypers = copy.deepcopy(self.hypers["loss"])
+        loss_hypers["weights"] = loss_weights_dict
         logging.info(f"Training with loss weights: {loss_weights_dict_external}")
 
         # Create a loss function:
-        loss_fn = TensorMapDictLoss(loss_weights_dict)
+        loss_fn = TensorMapDictLoss(
+            **loss_hypers,
+        )
 
         # Create an optimizer:
         optimizer = torch.optim.Adam(
@@ -425,7 +430,7 @@ def save_checkpoint(self, model, path: Union[str, Path]):
     def load_checkpoint(cls, path: Union[str, Path], train_hypers) -> "Trainer":
 
         # Load the checkpoint
-        checkpoint = torch.load(path, weights_only=False)
+        checkpoint = torch.load(path, weights_only=False, map_location="cpu")
         model_hypers = checkpoint["model_hypers"]
         model_state_dict = checkpoint["model_state_dict"]
         epoch = checkpoint["epoch"]