diff --git a/docs/src/architectures/alchemical-model.rst b/docs/src/architectures/alchemical-model.rst index 8576db608..4b37d3514 100644 --- a/docs/src/architectures/alchemical-model.rst +++ b/docs/src/architectures/alchemical-model.rst @@ -59,11 +59,19 @@ hyperparameters to tune are (in decreasing order of importance): This hyperparameter controls the size and depth of the descriptors and the neural network. In general, increasing this might lead to better accuracy, especially on larger datasets, at the cost of increased training and evaluation time. -- ``loss_weights``: This controls the weighting of different contributions to the loss - (e.g., energy, forces, virial, etc.). The default values work well for most datasets, - but they might need to be adjusted. For example, to set a weight of 1.0 for the energy - and 0.1 for the forces, you can set the following in the ``options.yaml`` file: - ``loss_weights: {"energy": 1.0, "forces": 0.1}``. +- ``loss``: This section describes the loss function to be used, and it has three + subsections. 1. ``weights``. This controls the weighting of different contributions + to the loss (e.g., energy, forces, virial, etc.). The default values of 1.0 for all + targets work well for most datasets, but they might need to be adjusted. For example, + to set a weight of 1.0 for the energy and 0.1 for the forces, you can set the + following in the ``options.yaml`` file under ``loss``: + ``weights: {"energy": 1.0, "forces": 0.1}``. 2. ``type``. This controls the type of + loss to be used. The default value is ``mse``, and other options are ``mae`` and + ``huber``. ``huber`` is a subsection of its own, and it requires the user to specify + the ``deltas`` parameters in a similar way to how the ``weights`` are specified (e.g., + ``deltas: {"energy": 0.1, "forces": 0.01}``). 3. ``reduction``. This controls how the + loss is reduced over batches. The default value is ``sum``, and the other allowed + option is ``mean``. Architecture Hyperparameters diff --git a/docs/src/architectures/soap-bpnn.rst b/docs/src/architectures/soap-bpnn.rst index f0715081b..f8e1ab607 100644 --- a/docs/src/architectures/soap-bpnn.rst +++ b/docs/src/architectures/soap-bpnn.rst @@ -55,14 +55,22 @@ hyperparameters to tune are (in decreasing order of importance): - ``radial_scaling`` hyperparameters: These hyperparameters control the radial scaling of the SOAP descriptor. In general, the default values should work well, but they might need to be adjusted for specific datasets. -- ``loss_weights``: This controls the weighting of different contributions to the loss - (e.g., energy, forces, virial, etc.). The default values work well for most datasets, - but they might need to be adjusted. For example, to set a weight of 1.0 for the energy - and 0.1 for the forces, you can set the following in the ``options.yaml`` file: - ``loss_weights: {"energy": 1.0, "forces": 0.1}``. - ``layernorm``: Whether to use layer normalization before the neural network. Setting this hyperparameter to ``false`` will lead to slower convergence of training, but might lead to better generalization outside of the training set distribution. +- ``loss``: This section describes the loss function to be used, and it has three + subsections. 1. ``weights``. This controls the weighting of different contributions + to the loss (e.g., energy, forces, virial, etc.). The default values of 1.0 for all + targets work well for most datasets, but they might need to be adjusted. For example, + to set a weight of 1.0 for the energy and 0.1 for the forces, you can set the + following in the ``options.yaml`` file under ``loss``: + ``weights: {"energy": 1.0, "forces": 0.1}``. 2. ``type``. This controls the type of + loss to be used. The default value is ``mse``, and other options are ``mae`` and + ``huber``. ``huber`` is a subsection of its own, and it requires the user to specify + the ``deltas`` parameters in a similar way to how the ``weights`` are specified (e.g., + ``deltas: {"energy": 0.1, "forces": 0.01}``). 3. ``reduction``. This controls how the + loss is reduced over batches. The default value is ``sum``, and the other allowed + option is ``mean``. All Hyperparameters diff --git a/src/metatrain/experimental/alchemical_model/default-hypers.yaml b/src/metatrain/experimental/alchemical_model/default-hypers.yaml index 778510902..7bceeac34 100644 --- a/src/metatrain/experimental/alchemical_model/default-hypers.yaml +++ b/src/metatrain/experimental/alchemical_model/default-hypers.yaml @@ -26,5 +26,8 @@ architecture: log_interval: 5 checkpoint_interval: 25 per_structure_targets: [] - loss_weights: {} log_mae: False + loss: + type: mse + weights: {} + reduction: sum diff --git a/src/metatrain/experimental/alchemical_model/model.py b/src/metatrain/experimental/alchemical_model/model.py index 06c81cdbc..767897f05 100644 --- a/src/metatrain/experimental/alchemical_model/model.py +++ b/src/metatrain/experimental/alchemical_model/model.py @@ -149,7 +149,7 @@ def forward( def load_checkpoint(cls, path: Union[str, Path]) -> "AlchemicalModel": # Load the checkpoint - checkpoint = torch.load(path, weights_only=False) + checkpoint = torch.load(path, weights_only=False, map_location="cpu") model_hypers = checkpoint["model_hypers"] model_state_dict = checkpoint["model_state_dict"] diff --git a/src/metatrain/experimental/alchemical_model/schema-hypers.json b/src/metatrain/experimental/alchemical_model/schema-hypers.json index 997bfa010..957044786 100644 --- a/src/metatrain/experimental/alchemical_model/schema-hypers.json +++ b/src/metatrain/experimental/alchemical_model/schema-hypers.json @@ -93,17 +93,57 @@ "type": "string" } }, - "loss_weights": { + "log_mae": { + "type": "boolean" + }, + "loss": { "type": "object", - "patternProperties": { - ".*": { - "type": "number" + "properties": { + "weights": { + "type": "object", + "patternProperties": { + ".*": { + "type": "number" + } + }, + "additionalProperties": false + }, + "reduction": { + "type": "string", + "enum": ["sum", "mean", "none"] + }, + "type": { + "oneOf": [ + { + "type": "string", + "enum": ["mse", "mae"] + }, + { + "type": "object", + "properties": { + "huber": { + "type": "object", + "properties": { + "deltas": { + "type": "object", + "patternProperties": { + ".*": { + "type": "number" + } + }, + "additionalProperties": false + } + }, + "required": ["deltas"], + "additionalProperties": false + } + }, + "additionalProperties": false + } + ] } }, "additionalProperties": false - }, - "log_mae": { - "type": "boolean" } }, "additionalProperties": false diff --git a/src/metatrain/experimental/alchemical_model/trainer.py b/src/metatrain/experimental/alchemical_model/trainer.py index 4127e7036..611c1c2e4 100644 --- a/src/metatrain/experimental/alchemical_model/trainer.py +++ b/src/metatrain/experimental/alchemical_model/trainer.py @@ -1,3 +1,4 @@ +import copy import logging from pathlib import Path from typing import List, Union @@ -175,21 +176,26 @@ def train( loss_weights_dict = {} for output_name in outputs_list: loss_weights_dict[output_name] = ( - self.hypers["loss_weights"][ + self.hypers["loss"]["weights"][ to_external_name(output_name, model.outputs) ] if to_external_name(output_name, model.outputs) - in self.hypers["loss_weights"] + in self.hypers["loss"]["weights"] else 1.0 ) loss_weights_dict_external = { to_external_name(key, model.outputs): value for key, value in loss_weights_dict.items() } + # Update the loss weights in the hypers: + loss_hypers = copy.deepcopy(self.hypers["loss"]) + loss_hypers["weights"] = loss_weights_dict logging.info(f"Training with loss weights: {loss_weights_dict_external}") # Create a loss function: - loss_fn = TensorMapDictLoss(loss_weights_dict) + loss_fn = TensorMapDictLoss( + **loss_hypers, + ) # Create an optimizer: optimizer = torch.optim.Adam( @@ -384,7 +390,7 @@ def save_checkpoint(self, model, path: Union[str, Path]): def load_checkpoint(cls, path: Union[str, Path], train_hypers) -> "Trainer": # Load the checkpoint - checkpoint = torch.load(path, weights_only=False) + checkpoint = torch.load(path, weights_only=False, map_location="cpu") model_hypers = checkpoint["model_hypers"] model_state_dict = checkpoint["model_state_dict"] epoch = checkpoint["epoch"] diff --git a/src/metatrain/experimental/pet/model.py b/src/metatrain/experimental/pet/model.py index ff4e8af31..4bd63ece7 100644 --- a/src/metatrain/experimental/pet/model.py +++ b/src/metatrain/experimental/pet/model.py @@ -138,7 +138,7 @@ def forward( @classmethod def load_checkpoint(cls, path: Union[str, Path]) -> "PET": - checkpoint = torch.load(path, weights_only=False) + checkpoint = torch.load(path, weights_only=False, map_location="cpu") hypers = checkpoint["hypers"] dataset_info = checkpoint["dataset_info"] model = cls( diff --git a/src/metatrain/experimental/pet/trainer.py b/src/metatrain/experimental/pet/trainer.py index 317775343..afe8c09dd 100644 --- a/src/metatrain/experimental/pet/trainer.py +++ b/src/metatrain/experimental/pet/trainer.py @@ -731,7 +731,7 @@ def save_checkpoint(self, model, path: Union[str, Path]): # together with the hypers inside a file that will act as a metatrain # checkpoint checkpoint_path = self.pet_dir / "checkpoint" # type: ignore - checkpoint = torch.load(checkpoint_path, weights_only=False) + checkpoint = torch.load(checkpoint_path, weights_only=False, map_location="cpu") torch.save( { "checkpoint": checkpoint, @@ -749,7 +749,7 @@ def load_checkpoint(cls, path: Union[str, Path], train_hypers) -> "Trainer": # This function loads a metatrain PET checkpoint and returns a Trainer # instance with the hypers, while also saving the checkpoint in the # class - checkpoint = torch.load(path, weights_only=False) + checkpoint = torch.load(path, weights_only=False, map_location="cpu") trainer = cls(train_hypers) trainer.pet_checkpoint = checkpoint["checkpoint"] return trainer diff --git a/src/metatrain/experimental/soap_bpnn/default-hypers.yaml b/src/metatrain/experimental/soap_bpnn/default-hypers.yaml index 6d9721b46..d7ec47cc6 100644 --- a/src/metatrain/experimental/soap_bpnn/default-hypers.yaml +++ b/src/metatrain/experimental/soap_bpnn/default-hypers.yaml @@ -35,5 +35,8 @@ architecture: checkpoint_interval: 25 fixed_composition_weights: {} per_structure_targets: [] - loss_weights: {} log_mae: False + loss: + type: mse + weights: {} + reduction: sum diff --git a/src/metatrain/experimental/soap_bpnn/model.py b/src/metatrain/experimental/soap_bpnn/model.py index d81b35450..093570723 100644 --- a/src/metatrain/experimental/soap_bpnn/model.py +++ b/src/metatrain/experimental/soap_bpnn/model.py @@ -303,7 +303,7 @@ def forward( def load_checkpoint(cls, path: Union[str, Path]) -> "SoapBpnn": # Load the checkpoint - checkpoint = torch.load(path, weights_only=False) + checkpoint = torch.load(path, weights_only=False, map_location="cpu") model_hypers = checkpoint["model_hypers"] model_state_dict = checkpoint["model_state_dict"] diff --git a/src/metatrain/experimental/soap_bpnn/schema-hypers.json b/src/metatrain/experimental/soap_bpnn/schema-hypers.json index 32d46da0d..cc7bdbb3b 100644 --- a/src/metatrain/experimental/soap_bpnn/schema-hypers.json +++ b/src/metatrain/experimental/soap_bpnn/schema-hypers.json @@ -141,17 +141,57 @@ "type": "string" } }, - "loss_weights": { + "log_mae": { + "type": "boolean" + }, + "loss": { "type": "object", - "patternProperties": { - ".*": { - "type": "number" + "properties": { + "weights": { + "type": "object", + "patternProperties": { + ".*": { + "type": "number" + } + }, + "additionalProperties": false + }, + "reduction": { + "type": "string", + "enum": ["sum", "mean", "none"] + }, + "type": { + "oneOf": [ + { + "type": "string", + "enum": ["mse", "mae"] + }, + { + "type": "object", + "properties": { + "huber": { + "type": "object", + "properties": { + "deltas": { + "type": "object", + "patternProperties": { + ".*": { + "type": "number" + } + }, + "additionalProperties": false + } + }, + "required": ["deltas"], + "additionalProperties": false + } + }, + "additionalProperties": false + } + ] } }, "additionalProperties": false - }, - "log_mae": { - "type": "boolean" } }, "additionalProperties": false diff --git a/src/metatrain/experimental/soap_bpnn/trainer.py b/src/metatrain/experimental/soap_bpnn/trainer.py index 810d9e471..563fc00d6 100644 --- a/src/metatrain/experimental/soap_bpnn/trainer.py +++ b/src/metatrain/experimental/soap_bpnn/trainer.py @@ -1,3 +1,4 @@ +import copy import logging import warnings from pathlib import Path @@ -191,21 +192,25 @@ def train( loss_weights_dict = {} for output_name in outputs_list: loss_weights_dict[output_name] = ( - self.hypers["loss_weights"][ + self.hypers["loss"]["weights"][ to_external_name(output_name, train_targets) ] if to_external_name(output_name, train_targets) - in self.hypers["loss_weights"] + in self.hypers["loss"]["weights"] else 1.0 ) loss_weights_dict_external = { to_external_name(key, train_targets): value for key, value in loss_weights_dict.items() } + loss_hypers = copy.deepcopy(self.hypers["loss"]) + loss_hypers["weights"] = loss_weights_dict logging.info(f"Training with loss weights: {loss_weights_dict_external}") # Create a loss function: - loss_fn = TensorMapDictLoss(loss_weights_dict) + loss_fn = TensorMapDictLoss( + **loss_hypers, + ) # Create an optimizer: optimizer = torch.optim.Adam( @@ -425,7 +430,7 @@ def save_checkpoint(self, model, path: Union[str, Path]): def load_checkpoint(cls, path: Union[str, Path], train_hypers) -> "Trainer": # Load the checkpoint - checkpoint = torch.load(path, weights_only=False) + checkpoint = torch.load(path, weights_only=False, map_location="cpu") model_hypers = checkpoint["model_hypers"] model_state_dict = checkpoint["model_state_dict"] epoch = checkpoint["epoch"] diff --git a/src/metatrain/utils/loss.py b/src/metatrain/utils/loss.py index b2267113d..2ab67daee 100644 --- a/src/metatrain/utils/loss.py +++ b/src/metatrain/utils/loss.py @@ -1,7 +1,10 @@ -from typing import Dict, Optional, Tuple +from typing import Dict, Optional, Tuple, Union import torch from metatensor.torch import TensorMap +from omegaconf import DictConfig + +from metatrain.utils.external_naming import to_internal_name # This file defines losses for metatensor models. @@ -30,10 +33,34 @@ def __init__( reduction: str = "sum", weight: float = 1.0, gradient_weights: Optional[Dict[str, float]] = None, + type: Union[str, dict] = "mse", ): - self.loss = torch.nn.MSELoss(reduction=reduction) + if gradient_weights is None: + gradient_weights = {} + + losses = {} + if type == "mse": + losses["values"] = torch.nn.MSELoss(reduction=reduction) + for key in gradient_weights.keys(): + losses[key] = torch.nn.MSELoss(reduction=reduction) + elif type == "mae": + losses["values"] = torch.nn.L1Loss(reduction=reduction) + for key in gradient_weights.keys(): + losses[key] = torch.nn.L1Loss(reduction=reduction) + elif isinstance(type, dict) and "huber" in type: + # Huber loss + deltas = type["huber"]["deltas"] + losses["values"] = torch.nn.HuberLoss( + reduction=reduction, delta=deltas["values"] + ) + for key in gradient_weights.keys(): + losses[key] = torch.nn.HuberLoss(reduction=reduction, delta=deltas[key]) + else: + raise ValueError(f"Unknown loss type: {type}") + + self.losses = losses self.weight = weight - self.gradient_weights = {} if gradient_weights is None else gradient_weights + self.gradient_weights = gradient_weights def __call__( self, tensor_map_1: TensorMap, tensor_map_2: TensorMap @@ -97,12 +124,12 @@ def __call__( values_1 = tensor_map_1.block().values values_2 = tensor_map_2.block().values - loss += self.weight * self.loss(values_1, values_2) + loss += self.weight * self.losses["values"](values_1, values_2) for gradient_name, gradient_weight in self.gradient_weights.items(): values_1 = tensor_map_1.block().gradient(gradient_name).values values_2 = tensor_map_2.block().gradient(gradient_name).values - loss += gradient_weight * self.loss(values_1, values_2) + loss += gradient_weight * self.losses[gradient_name](values_1, values_2) return loss @@ -129,6 +156,7 @@ def __init__( self, weights: Dict[str, float], reduction: str = "sum", + type: Union[str, dict] = "mse", ): outputs = [key for key in weights.keys() if "gradients" not in key] self.losses = {} @@ -141,10 +169,12 @@ def __init__( "_gradients", "" ) gradient_weights[gradient_name] = weight + type_output = _process_type(type, output) self.losses[output] = TensorMapLoss( reduction=reduction, weight=value_weight, gradient_weights=gradient_weights, + type=type_output, ) def __call__( @@ -167,3 +197,27 @@ def __call__( loss += target_loss return loss + + +def _process_type(type: Union[str, DictConfig], output: str) -> Union[str, dict]: + if not isinstance(type, str): + assert "huber" in type + # we process the Huber loss delta dict to make it similar to the + # `weights` dict + type_output = {"huber": {"deltas": {}}} # type: ignore + for key, delta in type["huber"]["deltas"].items(): + key_internal = to_internal_name(key) + if key_internal == output: + type_output["huber"]["deltas"]["values"] = delta + elif key_internal.startswith(output) and key_internal.endswith( + "_gradients" + ): + gradient_name = key_internal.replace(f"{output}_", "").replace( + "_gradients", "" + ) + type_output["huber"]["deltas"][gradient_name] = delta + else: + pass + else: + type_output = type # type: ignore + return type_output diff --git a/tests/utils/test_loss.py b/tests/utils/test_loss.py index 035480a32..6cf9ac522 100644 --- a/tests/utils/test_loss.py +++ b/tests/utils/test_loss.py @@ -94,9 +94,10 @@ def tensor_map_with_grad_4(): return tensor_map -def test_tmap_loss_no_gradients(): +@pytest.mark.parametrize("type", ["mse", {"huber": {"deltas": {"values": 3.0}}}]) +def test_tmap_loss_no_gradients(type): """Test that the loss is computed correctly when there are no gradients.""" - loss = TensorMapLoss() + loss = TensorMapLoss(type=type) tensor_map_1 = TensorMap( keys=Labels.single(), @@ -126,12 +127,18 @@ def test_tmap_loss_no_gradients(): # Expected result: 1.0 loss_value = loss(tensor_map_1, tensor_map_2) - torch.testing.assert_close(loss_value, torch.tensor(1.0)) + # Huber loss is scaled by 0.5 due to torch implementation + torch.testing.assert_close( + loss_value, (1.0 if type == "mse" else 0.5) * torch.tensor(1.0) + ) -def test_tmap_loss_with_gradients(tensor_map_with_grad_1, tensor_map_with_grad_2): +@pytest.mark.parametrize( + "type", ["mse", {"huber": {"deltas": {"values": 3.0, "gradient": 3.0}}}] +) +def test_tmap_loss_with_gradients(tensor_map_with_grad_1, tensor_map_with_grad_2, type): """Test that the loss is computed correctly when there are gradients.""" - loss = TensorMapLoss(gradient_weights={"gradient": 0.5}) + loss = TensorMapLoss(type=type, gradient_weights={"gradient": 0.5}) loss_value = loss(tensor_map_with_grad_1, tensor_map_with_grad_1) torch.testing.assert_close(loss_value, torch.tensor(0.0)) @@ -140,7 +147,8 @@ def test_tmap_loss_with_gradients(tensor_map_with_grad_1, tensor_map_with_grad_2 loss_value = loss(tensor_map_with_grad_1, tensor_map_with_grad_2) torch.testing.assert_close( loss_value, - torch.tensor(1.0 + 0.5 * 4.0), + # Huber loss is scaled by 0.5 due to torch implementation + (1.0 if type == "mse" else 0.5) * torch.tensor(1.0 + 0.5 * 4.0), ) @@ -152,7 +160,7 @@ def test_tmap_dict_loss( ): """Test that the dict loss is computed correctly.""" - loss = TensorMapDictLoss( + loss_rmse = TensorMapDictLoss( weights={ "output_1": 0.6, "output_2": 1.0, @@ -160,6 +168,24 @@ def test_tmap_dict_loss( "output_2_gradient_gradients": 0.5, } ) + loss_huber = TensorMapDictLoss( + weights={ + "output_1": 0.6, + "output_2": 1.0, + "output_1_gradient_gradients": 0.5, + "output_2_gradient_gradients": 0.5, + }, + type={ + "huber": { + "deltas": { + "output_1": 0.1, + "output_2": 0.1, + "output_1_gradient_gradients": 0.1, + "output_2_gradient_gradients": 0.1, + } + } + }, + ) output_dict = { "output_1": tensor_map_with_grad_1, @@ -202,9 +228,15 @@ def test_tmap_dict_loss( .sum() ) - loss_value = loss(output_dict, target_dict) + loss_value = loss_rmse(output_dict, target_dict) torch.testing.assert_close(loss_value, expected_result) + # Huber loss should be lower than RMSE + # (scaled by 0.5 due to torch implementation of Huber) + assert loss_huber(output_dict, target_dict) < 0.5 * loss_rmse( + output_dict, target_dict + ) + def test_tmap_dict_loss_subset(tensor_map_with_grad_1, tensor_map_with_grad_3): """Test that the dict loss is computed correctly when only a subset @@ -246,3 +278,94 @@ def test_tmap_dict_loss_subset(tensor_map_with_grad_1, tensor_map_with_grad_3): loss_value = loss(output_dict, target_dict) torch.testing.assert_close(loss_value, expected_result) + + +def test_tmap_loss_mae(): + """Test that the MAE loss is computed correctly.""" + loss = TensorMapLoss(type="mae", reduction="mean") + + tensor_map_1 = TensorMap( + keys=Labels.single(), + blocks=[ + TensorBlock( + values=torch.tensor([[2.0], [2.0], [3.0]]), + samples=Labels.range("samples", 3), + components=[], + properties=Labels("energy", torch.tensor([[0]])), + ) + ], + ) + tensor_map_2 = TensorMap( + keys=Labels.single(), + blocks=[ + TensorBlock( + values=torch.tensor([[0.0], [3.0], [3.0]]), + samples=Labels.range("samples", 3), + components=[], + properties=Labels("energy", torch.tensor([[0]])), + ) + ], + ) + + loss_value = loss(tensor_map_1, tensor_map_1) + torch.testing.assert_close(loss_value, torch.tensor(0.0)) + + # Expected result: 1.0 + loss_value = loss(tensor_map_1, tensor_map_2) + torch.testing.assert_close(loss_value, torch.tensor(1.0)) + + +def test_tmap_loss_huber(): + """Test that the Huber loss is computed correctly.""" + loss_mse = TensorMapLoss(type="mse", reduction="mean") + loss_huber = TensorMapLoss( + type={"huber": {"deltas": {"values": 3.0}}}, reduction="mean" + ) + + tensor_map_1 = TensorMap( + keys=Labels.single(), + blocks=[ + TensorBlock( + values=torch.tensor([[2.0], [2.0], [3.0]]), + samples=Labels.range("samples", 3), + components=[], + properties=Labels("energy", torch.tensor([[0]])), + ) + ], + ) + tensor_map_2 = TensorMap( + keys=Labels.single(), + blocks=[ + TensorBlock( + values=torch.tensor([[0.0], [3.0], [3.0]]), + samples=Labels.range("samples", 3), + components=[], + properties=Labels("energy", torch.tensor([[0]])), + ) + ], + ) + + loss_value = loss_huber(tensor_map_1, tensor_map_1) + torch.testing.assert_close(loss_value, torch.tensor(0.0)) + + # No outliers, should be equal to MSE (scaled by 0.5 due to torch implementation) + loss_value_huber = loss_huber(tensor_map_1, tensor_map_2) + loss_value_mse = loss_mse(tensor_map_1, tensor_map_2) + torch.testing.assert_close(loss_value_huber, 0.5 * loss_value_mse) + + tensor_map_with_outlier = TensorMap( + keys=Labels.single(), + blocks=[ + TensorBlock( + values=torch.tensor([[0.0], [100.0], [3.0]]), + samples=Labels.range("samples", 3), + components=[], + properties=Labels("energy", torch.tensor([[0]])), + ) + ], + ) + + loss_value_huber = loss_huber(tensor_map_1, tensor_map_with_outlier) + loss_value_mse = loss_mse(tensor_map_1, tensor_map_with_outlier) + # Huber loss is lower due to the outlier + assert loss_value_huber < 0.5 * loss_value_mse