diff --git a/README.md b/README.md index 60d0bd28d..37272db17 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,12 @@ dependencies with: pip install chemiscope[explore] ``` +To use `chemiscope.metatensor_featurizer` for providing your trained model +to get the features for `chemiscope.explore`, install the dependencies with: +```bash +pip install chemiscope[metatensor] +``` + ## sphinx and sphinx-gallery integration Chemiscope provides also extensions for `sphinx` and `sphinx-gallery` to diff --git a/docs/src/python/reference.rst b/docs/src/python/reference.rst index 896bb2596..a65851ea3 100644 --- a/docs/src/python/reference.rst +++ b/docs/src/python/reference.rst @@ -26,3 +26,5 @@ .. autofunction:: chemiscope.ase_tensors_to_ellipsoids .. autofunction:: chemiscope.explore + +.. autofunction:: chemiscope.metatensor_featurizer diff --git a/pyproject.toml b/pyproject.toml index 4273534c7..ad3633332 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,3 +75,8 @@ explore = [ "dscribe", "scikit-learn", ] + +metatensor = [ + "metatensor", + "metatensor[torch]" +] diff --git a/python/chemiscope/__init__.py b/python/chemiscope/__init__.py index ddefa6805..9723959c1 100644 --- a/python/chemiscope/__init__.py +++ b/python/chemiscope/__init__.py @@ -11,7 +11,7 @@ extract_properties, librascal_atomic_environments, ) -from .explore import explore # noqa: F401 +from .explore import explore, metatensor_featurizer # noqa: F401 from .version import __version__ # noqa: F401 from .jupyter import show, show_input # noqa diff --git a/python/chemiscope/explore.py b/python/chemiscope/explore/__init__.py similarity index 60% rename from python/chemiscope/explore.py rename to python/chemiscope/explore/__init__.py index 5394fa934..e83612821 100644 --- a/python/chemiscope/explore.py +++ b/python/chemiscope/explore/__init__.py @@ -1,6 +1,8 @@ -import os +from ..jupyter import show +from ._soap_pca import soap_pca_featurize +from ._metatensor import metatensor_featurizer -from .jupyter import show +__all__ = ["explore", "metatensor_featurizer"] def explore(frames, featurize=None, properties=None, environments=None, mode="default"): @@ -116,96 +118,9 @@ def soap_kpca_featurize(frames, environments): # Add dimensionality reduction results to properties properties["features"] = X_reduced - # Return chemiscope widget return show( - frames=frames, properties=properties, mode=mode, environments=environments + frames=frames, + properties=properties, + environments=environments, + mode=mode, ) - - -def soap_pca_featurize(frames, environments=None): - """ - Computes SOAP features for a given set of atomic structures and performs - dimensionality reduction using PCA. Custom featurize functions should - have the same signature. - - Note: - - The SOAP descriptor parameters are pre-defined. - - We use all available CPU cores for parallel computation of SOAP descriptors. - """ - - # Check if dependencies were installed - try: - from dscribe.descriptors import SOAP - from sklearn.decomposition import PCA - except ImportError as e: - raise ImportError( - f"Required package not found: {str(e)}. Please install dependency " - + "using 'pip install chemiscope[explore]'." - ) - centers = None - - # Get the atom indexes from the environments and pick related frames - if environments is not None: - centers = _extract_environment_indices(environments) - - # Pick frames and properties related to the environments if provided - if environments is not None: - # Sort environments by structure id and atom id - environments = sorted(environments, key=lambda x: (x[0], x[1])) - - # Check structure indexes - unique_structures = list({env[0] for env in environments}) - if any(index >= len(frames) for index in unique_structures): - raise IndexError( - "Some structure indices in 'environments' are larger than the number of" - "frames" - ) - - if len(unique_structures) != len(frames): - # only include frames that are present in the user-provided environments - frames = [frames[index] for index in unique_structures] - - # Get global species - species = set() - for frame in frames: - species.update(frame.get_chemical_symbols()) - species = list(species) - - # Check if periodic - is_periodic = all(all(frame.get_pbc()) for frame in frames) - - # Initialize calculator - soap = SOAP( - species=species, - r_cut=4.5, - n_max=8, - l_max=6, - sigma=0.2, - rbf="gto", - average="outer", - periodic=is_periodic, - weighting={"function": "pow", "c": 1, "m": 5, "d": 1, "r0": 3.5}, - compression={"mode": "mu1nu1"}, - ) - - # Calculate descriptors - n_jobs = min(len(frames), os.cpu_count()) - feats = soap.create(frames, centers=centers, n_jobs=n_jobs) - - # Compute pca - pca = PCA(n_components=2) - return pca.fit_transform(feats) - - -def _extract_environment_indices(envs): - """ - Convert from chemiscope's environements to DScribe's centers selection - - :param: list envs: each element is a list of [env_index, atom_index, cutoff] - """ - grouped_envs = {} - for [env_index, atom_index, _cutoff] in envs: - if env_index not in grouped_envs: - grouped_envs[env_index] = [] - grouped_envs[env_index].append(atom_index) - return list(grouped_envs.values()) diff --git a/python/chemiscope/explore/_metatensor.py b/python/chemiscope/explore/_metatensor.py new file mode 100644 index 000000000..85c170f5f --- /dev/null +++ b/python/chemiscope/explore/_metatensor.py @@ -0,0 +1,147 @@ +import numpy as np + + +def metatensor_featurizer( + model, + extensions_directory=None, + check_consistency=False, + device=None, +): + """ + Create a featurizer function using a `metatensor`_ model to obtain the features from + structures. The model must be able to create a ``"feature"`` output. + + :param model: model to use for the calculation. It can be a file path, a Python + instance of :py:class:`metatensor.torch.atomistic.MetatensorAtomisticModel`, or + the output of :py:func:`torch.jit.script` on + :py:class:`metatensor.torch.atomistic.MetatensorAtomisticModel`. + :param extensions_directory: a directory where model extensions are located + :param check_consistency: should we check the model for consistency when running, + defaults to False. + :param device: a torch device to use for the calculation. If ``None``, the function + will use the options in model's ``supported_device`` attribute. + + :returns: a function that takes a list of frames and returns the features. + + To use this function, additional dependencies are required. They can be installed + with the following command: + + .. code:: bash + + pip install chemiscope[metatensor] + + Here is an example using a pre-trained `metatensor`_ model, stored as a ``model.pt`` + file with the compiled extensions stored in the ``extensions/`` directory. To obtain + the details on how to get it, see metatensor `tutorial + `_. The + frames are obtained by reading structures from a file that `ase `_ can + read. + + .. code-block:: python + + import chemiscope import ase.io + + # Read the structures from the dataset frames = + ase.io.read("data/explore_c-gap-20u.xyz", ":") + + # Provide model file ("model.pt") to `metatensor_featurizer` featurizer = + chemiscope.metatensor_featurizer( + "model.pt", extensions_directory="extensions" + ) + + chemiscope.explore(frames, featurize=featurizer) + + For more examples, see the related :ref:`documentation + `. + + .. _metatensor: https://docs.metatensor.org/latest/index.html + .. _chemiscope-explore-metatensor: + https://chemiscope.org/docs/examples/7-explore-advanced.html#example-with-metatensor-model + """ + + # Check if dependencies were installed + try: + from metatensor.torch.atomistic import ModelOutput + from metatensor.torch.atomistic.ase_calculator import MetatensorCalculator + except ImportError as e: + raise ImportError( + f"Required package not found: {e}. Please install the dependency using " + "'pip install chemiscope[metatensor]'." + ) + + # Initialize metatensor calculator + CALCULATOR = MetatensorCalculator( + model=model, + extensions_directory=extensions_directory, + check_consistency=check_consistency, + device=device, + ) + + def get_features(atoms, environments): + """Run the model on a single atomic structure and extract the features""" + outputs = {"features": ModelOutput(per_atom=environments is not None)} + selected_atoms = _create_selected_atoms(environments) + output = CALCULATOR.run_model(atoms, outputs, selected_atoms) + + return output["features"].block().values.detach().cpu().numpy() + + def featurize(frames, environments): + if isinstance(frames, list): + envs_per_frame = _get_environments_per_frame(environments, len(frames)) + + outputs = [ + get_features(frame, envs) for frame, envs in zip(frames, envs_per_frame) + ] + return np.vstack(outputs) + else: + return get_features(frames, environments) + + return featurize + + +def _get_environments_per_frame(environments, num_frames): + """ + Organize the environments for each frame + + :param list environments: a list of atomic environments + :param int num_frames: total number of frames + """ + envs_per_frame = [None] * num_frames + + if environments is None: + return envs_per_frame + + frames_dict = {} + + # Group environments by structure_id + for env in environments: + structure_id = env[0] + if structure_id not in frames_dict: + frames_dict[structure_id] = [] + frames_dict[structure_id].append(env) + + # Insert environments to the frame indices + for structure_id, envs in frames_dict.items(): + if structure_id < num_frames: + envs_per_frame[structure_id] = envs + + return envs_per_frame + + +def _create_selected_atoms(environments): + """ + Convert environments into ``Labels`` object, to be used as ``selected_atoms`` + + :param environments: a list of atom-centered environments + """ + import torch + from metatensor.torch import Labels + + if environments is None: + return None + + # Extract system and atom indices from environments, overriding the structure id to + # be 0 (since we will only give a single frame to the calculator at the same time). + values = torch.tensor([(0, atom_id) for _, atom_id, _ in environments]) + + return Labels(names=["system", "atom"], values=values) diff --git a/python/chemiscope/explore/_soap_pca.py b/python/chemiscope/explore/_soap_pca.py new file mode 100644 index 000000000..6411c8732 --- /dev/null +++ b/python/chemiscope/explore/_soap_pca.py @@ -0,0 +1,90 @@ +import os + + +def soap_pca_featurize(frames, environments=None): + """ + Computes SOAP features for a given set of atomic structures and performs + dimensionality reduction using PCA. Custom featurize functions should + have the same signature. + + Note: + - The SOAP descriptor parameters are pre-defined. + - We use all available CPU cores for parallel computation of SOAP descriptors. + """ + + # Check if dependencies were installed + try: + from dscribe.descriptors import SOAP + from sklearn.decomposition import PCA + except ImportError as e: + raise ImportError( + f"Required package not found: {str(e)}. Please install dependency " + + "using 'pip install chemiscope[explore]'." + ) + centers = None + + # Get the atom indexes from the environments and pick related frames + if environments is not None: + centers = _extract_environment_indices(environments) + + # Pick frames and properties related to the environments if provided + if environments is not None: + # Sort environments by structure id and atom id + environments = sorted(environments, key=lambda x: (x[0], x[1])) + + # Check structure indexes + unique_structures = list({env[0] for env in environments}) + if any(index >= len(frames) for index in unique_structures): + raise IndexError( + "Some structure indices in 'environments' are larger than the number of" + "frames" + ) + + if len(unique_structures) != len(frames): + # only include frames that are present in the user-provided environments + frames = [frames[index] for index in unique_structures] + + # Get global species + species = set() + for frame in frames: + species.update(frame.get_chemical_symbols()) + species = list(species) + + # Check if periodic + is_periodic = all(all(frame.get_pbc()) for frame in frames) + + # Initialize calculator + soap = SOAP( + species=species, + r_cut=4.5, + n_max=8, + l_max=6, + sigma=0.2, + rbf="gto", + average="outer", + periodic=is_periodic, + weighting={"function": "pow", "c": 1, "m": 5, "d": 1, "r0": 3.5}, + compression={"mode": "mu1nu1"}, + ) + + # Calculate descriptors + n_jobs = min(len(frames), os.cpu_count()) + feats = soap.create(frames, centers=centers, n_jobs=n_jobs) + + # Compute pca + pca = PCA(n_components=2) + return pca.fit_transform(feats) + + +def _extract_environment_indices(environments): + """ + Convert from chemiscope's environments to DScribe's centers selection + + :param: list environments: each element is a list of [env_index, atom_index, cutoff] + """ + grouped_envs = {} + for [env_index, atom_index, _cutoff] in environments: + if env_index not in grouped_envs: + grouped_envs[env_index] = [] + grouped_envs[env_index].append(atom_index) + return list(grouped_envs.values()) diff --git a/python/examples/7-explore-advanced.py b/python/examples/7-explore-advanced.py index a57250c58..4d77360f9 100644 --- a/python/examples/7-explore-advanced.py +++ b/python/examples/7-explore-advanced.py @@ -186,6 +186,7 @@ def mace_mp0_tsne(frames, environments): fetch_dataset("mace-mp-tsne-m3cd.json.gz") chemiscope.show_input("data/mace-mp-tsne-m3cd.json.gz") + # %% # # Example with SOAP, t-SNE and environments diff --git a/python/examples/8-explore-with-metatensor.py b/python/examples/8-explore-with-metatensor.py new file mode 100644 index 000000000..135744110 --- /dev/null +++ b/python/examples/8-explore-with-metatensor.py @@ -0,0 +1,223 @@ +""" +.. _chemiscope-explore-metatensor: + +Using `metatensor`_ models for dataset exploration +================================================== + +In this example, we demonstrate how to create and use a `metatensor`_ model with +:py:func:`chemiscope.metatensor_featurizer` to extract features from the model, which +are then displayed using a chemiscope widget. To use this function, some additional +dependencies are required. You can install them with the following command: + +.. code:: bash + + pip install chemiscope[metatensor] + +.. _metatensor: https://docs.metatensor.org/latest/index.html +""" + +# %% +# +# Firstly, we import necessary packages and read structures from the dataset. + +from typing import Dict, List, Optional + +import ase.io +import torch +from metatensor.torch import Labels, TensorBlock, TensorMap +from metatensor.torch.atomistic import ( + MetatensorAtomisticModel, + ModelCapabilities, + ModelMetadata, + ModelOutput, + NeighborListOptions, + System, +) + +import chemiscope + +frames = ase.io.read("data/explore_c-gap-20u.xyz", ":") + +# %% +# +# Using pre-trained models +# ------------------------ +# +# Most commonly, you will have an already existing model in metatensor format that +# you'll want to use for dataset exploration. In this case, you'll have to create a +# ``featurizer`` function using :py:func:`chemiscope.metatensor_featurizer`. +# +# ``metatensor_featurizer`` takes an existing model as input. It can be either a +# ``MetatensorAtomisticModel`` instance or a path to a pre-trained model file (here +# ``"model.pt"``) + +featurizer = chemiscope.metatensor_featurizer(model="model.pt") + +# %% +# +# From here, you can use :py:func:`chemiscope.explore` to visualize the features +# computed from the structures. For this, we are passing the frames, the ``featurizer`` +# function, and — as the model computes per-atom properties — environments. + +chemiscope.explore( + frames=frames, + featurize=featurizer, + environments=chemiscope.all_atomic_environments(frames), +) + +# %% +# +# Defining a custom model +# ----------------------- +# +# Let's now move on and see how one can define a fully custom model to use through the +# metatensor interface. +# +# Here we will use an atom-centered representation, where each atomic environment is +# represented with the moments of the positions of the neighbors up to a maximal order. +# +# The model computes moments up to a specified maximum order :math:`k_{\text{max}}`, +# computing a representation :math:`F_i^k` +# +# .. math:: +# +# F_i^k = \sum_{j} \frac{r_{ij}}{r_c}^k +# +# where :math:`r_{ij}` is the distance between atom :math:`i` and its neighbor +# :math:`j`, :math:`k` is the moment order and :math:`r_c` is the cutoff radius. +# +# And then, the model will take a PCA of the above features to extract the three most +# relevant dimensions. + + +class FeatureModel(torch.nn.Module): + def __init__(self, cutoff: float, max_k: int): + super().__init__() + self.cutoff = cutoff + self.max_k = max_k + + self._neighbors_options = NeighborListOptions(cutoff=cutoff, full_list=True) + + def requested_neighbor_lists(self) -> List[NeighborListOptions]: + # our model requires a neighbor list, that will be computed and provided to it + # automatically. + return [self._neighbors_options] + + def forward( + self, + systems: List[System], + outputs: Dict[str, ModelOutput], + selected_atoms: Optional[Labels] = None, + ) -> Dict[str, TensorMap]: + if list(outputs.keys()) != ["features"]: + raise ValueError( + "this model can only compute 'features', but outputs contains other " + f"keys: {', '.join(outputs.keys())}" + ) + + if not outputs["features"].per_atom: + raise NotImplementedError("per structure features are not implemented") + + all_features = [] + all_samples = [] + + for system_i, system in enumerate(systems): + dtype = system.positions.dtype + device = system.positions.device + n_atoms = len(system.positions) + + # Initialize a tensor to store features for each atom + features = torch.zeros((n_atoms, self.max_k), dtype=dtype, device=device) + + # get the neighbor list for this system + neighbors = system.get_neighbor_list(self._neighbors_options) + i = neighbors.samples.column("first_atom") + + r_ij = torch.linalg.vector_norm(neighbors.values.reshape(-1, 3), dim=1) + r_ij /= self.cutoff + + for k in range(self.max_k): + features[i, k] += torch.pow(r_ij, k) + + all_features.append(features) + + # Create labels for each atom in the system + system_atom_labels = torch.tensor( + [[system_i, atom_i] for atom_i in range(n_atoms)] + ) + all_samples.append(system_atom_labels) + + # Concatenate features and labels across all systems + features_tensor = torch.cat(all_features, dim=0) + samples_tensor = torch.cat(all_samples, dim=0) + + # Take the PCA of the features + _, _, V = torch.linalg.svd(features_tensor - features_tensor.mean()) + features_pca = features_tensor @ V[:3].T + + # Add metadata to the output + block = TensorBlock( + values=features_pca, + samples=Labels(names=["system", "atom"], values=samples_tensor), + components=[], + properties=Labels( + names=["feature"], + values=torch.tensor([[0], [1], [2]]), + ), + ) + return { + "features": TensorMap( + keys=Labels(names=["_"], values=torch.tensor([[0]])), blocks=[block] + ) + } + + +# %% +# +# With the class defined, we can now create an instance of the model, giving ``cutoff`` +# and ``max_k`` as a maximal moment to compute. We don’t need to train this model since +# there are no trainable parameters inside. + +model = FeatureModel(cutoff=4.5, max_k=6) + + +# %% +# +# Next, we set up the model metadata and capabilities: + +metadata = ModelMetadata( + name="Example moment model", + description=( + "A model that computes atom-centered features based on the distances of " + "neighboring atoms" + ), +) + +capabilities = ModelCapabilities( + outputs={ + "features": ModelOutput(per_atom=True), + }, + atomic_types=[6], + interaction_range=0.0, + supported_devices=["cpu"], + dtype="float64", +) + +model = MetatensorAtomisticModel(model.eval(), metadata, capabilities) + +# %% +# +# For a more detailed example of exporting a model, please check the related +# documentation `page +# `_ +# in `metatensor`_. +# +# Once the model is fully defined, we can use it with +# :py:func:`chemiscope.metatensor_featurizer`: + +featurizer = chemiscope.metatensor_featurizer(model, check_consistency=True) +chemiscope.explore( + frames=frames, + featurize=featurizer, + environments=chemiscope.all_atomic_environments(frames), +) diff --git a/python/examples/model.pt b/python/examples/model.pt new file mode 100644 index 000000000..8739fb63c Binary files /dev/null and b/python/examples/model.pt differ diff --git a/tox.ini b/tox.ini index eab9f2354..86749e9e1 100644 --- a/tox.ini +++ b/tox.ini @@ -64,6 +64,10 @@ commands = [testenv:docs] description = Build chemiscope documentation +extras = + explore + metatensor + deps = -r docs/requirements.txt @@ -72,7 +76,6 @@ allowlist_externals = commands = npm run api-docs sphinx-build --doctree-dir docs/build/doctrees -W --builder html docs/src docs/build/html -extras = explore [testenv:generate-standalone] description = Generate standalone HTML for chemiscope