diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index abe27298..9399db6f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.9]
+        python-version: ['3.10']
 
     steps:
     - uses: actions/checkout@v2
@@ -27,15 +27,16 @@ jobs:
         pip install flake8 pytest pycodestyle pydocstyle
         pycodestyle --ignore E203,W503 --exclude=tests alignn
         pydocstyle --match-dir=core --match-dir=io --match-dir=io --match-dir=ai --match-dir=analysis --match-dir=db --match-dir=tasks --count alignn
-        flake8 --ignore E203,W503 --exclude=tests --statistics --count --exit-zero alignn
+        flake8 --ignore E203,W503 --exclude=tests,scripts --statistics --count --exit-zero alignn
     - name: Test with pytest
       run: |
         export DGLBACKEND=pytorch
         export CUDA_VISIBLE_DEVICES="-1"
         #pip install dgl-cu111
-        pip install flake8 pytest pycodestyle pydocstyle codecov pytest-cov coverage
+        pip install phonopy flake8 pytest pycodestyle pydocstyle codecov pytest-cov coverage
         #pip uninstall -y torch nvidia-cublas-cu11 nvidia-cuda-nvrtc-cu11 nvidia-cuda-runtime-cu11 nvidia-cudnn-cu11
         #conda install -y  pytorch-cpu
+        pip install torch==2.0.0
         #pip install attrs==22.1.0 certifi==2022.9.24 charset-normalizer==2.1.1 codecov==2.1.12 contourpy==1.0.5 coverage==6.5.0 cycler==0.11.0 dgl==0.9.1 flake8==5.0.4 fonttools==4.38.0 idna==3.4 iniconfig==1.1.1 jarvis-tools==2022.9.16 joblib==1.2.0 kiwisolver==1.4.4 matplotlib==3.6.1 mccabe==0.7.0 networkx==3.0b1 numpy==1.23.4 packaging==21.3 pandas==1.5.1 Pillow==9.2.0 pluggy==1.0.0 psutil==5.9.3 py==1.11.0 pycodestyle==2.9.1 pydantic==1.10.2 pydocstyle==6.1.1 pyflakes==2.5.0 pyparsing==2.4.7 pytest==7.1.3 pytest-cov==4.0.0 python-dateutil==2.8.2 pytorch-ignite==0.5.0.dev20221024 pytz==2022.5 requests==2.28.1 scikit-learn==1.1.2 scipy==1.9.3 six==1.16.0 snowballstemmer==2.2.0 spglib==2.0.1 threadpoolctl==3.1.0 tomli==2.0.1 toolz==0.12.0 torch==1.12.1 tqdm==4.64.1 typing_extensions==4.4.0 urllib3==1.26.12 xmltodict==0.13.0
         echo 'PIP freeze'
         pip freeze
@@ -43,8 +44,8 @@ jobs:
         coverage report -m -i
         codecov
         codecov --token="85bd9c5d-9e55-4f6d-bd69-350ee5e3bb41"
-        echo 'Train folder'
-        train_folder.py -h 
+        echo 'Train alignn'
+        train_alignn.py -h 
         echo 'Pre-trained models'
         pretrained.py -h
         #train_folder.py --root_dir "alignn/examples/sample_data" --config "alignn/examples/sample_data/config_example.json" --output_dir=temp
diff --git a/README.md b/README.md
index f9cf9a49..f6905423 100644
--- a/README.md
+++ b/README.md
@@ -109,8 +109,10 @@ pip install dgl==1.0.1+cu117 -f https://data.dgl.ai/wheels/cu117/repo.html
 Examples
 ---------
 
-#### Dataset
-The main script to train model is `train_folder.py`. A user needs at least the following info to train a model: 1) `id_prop.csv` with name of the file and corresponding value, 2) `config_example.json` a config file with training and hyperparameters.
+Here, we provide examples for property prediction tasks, development of machine-learning force-fields (MLFF), usage of pre-trained property predictor, MLFFs, webapps etc.
+
+#### Dataset preparation for property prediction tasks
+The main script to train model is `train_alignn.py`. A user needs at least the following info to train a model: 1) `id_prop.csv` with name of the file and corresponding value, 2) `config_example.json` a config file with training and hyperparameters.
 
 Users can keep their structure files in `POSCAR`, `.cif`, `.xyz` or `.pdb` files in a directory. In the examples below we will use POSCAR format files. In the same directory, there should be an `id_prop.csv` file.
 
@@ -123,31 +125,35 @@ The dataset in split in 80:10:10 as training-validation-test set (controlled by
 A brief help guide (`-h`) can be obtained as follows.
 
 ```
-train_folder.py -h
+train_alignn.py -h
 ```
 #### Regression example
 Now, the model is trained as follows. Please increase the `batch_size` parameter to something like 32 or 64 in `config_example.json` for general trainings.
 
 ```
-train_folder.py --root_dir "alignn/examples/sample_data" --config "alignn/examples/sample_data/config_example.json" --output_dir=temp
+train_alignn.py --root_dir "alignn/examples/sample_data" --config "alignn/examples/sample_data/config_example.json" --output_dir=temp
 ```
 #### Classification example
 While the above example is for regression, the follwoing example shows a classification task for metal/non-metal based on the above bandgap values. We transform the dataset
 into 1 or 0 based on a threshold of 0.01 eV (controlled by the parameter, `classification_threshold`) and train a similar classification model. Currently, the script allows binary classification tasks only.
 ```
-train_folder.py --root_dir "alignn/examples/sample_data" --classification_threshold 0.01 --config "alignn/examples/sample_data/config_example.json" --output_dir=temp
+train_alignn.py --root_dir "alignn/examples/sample_data" --classification_threshold 0.01 --config "alignn/examples/sample_data/config_example.json" --output_dir=temp
 ```
 
 #### Multi-output model example
 While the above example regression was for single-output values, we can train multi-output regression models as well.
 An example is given below for training formation energy per atom, bandgap and total energy per atom simulataneously. The script to generate the example data is provided in the script folder of the sample_data_multi_prop. Another example of training electron and phonon density of states is provided also.
 ```
-train_folder.py --root_dir "alignn/examples/sample_data_multi_prop" --config "alignn/examples/sample_data/config_example.json" --output_dir=temp
+train_alignn.py --root_dir "alignn/examples/sample_data_multi_prop" --config "alignn/examples/sample_data/config_example.json" --output_dir=temp
 ```
 #### Automated model training
 Users can try training using multiple example scripts to run multiple dataset (such as JARVIS-DFT, Materials project, QM9_JCTC etc.). Look into the [alignn/scripts/train_*.py](https://github.com/usnistgov/alignn/tree/main/alignn/scripts) folder. This is done primarily to make the trainings more automated rather than making folder/ csv files etc.
 These scripts automatically download datasets from [Databases in jarvis-tools](https://jarvis-tools.readthedocs.io/en/master/databases.html) and train several models. Make sure you specify your specific queuing system details in the scripts.
 
+#### other examples
+
+Additional example trainings for [2D-exfoliation energy](https://colab.research.google.com/github/knc6/jarvis-tools-notebooks/blob/master/jarvis-tools-notebooks/alignn_jarvis_leaderboard.ipynb), [superconductor transition temperature](https://colab.research.google.com/github/knc6/jarvis-tools-notebooks/blob/master/jarvis-tools-notebooks/ALIGNN_Sc.ipynb).
+
 <a name="pretrained"></a>
 Using pre-trained models
 -------------------------
@@ -177,6 +183,8 @@ The following [notebook](https://colab.research.google.com/github/knc6/jarvis-to
 
 The following [notebook](https://colab.research.google.com/github/knc6/jarvis-tools-notebooks/blob/master/jarvis-tools-notebooks/Train_ALIGNNFF_Mlearn.ipynb) provides an example of ALIGNN-FF model.
 
+For additional notebooks, checkout [JARVIS-Tools-Notebooks](https://github.com/JARVIS-Materials-Design/jarvis-tools-notebooks?tab=readme-ov-file#artificial-intelligencemachine-learning)
+
 <a name="webapp"></a>
 Web-app
 ------------
@@ -191,6 +199,8 @@ A basic web-app is for direct-prediction available at [JARVIS-ALIGNN app](https:
 ALIGNN-FF
 -------------------------
 
+Atomisitic line graph neural network-based FF (ALIGNN-FF) can be used to model both structurally and chemically diverse systems with any combination of 89 elements from the periodic table. To train the ALIGNN-FF model, we have used the JARVIS-DFT dataset which contains around 75000 materials and 4 million energy-force entries, out of which 307113 are used in the training. These models can be further finetuned, or new models can be developed from scratch on a new dataset.
+
 [ASE calculator](https://wiki.fysik.dtu.dk/ase/ase/calculators/calculators.html) provides interface to various codes. An example for ALIGNN-FF is give below. Note that there are multiple pretrained ALIGNN-FF models available, here we use the deafult_path model. As more accurate models are developed, they will be made available as well:
 
 ```
@@ -226,14 +236,21 @@ plt.ylabel('Total energy (eV)')
 plt.show()
 ```
 
-To train ALIGNN-FF use `train_folder_ff.py` script which uses `atomwise_alignn` model:
+To train ALIGNN-FF use `train_alignn.py` script which uses `atomwise_alignn` model:
+
+AtomWise prediction example which looks for similar setup as before but unstead of `id_prop.csv`, it requires `id_prop.json` file (see example in the sample_data_ff directory). An example to compile vasprun.xml files into a id_prop.json is kept [here](https://colab.research.google.com/gist/knc6/5513b21f5fd83a7943509ffdf5c3608b/make_id_prop.ipynb). Note ALIGNN-FF requires energy stored as energy per atom:
 
-AtomWise prediction example which looks for similar setup as before but unstead of `id_prop.csv`, it requires `id_prop.json` file (see example in the sample_data_ff directory). Note ALIGNN-FF requires energy stored as energy per atom:
 
 ```
-train_folder_ff.py --root_dir "alignn/examples/sample_data_ff" --config "alignn/examples/sample_data_ff/config_example_atomwise.json" --output_dir=temp
+train_alignn.py --root_dir "alignn/examples/sample_data_ff" --config "alignn/examples/sample_data_ff/config_example_atomwise.json" --output_dir=temp
 ```
 
+
+To finetune model, use `--restart_model_path` tag as well in the above with the path of a pretrained ALIGNN-FF model with same model confurations.
+
+An example for training MLFF for silicon is provided [here](https://colab.research.google.com/github/knc6/jarvis-tools-notebooks/blob/master/jarvis-tools-notebooks/Train_ALIGNNFF_Mlearn.ipynb). It is highly recommeded to get familiar with this example before developing a new model. Note: new model configs such as `lg_on_fly` and `add_reverse_forces` should be defaulted to True for newer versions. For MD runs, `use_cutoff_function` is recommended. 
+
+
 A pretrained ALIGNN-FF (under active development right now) can be used for predicting several properties, such as:
 
 ```
@@ -248,7 +265,7 @@ To know about other tasks, type.
 run_alignn_ff.py -h
 ```
 
-
+Several supporting scripts for stucture optimization, equation of states, phonon and related calculations are provided in the repo as well. If you need further assistance for a particular task, feel free to raise an GitHus issue.
 
 <a name="performances"></a>
 
@@ -386,9 +403,10 @@ Useful notes (based on some of the queries we received)
 1) If you are using GPUs, make sure you have a compatible dgl-cuda version installed, for example: dgl-cu101 or dgl-cu111, so e.g. `pip install dgl-cu111` .
 2) While comnventional '.cif' and '.pdb' files can be read using jarvis-tools, for complex files you might have to install `cif2cell` and `pytraj` respectively i.e.`pip install cif2cell==2.0.0a3` and `conda install -c ambermd pytraj`.
 3) Make sure you use `batch_size` as 32 or 64 for large datasets, and not 2 as given in the example config file, else it will take much longer to train, and performnce might drop a lot.
-4) Note that `train_folder.py` and `pretrained.py` in alignn folder are actually python executable scripts. So, even if you don't provide absolute path of these scripts, they should work.
+4) Note that `train_alignn.py` and `pretrained.py` in alignn folder are actually python executable scripts. So, even if you don't provide absolute path of these scripts, they should work.
 5) Learn about the issue with QM9 results here: https://github.com/usnistgov/alignn/issues/54
-6) Make sure you have `pandas` version as 1.2.3.
+6) Make sure you have `pandas` version as >1.2.3.
+7) Starting March 2024, pytroch-ignite dependency will be removed to enable conda-forge build.
 
 
 <a name="refs"></a>
diff --git a/alignn/__init__.py b/alignn/__init__.py
index 0527edcc..bd0e2a01 100644
--- a/alignn/__init__.py
+++ b/alignn/__init__.py
@@ -1,2 +1,3 @@
 """Version number."""
-__version__ = "2024.2.4"
+
+__version__ = "2024.3.4"
diff --git a/alignn/config.py b/alignn/config.py
index d755a9a9..c8a21279 100644
--- a/alignn/config.py
+++ b/alignn/config.py
@@ -3,18 +3,19 @@
 import subprocess
 from typing import Optional, Union
 import os
-from pydantic import root_validator
-from pydantic.typing import Literal
+from typing import Literal
 from alignn.utils import BaseSettings
-from alignn.models.modified_cgcnn import CGCNNConfig
-from alignn.models.icgcnn import ICGCNNConfig
-from alignn.models.gcn import SimpleGCNConfig
-from alignn.models.densegcn import DenseGCNConfig
 from alignn.models.alignn import ALIGNNConfig
 from alignn.models.alignn_atomwise import ALIGNNAtomWiseConfig
-from alignn.models.dense_alignn import DenseALIGNNConfig
-from alignn.models.alignn_cgcnn import ACGCNNConfig
-from alignn.models.alignn_layernorm import ALIGNNConfig as ALIGNN_LN_Config
+
+# from alignn.models.modified_cgcnn import CGCNNConfig
+# from alignn.models.icgcnn import ICGCNNConfig
+# from alignn.models.gcn import SimpleGCNConfig
+# from alignn.models.densegcn import DenseGCNConfig
+# from pydantic import model_validator
+# from alignn.models.dense_alignn import DenseALIGNNConfig
+# from alignn.models.alignn_cgcnn import ACGCNNConfig
+# from alignn.models.alignn_layernorm import ALIGNNConfig as ALIGNN_LN_Config
 
 # from typing import List
 
@@ -159,11 +160,11 @@ class TrainingConfig(BaseSettings):
         "tinnet_O",
         "tinnet_N",
     ] = "dft_3d"
-    target: TARGET_ENUM = "formation_energy_peratom"
+    target: TARGET_ENUM = "exfoliation_energy"
     atom_features: Literal["basic", "atomic_number", "cfid", "cgcnn"] = "cgcnn"
-    neighbor_strategy: Literal[
-        "k-nearest", "voronoi", "radius_graph"
-    ] = "k-nearest"
+    neighbor_strategy: Literal["k-nearest", "voronoi", "radius_graph"] = (
+        "k-nearest"
+    )
     id_tag: Literal["jid", "id", "_oqmd_entry_id"] = "jid"
 
     # logging configuration
@@ -216,26 +217,26 @@ class TrainingConfig(BaseSettings):
 
     # model configuration
     model: Union[
-        CGCNNConfig,
-        ICGCNNConfig,
-        SimpleGCNConfig,
-        DenseGCNConfig,
         ALIGNNConfig,
         ALIGNNAtomWiseConfig,
-        ALIGNN_LN_Config,
-        DenseALIGNNConfig,
-        ACGCNNConfig,
-    ] = ALIGNNConfig(name="alignn")
-    # ] = CGCNNConfig(name="cgcnn")
-
-    @root_validator()
-    def set_input_size(cls, values):
-        """Automatically configure node feature dimensionality."""
-        values["model"].atom_input_features = FEATURESET_SIZE[
-            values["atom_features"]
-        ]
-
-        return values
+        # CGCNNConfig,
+        # ICGCNNConfig,
+        # SimpleGCNConfig,
+        # DenseGCNConfig,
+        # ALIGNN_LN_Config,
+        # DenseALIGNNConfig,
+        # ACGCNNConfig,
+    ] = ALIGNNAtomWiseConfig(name="alignn_atomwise")
+
+    # @root_validator()
+    # @model_validator(mode='before')
+    # def set_input_size(cls, values):
+    #    """Automatically configure node feature dimensionality."""
+    #    values["model"].atom_input_features = FEATURESET_SIZE[
+    #        values["atom_features"]
+    #    ]
+
+    #    return values
 
     # @property
     # def atom_input_features(self):
diff --git a/alignn/examples/sample_data/config_example.json b/alignn/examples/sample_data/config_example.json
index 9d42c70a..dee460c2 100644
--- a/alignn/examples/sample_data/config_example.json
+++ b/alignn/examples/sample_data/config_example.json
@@ -37,7 +37,7 @@
     "max_neighbors": 12,
     "keep_data_order": true,
     "model": {
-        "name": "alignn",
+        "name": "alignn_atomwise",
         "alignn_layers": 4,
         "gcn_layers": 4,
         "atom_input_features": 92,
diff --git a/alignn/graphs.py b/alignn/graphs.py
index ae47a50b..25ca9aaf 100644
--- a/alignn/graphs.py
+++ b/alignn/graphs.py
@@ -1,4 +1,5 @@
 """Module to generate networkx graphs."""
+
 from jarvis.core.atoms import get_supercell_dims
 from jarvis.core.specie import Specie
 from jarvis.core.utils import random_colors
@@ -861,7 +862,7 @@ def __getitem__(self, idx):
         """Get StructureDataset sample."""
         g = self.graphs[idx]
         label = self.labels[idx]
-
+        # id = self.ids[idx]
         if self.transform:
             g = self.transform(g)
 
diff --git a/alignn/models/alignn.py b/alignn/models/alignn.py
index 971754cb..aafd0361 100644
--- a/alignn/models/alignn.py
+++ b/alignn/models/alignn.py
@@ -2,6 +2,7 @@
 
 A prototype crystal line graph network dgl implementation.
 """
+
 from typing import Tuple, Union
 
 import dgl
@@ -11,7 +12,7 @@
 from dgl.nn import AvgPooling
 
 # from dgl.nn.functional import edge_softmax
-from pydantic.typing import Literal
+from typing import Literal
 from torch import nn
 from torch.nn import functional as F
 
diff --git a/alignn/models/alignn_atomwise.py b/alignn/models/alignn_atomwise.py
index eb1c6b75..228688b5 100644
--- a/alignn/models/alignn_atomwise.py
+++ b/alignn/models/alignn_atomwise.py
@@ -2,6 +2,7 @@
 
 A prototype crystal line graph network dgl implementation.
 """
+
 from typing import Tuple, Union
 from torch.autograd import grad
 import dgl
@@ -11,7 +12,7 @@
 import torch
 
 # from dgl.nn.functional import edge_softmax
-from pydantic.typing import Literal
+from typing import Literal
 from torch import nn
 from torch.nn import functional as F
 from alignn.models.utils import RBFExpansion
@@ -333,8 +334,9 @@ def __init__(
             )
 
         if self.classification:
-            self.fc = nn.Linear(config.hidden_features, 2)
-            self.softmax = nn.LogSoftmax(dim=1)
+            self.fc = nn.Linear(config.hidden_features, 1)
+            self.softmax = nn.Sigmoid()
+            # self.softmax = nn.LogSoftmax(dim=1)
         else:
             self.fc = nn.Linear(config.hidden_features, config.output_features)
         self.link = None
@@ -543,6 +545,7 @@ def forward(
             out = self.link(out)
 
         if self.classification:
+            # out = torch.max(out,dim=1)
             out = self.softmax(out)
         result["out"] = out
         result["grad"] = forces
diff --git a/alignn/models/alignn_cgcnn.py b/alignn/models/alignn_cgcnn.py
deleted file mode 100644
index f3d60816..00000000
--- a/alignn/models/alignn_cgcnn.py
+++ /dev/null
@@ -1,313 +0,0 @@
-"""CGCNN: dgl implementation."""
-
-from typing import Tuple
-
-import dgl
-import dgl.function as fn
-import numpy as np
-import torch
-import torch.nn.functional as F
-from dgl.nn import AvgPooling
-from pydantic.typing import Literal
-from torch import nn
-
-# import torch
-from alignn.models.utils import RBFExpansion
-from alignn.utils import BaseSettings
-
-
-def compute_bond_cosines(edges):
-    """Compute bond angle cosines from bond displacement vectors."""
-    # line graph edge: (a, b), (b, c)
-    # `a -> b -> c`
-    # use law of cosines to compute angles cosines
-    # negate src bond so displacements are like `a <- b -> c`
-    # cos(theta) = ba \dot bc / (||ba|| ||bc||)
-    r1 = -edges.src["r"]
-    r2 = edges.dst["r"]
-    bond_cosine = torch.sum(r1 * r2, dim=1) / (
-        torch.norm(r1, dim=1) * torch.norm(r2, dim=1)
-    )
-    bond_cosine = torch.clamp(bond_cosine, -1, 1)
-    return {"h": bond_cosine}
-
-
-class ACGCNNConfig(BaseSettings):
-    """Hyperparameter schema for jarvisdgl.models.cgcnn."""
-
-    name: Literal["alignn_cgcnn"]
-    conv_layers: int = 3
-    atom_input_features: int = 92
-    edge_features: int = 40
-    node_features: int = 92
-    fc_layers: int = 1
-    fc_features: int = 256
-    output_features: int = 1
-    alignn_layers: int = 3
-    # if link == log, apply `exp` to final outputs
-    # to constrain predictions to be positive
-    link: Literal["identity", "log", "logit"] = "identity"
-    zero_inflated: bool = False
-    classification: bool = False
-
-    class Config:
-        """Configure model settings behavior."""
-
-        env_prefix = "jv_model"
-
-
-class ACGCNNConv(nn.Module):
-    """Xie and Grossman graph convolution function.
-
-    10.1103/PhysRevLett.120.145301
-    """
-
-    def __init__(
-        self,
-        node_features: int = 64,
-        edge_features: int = 32,
-        return_messages: bool = False,
-    ):
-        """Initialize torch modules for CGCNNConv layer."""
-        super().__init__()
-        self.node_features = node_features
-        self.edge_features = edge_features
-        self.return_messages = return_messages
-
-        # CGCNN-Conv operates on augmented edge features
-        # z_ij = cat(v_i, v_j, u_ij)
-        # m_ij = σ(z_ij W_f + b_f) ⊙ g_s(z_ij W_s + b_s)
-        # coalesce parameters for W_f and W_s
-        # but -- split them up along feature dimension
-        self.linear_src = nn.Linear(node_features, 2 * node_features)
-        self.linear_dst = nn.Linear(node_features, 2 * node_features)
-        self.linear_edge = nn.Linear(edge_features, 2 * node_features)
-        self.bn_message = nn.BatchNorm1d(2 * node_features)
-
-        # final batchnorm
-        self.bn = nn.BatchNorm1d(node_features)
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        node_feats: torch.Tensor,
-        edge_feats: torch.Tensor,
-    ) -> torch.Tensor:
-        """CGCNN convolution defined in Eq 5.
-
-        10.1103/PhysRevLett.120.14530
-        """
-        g = g.local_var()
-
-        # instead of concatenating (u || v || e) and applying one weight matrix
-        # split the weight matrix into three, apply, then sum
-        # see https://docs.dgl.ai/guide/message-efficient.html
-        # compute edge messages -- coalesce W_f and W_s from the paper
-        # but split them on feature dimensions to update u, v, e separately
-        # m = BatchNorm(Linear(cat(u, v, e)))
-        g.ndata["h_src"] = self.linear_src(node_feats)
-        g.ndata["h_dst"] = self.linear_dst(node_feats)
-        g.apply_edges(fn.u_add_v("h_src", "h_dst", "h_nodes"))
-        m = g.edata.pop("h_nodes") + self.linear_edge(edge_feats)
-        m = self.bn_message(m)
-
-        # split messages into W_f and W_s terms
-        # multiply output of atom interaction net and edge attention net
-        # i.e. compute the term inside the summation in eq 5
-        # σ(z_ij W_f + b_f) ⊙ g_s(z_ij W_s + b_s)
-        h_f, h_s = torch.chunk(m, 2, dim=1)
-        m = torch.sigmoid(h_f) * F.softplus(h_s)
-        g.edata["m"] = m
-
-        # apply the convolution term in eq. 5 (without residual connection)
-        # storing the results in edge features `h`
-        g.update_all(
-            message_func=fn.copy_e("m", "z"), reduce_func=fn.sum("z", "h"),
-        )
-
-        # final batchnorm
-        h = self.bn(g.ndata.pop("h"))
-
-        # residual connection plus nonlinearity
-        out = F.softplus(node_feats + h)
-
-        if self.return_messages:
-            return out, m
-
-        return out
-
-
-class ACGCNN(nn.Module):
-    """CGCNN dgl implementation."""
-
-    def __init__(
-        self, config: ACGCNNConfig = ACGCNNConfig(name="alignn_cgcnn")
-    ):
-        """Set up CGCNN modules."""
-        super().__init__()
-
-        self.rbf = RBFExpansion(vmin=0, vmax=8.0, bins=config.edge_features)
-        self.abf = RBFExpansion(
-            vmin=-np.pi / 2, vmax=np.pi / 2, bins=config.edge_features
-        )
-        # self.abf = RBFExpansion(vmin=-1, vmax=1, bins=config.edge_features)
-        self.atom_embedding = nn.Linear(
-            config.atom_input_features, config.node_features
-        )
-        self.classification = config.classification
-        self.conv_layers1 = nn.ModuleList(
-            [
-                ACGCNNConv(config.node_features, config.edge_features)
-                for _ in range(config.conv_layers)
-            ]
-        )
-
-        self.conv_layers2 = nn.ModuleList(
-            [
-                ACGCNNConv(config.edge_features, config.edge_features)
-                for _ in range(config.conv_layers)
-            ]
-        )
-        self.readout = AvgPooling()
-
-        self.fc = nn.Sequential(
-            nn.Linear(config.node_features, config.fc_features), nn.Softplus()
-        )
-
-        if config.zero_inflated:
-            # add latent Bernoulli variable model to zero out
-            # predictions in non-negative regression model
-            self.zero_inflated = True
-            self.fc_nonzero = nn.Linear(config.fc_features, 1)
-            self.fc_scale = nn.Linear(config.fc_features, 1)
-            # self.fc_shape = nn.Linear(config.fc_features, 1)
-            self.fc_scale.bias.data = torch.tensor(
-                # np.log(2.1), dtype=torch.float
-                2.1,
-                dtype=torch.float,
-            )
-            if self.classification:
-                raise ValueError(
-                    "Classification not implemented with ZIG loss."
-                )
-        else:
-            self.zero_inflated = False
-            if self.classification:
-                self.fc_out = nn.Linear(config.fc_features, 2)
-                self.softmax = nn.LogSoftmax(dim=1)
-            else:
-                self.fc_out = nn.Linear(
-                    config.fc_features, config.output_features
-                )
-        self.link = None
-        self.link_name = config.link
-        if config.link == "identity":
-            self.link = lambda x: x
-        elif config.link == "log":
-            self.link = torch.exp
-            avg_gap = 0.7  # magic number -- average bandgap in dft_3d
-            if not self.zero_inflated:
-                self.fc_out.bias.data = torch.tensor(
-                    np.log(avg_gap), dtype=torch.float
-                )
-        elif config.link == "logit":
-            self.link = torch.sigmoid
-
-    def forward(self, g) -> torch.Tensor:
-        """CGCNN function mapping graph to outputs."""
-        g, lg = g
-        g = g.local_var()
-        # lg = g.line_graph(shared=True)
-        # lg.apply_edges(compute_bond_cosines)
-        angle_features = self.abf(lg.edata.pop("h"))
-        # fixed edge features: RBF-expanded bondlengths
-        bondlength = torch.norm(g.edata.pop("r"), dim=1)
-        edge_features = self.rbf(bondlength)
-
-        # initial node features: atom feature network...
-        v = g.ndata.pop("atom_features")
-        node_features = self.atom_embedding(v)
-
-        # CGCNN-Conv block: update node features
-        for conv_layer1, conv_layer2 in zip(
-            self.conv_layers1, self.conv_layers2
-        ):
-            node_features = conv_layer1(g, node_features, edge_features)
-            edge_features = conv_layer2(lg, edge_features, angle_features)
-
-        # crystal-level readout
-        features = self.readout(g, node_features)
-        features = F.softplus(features)
-        features = self.fc(features)
-        features = F.softplus(features)
-
-        if self.zero_inflated:
-            logit_p = self.fc_nonzero(features)
-            log_scale = self.fc_scale(features)
-            # log_shape = self.fc_shape(features)
-
-            # pred = (torch.sigmoid(logit_p)
-            #         * torch.exp(log_scale)
-            #         * torch.exp(log_shape))
-            # out = torch.where(p < 0.5, torch.zeros_like(out), out)
-            return (
-                torch.squeeze(logit_p),
-                torch.squeeze(log_scale),
-                # torch.squeeze(log_shape),
-            )
-
-        else:
-            out = self.fc_out(features)
-            if self.link:
-                out = self.link(out)
-        if self.classification:
-            # out = torch.round(torch.sigmoid(out))
-            out = self.softmax(out)
-        return torch.squeeze(out)
-
-
-class ZeroInflatedGammaLoss(nn.modules.loss._Loss):
-    """Zero inflated Gamma regression loss."""
-
-    def predict(self, inputs: Tuple[torch.Tensor, torch.Tensor]):
-        """Combine ZIG multi-part outputs to yield real-valued predictions."""
-        # logit_p, log_scale, log_shape = inputs
-        logit_p, log_scale = inputs
-        return (
-            torch.sigmoid(logit_p)
-            * F.softplus(log_scale)
-            # * torch.exp(log_scale)
-            # * (1 + torch.exp(log_shape))
-        )
-
-    def forward(
-        self, inputs: Tuple[torch.Tensor, torch.Tensor], target: torch.Tensor,
-    ) -> torch.Tensor:
-        """Zero-inflated Gamma loss.
-
-        binary crossentropy loss combined with Gamma negative log likelihood
-        """
-        # logit_p, log_scale, log_shape = inputs
-        logit_p, log_scale = inputs
-
-        bce_loss = F.binary_cross_entropy_with_logits(
-            logit_p, target, reduction="sum"
-        )
-
-        indicator = target > 0
-        # g_loss = F.mse_loss(
-        #     log_scale[indicator],
-        #     torch.log(target[indicator]), reduction="sum"
-        # )
-        # g_loss = F.mse_loss(
-        #     torch.exp(log_scale[indicator]),
-        # target[indicator], reduction="sum"
-        # )
-        g_loss = F.mse_loss(
-            F.softplus(log_scale[indicator]),
-            target[indicator],
-            reduction="sum",
-        )
-
-        return (bce_loss + g_loss) / target.numel()
-        # return bce_loss + torch.tensor(2.0) * g_loss.sum() / indicator.sum()
diff --git a/alignn/models/alignn_layernorm.py b/alignn/models/alignn_layernorm.py
deleted file mode 100644
index c45328ac..00000000
--- a/alignn/models/alignn_layernorm.py
+++ /dev/null
@@ -1,300 +0,0 @@
-"""Atomistic LIne Graph Neural Network.
-
-A prototype crystal line graph network dgl implementation.
-"""
-from typing import Tuple, Union
-
-import dgl
-import dgl.function as fn
-import numpy as np
-import torch
-from dgl.nn import AvgPooling
-
-# from dgl.nn.functional import edge_softmax
-from pydantic.typing import Literal
-from torch import nn
-from torch.nn import functional as F
-
-from alignn.models.utils import RBFExpansion
-from alignn.utils import BaseSettings
-
-
-class ALIGNNConfig(BaseSettings):
-    """Hyperparameter schema for jarvisdgl.models.alignn."""
-
-    name: Literal["alignn_layernorm"]
-    alignn_layers: int = 2
-    gcn_layers: int = 3
-    atom_input_features: int = 92
-    edge_input_features: int = 16
-    triplet_input_features: int = 40
-    embedding_features: int = 112
-    hidden_features: int = 256
-    # fc_layers: int = 1
-    # fc_features: int = 64
-    output_features: int = 1
-
-    # if link == log, apply `exp` to final outputs
-    # to constrain predictions to be positive
-    link: Literal["identity", "log", "logit"] = "identity"
-    zero_inflated: bool = False
-    classification: bool = False
-
-    class Config:
-        """Configure model settings behavior."""
-
-        env_prefix = "jv_model"
-
-
-class EdgeGatedGraphConv(nn.Module):
-    """Edge gated graph convolution from arxiv:1711.07553.
-
-    see also arxiv:2003.0098.
-
-    This is similar to CGCNN, but edge features only go into
-    the soft attention / edge gating function, and the primary
-    node update function is W cat(u, v) + b
-    """
-
-    def __init__(
-        self, input_features: int, output_features: int, residual: bool = True
-    ):
-        """Initialize parameters for ALIGNN update."""
-        super().__init__()
-        self.residual = residual
-        # CGCNN-Conv operates on augmented edge features
-        # z_ij = cat(v_i, v_j, u_ij)
-        # m_ij = σ(z_ij W_f + b_f) ⊙ g_s(z_ij W_s + b_s)
-        # coalesce parameters for W_f and W_s
-        # but -- split them up along feature dimension
-        self.src_gate = nn.Linear(input_features, output_features)
-        self.dst_gate = nn.Linear(input_features, output_features)
-        self.edge_gate = nn.Linear(input_features, output_features)
-        self.bn_edges = nn.LayerNorm(output_features)
-        # self.bn_edges = nn.BatchNorm1d(output_features)
-
-        self.src_update = nn.Linear(input_features, output_features)
-        self.dst_update = nn.Linear(input_features, output_features)
-        self.bn_nodes = nn.LayerNorm(output_features)
-        # self.bn_nodes = nn.BatchNorm1d(output_features)
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        node_feats: torch.Tensor,
-        edge_feats: torch.Tensor,
-    ) -> torch.Tensor:
-        """Edge-gated graph convolution.
-
-        h_i^l+1 = ReLU(U h_i + sum_{j->i} eta_{ij} ⊙ V h_j)
-        """
-        g = g.local_var()
-
-        # instead of concatenating (u || v || e) and applying one weight matrix
-        # split the weight matrix into three, apply, then sum
-        # see https://docs.dgl.ai/guide/message-efficient.html
-        # but split them on feature dimensions to update u, v, e separately
-        # m = BatchNorm(Linear(cat(u, v, e)))
-
-        # compute edge updates, equivalent to:
-        # Softplus(Linear(u || v || e))
-        g.ndata["e_src"] = self.src_gate(node_feats)
-        g.ndata["e_dst"] = self.dst_gate(node_feats)
-        g.apply_edges(fn.u_add_v("e_src", "e_dst", "e_nodes"))
-        m = g.edata.pop("e_nodes") + self.edge_gate(edge_feats)
-
-        g.edata["sigma"] = torch.sigmoid(m)
-        g.ndata["Bh"] = self.dst_update(node_feats)
-        g.update_all(
-            fn.u_mul_e("Bh", "sigma", "m"), fn.sum("m", "sum_sigma_h")
-        )
-        g.update_all(fn.copy_e("sigma", "m"), fn.sum("m", "sum_sigma"))
-        g.ndata["h"] = g.ndata["sum_sigma_h"] / (g.ndata["sum_sigma"] + 1e-6)
-        x = self.src_update(node_feats) + g.ndata.pop("h")
-
-        # softmax version seems to perform slightly worse
-        # that the sigmoid-gated version
-        # compute node updates
-        # Linear(u) + edge_gates ⊙ Linear(v)
-        # g.edata["gate"] = edge_softmax(g, y)
-        # g.ndata["h_dst"] = self.dst_update(node_feats)
-        # g.update_all(fn.u_mul_e("h_dst", "gate", "m"), fn.sum("m", "h"))
-        # x = self.src_update(node_feats) + g.ndata.pop("h")
-
-        # node and edge updates
-        x = F.silu(self.bn_nodes(x))
-        y = F.silu(self.bn_edges(m))
-
-        if self.residual:
-            x = node_feats + x
-            y = edge_feats + y
-
-        return x, y
-
-
-class ALIGNNConv(nn.Module):
-    """Line graph update."""
-
-    def __init__(
-        self, in_features: int, out_features: int,
-    ):
-        """Set up ALIGNN parameters."""
-        super().__init__()
-        self.node_update = EdgeGatedGraphConv(in_features, out_features)
-        self.edge_update = EdgeGatedGraphConv(out_features, out_features)
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        lg: dgl.DGLGraph,
-        x: torch.Tensor,
-        y: torch.Tensor,
-        z: torch.Tensor,
-    ):
-        """Node and Edge updates for ALIGNN layer.
-
-        x: node input features
-        y: edge input features
-        z: edge pair input features
-        """
-        g = g.local_var()
-        lg = lg.local_var()
-        # Edge-gated graph convolution update on crystal graph
-        x, m = self.node_update(g, x, y)
-
-        # Edge-gated graph convolution update on crystal graph
-        y, z = self.edge_update(lg, m, z)
-
-        return x, y, z
-
-
-class MLPLayer(nn.Module):
-    """Multilayer perceptron layer helper."""
-
-    def __init__(self, in_features: int, out_features: int):
-        """Linear, Batchnorm, SiLU layer."""
-        super().__init__()
-        self.layer = nn.Sequential(
-            nn.Linear(in_features, out_features),
-            nn.LayerNorm(out_features),
-            # nn.BatchNorm1d(out_features),
-            nn.SiLU(),
-        )
-
-    def forward(self, x):
-        """Linear, Batchnorm, silu layer."""
-        return self.layer(x)
-
-
-class ALIGNN(nn.Module):
-    """Atomistic Line graph network.
-
-    Chain alternating gated graph convolution updates on crystal graph
-    and atomistic line graph.
-    """
-
-    def __init__(
-        self, config: ALIGNNConfig = ALIGNNConfig(name="alignn_layernorm")
-    ):
-        """Initialize class with number of input features, conv layers."""
-        super().__init__()
-        print(config)
-        self.classification = config.classification
-
-        self.atom_embedding = MLPLayer(
-            config.atom_input_features, config.hidden_features
-        )
-
-        self.edge_embedding = nn.Sequential(
-            RBFExpansion(vmin=0, vmax=8.0, bins=config.edge_input_features,),
-            MLPLayer(config.edge_input_features, config.embedding_features),
-            MLPLayer(config.embedding_features, config.hidden_features),
-        )
-        self.angle_embedding = nn.Sequential(
-            RBFExpansion(
-                vmin=-1, vmax=1.0, bins=config.triplet_input_features,
-            ),
-            MLPLayer(config.triplet_input_features, config.embedding_features),
-            MLPLayer(config.embedding_features, config.hidden_features),
-        )
-
-        self.alignn_layers = nn.ModuleList(
-            [
-                ALIGNNConv(config.hidden_features, config.hidden_features,)
-                for idx in range(config.alignn_layers)
-            ]
-        )
-        self.gcn_layers = nn.ModuleList(
-            [
-                EdgeGatedGraphConv(
-                    config.hidden_features, config.hidden_features
-                )
-                for idx in range(config.gcn_layers)
-            ]
-        )
-
-        self.readout = AvgPooling()
-
-        if self.classification:
-            self.fc = nn.Linear(config.hidden_features, 2)
-            self.softmax = nn.LogSoftmax(dim=1)
-        else:
-            self.fc = nn.Linear(config.hidden_features, config.output_features)
-        self.link = None
-        self.link_name = config.link
-        if config.link == "identity":
-            self.link = lambda x: x
-        elif config.link == "log":
-            self.link = torch.exp
-            avg_gap = 0.7  # magic number -- average bandgap in dft_3d
-            self.fc.bias.data = torch.tensor(
-                np.log(avg_gap), dtype=torch.float
-            )
-        elif config.link == "logit":
-            self.link = torch.sigmoid
-
-    def forward(
-        self, g: Union[Tuple[dgl.DGLGraph, dgl.DGLGraph], dgl.DGLGraph]
-    ):
-        """ALIGNN : start with `atom_features`.
-
-        x: atom features (g.ndata)
-        y: bond features (g.edata and lg.ndata)
-        z: angle features (lg.edata)
-        """
-        if len(self.alignn_layers) > 0:
-            g, lg = g
-            lg = lg.local_var()
-
-            # angle features (fixed)
-            z = self.angle_embedding(lg.edata.pop("h"))
-
-        g = g.local_var()
-
-        # initial node features: atom feature network...
-        x = g.ndata.pop("atom_features")
-        x = self.atom_embedding(x)
-
-        # initial bond features
-        bondlength = torch.norm(g.edata.pop("r"), dim=1)
-        y = self.edge_embedding(bondlength)
-
-        # ALIGNN updates: update node, edge, triplet features
-        for alignn_layer in self.alignn_layers:
-            x, y, z = alignn_layer(g, lg, x, y, z)
-
-        # gated GCN updates: update node, edge features
-        for gcn_layer in self.gcn_layers:
-            x, y = gcn_layer(g, x, y)
-
-        # norm-activation-pool-classify
-        h = self.readout(g, x)
-        out = self.fc(h)
-
-        if self.link:
-            out = self.link(out)
-
-        if self.classification:
-            # out = torch.round(torch.sigmoid(out))
-            out = self.softmax(out)
-        return torch.squeeze(out)
diff --git a/alignn/models/dense_alignn.py b/alignn/models/dense_alignn.py
deleted file mode 100644
index 273bce9e..00000000
--- a/alignn/models/dense_alignn.py
+++ /dev/null
@@ -1,509 +0,0 @@
-"""Atomistic LIne Graph Neural Network.
-
-A prototype crystal line graph network dgl implementation.
-"""
-from typing import Tuple, Union
-
-# from typing import List, Optional, Tuple, Union
-import dgl
-import dgl.function as fn
-import numpy as np
-import torch
-from dgl.nn import AvgPooling
-from pydantic import root_validator
-from pydantic.typing import Literal
-from torch import nn
-from torch.nn import functional as F
-
-from alignn.models.utils import RBFExpansion
-from alignn.utils import BaseSettings
-
-
-class DenseALIGNNConfig(BaseSettings):
-    """Hyperparameter schema for jarvisdgl.models.dense_alignn."""
-
-    name: Literal["dense_alignn"]
-    alignn_layers: int = 3
-    gcn_layers: int = 3
-    atom_input_features: int = 92
-    edge_input_features: int = 81
-    triplet_input_features: int = 40
-    embedding_features: int = 92
-    initial_features: int = 92
-    bottleneck_features: int = 92
-    residual: bool = True
-    growth_rate: int = 64
-    # fc_layers: int = 1
-    # fc_features: int = 64
-    output_features: int = 1
-    norm: Literal["batchnorm", "layernorm"] = "layernorm"
-
-    # if link == log, apply `exp` to final outputs
-    # to constrain predictions to be positive
-    link: Literal["identity", "log", "logit"] = "identity"
-    zero_inflated: bool = False
-    classification: bool = False
-
-    @root_validator()
-    def ensure_residual_dimensions_match(cls, values):
-        """Check that residual connections are allowed."""
-        initial_features = values.get("initial_features")
-        bottleneck_features = values.get("bottleneck_features")
-        residual = values.get("residual")
-        if residual:
-            if initial_features != bottleneck_features:
-                raise ValueError(
-                    "input and bottleneck dims must match to use residuals."
-                )
-
-        return values
-
-    class Config:
-        """Configure model settings behavior."""
-
-        env_prefix = "jv_model"
-
-
-class EdgeGatedGraphConv(nn.Module):
-    """Edge gated graph convolution from arxiv:1711.07553.
-
-    see also arxiv:2003.0098.
-
-    This is similar to CGCNN, but edge features only go into
-    the soft attention / edge gating function, and the primary
-    node update function is W cat(u, v) + b
-    """
-
-    def __init__(
-        self,
-        node_input_features: int,
-        edge_input_features: int,
-        output_features: int,
-        residual: bool = True,
-        norm=nn.BatchNorm1d,
-    ):
-        """Initialize parameters for ALIGNN update."""
-        super().__init__()
-        self.residual = residual
-
-        # CGCNN-Conv operates on augmented edge features
-        # z_ij = cat(v_i, v_j, u_ij)
-        # m_ij = σ(z_ij W_f + b_f) ⊙ g_s(z_ij W_s + b_s)
-        # coalesce parameters for W_f and W_s
-        # but -- split them up along feature dimension
-        self.norm_edges = norm(edge_input_features)
-        self.src_gate = nn.Linear(
-            node_input_features, output_features, bias=False
-        )
-        self.dst_gate = nn.Linear(
-            node_input_features, output_features, bias=False
-        )
-        self.edge_gate = nn.Linear(
-            edge_input_features, output_features, bias=False
-        )
-
-        self.norm_nodes = norm(node_input_features)
-        self.src_update = nn.Linear(
-            node_input_features, output_features, bias=False
-        )
-        self.dst_update = nn.Linear(
-            node_input_features, output_features, bias=False
-        )
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        node_feats: torch.Tensor,
-        edge_feats: torch.Tensor,
-    ) -> torch.Tensor:
-        """Edge-gated graph convolution.
-
-        h_i^l+1 = ReLU(U h_i + sum_{j->i} eta_{ij} ⊙ V h_j)
-        """
-        g = g.local_var()
-
-        # pre-normalization, pre-activation
-        # node and edge updates
-        x = F.silu(self.norm_nodes(node_feats))
-        y = F.silu(self.norm_edges(edge_feats))
-
-        # instead of concatenating (u || v || e) and applying one weight matrix
-        # split the weight matrix into three, apply, then sum
-        # see https://docs.dgl.ai/guide/message-efficient.html
-        # but split them on feature dimensions to update u, v, e separately
-        # m = BatchNorm(Linear(cat(u, v, e)))
-
-        # compute edge updates, equivalent to:
-        # Softplus(Linear(u || v || e))
-        g.ndata["e_src"] = self.src_gate(x)
-        g.ndata["e_dst"] = self.dst_gate(x)
-        g.apply_edges(fn.u_add_v("e_src", "e_dst", "e_nodes"))
-        y = g.edata.pop("e_nodes") + self.edge_gate(y)
-
-        g.edata["sigma"] = torch.sigmoid(y)
-        g.ndata["Bh"] = self.dst_update(x)
-        g.update_all(
-            fn.u_mul_e("Bh", "sigma", "m"), fn.sum("m", "sum_sigma_h")
-        )
-        g.update_all(fn.copy_e("sigma", "m"), fn.sum("m", "sum_sigma"))
-        g.ndata["h"] = g.ndata["sum_sigma_h"] / (g.ndata["sum_sigma"] + 1e-6)
-        x = self.src_update(x) + g.ndata.pop("h")
-
-        if self.residual:
-            x = node_feats + x
-            y = edge_feats + y
-
-        return x, y
-
-
-class ALIGNNConv(nn.Module):
-    """Line graph update."""
-
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        residual: bool = False,
-        norm=nn.BatchNorm1d,
-    ):
-        """Set up ALIGNN parameters."""
-        super().__init__()
-        self.residual = residual
-        self.node_update = EdgeGatedGraphConv(
-            in_features, in_features, out_features, residual, norm
-        )
-        # y: out_features
-        # z: in_features
-        self.edge_update = EdgeGatedGraphConv(
-            out_features, in_features, out_features, residual, norm
-        )
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        lg: dgl.DGLGraph,
-        x: torch.Tensor,
-        y: torch.Tensor,
-        z: torch.Tensor,
-    ):
-        """Node and Edge updates for ALIGNN layer.
-
-        x: node input features
-        y: edge input features
-        z: edge pair input features
-        """
-        g = g.local_var()
-        lg = lg.local_var()
-        # y_initial = y
-
-        # Edge-gated graph convolution update on crystal graph
-        # x, y are concatenated feature maps
-        x, y = self.node_update(g, x, y)
-
-        # Edge-gated graph convolution update on crystal graph
-        # y: growth_rate
-        # z: concatenated feature map size
-        y, z = self.edge_update(lg, y, z)
-
-        # # residual edge connection around line graph convolution
-        # y = y_initial + y
-
-        return x, y, z
-
-
-class MLPLayer(nn.Module):
-    """Multilayer perceptron layer helper."""
-
-    def __init__(
-        self, in_features: int, out_features: int, norm=nn.BatchNorm1d
-    ):
-        """Linear, Batchnorm, SiLU layer."""
-        super().__init__()
-        self.layer = nn.ModuleDict(
-            {
-                "linear": nn.Linear(in_features, out_features),
-                "norm": norm(out_features),
-                "activation": nn.SiLU(),
-            }
-        )
-
-    def forward(self, x):
-        """Linear, Batchnorm, silu layer."""
-        for name, cpt in self.layer.items():
-            x = cpt(x)
-        return x
-
-
-class DenseGCNBlock(nn.Module):
-    """Dense block of gated graph convolution layers."""
-
-    def __init__(
-        self,
-        n_layers: int = 3,
-        input_features: int = 32,
-        growth_rate: int = 32,
-        output_features: int = 32,
-        residual: bool = True,
-        norm=nn.BatchNorm1d,
-    ):
-        """Densely-connected gated graph convolution layers."""
-        super().__init__()
-        self.residual = residual
-        self.bottleneck_inputs = input_features + n_layers * growth_rate
-        self.layers = nn.ModuleList()
-
-        for idx in range(n_layers):
-            in_features = input_features + idx * growth_rate
-            self.layers.append(
-                EdgeGatedGraphConv(
-                    in_features,
-                    in_features,
-                    growth_rate,
-                    residual=False,
-                    norm=norm,
-                )
-            )
-
-        self.bottleneck_x = nn.Sequential(
-            norm(self.bottleneck_inputs),
-            nn.SiLU(),
-            nn.Linear(self.bottleneck_inputs, output_features, bias=False),
-        )
-        self.bottleneck_y = nn.Sequential(
-            norm(self.bottleneck_inputs),
-            nn.SiLU(),
-            nn.Linear(self.bottleneck_inputs, output_features, bias=False),
-        )
-
-    def forward(self, g, x, y):
-        """Gated GCN updates: update node, edge features."""
-        x_identity = x
-        y_identity = y
-        xs, ys = [x], [y]
-        for gcn_layer in self.layers:
-            new_x, new_y = gcn_layer(g, torch.cat(xs, 1), torch.cat(ys, 1))
-            xs.append(new_x)
-            ys.append(new_y)
-
-        x = self.bottleneck_x(torch.cat(xs, 1))
-        y = self.bottleneck_y(torch.cat(ys, 1))
-
-        if self.residual:
-            x = x_identity + x
-            y = y_identity + y
-
-        return x, y
-
-
-class DenseALIGNNBlock(nn.Module):
-    """Dense block of ALIGNN updates."""
-
-    def __init__(
-        self,
-        n_layers: int = 3,
-        input_features: int = 32,
-        growth_rate: int = 32,
-        output_features: int = 32,
-        residual: bool = True,
-        norm=nn.BatchNorm1d,
-    ):
-        """Dense block of ALIGNN updates."""
-        super().__init__()
-        self.residual = residual
-        self.bottleneck_inputs = input_features + n_layers * growth_rate
-
-        self.layers = nn.ModuleList()
-        for idx in range(n_layers):
-            in_features = input_features + idx * growth_rate
-            self.layers.append(
-                ALIGNNConv(in_features, growth_rate, residual=False, norm=norm)
-            )
-
-        self.bottleneck_x = nn.Sequential(
-            norm(self.bottleneck_inputs),
-            nn.SiLU(),
-            nn.Linear(self.bottleneck_inputs, output_features, bias=False),
-        )
-        self.bottleneck_y = nn.Sequential(
-            norm(self.bottleneck_inputs),
-            nn.SiLU(),
-            nn.Linear(self.bottleneck_inputs, output_features, bias=False),
-        )
-
-    def forward(self, g, lg, x, y, z):
-        """ALIGNN updates: update node, edge, triplet features.
-
-        DenseNet style updates:
-        maintain a list of x, y, z features
-        and concatenate all previous feature maps
-        to form input for each layer
-        """
-        x_identity = x
-        xs = [x]
-        y_identity = y
-        ys = [y]
-        # z_identity = z
-        zs = [z]
-
-        for alignn_layer in self.layers:
-            new_x, new_y, new_z = alignn_layer(
-                g, lg, torch.cat(xs, 1), torch.cat(ys, 1), torch.cat(zs, 1)
-            )
-            xs.append(new_x)
-            ys.append(new_y)
-            zs.append(new_z)
-
-        x = self.bottleneck_x(torch.cat(xs, 1))
-        y = self.bottleneck_y(torch.cat(ys, 1))
-
-        # residual connections around graph dense graph convolution block
-        if self.residual:
-            x = x_identity + x
-            y = y_identity + y
-
-        return x, y
-
-
-class DenseALIGNN(nn.Module):
-    """Atomistic Line graph network.
-
-    Chain alternating gated graph convolution updates on crystal graph
-    and atomistic line graph.
-    """
-
-    def __init__(
-        self,
-        config: DenseALIGNNConfig = DenseALIGNNConfig(name="dense_alignn"),
-    ):
-        """Initialize class with number of input features, conv layers."""
-        super().__init__()
-        print(config)
-        self.classification = config.classification
-        norm = {"batchnorm": nn.BatchNorm1d, "layernorm": nn.LayerNorm}[
-            config.norm
-        ]
-
-        self.atom_embedding = MLPLayer(
-            config.atom_input_features, config.initial_features, norm
-        )
-
-        self.edge_embedding = nn.Sequential(
-            RBFExpansion(
-                vmin=0,
-                vmax=8.0,
-                bins=config.edge_input_features,
-                lengthscale=0.5,
-            ),
-            MLPLayer(
-                config.edge_input_features, config.embedding_features, norm
-            ),
-            MLPLayer(config.embedding_features, config.initial_features, norm),
-        )
-        self.angle_embedding = nn.Sequential(
-            RBFExpansion(
-                vmin=-np.pi, vmax=np.pi, bins=config.triplet_input_features,
-            ),
-            MLPLayer(
-                config.triplet_input_features, config.embedding_features, norm
-            ),
-            MLPLayer(config.embedding_features, config.initial_features, norm),
-        )
-
-        if config.alignn_layers > 0:
-            self.dense_alignn_block = DenseALIGNNBlock(
-                n_layers=config.alignn_layers,
-                input_features=config.initial_features,
-                growth_rate=config.growth_rate,
-                output_features=config.bottleneck_features,
-                residual=config.residual,
-                norm=norm,
-            )
-        else:
-            self.dense_alignn_block = None
-
-        initial_features = config.initial_features
-        self.dense_gcn_block = DenseGCNBlock(
-            n_layers=config.gcn_layers,
-            input_features=initial_features,
-            growth_rate=config.growth_rate,
-            output_features=config.bottleneck_features,
-            residual=config.residual,
-            norm=norm,
-        )
-
-        self.readout = AvgPooling()
-
-        if self.classification:
-            self.fc = nn.Linear(config.bottleneck_features, 2)
-            self.softmax = nn.LogSoftmax(dim=1)
-        else:
-            self.fc = nn.Linear(
-                config.bottleneck_features, config.output_features
-            )
-
-        self.link = None
-        self.link_name = config.link
-        if config.link == "identity":
-            self.link = lambda x: x
-        elif config.link == "log":
-            self.link = torch.exp
-            avg_gap = 0.7  # magic number -- average bandgap in dft_3d
-            self.fc.bias.data = torch.tensor(
-                np.log(avg_gap), dtype=torch.float
-            )
-        elif config.link == "logit":
-            self.link = torch.sigmoid
-
-        # Kaiming initialization not working out
-        # stick with default Glorot
-        # self.apply(self.reset_parameters)
-
-    @staticmethod
-    def reset_parameters(m):
-        """He initialization."""
-        if isinstance(m, nn.Linear):
-            nn.init.kaiming_normal_(
-                m.weight, mode="fan_out", nonlinearity="relu"
-            )
-            # nn.init.constant_(m.bias, 0)
-
-    def forward(
-        self, g: Union[Tuple[dgl.DGLGraph, dgl.DGLGraph], dgl.DGLGraph]
-    ):
-        """ALIGNN : start with `atom_features`.
-
-        x: atom features (g.ndata)
-        y: bond features (g.edata and lg.ndata)
-        z: angle features (lg.edata)
-        """
-        if self.dense_alignn_block is not None:
-            g, lg = g
-            lg = lg.local_var()
-
-            # angle features (fixed)
-            z = self.angle_embedding(lg.edata.pop("h"))
-
-        g = g.local_var()
-
-        # initial node features: atom feature network...
-        x = g.ndata.pop("atom_features")
-        x = self.atom_embedding(x)
-
-        # initial bond features
-        bondlength = torch.norm(g.edata.pop("r"), dim=1)
-        y = self.edge_embedding(bondlength)
-
-        x, y = self.dense_alignn_block(g, lg, x, y, z)
-        x, y = self.dense_gcn_block(g, x, y)
-
-        # norm-activation-pool-classify
-        h = self.readout(g, x)
-        out = self.fc(h)
-
-        if self.link:
-            out = self.link(out)
-        if self.classification:
-            # out = torch.round(torch.sigmoid(out))
-            out = self.softmax(out)
-
-        return torch.squeeze(out)
diff --git a/alignn/models/densegcn.py b/alignn/models/densegcn.py
deleted file mode 100644
index faba7474..00000000
--- a/alignn/models/densegcn.py
+++ /dev/null
@@ -1,137 +0,0 @@
-"""A baseline graph convolution network dgl implementation."""
-from typing import List, Optional
-
-import dgl
-import torch
-from dgl.nn import AvgPooling, GraphConv
-from pydantic.typing import Literal
-from torch import nn
-from torch.nn import functional as F
-
-from alignn.utils import BaseSettings
-
-
-class DenseGCNConfig(BaseSettings):
-    """Hyperparameter schema for jarvisdgl.models.densegcn."""
-
-    name: Literal["densegcn"]
-    atom_input_features: int = 1
-    edge_lengthscale: float = 4.0
-    weight_edges: bool = True
-    conv_layers: int = 4
-    node_features: int = 32
-    growth_rate: int = 32
-    output_features: int = 1
-    classification: bool = False
-
-    class Config:
-        """Configure model settings behavior."""
-
-        env_prefix = "jv_model"
-
-
-class _DenseLayer(nn.Module):
-    """BatchNorm-ReLU-GraphConv Dense layer."""
-
-    def __init__(self, in_features: int, growth_rate: int):
-        super().__init__()
-        self.bn = nn.BatchNorm1d(in_features)
-        self.conv = GraphConv(in_features, growth_rate)
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        input: List[torch.Tensor],
-        edge_weight: Optional[torch.Tensor],
-    ):
-
-        prev_features = F.relu(self.bn(torch.cat(input, 1)))
-        new_features = self.conv(g, prev_features, edge_weight=edge_weight)
-
-        return new_features
-
-
-class _DenseBlock(nn.ModuleDict):
-    """Block of densely-connected bn-ReLU-conv layers."""
-
-    def __init__(self, n_layers: int, in_features: int, growth_rate: int):
-        super().__init__()
-        for id_layer in range(n_layers):
-            layer = _DenseLayer(
-                in_features + id_layer * growth_rate, growth_rate
-            )
-            self.add_module(f"denselayer{1+id_layer}", layer)
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        node_features: torch.Tensor,
-        edge_weight: Optional[torch.Tensor] = None,
-    ):
-        features = [node_features]
-        for name, layer in self.items():
-            new_features = layer(g, features, edge_weight=edge_weight)
-            features.append(new_features)
-        return torch.cat(features, 1)
-
-
-class DenseGCN(nn.Module):
-    """GraphConv GCN with DenseNet-style connections."""
-
-    def __init__(
-        self, config: DenseGCNConfig = DenseGCNConfig(name="densegcn")
-    ):
-        """Initialize class with number of input features, conv layers."""
-        super().__init__()
-        print(config)
-        self.edge_lengthscale = config.edge_lengthscale
-        self.weight_edges = config.weight_edges
-
-        self.atom_embedding = nn.Linear(
-            config.atom_input_features, config.node_features
-        )
-
-        self.bn = nn.BatchNorm1d(config.node_features)
-
-        # bn-relu-conv
-        self.dense_layers = _DenseBlock(
-            config.conv_layers, config.node_features, config.growth_rate
-        )
-
-        final_size = (
-            config.node_features + config.conv_layers * config.growth_rate
-        )
-
-        self.bn_final = nn.BatchNorm1d(final_size)
-
-        self.readout = AvgPooling()
-
-        self.fc = nn.Linear(final_size, config.output_features)
-
-    def forward(self, g):
-        """Baseline SimpleGCN : start with `atom_features`."""
-        g = g.local_var()
-
-        if self.weight_edges:
-            r = torch.norm(g.edata["r"], dim=1)
-            edge_weights = torch.exp(-(r ** 2) / self.edge_lengthscale ** 2)
-        else:
-            edge_weights = None
-
-        # initial node features: atom feature network...
-        # conv-bn-relu
-        v = g.ndata.pop("atom_features")
-        node_features = self.atom_embedding(v)
-        node_features = F.relu(self.bn(node_features))
-
-        # bn-relu-conv
-        h = self.dense_layers(g, node_features, edge_weight=edge_weights)
-
-        # norm-relu-pool-classify
-        h = F.relu(self.bn_final(h))
-
-        h = self.readout(g, h)
-
-        out = self.fc(h)
-
-        return torch.squeeze(out)
diff --git a/alignn/models/gcn.py b/alignn/models/gcn.py
deleted file mode 100644
index 2ef91072..00000000
--- a/alignn/models/gcn.py
+++ /dev/null
@@ -1,64 +0,0 @@
-"""A baseline graph convolution network dgl implementation."""
-# import dgl
-import torch
-from dgl.nn import AvgPooling, GraphConv
-from pydantic.typing import Literal
-from torch import nn
-from torch.nn import functional as F
-
-from alignn.utils import BaseSettings
-
-
-class SimpleGCNConfig(BaseSettings):
-    """Hyperparameter schema for jarvisdgl.models.gcn."""
-
-    name: Literal["simplegcn"]
-    atom_input_features: int = 1
-    weight_edges: bool = True
-    width: int = 64
-    output_features: int = 1
-
-    class Config:
-        """Configure model settings behavior."""
-
-        env_prefix = "jv_model"
-
-
-class SimpleGCN(nn.Module):
-    """GraphConv GCN with DenseNet-style connections."""
-
-    def __init__(
-        self, config: SimpleGCNConfig = SimpleGCNConfig(name="simplegcn")
-    ):
-        """Initialize class with number of input features, conv layers."""
-        super().__init__()
-        self.edge_lengthscale = config.edge_lengthscale
-        self.weight_edges = config.weight_edges
-
-        self.atom_embedding = nn.Linear(
-            config.atom_input_features, config.width
-        )
-
-        self.layer1 = GraphConv(config.width, config.width)
-        self.layer2 = GraphConv(config.width, config.output_features)
-        self.readout = AvgPooling()
-
-    def forward(self, g):
-        """Baseline SimpleGCN : start with `atom_features`."""
-        g = g.local_var()
-
-        if self.weight_edges:
-            r = torch.norm(g.edata["bondlength"], dim=1)
-            edge_weights = torch.exp(-(r ** 2) / self.edge_lengthscale ** 2)
-        else:
-            edge_weights = None
-
-        # initial node features: atom feature network...
-        v = g.ndata.pop("atom_features")
-        node_features = self.atom_embedding(v)
-
-        x = F.relu(self.layer1(g, node_features, edge_weight=edge_weights))
-        x = self.layer2(g, x, edge_weight=edge_weights)
-        x = self.readout(g, x)
-
-        return torch.squeeze(x)
diff --git a/alignn/models/icgcnn.py b/alignn/models/icgcnn.py
deleted file mode 100644
index 025f9a01..00000000
--- a/alignn/models/icgcnn.py
+++ /dev/null
@@ -1,299 +0,0 @@
-"""CGCNN: dgl implementation."""
-
-from typing import Tuple
-import dgl
-import dgl.function as fn
-
-# import numpy as np
-import torch
-import torch.nn.functional as F
-from dgl.nn import AvgPooling
-from pydantic.typing import Literal
-from torch import nn
-
-from alignn.models.utils import RBFExpansion
-from alignn.utils import BaseSettings
-
-
-class ICGCNNConfig(BaseSettings):
-    """Hyperparameter schema for jarvisdgl.models.icgcnn."""
-
-    name: Literal["icgcnn"]
-    conv_layers: int = 3
-    atom_input_features: int = 1
-    edge_features: int = 16
-    node_features: int = 64
-    fc_layers: int = 1
-    fc_features: int = 64
-    output_features: int = 1
-
-    # if logscale is set, apply `exp` to final outputs
-    # to constrain predictions to be positive
-    logscale: bool = False
-    hurdle: bool = False
-    classification: bool = False
-
-    class Config:
-        """Configure model settings behavior."""
-
-        env_prefix = "jv_model"
-
-
-class CGCNNUpdate(nn.Module):
-    """Helper nn.Module for CGCNN-style updates."""
-
-    def __init__(self, in_features: int, out_features: int):
-        """Set up CGCNN internal parameters."""
-        super().__init__()
-
-        # edge interaction model (W_f / W_1)
-        self.conv = nn.Sequential(
-            nn.Linear(in_features, out_features),
-            nn.BatchNorm1d(out_features),
-            nn.Sigmoid(),
-        )
-
-        # edge attention model (W_s / W_2)
-        self.screen = nn.Sequential(
-            nn.Linear(in_features, out_features),
-            nn.BatchNorm1d(out_features),
-            nn.Softplus(),
-        )
-
-    def forward(self, x: torch.Tensor):
-        """Apply CGCNNConv-style update."""
-        return self.conv(x) * self.screen(x)
-
-
-class iCGCNNConv(nn.Module):
-    """Park and Wolverton iCGCNN convolution.
-
-    10.1103/PhysRevMaterials.4.063801
-
-    In the papers, nodes are v_i, v_j, edges are u_ij
-    In DGL, nodes are u (src) and v (dst), edges are e
-    """
-
-    def __init__(self, node_features: int = 64, edge_features: int = 32):
-        """Initialize torch modules for iCGCNNConv layer."""
-        super().__init__()
-        self.node_features = node_features
-        self.edge_features = edge_features
-
-        # iCGCNNConv has a node update and an edge update
-        # each update has a pairwise and triplet interaction term
-
-        # pairwise features:
-        # z_ij = cat(v_i, v_j, u_ij)
-        pair_sz = 2 * self.node_features + self.edge_features
-
-        # triplet features:
-        # z_ijl = cat(v_i, v_j, v_l, u_ij, u_il)
-        triple_sz = 3 * self.node_features + 2 * self.edge_features
-
-        # node update functions
-        self.node_pair_update = CGCNNUpdate(pair_sz, self.node_features)
-        self.node_triple_update = CGCNNUpdate(triple_sz, self.node_features)
-
-        # edge update functions
-        self.edge_pair_update = CGCNNUpdate(pair_sz, self.edge_features)
-        self.edge_triple_update = CGCNNUpdate(triple_sz, self.edge_features)
-
-        # final batchnorm
-        self.node_bn = nn.BatchNorm1d(self.node_features)
-        self.edge_bn = nn.BatchNorm1d(self.edge_features)
-
-    def combine_edge_features(self, edges):
-        """Edge update for iCGCNNConv.
-
-        concatenate source and destination node features with edge features
-        then apply the edge update modulated by the edge interaction model
-        """
-        # form augmented edge features z_ij = [v_i, v_j, u_ij]
-        z = torch.cat((edges.src["h"], edges.dst["h"], edges.data["h"]), dim=1)
-
-        return {"z_pair": z}
-
-    def combine_triplet_features(self, edges):
-        """Line graph edge update for iCGCNNConv."""
-        z_ijl = torch.cat(
-            (
-                edges.src["src_h"],
-                edges.src["dst_h"],
-                edges.dst["dst_h"],
-                edges.src["h"],
-                edges.dst["h"],
-            ),
-            dim=1,
-        )
-        return {"z_triple": z_ijl}
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        node_feats: torch.Tensor,
-        edge_feats: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """CGCNN convolution defined in Eq 1, 2, and 3.
-
-        10.1103/PhysRevMaterials.4.063801
-
-        This convolution function forms z_ij and z_ijl tensors
-        and performs two aggregrations each:
-        one to update node features and one to update edge features
-        """
-        g = g.local_var()
-
-        g.ndata["h"] = node_feats
-        g.edata["h"] = edge_feats
-
-        # propagate node features to line graph nodes
-        g.apply_edges(
-            func=lambda edges: {
-                "src_h": edges.src["h"],
-                "dst_h": edges.dst["h"],
-            }
-        )
-
-        # line graph edge == pairs of bonds (u,v), (v,w)
-        # z_ijl = cat(src[src], src[dst], dst[dst])
-        lg = dgl.line_graph(g, shared=True)
-
-        # both node and edge updates need both z_ij and z_ijl
-        # compute these separately with apply_edges
-        # apply multiple linear maps within that message function
-        # then propagate them with separate update_all call each
-
-        # compute z_ij (n_pairs, pair_sz)
-        g.apply_edges(self.combine_edge_features)
-        z_pair = g.edata.pop("z_pair")
-
-        # compute z_ijl_kk' (n_triples, triple_sz)
-        lg.apply_edges(self.combine_triplet_features)
-        z_triple = lg.edata.pop("z_triple")
-
-        # node update: eqs 1 and 2
-        # eq 1 (pre-reduction) (n_edges, node_sz)
-        # reduces to (n_nodes, node_sz)
-        h_node_pair = self.node_pair_update(z_pair)
-
-        # eq 2 (pre-reduction) (n_triples, node_sz)
-        # reduces to (n_nodes, node_sz)
-        h_node_triple = self.node_triple_update(z_triple)
-
-        # edge update: eq 3
-        # eq 3 term 1 (n_edges, edge_sz)
-        # no reduction needed
-        h_edge_pair = self.edge_pair_update(z_pair)
-
-        # eq 3 term 2 (pre-reduction) (n_triples, edge_sz)
-        # reduces to (n_edges, edge_sz)
-        h_edge_triple = self.edge_triple_update(z_triple)
-
-        # aggregate triple features to edges, then edges to nodes
-        lg.edata["h_node_triple"] = h_node_triple
-        lg.edata["h_edge_triple"] = h_edge_triple
-
-        # triple -> edge aggregation (i.e. LG edges to LG nodes)
-        # partial summation in Eq 2 (sum over l, k')
-        lg.update_all(
-            fn.copy_e("h_node_triple", "h_node_triple"),
-            fn.sum("h_node_triple", "h_node_triple"),
-        )
-        # sum over l, k' in Eq 3
-        lg.update_all(
-            fn.copy_e("h_edge_triple", "h_edge_triple"),
-            fn.sum("h_edge_triple", "h_edge_triple"),
-        )
-
-        # further aggregate triplet features to nodes
-        # complete summation in eq 2 (sum over j, k)
-        g.edata["h_node_triple"] = lg.ndata.pop("h_node_triple")
-        g.update_all(
-            fn.copy_e("h_node_triple", "h_node_triple"),
-            fn.sum("h_node_triple", "h_node_triple"),
-        )
-
-        # edge-wise reduction in eq 1 (sum over j,k)
-        g.edata["h_node_pair"] = h_node_pair
-        g.update_all(
-            message_func=fn.copy_e("h_node_pair", "h_node_pair"),
-            reduce_func=fn.sum("h_node_pair", "h_node_pair"),
-        )
-
-        # final batchnorm
-        h_node = g.ndata.pop("h_node_pair") + g.ndata.pop("h_node_triple")
-        h_node = self.node_bn(h_node)
-
-        h_edge = h_edge_pair + lg.ndata.pop("h_edge_triple")
-        h_edge = self.edge_bn(h_edge)
-
-        # residual connection plus nonlinearity
-        return F.softplus(node_feats + h_node), F.softplus(edge_feats + h_edge)
-
-
-class iCGCNN(nn.Module):
-    """iCGCNN dgl implementation."""
-
-    def __init__(self, config: ICGCNNConfig = ICGCNNConfig(name="icgcnn")):
-        """Set up CGCNN modules."""
-        super().__init__()
-
-        self.rbf = RBFExpansion(vmin=0, vmax=8.0, bins=config.edge_features)
-        self.atom_embedding = nn.Linear(
-            config.atom_input_features, config.node_features
-        )
-        self.classification = config.classification
-        self.conv_layers = nn.ModuleList(
-            [
-                iCGCNNConv(config.node_features, config.edge_features)
-                for _ in range(config.conv_layers)
-            ]
-        )
-
-        self.readout = AvgPooling()
-
-        self.fc = nn.Sequential(
-            nn.Linear(config.node_features, config.fc_features), nn.Softplus()
-        )
-
-        if self.classification:
-            self.fc_out = nn.Linear(config.fc_features, 2)
-            self.softmax = nn.LogSoftmax(dim=1)
-        else:
-            self.fc_out = nn.Linear(config.fc_features, config.output_features)
-
-        self.logscale = config.logscale
-
-    def forward(self, g) -> torch.Tensor:
-        """CGCNN function mapping graph to outputs."""
-        g, lg = g
-        g = g.local_var()
-
-        # fixed edge features: RBF-expanded bondlengths
-        bondlength = torch.norm(g.edata.pop("r"), dim=1)
-        h_edge = self.rbf(bondlength)
-
-        # initial node features: atom feature network...
-        v = g.ndata.pop("atom_features")
-        h_node = self.atom_embedding(v)
-
-        # CGCNN-Conv block: update node features
-        for conv_layer in self.conv_layers:
-            h_node, h_edge = conv_layer(g, h_node, h_edge)
-
-        # crystal-level readout
-        features = self.readout(g, h_node)
-        features = F.softplus(features)
-        features = self.fc(features)
-        features = F.softplus(features)
-
-        out = self.fc_out(features)
-
-        if self.logscale:
-            out = torch.exp(out)
-        if self.classification:
-            # out = torch.round(torch.sigmoid(out))
-            out = self.softmax(out)
-
-        return torch.squeeze(out)
diff --git a/alignn/models/modified_cgcnn.py b/alignn/models/modified_cgcnn.py
deleted file mode 100644
index fcd89ac9..00000000
--- a/alignn/models/modified_cgcnn.py
+++ /dev/null
@@ -1,357 +0,0 @@
-"""CGCNN: dgl implementation."""
-
-from typing import Tuple
-
-import dgl
-import dgl.function as fn
-import numpy as np
-import torch
-import torch.nn.functional as F
-from dgl.nn import AvgPooling
-from pydantic.typing import Literal
-from torch import nn
-
-from alignn.models.utils import RBFExpansion
-from alignn.utils import BaseSettings
-
-
-class CGCNNConfig(BaseSettings):
-    """Hyperparameter schema for jarvisdgl.models.cgcnn."""
-
-    name: Literal["cgcnn"]
-    conv_layers: int = 3
-    atom_input_features: int = 92
-    edge_features: int = 41
-    node_features: int = 64
-    fc_layers: int = 1
-    fc_features: int = 128
-    output_features: int = 1
-
-    # if link == log, apply `exp` to final outputs
-    # to constrain predictions to be positive
-    link: Literal["identity", "log", "logit"] = "identity"
-    zero_inflated: bool = False
-    classification: bool = False
-
-    class Config:
-        """Configure model settings behavior."""
-
-        env_prefix = "jv_model"
-
-
-class CGCNNConvFull(nn.Module):
-    """Xie and Grossman graph convolution function.
-
-    10.1103/PhysRevLett.120.145301
-    """
-
-    def __init__(self, node_features: int = 64, edge_features: int = 32):
-        """Initialize torch modules for CGCNNConv layer."""
-        super().__init__()
-        self.node_features = node_features
-        self.edge_features = edge_features
-
-        # CGCNN-Conv operates on augmented edge features
-        # z_ij = cat(v_i, v_j, u_ij)
-        in_feats = 2 * self.node_features + self.edge_features
-
-        # edge interaction model (W_f)
-        self.edge_interaction = nn.Sequential(
-            nn.Linear(in_feats, self.node_features),
-            nn.BatchNorm1d(self.node_features),
-            nn.Sigmoid(),
-        )
-
-        # edge attention model (W_s)
-        self.edge_update = nn.Sequential(
-            nn.Linear(in_feats, self.node_features),
-            nn.BatchNorm1d(self.node_features),
-            nn.Softplus(),
-        )
-
-        # final batchnorm
-        self.bn = nn.BatchNorm1d(self.node_features)
-
-    def combine_edge_features(self, edges):
-        """Edge update for CGCNNConv.
-
-        concatenate source and destination node features with edge features
-        then apply the edge update modulated by the edge interaction model
-        """
-        # form augmented edge features z_ij = [v_i, v_j, u_ij]
-        z = torch.cat((edges.src["h"], edges.dst["h"], edges.data["h"]), dim=1)
-
-        # multiply output of atom interaction net and edge attention net
-        # i.e. compute the term inside the summation in eq 5
-        # σ(z_ij W_f + b_f) ⊙ g_s(z_ij W_s + b_s)
-        return {"z": self.edge_interaction(z) * self.edge_update(z)}
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        node_feats: torch.Tensor,
-        edge_feats: torch.Tensor,
-    ) -> torch.Tensor:
-        """CGCNN convolution defined in Eq 5.
-
-        10.1103/PhysRevLett.120.14530
-        """
-        g = g.local_var()
-
-        g.ndata["h"] = node_feats
-        g.edata["h"] = edge_feats
-
-        # apply the convolution term in eq. 5 (without residual connection)
-        # storing the results in edge features `h`
-        g.update_all(
-            message_func=self.combine_edge_features,
-            reduce_func=fn.sum("z", "h"),
-        )
-
-        # final batchnorm
-        h = self.bn(g.ndata.pop("h"))
-
-        # residual connection plus nonlinearity
-        return F.softplus(node_feats + h)
-
-
-class CGCNNConv(nn.Module):
-    """Xie and Grossman graph convolution function.
-
-    10.1103/PhysRevLett.120.145301
-    """
-
-    def __init__(
-        self,
-        node_features: int = 64,
-        edge_features: int = 32,
-        return_messages: bool = False,
-    ):
-        """Initialize torch modules for CGCNNConv layer."""
-        super().__init__()
-        self.node_features = node_features
-        self.edge_features = edge_features
-        self.return_messages = return_messages
-
-        # CGCNN-Conv operates on augmented edge features
-        # z_ij = cat(v_i, v_j, u_ij)
-        # m_ij = σ(z_ij W_f + b_f) ⊙ g_s(z_ij W_s + b_s)
-        # coalesce parameters for W_f and W_s
-        # but -- split them up along feature dimension
-        self.linear_src = nn.Linear(node_features, 2 * node_features)
-        self.linear_dst = nn.Linear(node_features, 2 * node_features)
-        self.linear_edge = nn.Linear(edge_features, 2 * node_features)
-        self.bn_message = nn.BatchNorm1d(2 * node_features)
-
-        # final batchnorm
-        self.bn = nn.BatchNorm1d(node_features)
-
-    def forward(
-        self,
-        g: dgl.DGLGraph,
-        node_feats: torch.Tensor,
-        edge_feats: torch.Tensor,
-    ) -> torch.Tensor:
-        """CGCNN convolution defined in Eq 5.
-
-        10.1103/PhysRevLett.120.14530
-        """
-        g = g.local_var()
-
-        # instead of concatenating (u || v || e) and applying one weight matrix
-        # split the weight matrix into three, apply, then sum
-        # see https://docs.dgl.ai/guide/message-efficient.html
-        # compute edge messages -- coalesce W_f and W_s from the paper
-        # but split them on feature dimensions to update u, v, e separately
-        # m = BatchNorm(Linear(cat(u, v, e)))
-        g.ndata["h_src"] = self.linear_src(node_feats)
-        g.ndata["h_dst"] = self.linear_dst(node_feats)
-        g.apply_edges(fn.u_add_v("h_src", "h_dst", "h_nodes"))
-        m = g.edata.pop("h_nodes") + self.linear_edge(edge_feats)
-        m = self.bn_message(m)
-
-        # split messages into W_f and W_s terms
-        # multiply output of atom interaction net and edge attention net
-        # i.e. compute the term inside the summation in eq 5
-        # σ(z_ij W_f + b_f) ⊙ g_s(z_ij W_s + b_s)
-        h_f, h_s = torch.chunk(m, 2, dim=1)
-        m = torch.sigmoid(h_f) * F.softplus(h_s)
-        g.edata["m"] = m
-
-        # apply the convolution term in eq. 5 (without residual connection)
-        # storing the results in edge features `h`
-        g.update_all(
-            message_func=fn.copy_e("m", "z"), reduce_func=fn.sum("z", "h"),
-        )
-
-        # final batchnorm
-        h = self.bn(g.ndata.pop("h"))
-
-        # residual connection plus nonlinearity
-        out = F.softplus(node_feats + h)
-
-        if self.return_messages:
-            return out, m
-
-        return out
-
-
-class CGCNN(nn.Module):
-    """CGCNN dgl implementation."""
-
-    def __init__(self, config: CGCNNConfig = CGCNNConfig(name="cgcnn")):
-        """Set up CGCNN modules."""
-        super().__init__()
-        self.classification = config.classification
-        self.rbf = RBFExpansion(vmin=0, vmax=8.0, bins=config.edge_features)
-        self.atom_embedding = nn.Linear(
-            config.atom_input_features, config.node_features
-        )
-
-        self.conv_layers = nn.ModuleList(
-            [
-                CGCNNConv(config.node_features, config.edge_features)
-                for _ in range(config.conv_layers)
-            ]
-        )
-
-        self.readout = AvgPooling()
-
-        self.fc = nn.Sequential(
-            nn.Linear(config.node_features, config.fc_features), nn.Softplus()
-        )
-
-        if config.zero_inflated:
-            # add latent Bernoulli variable model to zero out
-            # predictions in non-negative regression model
-            self.zero_inflated = True
-            self.fc_nonzero = nn.Linear(config.fc_features, 1)
-            self.fc_scale = nn.Linear(config.fc_features, 1)
-            # self.fc_shape = nn.Linear(config.fc_features, 1)
-            self.fc_scale.bias.data = torch.tensor(
-                # np.log(2.1), dtype=torch.float
-                2.1,
-                dtype=torch.float,
-            )
-            if self.classification:
-                raise ValueError(
-                    "Classification not implemented for zero_inflated"
-                )
-        else:
-            self.zero_inflated = False
-            if self.classification:
-                self.fc_out = nn.Linear(config.fc_features, 2)
-                self.softmax = nn.LogSoftmax(dim=1)
-            else:
-                self.fc_out = nn.Linear(
-                    config.fc_features, config.output_features
-                )
-
-        self.link = None
-        self.link_name = config.link
-        if config.link == "identity":
-            self.link = lambda x: x
-        elif config.link == "log":
-            self.link = torch.exp
-            avg_gap = 0.7  # magic number -- average bandgap in dft_3d
-            if not self.zero_inflated:
-                self.fc_out.bias.data = torch.tensor(
-                    np.log(avg_gap), dtype=torch.float
-                )
-        elif config.link == "logit":
-            self.link = torch.sigmoid
-
-    def forward(self, g) -> torch.Tensor:
-        """CGCNN function mapping graph to outputs."""
-        g, lg = g
-        g = g.local_var()
-
-        # fixed edge features: RBF-expanded bondlengths
-        bondlength = torch.norm(g.edata.pop("r"), dim=1)
-        edge_features = self.rbf(bondlength)
-
-        # initial node features: atom feature network...
-        v = g.ndata.pop("atom_features")
-        node_features = self.atom_embedding(v)
-
-        # CGCNN-Conv block: update node features
-        for conv_layer in self.conv_layers:
-            node_features = conv_layer(g, node_features, edge_features)
-
-        # crystal-level readout
-        features = self.readout(g, node_features)
-        features = F.softplus(features)
-        features = self.fc(features)
-        features = F.softplus(features)
-
-        if self.zero_inflated:
-            logit_p = self.fc_nonzero(features)
-            log_scale = self.fc_scale(features)
-            # log_shape = self.fc_shape(features)
-
-            # pred = (torch.sigmoid(logit_p)
-            #         * torch.exp(log_scale)
-            #         * torch.exp(log_shape))
-            # out = torch.where(p < 0.5, torch.zeros_like(out), out)
-            return (
-                torch.squeeze(logit_p),
-                torch.squeeze(log_scale),
-                # torch.squeeze(log_shape),
-            )
-
-        else:
-            out = self.fc_out(features)
-            if self.link:
-                out = self.link(out)
-        if self.classification:
-            # out = torch.round(torch.sigmoid(out))
-            out = self.softmax(out)
-
-        return torch.squeeze(out)
-
-
-class ZeroInflatedGammaLoss(nn.modules.loss._Loss):
-    """Zero inflated Gamma regression loss."""
-
-    def predict(self, inputs: Tuple[torch.Tensor, torch.Tensor]):
-        """Combine ZIG multi-part outputs to yield real-valued predictions."""
-        # logit_p, log_scale, log_shape = inputs
-        logit_p, log_scale = inputs
-        return (
-            torch.sigmoid(logit_p)
-            * F.softplus(log_scale)
-            # * torch.exp(log_scale)
-            # * (1 + torch.exp(log_shape))
-        )
-
-    def forward(
-        self, inputs: Tuple[torch.Tensor, torch.Tensor], target: torch.Tensor,
-    ) -> torch.Tensor:
-        """Zero-inflated Gamma loss.
-
-        binary crossentropy loss combined with Gamma negative log likelihood
-        """
-        # logit_p, log_scale, log_shape = inputs
-        logit_p, log_scale = inputs
-
-        bce_loss = F.binary_cross_entropy_with_logits(
-            logit_p, target, reduction="sum"
-        )
-
-        indicator = target > 0
-        # g_loss = F.mse_loss(
-        #     log_scale[indicator],
-        #     torch.log(target[indicator]), reduction="sum"
-        # )
-        # g_loss = F.mse_loss(
-        #     torch.exp(log_scale[indicator]),
-        # target[indicator], reduction="sum"
-        # )
-        g_loss = F.mse_loss(
-            F.softplus(log_scale[indicator]),
-            target[indicator],
-            reduction="sum",
-        )
-
-        return (bce_loss + g_loss) / target.numel()
-        # return bce_loss + torch.tensor(2.0) * g_loss.sum() / indicator.sum()
diff --git a/alignn/tests/test_alignn_ff.py b/alignn/tests/test_alignn_ff.py
index 5f4dbc23..740dbabc 100644
--- a/alignn/tests/test_alignn_ff.py
+++ b/alignn/tests/test_alignn_ff.py
@@ -9,6 +9,18 @@
     get_interface_energy,
 )
 from alignn.graphs import Graph
+from alignn.ff.ff import phonons
+from jarvis.core.atoms import ase_to_atoms
+from jarvis.db.figshare import get_jid_data
+from jarvis.core.atoms import Atoms
+from alignn.ff.ff import (
+    AlignnAtomwiseCalculator,
+    default_path,
+    wt10_path,
+    alignnff_fmult,
+    fd_path,
+    ForceField,
+)
 
 
 def test_alignnff():
@@ -24,7 +36,6 @@ def test_alignnff():
     print("atoms", atoms)
     # atoms = atoms.make_supercell_matrix([2, 2, 2])
     # atoms=atoms.strain_atoms(.05)
-    # print(atoms)
     ev = ev_curve(atoms=atoms, model_path=model_path)
     # surf = surface_energy(atoms=atoms, model_path=model_path)
     # print('surf',surf)
@@ -51,13 +62,21 @@ def test_alignnff():
         get_jid_data(dataset="dft_3d", jid="JVASP-32")["atoms"]
     )
     intf = get_interface_energy(
-       film_atoms=atoms_al,
-       subs_atoms=atoms_al,
-       model_path=model_path,
-       film_thickness=10,
-       subs_thickness=10
-       # film_atoms=atoms_al, subs_atoms=atoms_al2o3, model_path=model_path
+        film_atoms=atoms_al,
+        subs_atoms=atoms_al,
+        model_path=model_path,
+        film_thickness=10,
+        subs_thickness=10,
+        # film_atoms=atoms_al, subs_atoms=atoms_al2o3, model_path=model_path
+    )
+
+
+def test_phonons():
+    atoms = Atoms.from_dict(
+        get_jid_data(jid="JVASP-816", dataset="dft_3d")["atoms"]
     )
+    ph_path = fd_path()
+    ph = phonons(model_path=ph_path, atoms=(atoms))
 
 
 # test_alignnff()
diff --git a/alignn/tests/test_prop.py b/alignn/tests/test_prop.py
index 307897db..aee4b638 100644
--- a/alignn/tests/test_prop.py
+++ b/alignn/tests/test_prop.py
@@ -1,4 +1,5 @@
 """Training script test suite."""
+
 import time
 import matplotlib.pyplot as plt
 import numpy as np
@@ -8,8 +9,7 @@
 from sklearn.metrics import mean_absolute_error
 import os
 from jarvis.core.atoms import Atoms
-from alignn.train_folder import train_for_folder
-from alignn.train_folder_ff import train_for_folder as train_for_folder_ff
+from alignn.train_alignn import train_for_folder
 from jarvis.db.figshare import get_jid_data
 from alignn.ff.ff import AlignnAtomwiseCalculator, default_path, revised_path
 
@@ -49,152 +49,37 @@
 #    os.system(cmd3)
 
 
-def test_minor_configs():
-    tmp = config
-    # tmp["log_tensorboard"] = True
-    tmp["n_early_stopping"] = 2
-    tmp["model"]["name"] = "alignn"
-    config["write_predictions"] = True
-    result = train_dgl(tmp)
+# def test_minor_configs():
+#    tmp = config
+#    # tmp["log_tensorboard"] = True
+#    tmp["n_early_stopping"] = 2
+#    tmp["model"]["name"] = "alignn"
+#    config["write_predictions"] = True
+#    result = train_dgl(tmp)
 
 
 def test_models():
-    """Test CGCNN end to end training."""
-    config["model"]["name"] = "dense_alignn"
-    t1 = time.time()
-    result = train_dgl(config)
-    t2 = time.time()
-    print("Toal time:", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
-    print()
-    print()
-    print()
 
     config["write_predictions"] = True
-    config["model"]["name"] = "alignn"
-    t1 = time.time()
-    result = train_dgl(config)
-    t2 = time.time()
-    print("Total time", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
-    print()
-    print()
-    print()
-
-    config["model"]["name"] = "alignn_layernorm"
-    t1 = time.time()
-    result = train_dgl(config)
-    t2 = time.time()
-    print("Total time", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
-    print()
-    print()
-    print()
-
-    config["model"]["name"] = "cgcnn"
-    config["write_predictions"] = False
-    config["save_dataloader"] = False
-    t1 = time.time()
-    result = train_dgl(config)
-    t2 = time.time()
-    print("Total time", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
-    print()
-    print()
-    print()
-
-    config["model"]["name"] = "densegcn"
-    config["write_predictions"] = False
-    config["save_dataloader"] = False
-    t1 = time.time()
-    result = train_dgl(config)
-    t2 = time.time()
-    print("Total time", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
-    print()
-    print()
-    print()
-
-    config["model"]["name"] = "icgcnn"
-    config["write_predictions"] = False
-    config["save_dataloader"] = False
-    t1 = time.time()
-    result = train_dgl(config)
-    t2 = time.time()
-    print("Total time", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
-    print()
-    print()
-    print()
-
-    config["model"]["name"] = "alignn_cgcnn"
-    config["write_predictions"] = False
-    config["save_dataloader"] = False
+    config["model"]["name"] = "alignn_atomwise"
     t1 = time.time()
     result = train_dgl(config)
     t2 = time.time()
     print("Total time", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
-    print()
-    print()
-    print()
-
-    # Classification
-    config["model"]["name"] = "dense_alignn"
-    config["classification_threshold"] = 0.0
-    t1 = time.time()
-    result = train_dgl(config)
-    t2 = time.time()
-    print("Toal time:", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
+    # print("train=", result["train"])
+    # print("validation=", result["validation"])
     print()
     print()
     print()
 
-    config["model"]["name"] = "alignn"
+    config["model"]["name"] = "alignn_atomwise"
     config["classification_threshold"] = 0.0
     t1 = time.time()
     result = train_dgl(config)
     t2 = time.time()
     print("Total time", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
-    print()
-    print()
-    print()
-
-    config["model"]["name"] = "cgcnn"
-    config["write_predictions"] = False
-    config["save_dataloader"] = False
-    config["classification_threshold"] = 0.0
-    t1 = time.time()
-    result = train_dgl(config)
-    t2 = time.time()
-    print("Total time", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
-    print()
-    print()
-    print()
-
-    config["model"]["name"] = "alignn_cgcnn"
-    config["write_predictions"] = False
-    config["save_dataloader"] = True
-    config["classification_threshold"] = 0.0
-    t1 = time.time()
-    result = train_dgl(config)
-    t2 = time.time()
-    print("Total time", t2 - t1)
-    print("train=", result["train"])
-    print("validation=", result["validation"])
+    # print("train=", result["train"])
+    # print("validation=", result["validation"])
     print()
     print()
     print()
@@ -289,7 +174,7 @@ def test_alignn_train():
             "../examples/sample_data_ff/config_example_atomwise.json",
         )
     )
-    train_for_folder_ff(root_dir=root_dir, config_name=config)
+    train_for_folder(root_dir=root_dir, config_name=config)
 
 
 def test_calculator():
@@ -311,6 +196,46 @@ def test_calculator():
     # assert round(max(stress.flatten()),2)==round(0.002801671050217803,2)
 
 
+def test_del_files():
+    fnames = [
+        "temp",
+        "ase_nve.traj",
+        "ase_nvt_langevin.traj",
+        "ase_nvt_andersen.traj",
+        "opt.log",
+        "opt.traj",
+        "alignn_ff.log",
+        "dataset_data_range",
+        "pred_data.json",
+        "prediction_results_train_set.csv",
+        "multi_out_predictions.json",
+        "checkpoint_2.pt",
+        "checkpoint_3.pt",
+        "prediction_results_test_set.csv",
+        "mad",
+        "ids_train_val_test.json",
+        "train_data_data_range",
+        "val_data_data_range",
+        "test_data_data_range",
+        "config.json",
+        "history_train.json",
+        "current_model.pt",
+        "best_model.pt",
+        "Train_results.json",
+        "Val_results.json",
+        "history_val.json",
+        "Test_results.json",
+        "Test_results.json",
+        "last_model.pt",
+        "temp",
+        "alignn/jv_formation_energy_peratom_alignn.zip",
+        "alignn/jv_optb88vdw_total_energy_alignn.zip",
+    ]
+    for i in fnames:
+        cmd = "rm -r " + i
+        os.system(cmd)
+
+
 # test_minor_configs()
 # test_pretrained()
 # test_runtime_training()
diff --git a/alignn/train.py b/alignn/train.py
index 1b002796..63950d73 100644
--- a/alignn/train.py
+++ b/alignn/train.py
@@ -6,69 +6,26 @@
 """
 
 from functools import partial
-
-# from pathlib import Path
 from typing import Any, Dict, Union
-import ignite
 import torch
 import random
-from ignite.contrib.handlers import TensorboardLogger
 from sklearn.metrics import mean_absolute_error
-
-try:
-    from ignite.contrib.handlers.stores import EpochOutputStore
-
-    # For different version of pytorch-ignite
-except Exception:
-    from ignite.handlers.stores import EpochOutputStore
-
-    pass
-from ignite.handlers import EarlyStopping
-from ignite.contrib.handlers.tensorboard_logger import (
-    global_step_from_engine,
-)
-from ignite.contrib.handlers.tqdm_logger import ProgressBar
-from ignite.engine import (
-    Events,
-    create_supervised_evaluator,
-    create_supervised_trainer,
-)
-from ignite.contrib.metrics import ROC_AUC, RocCurve
-from ignite.metrics import (
-    Accuracy,
-    Precision,
-    Recall,
-    ConfusionMatrix,
-)
+from sklearn.metrics import log_loss
 import pickle as pk
 import numpy as np
-from ignite.handlers import Checkpoint, DiskSaver, TerminateOnNan
-from ignite.metrics import Loss, MeanAbsoluteError
 from torch import nn
-from alignn import models
 from alignn.data import get_train_val_loaders
 from alignn.config import TrainingConfig
-from alignn.models.alignn import ALIGNN
 from alignn.models.alignn_atomwise import ALIGNNAtomWise
-from alignn.models.alignn_layernorm import ALIGNN as ALIGNN_LN
-from alignn.models.modified_cgcnn import CGCNN
-from alignn.models.dense_alignn import DenseALIGNN
-from alignn.models.densegcn import DenseGCN
-from alignn.models.icgcnn import iCGCNN
-from alignn.models.alignn_cgcnn import ACGCNN
 from jarvis.db.jsonutils import dumpjson
 import json
 import pprint
-
-# from accelerate import Accelerator
 import os
 import warnings
+import time
+from sklearn.metrics import roc_auc_score
 
 warnings.filterwarnings("ignore", category=RuntimeWarning)
-# from sklearn.decomposition import PCA, KernelPCA
-# from sklearn.preprocessing import StandardScaler
-
-# torch config
 torch.set_default_dtype(torch.float32)
 
 
@@ -91,7 +48,6 @@ def make_standard_scalar_and_pca(output):
     # pc = pk.load(open("pca.pkl", "rb"))
     # y_pred = torch.tensor(pc.transform(y_pred), device=device)
     # y = torch.tensor(pc.transform(y), device=device)
-
     # y_pred = torch.tensor(pca_sc.inverse_transform(y_pred),device=device)
     # y = torch.tensor(pca_sc.inverse_transform(y),device=device)
     # print (y.shape,y_pred.shape)
@@ -159,7 +115,6 @@ def train_dgl(
             config = TrainingConfig(**config)
         except Exception as exp:
             print("Check", exp)
-    import os
 
     if not os.path.exists(config.output_dir):
         os.makedirs(config.output_dir)
@@ -176,26 +131,10 @@ def train_dgl(
     pprint.pprint(tmp)  # , sort_dicts=False)
     if config.classification_threshold is not None:
         classification = True
-    if config.random_seed is not None:
-        deterministic = True
-        ignite.utils.manual_seed(config.random_seed)
 
     line_graph = False
-    alignn_models = {
-        "alignn",
-        "dense_alignn",
-        "alignn_cgcnn",
-        "alignn_layernorm",
-    }
-    if config.model.name == "clgn":
-        line_graph = True
-    if config.model.name == "cgcnn":
+    if config.model.alignn_layers > 0:
         line_graph = True
-    if config.model.name == "icgcnn":
-        line_graph = True
-    if config.model.name in alignn_models and config.model.alignn_layers > 0:
-        line_graph = True
-    # print ('output_dir train', config.output_dir)
     if not train_val_test_loaders:
         # use input standardization for all real-valued feature sets
         # print("config.neighbor_strategy",config.neighbor_strategy)
@@ -207,7 +146,6 @@ def train_dgl(
             test_loader,
             prepare_batch,
         ) = get_train_val_loaders(
-            # ) = data.get_train_val_loaders(
             dataset=config.dataset,
             target=config.target,
             n_train=config.n_train,
@@ -244,27 +182,11 @@ def train_dgl(
     device = "cpu"
     if torch.cuda.is_available():
         device = torch.device("cuda")
-    if config.distributed:
-        print(
-            "Using Accelerator, currently experimental, use at your own risk."
-        )
-        from accelerate import Accelerator
-
-        accelerator = Accelerator()
-        device = accelerator.device
     prepare_batch = partial(prepare_batch, device=device)
     if classification:
         config.model.classification = True
-    # define network, optimizer, scheduler
     _model = {
-        "cgcnn": CGCNN,
-        "icgcnn": iCGCNN,
-        "densegcn": DenseGCN,
-        "alignn": ALIGNN,
         "alignn_atomwise": ALIGNNAtomWise,
-        "dense_alignn": DenseALIGNN,
-        "alignn_cgcnn": ACGCNN,
-        "alignn_layernorm": ALIGNN_LN,
     }
     if config.random_seed is not None:
         random.seed(config.random_seed)
@@ -286,10 +208,12 @@ def train_dgl(
         net = _model.get(config.model.name)(config.model)
     else:
         net = model
+    net.to(device)
     if config.data_parallel and torch.cuda.device_count() > 1:
+        # For multi-GPU training make data_parallel:true in config.json file
+        device_ids = [cid for cid in range(torch.cuda.device_count())]
         print("Let's use", torch.cuda.device_count(), "GPUs!")
-        net = torch.nn.DataParallel(net)
-    net.to(device)
+        net = torch.nn.DataParallel(net, device_ids=device_ids).cuda()
     # group parameters to skip weight decay for bias and batchnorm
     params = group_decay(net)
     optimizer = setup_optimizer(params, config)
@@ -369,7 +293,10 @@ def get_batch_errors(dat=[]):
                 pred_out = np.array(pred_out)
                 # print('target_out',target_out,target_out.shape)
                 # print('pred_out',pred_out,pred_out.shape)
-                mean_out = mean_absolute_error(target_out, pred_out)
+                if classification:
+                    mean_out = log_loss(target_out, pred_out)
+                else:
+                    mean_out = mean_absolute_error(target_out, pred_out)
             if "target_stress" in i:
                 # if i["target_stress"]:
                 mean_stress = np.array(stress).mean()
@@ -387,32 +314,22 @@ def get_batch_errors(dat=[]):
 
         best_loss = np.inf
         criterion = nn.L1Loss()
-        # criterion = nn.MSELoss()
+        if classification:
+            criterion = nn.NLLLoss()
         params = group_decay(net)
         optimizer = setup_optimizer(params, config)
         # optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
-        if config.distributed:
-            from accelerate import Accelerator
-
-            accelerator = Accelerator()
-            print("Using Accelerator device", accelerator.device)
-
-            device = accelerator.device
-            # print('model',net)
-            net.to(device)
-            train_loader, val_loader, model, optimizer = accelerator.prepare(
-                train_loader, val_loader, net, optimizer
-            )
         history_train = []
         history_val = []
         for e in range(config.epochs):
             # optimizer.zero_grad()
+            train_init_time = time.time()
             running_loss = 0
             train_result = []
             # for dats in train_loader:
             for dats, jid in zip(train_loader, train_loader.dataset.ids):
                 info = {}
-                info["id"] = jid
+                # info["id"] = jid
                 optimizer.zero_grad()
                 result = net([dats[0].to(device), dats[1].to(device)])
                 # info = {}
@@ -430,6 +347,8 @@ def get_batch_errors(dat=[]):
                 loss3 = 0  # Such as forces
                 loss4 = 0  # Such as stresses
                 if config.model.output_features is not None:
+                    # print('result["out"]',result["out"])
+                    # print('dats[2]',dats[2])
                     loss1 = config.model.graphwise_weight * criterion(
                         result["out"], dats[2].to(device)
                     )
@@ -522,13 +441,7 @@ def get_batch_errors(dat=[]):
                     # print("pred_stress", info["pred_stress"][0])
                 train_result.append(info)
                 loss = loss1 + loss2 + loss3 + loss4
-                if config.distributed:
-                    from accelerate import Accelerator
-
-                    accelerator = Accelerator()
-                    accelerator.backward(loss)
-                else:
-                    loss.backward()
+                loss.backward()
                 optimizer.step()
                 # optimizer.zero_grad() #never
                 running_loss += loss.item()
@@ -537,42 +450,25 @@ def get_batch_errors(dat=[]):
             )
             # dumpjson(filename="Train_results.json", data=train_result)
             scheduler.step()
-            if config.distributed:
-                from accelerate import Accelerator
-
-                accelerator = Accelerator()
-                accelerator.print(
-                    "TrainLoss",
-                    "Epoch",
-                    e,
-                    "total",
-                    running_loss,
-                    "out",
-                    mean_out,
-                    "atom",
-                    mean_atom,
-                    "grad",
-                    mean_grad,
-                    "stress",
-                    mean_stress,
-                )
-
-            else:
-                print(
-                    "TrainLoss",
-                    "Epoch",
-                    e,
-                    "total",
-                    running_loss,
-                    "out",
-                    mean_out,
-                    "atom",
-                    mean_atom,
-                    "grad",
-                    mean_grad,
-                    "stress",
-                    mean_stress,
-                )
+            train_final_time = time.time()
+            train_ep_time = train_final_time - train_init_time
+            print(
+                "TrainLoss",
+                "Epoch",
+                e,
+                "total",
+                running_loss,
+                "out",
+                mean_out,
+                "atom",
+                mean_atom,
+                "grad",
+                mean_grad,
+                "stress",
+                mean_stress,
+                "time",
+                train_ep_time,
+            )
             history_train.append([mean_out, mean_atom, mean_grad, mean_stress])
             dumpjson(
                 filename=os.path.join(config.output_dir, "history_train.json"),
@@ -668,6 +564,7 @@ def get_batch_errors(dat=[]):
                 net.state_dict(),
                 os.path.join(config.output_dir, current_model_name),
             )
+            saving_msg = ""
             if val_loss < best_loss:
                 best_loss = val_loss
                 best_model_name = "best_model.pt"
@@ -675,13 +572,8 @@ def get_batch_errors(dat=[]):
                     net.state_dict(),
                     os.path.join(config.output_dir, best_model_name),
                 )
-                if config.distributed:
-                    from accelerate import Accelerator
-
-                    accelerator = Accelerator()
-                    accelerator.print("Saving data for epoch:", e)
-                else:
-                    print("Saving data for epoch:", e)
+                # print("Saving data for epoch:", e)
+                saving_msg = "Saving model"
                 dumpjson(
                     filename=os.path.join(
                         config.output_dir, "Train_results.json"
@@ -694,41 +586,23 @@ def get_batch_errors(dat=[]):
                     ),
                     data=val_result,
                 )
-            if config.distributed:
-                from accelerate import Accelerator
-
-                accelerator = Accelerator()
-                accelerator.print(
-                    "ValLoss",
-                    "Epoch",
-                    e,
-                    "total",
-                    val_loss,
-                    "out",
-                    mean_out,
-                    "atom",
-                    mean_atom,
-                    "grad",
-                    mean_grad,
-                    "stress",
-                    mean_stress,
-                )
-            else:
-                print(
-                    "ValLoss",
-                    "Epoch",
-                    e,
-                    "total",
-                    val_loss,
-                    "out",
-                    mean_out,
-                    "atom",
-                    mean_atom,
-                    "grad",
-                    mean_grad,
-                    "stress",
-                    mean_stress,
-                )
+                best_model = net
+            print(
+                "ValLoss",
+                "Epoch",
+                e,
+                "total",
+                val_loss,
+                "out",
+                mean_out,
+                "atom",
+                mean_atom,
+                "grad",
+                mean_grad,
+                "stress",
+                mean_stress,
+                saving_msg,
+            )
             history_val.append([mean_out, mean_atom, mean_grad, mean_stress])
             dumpjson(
                 filename=os.path.join(config.output_dir, "history_val.json"),
@@ -750,7 +624,9 @@ def get_batch_errors(dat=[]):
             loss2 = 0  # Such as bader charges
             loss3 = 0  # Such as forces
             loss4 = 0  # Such as stresses
-            if config.model.output_features is not None:
+            if config.model.output_features is not None and not classification:
+                # print('result["out"]',result["out"])
+                # print('dats[2]',dats[2])
                 loss1 = config.model.graphwise_weight * criterion(
                     result["out"], dats[2].to(device)
                 )
@@ -807,14 +683,9 @@ def get_batch_errors(dat=[]):
                 )
             test_result.append(info)
             loss = loss1 + loss2 + loss3 + loss4
-            test_loss += loss.item()
-        if config.distributed:
-            from accelerate import Accelerator
-
-            accelerator = Accelerator()
-            accelerator.print("TestLoss", e, test_loss)
-        else:
-            print("TestLoss", e, test_loss)
+            if not classification:
+                test_loss += loss.item()
+        print("TestLoss", e, test_loss)
         dumpjson(
             filename=os.path.join(config.output_dir, "Test_results.json"),
             data=test_result,
@@ -824,315 +695,11 @@ def get_batch_errors(dat=[]):
             net.state_dict(),
             os.path.join(config.output_dir, last_model_name),
         )
-        return test_result
-
-    if config.distributed:
-        import torch.distributed as dist
-        import os
-
-        print()
-        print()
-        print()
-        gpus = torch.cuda.device_count()
-        print("Using DistributedDataParallel !!!", gpus)
-
-        def setup(rank, world_size):
-            os.environ["MASTER_ADDR"] = "localhost"
-            os.environ["MASTER_PORT"] = "12355"
-
-            # initialize the process group
-            dist.init_process_group("gloo", rank=rank, world_size=world_size)
-
-        def cleanup():
-            dist.destroy_process_group()
-
-        setup(2, 2)
-        local_rank = [0, 1]
-        # net=torch.nn.parallel.DataParallel(net
-        # ,device_ids=[local_rank, ],output_device=local_rank)
-        net = torch.nn.parallel.DistributedDataParallel(
-            net,
-            device_ids=[
-                local_rank,
-            ],
-            output_device=local_rank,
-        )
-        print()
-        print()
-        print()
-        # )  # ,device_ids=[local_rank, ],output_device=local_rank)
-    """
-    # group parameters to skip weight decay for bias and batchnorm
-    params = group_decay(net)
-    optimizer = setup_optimizer(params, config)
-
-    if config.scheduler == "none":
-        # always return multiplier of 1 (i.e. do nothing)
-        scheduler = torch.optim.lr_scheduler.LambdaLR(
-            optimizer, lambda epoch: 1.0
-        )
-
-    elif config.scheduler == "onecycle":
-        steps_per_epoch = len(train_loader)
-        # pct_start = config.warmup_steps / (config.epochs * steps_per_epoch)
-        scheduler = torch.optim.lr_scheduler.OneCycleLR(
-            optimizer,
-            max_lr=config.learning_rate,
-            epochs=config.epochs,
-            steps_per_epoch=steps_per_epoch,
-            # pct_start=pct_start,
-            pct_start=0.3,
-        )
-    elif config.scheduler == "step":
-        # pct_start = config.warmup_steps / (config.epochs * steps_per_epoch)
-        scheduler = torch.optim.lr_scheduler.StepLR(
-            optimizer,
-        )
-
-    """
-    # select configured loss function
-    criteria = {
-        "mse": nn.MSELoss(),
-        "l1": nn.L1Loss(),
-        "poisson": nn.PoissonNLLLoss(log_input=False, full=True),
-        "zig": models.modified_cgcnn.ZeroInflatedGammaLoss(),
-    }
-    criterion = criteria[config.criterion]
-
-    # set up training engine and evaluators
-    metrics = {"loss": Loss(criterion), "mae": MeanAbsoluteError()}
-    if config.model.output_features > 1 and config.standard_scalar_and_pca:
-        # metrics = {"loss": Loss(criterion), "mae": MeanAbsoluteError()}
-        metrics = {
-            "loss": Loss(
-                criterion, output_transform=make_standard_scalar_and_pca
-            ),
-            "mae": MeanAbsoluteError(
-                output_transform=make_standard_scalar_and_pca
-            ),
-        }
-
-    if config.criterion == "zig":
+        # return test_result
 
-        def zig_prediction_transform(x):
-            output, y = x
-            return criterion.predict(output), y
-
-        metrics = {
-            "loss": Loss(criterion),
-            "mae": MeanAbsoluteError(
-                output_transform=zig_prediction_transform
-            ),
-        }
-
-    if classification:
-        criterion = nn.NLLLoss()
-
-        metrics = {
-            "accuracy": Accuracy(
-                output_transform=thresholded_output_transform
-            ),
-            "precision": Precision(
-                output_transform=thresholded_output_transform
-            ),
-            "recall": Recall(output_transform=thresholded_output_transform),
-            "rocauc": ROC_AUC(output_transform=activated_output_transform),
-            "roccurve": RocCurve(output_transform=activated_output_transform),
-            "confmat": ConfusionMatrix(
-                output_transform=thresholded_output_transform, num_classes=2
-            ),
-        }
-    trainer = create_supervised_trainer(
-        net,
-        optimizer,
-        criterion,
-        prepare_batch=prepare_batch,
-        device=device,
-        deterministic=deterministic,
-        # output_transform=make_standard_scalar_and_pca,
-    )
-
-    evaluator = create_supervised_evaluator(
-        net,
-        metrics=metrics,
-        prepare_batch=prepare_batch,
-        device=device,
-        # output_transform=make_standard_scalar_and_pca,
-    )
-
-    train_evaluator = create_supervised_evaluator(
-        net,
-        metrics=metrics,
-        prepare_batch=prepare_batch,
-        device=device,
-        # output_transform=make_standard_scalar_and_pca,
-    )
-
-    # ignite event handlers:
-    trainer.add_event_handler(Events.EPOCH_COMPLETED, TerminateOnNan())
-
-    # apply learning rate scheduler
-    trainer.add_event_handler(
-        Events.ITERATION_COMPLETED, lambda engine: scheduler.step()
-    )
-
-    if config.write_checkpoint:
-        # model checkpointing
-        to_save = {
-            "model": net,
-            "optimizer": optimizer,
-            "lr_scheduler": scheduler,
-            "trainer": trainer,
-        }
-        if classification:
-
-            def cp_score(engine):
-                """Higher accuracy is better."""
-                return engine.state.metrics["accuracy"]
-
-        else:
-
-            def cp_score(engine):
-                """Lower MAE is better."""
-                return -engine.state.metrics["mae"]
-
-        # save last two epochs
-        evaluator.add_event_handler(
-            Events.EPOCH_COMPLETED,
-            Checkpoint(
-                to_save,
-                DiskSaver(
-                    checkpoint_dir, create_dir=True, require_empty=False
-                ),
-                n_saved=2,
-                global_step_transform=lambda *_: trainer.state.epoch,
-            ),
-        )
-        # save best model
-        evaluator.add_event_handler(
-            Events.EPOCH_COMPLETED,
-            Checkpoint(
-                to_save,
-                DiskSaver(
-                    checkpoint_dir, create_dir=True, require_empty=False
-                ),
-                filename_pattern="best_model.{ext}",
-                n_saved=1,
-                global_step_transform=lambda *_: trainer.state.epoch,
-                score_function=cp_score,
-            ),
-        )
-    if config.progress:
-        pbar = ProgressBar()
-        pbar.attach(trainer, output_transform=lambda x: {"loss": x})
-        # pbar.attach(evaluator,output_transform=lambda x: {"mae": x})
-
-    history = {
-        "train": {m: [] for m in metrics.keys()},
-        "validation": {m: [] for m in metrics.keys()},
-    }
-
-    if config.store_outputs:
-        # log_results handler will save epoch output
-        # in history["EOS"]
-        eos = EpochOutputStore()
-        eos.attach(evaluator)
-        train_eos = EpochOutputStore()
-        train_eos.attach(train_evaluator)
-
-    # collect evaluation performance
-    @trainer.on(Events.EPOCH_COMPLETED)
-    def log_results(engine):
-        """Print training and validation metrics to console."""
-        train_evaluator.run(train_loader)
-        evaluator.run(val_loader)
-
-        tmetrics = train_evaluator.state.metrics
-        vmetrics = evaluator.state.metrics
-        for metric in metrics.keys():
-            tm = tmetrics[metric]
-            vm = vmetrics[metric]
-            if metric == "roccurve":
-                tm = [k.tolist() for k in tm]
-                vm = [k.tolist() for k in vm]
-            if isinstance(tm, torch.Tensor):
-                tm = tm.cpu().numpy().tolist()
-                vm = vm.cpu().numpy().tolist()
-
-            history["train"][metric].append(tm)
-            history["validation"][metric].append(vm)
-
-        # for metric in metrics.keys():
-        #    history["train"][metric].append(tmetrics[metric])
-        #    history["validation"][metric].append(vmetrics[metric])
-
-        if config.store_outputs:
-            history["EOS"] = eos.data
-            history["trainEOS"] = train_eos.data
-            dumpjson(
-                filename=os.path.join(config.output_dir, "history_val.json"),
-                data=history["validation"],
-            )
-            dumpjson(
-                filename=os.path.join(config.output_dir, "history_train.json"),
-                data=history["train"],
-            )
-        if config.progress:
-            pbar = ProgressBar()
-            if not classification:
-                pbar.log_message(f"Val_MAE: {vmetrics['mae']:.4f}")
-                pbar.log_message(f"Train_MAE: {tmetrics['mae']:.4f}")
-            else:
-                pbar.log_message(f"Train ROC AUC: {tmetrics['rocauc']:.4f}")
-                pbar.log_message(f"Val ROC AUC: {vmetrics['rocauc']:.4f}")
-
-    if config.n_early_stopping is not None:
-        # early stopping if no improvement (improvement = higher score)
-        if classification:
-
-            def es_score(engine):
-                """Higher accuracy is better."""
-                return engine.state.metrics["accuracy"]
-
-        else:
-
-            def es_score(engine):
-                """Lower MAE is better."""
-                return -engine.state.metrics["mae"]
-
-        es_handler = EarlyStopping(
-            patience=config.n_early_stopping,
-            score_function=es_score,
-            trainer=trainer,
-        )
-        evaluator.add_event_handler(Events.EPOCH_COMPLETED, es_handler)
-
-    # optionally log results to tensorboard
-    if config.log_tensorboard:
-        tb_logger = TensorboardLogger(
-            log_dir=os.path.join(config.output_dir, "tb_logs", "test")
-        )
-        for tag, evaluator in [
-            ("training", train_evaluator),
-            ("validation", evaluator),
-        ]:
-            tb_logger.attach_output_handler(
-                evaluator,
-                event_name=Events.EPOCH_COMPLETED,
-                tag=tag,
-                metric_names=["loss", "mae"],
-                global_step_transform=global_step_from_engine(trainer),
-            )
-
-    # train the model!
-    trainer.run(train_loader, max_epochs=config.epochs)
-
-    if config.log_tensorboard:
-        test_loss = evaluator.state.metrics["loss"]
-        tb_logger.writer.add_hparams(config, {"hparam/test_loss": test_loss})
-        tb_logger.close()
     if config.write_predictions and classification:
-        net.eval()
+        best_model.eval()
+        # net.eval()
         f = open(
             os.path.join(config.output_dir, "prediction_results_test_set.csv"),
             "w",
@@ -1144,8 +711,11 @@ def es_score(engine):
             ids = test_loader.dataset.ids  # [test_loader.dataset.indices]
             for dat, id in zip(test_loader, ids):
                 g, lg, target = dat
-                out_data = net([g.to(device), lg.to(device)])
+                out_data = best_model([g.to(device), lg.to(device)])["out"]
+                # out_data = net([g.to(device), lg.to(device)])["out"]
                 # out_data = torch.exp(out_data.cpu())
+                # print('target',target)
+                # print('out_data',out_data)
                 top_p, top_class = torch.topk(torch.exp(out_data), k=1)
                 target = int(target.cpu().numpy().flatten().tolist()[0])
 
@@ -1155,7 +725,6 @@ def es_score(engine):
                     top_class.cpu().numpy().flatten().tolist()[0]
                 )
         f.close()
-        from sklearn.metrics import roc_auc_score
 
         print("predictions", predictions)
         print("targets", targets)
@@ -1169,13 +738,15 @@ def es_score(engine):
         and not classification
         and config.model.output_features > 1
     ):
-        net.eval()
+        best_model.eval()
+        # net.eval()
         mem = []
         with torch.no_grad():
             ids = test_loader.dataset.ids  # [test_loader.dataset.indices]
             for dat, id in zip(test_loader, ids):
                 g, lg, target = dat
-                out_data = net([g.to(device), lg.to(device)])
+                out_data = best_model([g.to(device), lg.to(device)])["out"]
+                # out_data = net([g.to(device), lg.to(device)])["out"]
                 out_data = out_data.cpu().numpy().tolist()
                 if config.standard_scalar_and_pca:
                     sc = pk.load(open("sc.pkl", "rb"))
@@ -1198,8 +769,10 @@ def es_score(engine):
         config.write_predictions
         and not classification
         and config.model.output_features == 1
+        and config.model.gradwise_weight == 0
     ):
-        net.eval()
+        best_model.eval()
+        # net.eval()
         f = open(
             os.path.join(config.output_dir, "prediction_results_test_set.csv"),
             "w",
@@ -1211,7 +784,8 @@ def es_score(engine):
             ids = test_loader.dataset.ids  # [test_loader.dataset.indices]
             for dat, id in zip(test_loader, ids):
                 g, lg, target = dat
-                out_data = net([g.to(device), lg.to(device)])
+                out_data = best_model([g.to(device), lg.to(device)])["out"]
+                # out_data = net([g.to(device), lg.to(device)])["out"]
                 out_data = out_data.cpu().numpy().tolist()
                 if config.standard_scalar_and_pca:
                     sc = pk.load(
@@ -1232,59 +806,45 @@ def es_score(engine):
             "Test MAE:",
             mean_absolute_error(np.array(targets), np.array(predictions)),
         )
-        if config.store_outputs and not classification:
-            # save training targets and predictions here
-            # TODO: Add IDs
-            resultsfile = os.path.join(
+        best_model.eval()
+        # net.eval()
+        f = open(
+            os.path.join(
                 config.output_dir, "prediction_results_train_set.csv"
-            )
-
-            target_vals, predictions = [], []
-
-            for tgt, pred in history["trainEOS"]:
-                target_vals.append(tgt.cpu().numpy().tolist())
-                predictions.append(pred.cpu().numpy().tolist())
-
-            target_vals = np.array(target_vals, dtype="float").flatten()
-            predictions = np.array(predictions, dtype="float").flatten()
-
-            with open(resultsfile, "w") as f:
-                print("target,prediction", file=f)
-                for target_val, predicted_val in zip(target_vals, predictions):
-                    print(f"{target_val}, {predicted_val}", file=f)
-
-    # TODO: Fix IDs for train loader
-    """
-    if config.write_train_predictions:
-        net.eval()
-        f = open("train_prediction_results.csv", "w")
-        f.write("id,target,prediction\n")
+            ),
+            "w",
+        )
+        f.write("target,prediction\n")
+        targets = []
+        predictions = []
         with torch.no_grad():
-            ids = train_loader.dataset.dataset.ids[
-                train_loader.dataset.indices
-            ]
-            print("lens", len(ids), len(train_loader.dataset.dataset))
-            x = []
-            y = []
-
+            ids = train_loader.dataset.ids  # [test_loader.dataset.indices]
             for dat, id in zip(train_loader, ids):
                 g, lg, target = dat
-                out_data = net([g.to(device), lg.to(device)])
+                out_data = best_model([g.to(device), lg.to(device)])["out"]
+                # out_data = net([g.to(device), lg.to(device)])["out"]
                 out_data = out_data.cpu().numpy().tolist()
+                if config.standard_scalar_and_pca:
+                    sc = pk.load(
+                        open(os.path.join(tmp_output_dir, "sc.pkl"), "rb")
+                    )
+                    out_data = sc.transform(np.array(out_data).reshape(-1, 1))[
+                        0
+                    ][0]
                 target = target.cpu().numpy().flatten().tolist()
-                for i, j in zip(out_data, target):
-                    x.append(i)
-                    y.append(j)
-            for i, j, k in zip(ids, x, y):
-                f.write("%s, %6f, %6f\n" % (i, j, k))
+                # if len(target) == 1:
+                #    target = target[0]
+                # if len(out_data) == 1:
+                #    out_data = out_data[0]
+                for ii, jj in zip(target, out_data):
+                    f.write("%6f, %6f\n" % (ii, jj))
+                    targets.append(ii)
+                    predictions.append(jj)
         f.close()
 
-    """
-    return history
-
 
 if __name__ == "__main__":
     config = TrainingConfig(
         random_seed=123, epochs=10, n_train=32, n_val=32, batch_size=16
     )
-    history = train_dgl(config, progress=True)
+    history = train_dgl(config)
diff --git a/alignn/train_folder_ff.py b/alignn/train_alignn.py
similarity index 77%
rename from alignn/train_folder_ff.py
rename to alignn/train_alignn.py
index 5070114c..2a47052e 100644
--- a/alignn/train_folder_ff.py
+++ b/alignn/train_alignn.py
@@ -2,19 +2,19 @@
 
 """Module to train for a folder with formatted dataset."""
 import os
-
-# import numpy as np
+import csv
 import sys
+import json
+import zipfile
 from alignn.data import get_train_val_loaders
 from alignn.train import train_dgl
 from alignn.config import TrainingConfig
 from jarvis.db.jsonutils import loadjson
 import argparse
 from alignn.models.alignn_atomwise import ALIGNNAtomWise, ALIGNNAtomWiseConfig
-
-# from alignn.models.alignn import ALIGNN, ALIGNNConfig
 import torch
 import time
+from jarvis.core.atoms import Atoms
 
 device = "cpu"
 if torch.cuda.is_available():
@@ -133,7 +133,26 @@ def train_for_folder(
     output_dir=None,
 ):
     """Train for a folder."""
-    dat = loadjson(os.path.join(root_dir, "id_prop.json"))
+    id_prop_json = os.path.join(root_dir, "id_prop.json")
+    id_prop_json_zip = os.path.join(root_dir, "id_prop.json.zip")
+    id_prop_csv = os.path.join(root_dir, "id_prop.csv")
+    id_prop_csv_file = False
+    multioutput = False
+    # lists_length_equal = True
+    if os.path.exists(id_prop_json_zip):
+        dat = json.loads(
+            zipfile.ZipFile(id_prop_json_zip).read("id_prop.json")
+        )
+    elif os.path.exists(id_prop_json):
+        dat = loadjson(os.path.join(root_dir, "id_prop.json"))
+    elif os.path.exists(id_prop_csv):
+        id_prop_csv_file = True
+        with open(id_prop_csv, "r") as f:
+            reader = csv.reader(f)
+            dat = [row for row in reader]
+        print("id_prop_csv_file exists", id_prop_csv_file)
+    else:
+        print("Check dataset file.")
     config_dict = loadjson(config_name)
     config = TrainingConfig(**config_dict)
     if type(config) is dict:
@@ -154,30 +173,68 @@ def train_for_folder(
 
     train_grad = False
     train_stress = False
-    if config.model.gradwise_weight != 0:
+    train_atom = False
+    if config.model.calculate_gradient and config.model.gradwise_weight != 0:
         train_grad = True
-    if config.model.stresswise_weight != 0:
+    else:
+        train_grad = False
+    if config.model.calculate_gradient and config.model.stresswise_weight != 0:
         train_stress = True
-    train_atom = False
+    else:
+        train_stress = False
     if config.model.atomwise_weight != 0:
         train_atom = True
-
-    if config.model.atomwise_weight == 0:
+    else:
         train_atom = False
-    if config.model.gradwise_weight == 0:
-        train_grad = False
-    if config.model.stresswise_weight == 0:
-        train_stress = False
+
+    # if config.model.atomwise_weight == 0:
+    #    train_atom = False
+    # if config.model.gradwise_weight == 0:
+    #    train_grad = False
+    # if config.model.stresswise_weight == 0:
+    #    train_stress = False
     target_atomwise = None  # "atomwise_target"
     target_grad = None  # "atomwise_grad"
     target_stress = None  # "stresses"
 
     # mem = []
     # enp = []
+    n_outputs = []
     dataset = []
     for i in dat:
         info = {}
-        info["target"] = i[target_key]
+        if id_prop_csv_file:
+            file_name = i[0]
+            tmp = [float(j) for j in i[1:]]  # float(i[1])
+            info["jid"] = file_name
+
+            if len(tmp) == 1:
+                tmp = tmp[0]
+            else:
+                multioutput = True
+                n_outputs.append(tmp)
+            info["target"] = tmp
+            file_path = os.path.join(root_dir, file_name)
+            if file_format == "poscar":
+                atoms = Atoms.from_poscar(file_path)
+            elif file_format == "cif":
+                atoms = Atoms.from_cif(file_path)
+            elif file_format == "xyz":
+                atoms = Atoms.from_xyz(file_path, box_size=500)
+            elif file_format == "pdb":
+                # Note using 500 angstrom as box size
+                # Recommended install pytraj
+                # conda install -c ambermd pytraj
+                atoms = Atoms.from_pdb(file_path, max_lat=500)
+            else:
+                raise NotImplementedError(
+                    "File format not implemented", file_format
+                )
+            info["atoms"] = atoms.to_dict()
+        else:
+            info["target"] = i[target_key]
+            info["atoms"] = i["atoms"]
+            info["jid"] = i[id_key]
         if train_atom:
             target_atomwise = "atomwise_target"
             info["atomwise_target"] = i[atomwise_key]  # such as charges
@@ -189,35 +246,32 @@ def train_for_folder(
             target_stress = "stresses"
         if "extra_features" in i:
             info["extra_features"] = i["extra_features"]
-        info["atoms"] = i["atoms"]
-        info["jid"] = i[id_key]
         dataset.append(info)
     print("len dataset", len(dataset))
-    n_outputs = []
-    multioutput = False
+    del dat
+    # multioutput = False
     lists_length_equal = True
     line_graph = False
     alignn_models = {
-        "alignn",
+        # "alignn",
         # "alignn_layernorm",
         "alignn_atomwise",
     }
 
-    if config.model.name == "clgn":
-        line_graph = True
-    if config.model.name == "cgcnn":
-        line_graph = True
-    if config.model.name == "icgcnn":
-        line_graph = True
-    if config.model.name in alignn_models and config.model.alignn_layers > 0:
+    if config.model.alignn_layers > 0:
         line_graph = True
 
     if multioutput:
+        print("multioutput", multioutput)
         lists_length_equal = False not in [
             len(i) == len(n_outputs[0]) for i in n_outputs
         ]
-        print("lists_length_equal", lists_length_equal)
+        print("lists_length_equal", lists_length_equal, len(n_outputs[0]))
+        if lists_length_equal:
+            config.model.output_features = len(n_outputs[0])
 
+        else:
+            raise ValueError("Make sure the outputs are of same size.")
     model = None
     if restart_model_path is not None:
         # Should be best_model.pt file
@@ -316,7 +370,7 @@ def train_for_folder(
         keep_data_order=config.keep_data_order,
         output_dir=config.output_dir,
     )
-
+    # print("dataset", dataset[0])
     t1 = time.time()
     train_dgl(
         config,
diff --git a/alignn/train_folder.py b/alignn/train_folder.py
deleted file mode 100644
index f59870b3..00000000
--- a/alignn/train_folder.py
+++ /dev/null
@@ -1,244 +0,0 @@
-#!/usr/bin/env python
-
-"""Module to train for a folder with formatted dataset."""
-import csv
-import os
-import sys
-import time
-from jarvis.core.atoms import Atoms
-from alignn.data import get_train_val_loaders
-from alignn.train import train_dgl
-from alignn.config import TrainingConfig
-from jarvis.db.jsonutils import loadjson
-import argparse
-import glob
-import torch
-
-device = "cpu"
-if torch.cuda.is_available():
-    device = torch.device("cuda")
-
-
-parser = argparse.ArgumentParser(
-    description="Atomistic Line Graph Neural Network"
-)
-parser.add_argument(
-    "--root_dir",
-    default="./",
-    help="Folder with id_props.csv, structure files",
-)
-parser.add_argument(
-    "--config_name",
-    default="alignn/examples/sample_data/config_example.json",
-    help="Name of the config file",
-)
-
-parser.add_argument(
-    "--file_format", default="poscar", help="poscar/cif/xyz/pdb file format."
-)
-
-# parser.add_argument(
-#    "--keep_data_order",
-#    default=True,
-#    help="Whether to randomly shuffle samples, True/False",
-# )
-
-parser.add_argument(
-    "--classification_threshold",
-    default=None,
-    help="Floating point threshold for converting into 0/1 class"
-    + ", use only for classification tasks",
-)
-
-parser.add_argument(
-    "--batch_size", default=None, help="Batch size, generally 64"
-)
-
-parser.add_argument(
-    "--epochs", default=None, help="Number of epochs, generally 300"
-)
-
-parser.add_argument(
-    "--output_dir",
-    default="./",
-    help="Folder to save outputs",
-)
-
-parser.add_argument(
-    "--device",
-    default=None,
-    help="set device for training the model [e.g. cpu, cuda, cuda:2]",
-)
-
-parser.add_argument(
-    "--restart_model_path",
-    default=None,
-    help="Checkpoint file path for model",
-)
-
-
-def train_for_folder(
-    root_dir="examples/sample_data",
-    config_name="config.json",
-    # keep_data_order=False,
-    classification_threshold=None,
-    batch_size=None,
-    epochs=None,
-    restart_model_path=None,
-    file_format="poscar",
-    output_dir=None,
-):
-    """Train for a folder."""
-    # config_dat=os.path.join(root_dir,config_name)
-    id_prop_dat = os.path.join(root_dir, "id_prop.csv")
-    config = loadjson(config_name)
-    if type(config) is dict:
-        try:
-            config = TrainingConfig(**config)
-        except Exception as exp:
-            print("Check", exp)
-
-    # config.keep_data_order = keep_data_order
-    if classification_threshold is not None:
-        config.classification_threshold = float(classification_threshold)
-    if output_dir is not None:
-        config.output_dir = output_dir
-    if batch_size is not None:
-        config.batch_size = int(batch_size)
-    if epochs is not None:
-        config.epochs = int(epochs)
-    if restart_model_path is not None:
-        print("Restarting model from:", restart_model_path)
-        from alignn.models.alignn import ALIGNN, ALIGNNConfig
-
-        rest_config = loadjson(os.path.join(restart_model_path, "config.json"))
-        print("rest_config", rest_config)
-        model = ALIGNN(ALIGNNConfig(**rest_config["model"]))
-        chk_glob = os.path.join(restart_model_path, "*.pt")
-        tmp = "na"
-        for i in glob.glob(chk_glob):
-            tmp = i
-        print("Checkpoint file", tmp)
-        model.load_state_dict(torch.load(tmp, map_location=device)["model"])
-        model.to(device)
-    else:
-        model = None
-    with open(id_prop_dat, "r") as f:
-        reader = csv.reader(f)
-        data = [row for row in reader]
-
-    dataset = []
-    n_outputs = []
-    multioutput = False
-    lists_length_equal = True
-    for i in data:
-        info = {}
-        file_name = i[0]
-        file_path = os.path.join(root_dir, file_name)
-        if file_format == "poscar":
-            atoms = Atoms.from_poscar(file_path)
-        elif file_format == "cif":
-            atoms = Atoms.from_cif(file_path)
-        elif file_format == "xyz":
-            # Note using 500 angstrom as box size
-            atoms = Atoms.from_xyz(file_path, box_size=500)
-        elif file_format == "pdb":
-            # Note using 500 angstrom as box size
-            # Recommended install pytraj
-            # conda install -c ambermd pytraj
-            atoms = Atoms.from_pdb(file_path, max_lat=500)
-        else:
-            raise NotImplementedError(
-                "File format not implemented", file_format
-            )
-
-        info["atoms"] = atoms.to_dict()
-        info["jid"] = file_name
-
-        tmp = [float(j) for j in i[1:]]  # float(i[1])
-        if len(tmp) == 1:
-            tmp = tmp[0]
-        else:
-            multioutput = True
-        info["target"] = tmp  # float(i[1])
-        n_outputs.append(info["target"])
-        dataset.append(info)
-    if multioutput:
-        lists_length_equal = False not in [
-            len(i) == len(n_outputs[0]) for i in n_outputs
-        ]
-
-    # print ('n_outputs',n_outputs[0])
-    if multioutput and classification_threshold is not None:
-        raise ValueError("Classification for multi-output not implemented.")
-    if multioutput and lists_length_equal:
-        config.model.output_features = len(n_outputs[0])
-    else:
-        # TODO: Pad with NaN
-        if not lists_length_equal:
-            raise ValueError("Make sure the outputs are of same size.")
-        else:
-            config.model.output_features = 1
-    (
-        train_loader,
-        val_loader,
-        test_loader,
-        prepare_batch,
-    ) = get_train_val_loaders(
-        dataset_array=dataset,
-        target=config.target,
-        n_train=config.n_train,
-        n_val=config.n_val,
-        n_test=config.n_test,
-        train_ratio=config.train_ratio,
-        val_ratio=config.val_ratio,
-        test_ratio=config.test_ratio,
-        batch_size=config.batch_size,
-        atom_features=config.atom_features,
-        neighbor_strategy=config.neighbor_strategy,
-        standardize=config.atom_features != "cgcnn",
-        id_tag=config.id_tag,
-        pin_memory=config.pin_memory,
-        workers=config.num_workers,
-        save_dataloader=config.save_dataloader,
-        use_canonize=config.use_canonize,
-        filename=config.filename,
-        cutoff=config.cutoff,
-        max_neighbors=config.max_neighbors,
-        output_features=config.model.output_features,
-        classification_threshold=config.classification_threshold,
-        target_multiplication_factor=config.target_multiplication_factor,
-        standard_scalar_and_pca=config.standard_scalar_and_pca,
-        keep_data_order=config.keep_data_order,
-        output_dir=config.output_dir,
-    )
-    t1 = time.time()
-    train_dgl(
-        config,
-        model,
-        train_val_test_loaders=[
-            train_loader,
-            val_loader,
-            test_loader,
-            prepare_batch,
-        ],
-    )
-    t2 = time.time()
-    print("Time taken (s):", t2 - t1)
-
-    # train_data = get_torch_dataset(
-
-
-if __name__ == "__main__":
-    args = parser.parse_args(sys.argv[1:])
-    train_for_folder(
-        root_dir=args.root_dir,
-        config_name=args.config_name,
-        # keep_data_order=args.keep_data_order,
-        classification_threshold=args.classification_threshold,
-        output_dir=args.output_dir,
-        batch_size=(args.batch_size),
-        epochs=(args.epochs),
-        file_format=(args.file_format),
-        restart_model_path=(args.restart_model_path),
-    )
diff --git a/alignn/utils.py b/alignn/utils.py
index 5125135c..cc6e57b9 100644
--- a/alignn/utils.py
+++ b/alignn/utils.py
@@ -1,10 +1,11 @@
 """Shared pydantic settings configuration."""
+
 import json
 from pathlib import Path
 from typing import Union
 import matplotlib.pyplot as plt
 
-from pydantic import BaseSettings as PydanticBaseSettings
+from pydantic_settings import BaseSettings as PydanticBaseSettings
 
 
 class BaseSettings(PydanticBaseSettings):
diff --git a/setup.py b/setup.py
index 13ca5263..b4027ff5 100644
--- a/setup.py
+++ b/setup.py
@@ -10,44 +10,36 @@
 
 setuptools.setup(
     name="alignn",
-    version="2024.2.4",
+    version="2024.3.4",
     author="Kamal Choudhary, Brian DeCost",
     author_email="kamal.choudhary@nist.gov",
     description="alignn",
     install_requires=[
-        "numpy>=1.19.5",
+        "numpy>=1.19.5,<2.0.0",
         "scipy>=1.6.1",
         "jarvis-tools>=2021.07.19",
         "torch<=2.0.0",
+        "mpmath<=1.3.0",
         "dgl>=0.6.0",
         "spglib<=2.0.2",
         "scikit-learn>=0.22.2",
         "matplotlib>=3.4.1",
         "tqdm>=4.60.0",
         "pandas>=1.2.3",
-        "pytorch-ignite>=0.5.0.dev20221024",
-        "pydantic==1.8.1",
+        "pydantic>=1.8.1",
+        "pydantic-settings",
         "flake8>=3.9.1",
         "pycodestyle>=2.7.0",
         "pydocstyle>=6.0.0",
         "pyparsing>=2.2.1,<3",
         "ase",
-        "accelerate>=0.20.3",
+        # "pytorch-ignite>=0.5.0.dev20221024",
+        # "accelerate>=0.20.3",
         # "dgl-cu101>=0.6.0",
     ],
-    # package_data={
-    #    "alignn.ff.alignnff_wt10": ["best_model.pt", "config.json"],
-    #    "alignn.ff.alignnff_wt1": ["best_model.pt", "config.json"],
-    #    "alignn.ff.alignnff_wt01": ["best_model.pt", "config.json"],
-    #    "alignn.ff.revised": ["best_model.pt", "config.json"],
-    #    "alignn.ff.fmult_mlearn_only": ["best_model.pt", "config.json"],
-    #    "alignn.ff.alignnff_fd": ["best_model.pt", "config.json"],
-    #    "alignn.ff.alignnff_fmult": ["best_model.pt", "config.json"],
-    # },
     scripts=[
         "alignn/pretrained.py",
-        "alignn/train_folder.py",
-        "alignn/train_folder_ff.py",
+        "alignn/train_alignn.py",
         "alignn/run_alignn_ff.py",
     ],
     long_description=long_description,
@@ -59,5 +51,5 @@
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
-    python_requires=">=3.7",
+    python_requires=">=3.9",
 )