Add FigShare datasets, DOS in QE, Uncertainty.

usnistgov · Aug 11, 2023 · 4477c4a · 4477c4a
1 parent 30eae92
commit 4477c4a
Show file tree

Hide file tree

Showing 6 changed files with 338 additions and 18 deletions.
diff --git a/jarvis/__init__.py b/jarvis/__init__.py
@@ -1,5 +1,5 @@
 """Version number."""
-__version__ = "2023.05.26"
+__version__ = "2023.08.01"
 
 import os
 

diff --git a/jarvis/ai/uncertainty/gaussian_process_uncertainty.py b/jarvis/ai/uncertainty/gaussian_process_uncertainty.py
@@ -0,0 +1,193 @@
+"""
+Code to predict properties and their uncertainty.
+
+ML model used: lgbm
+"""
+# Ref: https://doi.org/10.1021/acsomega.1c03752
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from jarvis.ai.pkgs.utils import regr_scores
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RBF
+from sklearn.gaussian_process.kernels import WhiteKernel
+from sklearn.gaussian_process.kernels import RationalQuadratic
+
+# from sklearn.gaussian_process.kernels import ExpSineSquared
+# from sklearn.gaussian_process.kernels import ConstantKernel as C
+# import joblib
+from joblib import dump
+from collections import OrderedDict
+
+
+def GaussianProcesses(
+    x,
+    y,
+    jid,
+    cv=2,
+    n_jobs=-1,
+    n_iter=10,
+    random_state=508842607,
+    scoring="neg_mean_absolute_error",
+    prop="exfoliation_energy",
+    write_files=True,
+):
+    """
+    Perform Gaussian Processes and determine prediction intervals.
+
+    NOTE: it can deal with ~50 descriptors AT MAX ==> reduction of
+          the CFID descriptors is needed
+    """
+    # TODO: Make writing file in proper python format
+
+    # STEP-2: Splitting the data
+    # ***************************
+    # 90-10% split for train test
+
+    X_train, X_test, y_train, y_test, jid_train, jid_test = train_test_split(
+        x, y, jid, random_state=1, test_size=0.1
+    )
+    # print ('lenx len y',len(x[0]),len(y))
+
+    # STEP-3: Use a specific ML modeli: GP in this case
+    # *************************************************
+    #
+    # Model definition
+    # ================
+    # The Kernel can be chanded depending on the property under examination.
+    # In this example we use a composite kernel that includes long range
+    # effects (RBF), medium term irregularities (RationalQuadratic), and
+    # white noise
+    # Periodic trends can be added using ExpSineSquared
+    # Note: RBF has one parameter for each descriptor used ==> if using a
+    #       different number of descriptors, the number of parameters has
+    #       to be changed
+    RQ1 = 316**2 * RationalQuadratic(
+        alpha=0.067,
+        length_scale=43.5,
+        length_scale_bounds=(1e0, 1e2),
+        alpha_bounds=(1e-3, 1e1),
+    )
+    RQ2 = 316**2 * RationalQuadratic(
+        alpha=0.00473,
+        length_scale=0.0796,
+        length_scale_bounds=(1e-2, 1e0),
+        alpha_bounds=(1e-3, 1e1),
+    )
+    RBF1 = 316**2 * RBF(
+        length_scale=[
+            16,
+            1.48e03,
+            9.92e04,
+            0.858,
+            1.02,
+            2.58,
+            2.72e03,
+            2.22e03,
+            0.839,
+            0.5,
+            6.15,
+            130,
+            3.9e03,
+            3.1,
+            0.675,
+            6.04,
+            1.18e03,
+            2.9,
+            4.56e03,
+            333,
+            0.0167,
+            797,
+            2.37e03,
+            1.81e03,
+        ]
+    )
+    White = WhiteKernel(noise_level=0.7, noise_level_bounds=(0.05, 0.8))
+    kernel = RQ1 + RQ2 + RBF1 + White
+
+    mid_model = GaussianProcessRegressor(
+        kernel=kernel,
+        alpha=0.0001,
+        n_restarts_optimizer=3,
+        normalize_y=True,
+        random_state=int(774057201),
+    )
+
+    # Model fitting
+    # =============
+    scaler = StandardScaler().fit(X_train)
+    scaler.transform(X_train)
+    scaler.transform(X_test)
+
+    mid_model.fit(X_train, y_train)
+
+    dump(mid_model, "model.joblib")
+    print("")
+    print("Params:")
+    print(mid_model.get_params(deep=True))
+    print("Initial: ", kernel)
+    print("Optimum: ", mid_model.kernel_)
+    print(
+        "Log-Marginal-Likelihood: ",
+        mid_model.log_marginal_likelihood(mid_model.kernel_.theta),
+    )
+    print("")
+    print("")
+
+    # PREDICTIONS and UQ
+    # ==================
+    pred, sigma = mid_model.predict(X_test, return_std=True)
+    print("Model       mae     rmse")
+    reg_sc = regr_scores(y_test, pred)
+    info = OrderedDict()
+    info["MAE_Mid"] = reg_sc["mae"]
+    actual = y_test
+    print("Mid:", round(reg_sc["mae"], 3), round(reg_sc["rmse"], 3))
+
+    fout2 = open("Intervals1.dat", "w")
+    line0 = "#    Jid      Observed       pred_Lower       pred_Mid    "
+    line1 = line0 + "RealErr(Mid)     "
+    line = line1 + "pred_Upper    PredError    Pred_inBounds\n"
+    fout2.write(line)
+    sum = 0.0
+    count = 0
+    MAE_err = 0.0
+    for ii in range(len(y_test)):
+        ss = float(sigma[ii])
+        pp = float(pred[ii])
+        sum = sum + ss
+        lower = pp - ss
+        upper = pp + ss
+        err = ss
+        real_err = float(abs(y_test[ii] - pp))
+        err_err = abs(real_err - err)
+        MAE_err = MAE_err + err_err
+        if (pp >= lower) and (pp <= upper):
+            in_bounds = "True"
+            count = count + 1
+        else:
+            in_bounds = "False"
+        line2_0 = str(ii) + " " + jid[ii] + " " + str(y_test[ii]) + " "
+        line2_1 = line2_0 + str(lower) + " " + str(pp) + " " + str(real_err)
+        line2 = (
+            line2_1
+            + " "
+            + str(upper)
+            + " "
+            + str(ss)
+            + " "
+            + str(in_bounds)
+            + "\n"
+        )
+        fout2.write(line2)
+    print("")
+    print("Number of test materials= " + str(len(actual)))
+    print(
+        "Percentage of in-bound results= "
+        + str((float(count) / (len(actual))) * 100)
+        + "%"
+    )
+    print(" ")
+    MAE_error = float(MAE_err) / (len(actual))
+    print("MAE predicted error = " + str(MAE_error))
+    info["MAE_Error"] = MAE_error
+    return info
diff --git a/jarvis/ai/uncertainty/lgbm_quantile_uncertainty.py b/jarvis/ai/uncertainty/lgbm_quantile_uncertainty.py
@@ -3,7 +3,7 @@
 
 ML model used: lgbm
 """
-
+# Ref: https://doi.org/10.1021/acsomega.1c03752
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 import pickle

diff --git a/jarvis/db/figshare.py b/jarvis/db/figshare.py
@@ -357,6 +357,41 @@ def get_db_info():
             "Obtaining TinNet OH dataset 748...",
             "https://github.com/hlxin/tinnet",
         ],
+        # https://doi.org/10.6084/m9.figshare.23909478
+        "AGRA_O": [
+            "https://figshare.com/ndownloader/files/41923284",
+            "AGRA_O.json",
+            "Obtaining AGRA Oxygen dataset 1000...",
+            "https://github.com/Feugmo-Group/AGRA",
+        ],
+        # https://doi.org/10.6084/m9.figshare.23909478
+        "AGRA_OH": [
+            "https://figshare.com/ndownloader/files/41923287",
+            "AGRA_OH.json",
+            "Obtaining AGRA OH dataset 875...",
+            "https://github.com/Feugmo-Group/AGRA",
+        ],
+        # https://doi.org/10.6084/m9.figshare.23909478
+        "AGRA_CO": [
+            "https://figshare.com/ndownloader/files/41923278",
+            "AGRA_CO.json",
+            "Obtaining AGRA CO dataset 193...",
+            "https://github.com/Feugmo-Group/AGRA",
+        ],
+        # https://doi.org/10.6084/m9.figshare.23909478
+        "AGRA_CHO": [
+            "https://figshare.com/ndownloader/files/41923275",
+            "AGRA_CHO.json",
+            "Obtaining AGRA Oxygen dataset 214...",
+            "https://github.com/Feugmo-Group/AGRA",
+        ],
+        # https://doi.org/10.6084/m9.figshare.23909478
+        "AGRA_COOH": [
+            "https://figshare.com/ndownloader/files/41923281",
+            "AGRA_COOH.json",
+            "Obtaining AGRA COOH dataset 280...",
+            "https://github.com/Feugmo-Group/AGRA",
+        ],
         # https://doi.org/10.6084/m9.figshare.21370572
         "supercon_3d": [
             "https://figshare.com/ndownloader/files/38307921",
@@ -610,7 +645,7 @@ def get_hk_tb(k=np.array([0, 0, 0]), w=[]):
     hk = np.zeros((w.nwan, w.nwan), dtype=complex)
     kmat = np.tile(k, (nr, 1))
     exp_ikr = np.exp(1.0j * 2 * np.pi * np.sum(kmat * w.R, 1))
-    temp = np.zeros(w.nwan ** 2, dtype=complex)
+    temp = np.zeros(w.nwan**2, dtype=complex)
     for i in range(nr):
         temp += exp_ikr[i] * w.HR[i, :]
     hk = np.reshape(temp, (w.nwan, w.nwan))