uplaod tool and update samples for cv

SpM-lab · Sep 20, 2021 · eb5327d · eb5327d
1 parent 101b19b
commit eb5327d
Show file tree

Hide file tree

Showing 6 changed files with 460 additions and 4 deletions.
diff --git a/samples/tool/cv/base/param.in b/samples/tool/cv/base/param.in
@@ -0,0 +1,15 @@
+# INPUT/OUTPUT
+statistics="fermion"
+beta=100
+filein_G="G_train.in"
+column=1  
+fileout_spec="spectrum.dat"
+# OMEGA
+Nomega=1001
+omegamin=-4
+omegamax=4
+# ADMM
+lambdalogbegin=0
+lambdalogend=-6
+tolerance=1e-10
+maxiteration=10000
diff --git a/samples/tool/cv/input_10fold.toml b/samples/tool/cv/input_10fold.toml
@@ -5,6 +5,6 @@ nsamples = 20
 k_fold = 10
 [file]
 work_dir = "input"
+path_to_job = "base"
 [job]
-cmd = ["SpM.out"]
-path_to_job = ["./SpM.out"]
+cmd = ["./SpM.out", "param.in"]
diff --git a/samples/tool/cv/input_LOO.toml b/samples/tool/cv/input_LOO.toml
@@ -5,6 +5,6 @@ nsamples = 20
 k_fold = 2
 [file]
 work_dir = "input"
+path_to_job = "base"
 [job]
-cmd = ["SpM.out"]
-path_to_job = ["./SpM.out"]
+cmd = ["./SpM.out", "param.in"]
diff --git a/tool/README.md b/tool/README.md
@@ -0,0 +1,87 @@
+# Wrapper＆score
+
+## About This Directory
+
+The configuration of this directory is shown below.
+
+```
+.
+├── wrapper_cv_Gtau.py  # Wrapper for generating dataset for CV
+└── calc_cv_score.py    # Calculating score
+```
+
+## Usage
+
+### Making datasets for Cross-Validation
+
+`wrapper_cv_Gtau.py` will generates input files for CV.
+
+To use this script, first, the data set of imaginary-time Green's functions must be prepared.
+The name of files stored imaginary-time Green's functions must be Gtau_xx.in (xx is the integer).
+Then, `wrapper_cv_Gtau.py` generates training and validation files for CV by averaging imaginary-time Green's functions.
+You can also do SpM calculation if you set `[job]` section in the input file of `wrapper_cv_Gtau.py`.
+You can select two methods, Leave one out and Kfold, for CV.
+
+### Input file
+
+Parameters of the input file for `wrapper_cv_Gtau.py` is explained below.
+
+- \[cond\] section
+  - cv_type
+    - type: str
+    - Description: Set the Cross validation method from LOO or Kfold.
+  - nsamples
+    - type: int
+    - Description: The total number of samples.
+- \[param\] section
+  - k_fold
+    - type: int
+    - Description: The number of folds used in Kfold method.
+- \[file\] section
+  - work_dir
+    - type: str
+    - Description: The name of work directory
+- \[job\] section
+  - cmd
+    - type: list
+    - Description: Command of executing SpM in working directory.  
+      ex.) \["./SpM.out"\], \["sh", "./spm.sh"\], \["qsub", "./spm.sh"\]
+  - path_to_job
+    - type: str
+      - Description: Path to base script or execution file.
+        This file will be copied to each working directory.
+
+If `cmd` in \[job\] section is not defined, only input files are generated.
+
+### Example
+
+Sample files (`*.toml` and `input/Gtau_xx`) are available at `sample/tool/cv`.
+
+1. Leave one out (LOO)
+
+    In LOO method, one sample is chosen from all samples to make a data set in all combinations.
+    Type as follows if the numbar of all samples is 20.
+
+        $ python wrapper_cv_Gtau.py -i input_LOO.toml
+
+    If succeeded, data sets including `G_test.in` and `G_train.in`  are created in `input/20samples_leave_one_out/` directory.
+
+2. Kfold
+
+    In Kfold method, all samples are divided into K parts in order from the head, and each of them is considered as a unit to create a learning data set of K combinations.
+    Type as follows if the number of all samples is 20 and K = 10.
+
+        $ python wrapper_cv_Gtau.py -i input_10fold.toml
+
+    If succeeded, data sets are created in `input/20samples_10fold/` directory.
+
+## score
+
+### Calculating score
+To calculate the scores, `calc_cv_score.py` is prepared.
+The input file which is same as the above wrapper for generating the CV set is needed.
+For example, for LOO, you can get score by typing
+
+    $ python calc_cv_score.py -i input_LOO.toml
+
+If succeeded, `input/20samples_10fold/score.dat` is created.
diff --git a/tool/calc_cv_score.py b/tool/calc_cv_score.py
@@ -0,0 +1,111 @@
+import os
+import argparse
+import numpy as np
+import glob
+from sklearn.metrics import mean_squared_error
+
+
+def read_Gtau_test(path_to_file):
+    file_name = os.path.join(path_to_file)
+    with open(file_name, "r") as fr:
+        lines = fr.readlines()
+        header = lines[:5]
+        num = len(lines[5:])
+        g_tau = np.zeros(num, dtype=np.complex128)
+        tau = np.zeros(num)
+        for idx, line in enumerate(lines[5:]):
+            values = line.split()
+            tau[idx] = values[0]
+            g_tau[idx] = float(values[1]) + 1j * float(values[2])
+    return header, tau, g_tau
+
+
+def read_ytw(path_to_file):
+    with open(os.path.join(path_to_file, "y_tw.dat"), "r") as fr:
+        lines = fr.readlines()
+        num = len(lines)
+        g_tau = np.zeros(num, dtype=np.complex128)
+        for idx, line in enumerate(lines):
+            values = line.split()
+            g_tau[idx] = float(values[2]) + 1j * float(values[3])
+    return g_tau
+
+
+def get_cv_type_name(cv_type, nsamples, kfold):
+    if cv_type == "Kfold":
+        cv_type_name = "{}samples_{}fold".format(nsamples, kfold)
+        num_idx = kfold
+    elif cv_type == "LOO":
+        cv_type_name = "{}samples_leave_one_out".format(nsamples)
+        num_idx = nsamples
+    else:
+        print("Error: cv_type {} is incorrect.".format(cv_type))
+        exit(1)
+    return cv_type_name, num_idx
+
+
+parser = argparse.ArgumentParser(
+    description="Calculating score functions", add_help=True
+)
+
+parser.add_argument(
+    "-i", dest="input_file_toml", default="input.toml", type=str, help="toml file",
+)
+
+
+import toml
+
+args = parser.parse_args()
+dict = toml.load(args.input_file_toml)
+
+input_parent_dir = os.path.join(os.getcwd(), dict["file"]["work_dir"])
+current_dir = os.getcwd()
+cv_type_name, num_idx = get_cv_type_name(
+    dict["cond"]["cv_type"], dict["cond"]["nsamples"], dict["param"].get("k_fold", 1)
+)
+
+path_to_fold = os.path.join(input_parent_dir, cv_type_name)
+rmse = []
+lambda_list = [
+    os.path.basename(p)[7:]
+    for p in glob.glob(os.path.join(path_to_fold, str(0), "output", "lambda", "*"))
+]
+if lambda_list == []:
+    print(
+        "Error: output directory does not exist in {}.".format(
+            os.path.join(path_to_fold, "0")
+        )
+    )
+    exit(1)
+score_list = np.zeros((len(lambda_list), 3))
+green_list = []
+
+with open(os.path.join(path_to_fold, "score.dat"), "w") as fw:
+    for l_idx, dlambda in enumerate(lambda_list):
+        print("     lambda: {}/{}".format(l_idx + 1, len(lambda_list)))
+        lambda_dir = "lambda_{}".format(dlambda)
+        score_list_lambda = np.zeros(num_idx)
+        for idx in range(num_idx):
+            print("       fold: {}/{}".format(idx + 1, num_idx))
+            path_to_train = os.path.join(path_to_fold, str(idx), "G_test.in")
+            header, tau, g_tau_test = read_Gtau_test(path_to_train)
+            path_to_train = os.path.join(path_to_fold, str(idx), "G_train.in")
+            header, tau, g_tau_train = read_Gtau_test(path_to_train)
+            path_to_lambda = os.path.join(
+                path_to_fold, str(idx), "output", "lambda", lambda_dir
+            )
+            g_train_opt = -read_ytw(path_to_lambda)
+            score_list_lambda[idx] = 1.0 - mean_squared_error(
+                g_tau_test.real, g_train_opt.real
+            ) / mean_squared_error(
+                g_tau_test.real, np.full(g_tau_test.real.shape, g_tau_test.mean().real)
+            )
+        score_list[l_idx][0] = dlambda
+        score_list[l_idx][1] = np.mean(score_list_lambda)
+        score_list[l_idx][2] = np.std(score_list_lambda)
+        fw.write(
+            "{} {} {}\n".format(
+                score_list[l_idx][0], score_list[l_idx][1], score_list[l_idx][2]
+            )
+        )
+        fw.flush()