From c370c7b8a72d28a5cb68716fb23c0e25dfe1b2a7 Mon Sep 17 00:00:00 2001
From: Narges Rezaie <nargesr@uci.edu>
Date: Mon, 18 Dec 2023 01:57:09 -0800
Subject: [PATCH] add save train as h5

---
 Topyfic/train.py          | 52 ++++++++++++++++++++++++------
 Topyfic/utilsMakeModel.py | 67 +++++++++++++++++++++++++++++++--------
 2 files changed, 96 insertions(+), 23 deletions(-)

diff --git a/Topyfic/train.py b/Topyfic/train.py
index 95e00c4..405ef4c 100644
--- a/Topyfic/train.py
+++ b/Topyfic/train.py
@@ -10,6 +10,7 @@
 from itertools import repeat
 import pickle
 from sklearn.decomposition import LatentDirichletAllocation
+import h5py
 
 from Topyfic.topModel import TopModel
 
@@ -195,20 +196,51 @@ def make_LDA_models_attributes(self):
 
         return all_components, all_exp_dirichlet_component, all_others
 
-    def save_train(self, name=None, save_path=""):
+    def save_train(self, name=None, save_path="", file_format='pickle'):
         """
-        save Train class as a pickle file
+            save Train class as a pickle file
 
-        :param name: name of the pickle file (default is train_Train.name)
-        :type name: str
-        :param save_path: directory you want to use to save pickle file (default is saving near script)
-        :type save_path: str
+            :param name: name of the pickle file (default is train_Train.name)
+            :type name: str
+            :param save_path: directory you want to use to save pickle file (default is saving near script)
+            :type save_path: str
         """
+        if file_format not in ['pickle', 'HDF5']:
+            sys.exit(f"{file_format} is not correct! It should be 'pickle' or 'HDF5'.")
         if name is None:
             name = f"train_{self.name}"
 
-        print(f"Saving train class as {name}.p")
+        if file_format == "pickle":
+            print(f"Saving train as {name}.p")
+
+            picklefile = open(f"{save_path}{name}.p", "wb")
+            pickle.dump(self, picklefile)
+            picklefile.close()
+
+        if file_format == "HDF5":
+            print(f"Saving train as {name}.h5")
+
+            f = h5py.File(f"{name}.h5", "w")
+
+            # models
+            models = f.create_group("models")
+            for i in range(len(self.top_models)):
+                model = models.create_group(str(i))
+
+                self.top_models[i].model = self.top_models[i].rLDA
+
+                model['components_'] = self.top_models[i].model.components_
+                model['exp_dirichlet_component_'] = self.top_models[i].model.exp_dirichlet_component_
+                model['n_batch_iter_'] = np.int_(self.top_models[i].model.n_batch_iter_)
+                model['n_features_in_'] = self.top_models[i].model.n_features_in_
+                model['n_iter_'] = np.int_(self.top_models[i].model.n_iter_)
+                model['bound_'] = np.float_(self.top_models[i].model.bound_)
+                model['doc_topic_prior_'] = np.float_(self.top_models[i].model.doc_topic_prior_)
+                model['topic_word_prior_'] = np.float_(self.top_models[i].model.topic_word_prior_)
+
+            f['name'] = np.string_(self.name)
+            f['k'] = np.int_(self.k)
+            f['n_runs'] = np.int_(self.n_runs)
+            f['random_state_range'] = np.array(list(self.random_state_range))
 
-        picklefile = open(f"{save_path}{name}.p", "wb")
-        pickle.dump(self, picklefile)
-        picklefile.close()
+            f.close()
diff --git a/Topyfic/utilsMakeModel.py b/Topyfic/utilsMakeModel.py
index f25f5f2..da33d4c 100644
--- a/Topyfic/utilsMakeModel.py
+++ b/Topyfic/utilsMakeModel.py
@@ -495,10 +495,51 @@ def read_train(file):
     :rtype: Train class
     """
     if not os.path.isfile(file):
-        raise ValueError('Train object not found at given path!')
+        raise ValueError('Train file not found at given path!')
+    if not file.endswith('.p') and not file.endswith('.h5'):
+        raise ValueError('Train file type is not correct!')
 
-    picklefile = open(file, 'rb')
-    train = pickle.load(picklefile)
+    if file.endswith('.p'):
+        picklefile = open(file, 'rb')
+        train = pickle.load(picklefile)
+
+    if file.endswith('.h5'):
+        f = h5py.File(file, 'r')
+
+        name = np.string_(f['name']).decode('ascii')
+        k = np.int_(f['k'])
+        n_runs = np.int_(f['n_runs'])
+        random_state_range = list(f['random_state_range'])
+
+        # models
+        top_models = []
+        for random_state in random_state_range:
+            components = pd.DataFrame(np.array(f[f"models/{random_state}/components_"]))
+            exp_dirichlet_component = pd.DataFrame(np.array(f[f"models/{random_state}/exp_dirichlet_component_"]))
+
+            others = pd.DataFrame()
+            others.loc[0, 'n_batch_iter'] = np.int_(f[f"models/{random_state}/n_batch_iter_"])
+            others.loc[0, 'n_features_in'] = np.array(f[f"models/{random_state}/n_features_in_"])
+            others.loc[0, 'n_iter'] = np.int_(f[f"models/{random_state}/n_iter_"])
+            others.loc[0, 'bound'] = np.float_(f[f"models/{random_state}/bound_"])
+            others.loc[0, 'doc_topic_prior'] = np.array(f[f"models/{random_state}/doc_topic_prior_"])
+            others.loc[0, 'topic_word_prior'] = np.array(f[f"models/{random_state}/topic_word_prior_"])
+
+            model = initialize_lda_model(components, exp_dirichlet_component, others)
+
+            top_model = TopModel(name=f"{name}_{random_state}",
+                                         N=k,
+                                         gene_weights=components,
+                                         model=model)
+            top_models.append(top_model)
+
+        train = Train(name=name,
+                              k=k,
+                              n_runs=n_runs,
+                              random_state_range=random_state_range)
+        train.top_models = top_models
+
+        f.close()
 
     print(f"Reading Train done!")
     return train
@@ -553,11 +594,11 @@ def read_topModel(file):
             gene_weights.index = gene_information.index.tolist()
             gene_weights.columns = topic_information.index.tolist()
 
-            topic = Topyfic.Topic(topic_id=topic_id,
-                                  topic_name=topic_name,
-                                  topic_gene_weights=gene_weights,
-                                  gene_information=gene_information,
-                                  topic_information=topic_information)
+            topic = Topic(topic_id=topic_id,
+                          topic_name=topic_name,
+                          topic_gene_weights=gene_weights,
+                          gene_information=gene_information,
+                          topic_information=topic_information)
             topics[topic_id] = topic
 
         # model
@@ -572,12 +613,12 @@ def read_topModel(file):
         others.loc[0, 'doc_topic_prior'] = np.array(f['model']['doc_topic_prior_'])
         others.loc[0, 'topic_word_prior'] = np.array(f['model']['topic_word_prior_'])
 
-        model = Topyfic.initialize_lda_model(components, exp_dirichlet_component, others)
+        model = initialize_lda_model(components, exp_dirichlet_component, others)
 
-        top_model = Topyfic.TopModel(name=name,
-                                     N=N,
-                                     topics=topics,
-                                     model=model)
+        top_model = TopModel(name=name,
+                             N=N,
+                             topics=topics,
+                             model=model)
 
         f.close()