From 994554ef4eaacff59e0fb1c4f0b5c183cdfe0093 Mon Sep 17 00:00:00 2001
From: ejhusom <ejlgkvam@online.no>
Date: Wed, 17 Jan 2024 12:09:39 +0100
Subject: [PATCH] Update how cluster names are created

---
 params-fersa.yaml    | 16 ++++++++++++++++
 params-nova9.yaml    | 16 ++++++++++++++++
 params.yaml          |  2 +-
 src/cluster_utils.py |  3 ++-
 src/featurize.py     |  2 ++
 src/postprocess.py   | 45 ++++++++++++++++++++++++++++++++------------
 6 files changed, 70 insertions(+), 14 deletions(-)
 create mode 100644 params-fersa.yaml
 create mode 100644 params-nova9.yaml

diff --git a/params-fersa.yaml b/params-fersa.yaml
new file mode 100644
index 0000000..ff396c1
--- /dev/null
+++ b/params-fersa.yaml
@@ -0,0 +1,16 @@
+featurize:
+  columns: Channel_4_Data
+  convert_timestamp_to_datetime: true
+  dataset: nova10_p8_10hz-20230721121352
+  overlap: 0
+  timestamp_column: timestamp
+  window_size: 30
+postprocess:
+  min_segment_length: 1
+train:
+  annotations_dir: nova10_p8_10hz_annotations
+  fix_predefined_centroids: false
+  learning_method: minibatchkmeans
+  max_iter: 100
+  n_clusters: 7
+  use_predefined_centroids: true
diff --git a/params-nova9.yaml b/params-nova9.yaml
new file mode 100644
index 0000000..14542e9
--- /dev/null
+++ b/params-nova9.yaml
@@ -0,0 +1,16 @@
+featurize:
+  columns: Channel_4_Data
+  convert_timestamp_to_datetime: true
+  dataset: nova9_m4
+  overlap: 0
+  timestamp_column: timestamp
+  window_size: 10
+postprocess:
+  min_segment_length: 1
+train:
+  annotations_dir: nova9_m4_10hz_annotations
+  fix_predefined_centroids: false
+  learning_method: meanshift
+  max_iter: 100
+  n_clusters: 4
+  use_predefined_centroids: true
diff --git a/params.yaml b/params.yaml
index de4fb5f..b132c7b 100644
--- a/params.yaml
+++ b/params.yaml
@@ -4,7 +4,7 @@ featurize:
   dataset: nova10_p8_10hz
   overlap: 0
   timestamp_column: timestamp
-  window_size: 30
+  window_size: 35
 postprocess:
   min_segment_length: 1
 train:
diff --git a/src/cluster_utils.py b/src/cluster_utils.py
index 25c9f9d..e95e8ab 100644
--- a/src/cluster_utils.py
+++ b/src/cluster_utils.py
@@ -420,7 +420,8 @@ def create_event_log(labels, identifier="",
     segments = find_segments(labels)
     event_log = create_event_log_from_segments(segments,
             feature_vector_timestamps)
-    event_log["case"] = identifier
+    event_log["source"] = identifier
+    event_log["case"] = ""
 
     return event_log
 
diff --git a/src/featurize.py b/src/featurize.py
index 51f315e..77d8fe4 100644
--- a/src/featurize.py
+++ b/src/featurize.py
@@ -58,6 +58,8 @@ def featurize(dir_path="", inference=False, inference_df=None):
     with open("params.yaml", "r") as params_file:
         params = yaml.safe_load(params_file)
 
+    print(params)
+    print("=========")
     dataset = params["featurize"]["dataset"]
     columns = params["featurize"]["columns"]
     window_size = params["featurize"]["window_size"]
diff --git a/src/postprocess.py b/src/postprocess.py
index ceeb7b8..3638563 100644
--- a/src/postprocess.py
+++ b/src/postprocess.py
@@ -257,20 +257,39 @@ def generate_cluster_names(model, cluster_centers):
     """
 
     levels = ["lowest", "low", "medium", "high", "highest"]
+    cluster_labels = []
     cluster_names = []
+    cluster_characteristics = []
+    cluster_colors = []
+
     n_clusters = cluster_centers.shape[0]
 
+    # Add index and color for each cluster
     for i in range(n_clusters):
-        cluster_names.append(f"{i} ({COLORS[i]}): ")
+        cluster_colors.append(COLORS[i])
+        cluster_labels.append(i)
+        cluster_names.append("")
+        cluster_characteristics.append("")
 
     maxs = cluster_centers.argmax(axis=0)
     mins = cluster_centers.argmin(axis=0)
 
     for i in range(len(FEATURE_NAMES)):
-        cluster_names[maxs[i]] += "highest " + FEATURE_NAMES[i] + ", "
-        cluster_names[mins[i]] += "lowest " + FEATURE_NAMES[i] + ", "
-
-    cluster_names = pd.DataFrame(cluster_names, columns=["cluster_name"])
+        # cluster_names[maxs[i]] += "highest " + FEATURE_NAMES[i] + ", "
+        # cluster_names[mins[i]] += "lowest " + FEATURE_NAMES[i] + ", "
+        cluster_characteristics[maxs[i]] += "highest " + FEATURE_NAMES[i] + ", "
+        cluster_characteristics[mins[i]] += "lowest " + FEATURE_NAMES[i] + ", "
+
+    print(cluster_labels)
+    print(cluster_names)
+    print(cluster_characteristics)
+
+    # cluster_names = pd.DataFrame([cluster_labels, cluster_names, cluster_characteristics], columns=["cluster_label", "cluster_name", "cluster_characteristics"])
+    cluster_names = pd.DataFrame({
+        "cluster_label": cluster_labels,
+        "cluster_name": cluster_names,
+        "cluster_characteristics": cluster_characteristics
+        })
 
     return cluster_names
 
@@ -346,12 +365,13 @@ def postprocess(model, cluster_centers, feature_vectors, labels):
     if use_predefined_centroids:
         if len(predefined_centroids_dict) == n_clusters:
             for i, key in enumerate(predefined_centroids_dict):
-                cluster_names["cluster_name"][i] = (
-                    str(cluster_names["cluster_name"][i].split(":")[0])
-                    + ": "
-                    + f" {key}, ".upper()
-                    + str(cluster_names["cluster_name"][i].split(":")[1])
-                )
+                # cluster_names["cluster_name"][i] = (
+                #     str(cluster_names["cluster_name"][i].split(":")[0])
+                #     + ": "
+                #     + f" {key}, ".upper()
+                #     + str(cluster_names["cluster_name"][i].split(":")[1])
+                # )
+                cluster_names["cluster_name"][i] = key.upper()
 
                 if expectations != None:
                     # Add number to expectations
@@ -359,7 +379,8 @@ def postprocess(model, cluster_centers, feature_vectors, labels):
                         if expectation["name"].lower() == key.lower():
                             expectation["label"] = i
 
-    cluster_names.to_csv(OUTPUT_PATH / "cluster_names.csv")
+    cluster_names["source"] = params["featurize"]["dataset"]
+    cluster_names.to_csv(OUTPUT_PATH / "cluster_names.csv", index=False)
 
     if expectations != None:
         event_log_score(event_log, expectations)