From 994554ef4eaacff59e0fb1c4f0b5c183cdfe0093 Mon Sep 17 00:00:00 2001 From: ejhusom Date: Wed, 17 Jan 2024 12:09:39 +0100 Subject: [PATCH] Update how cluster names are created --- params-fersa.yaml | 16 ++++++++++++++++ params-nova9.yaml | 16 ++++++++++++++++ params.yaml | 2 +- src/cluster_utils.py | 3 ++- src/featurize.py | 2 ++ src/postprocess.py | 45 ++++++++++++++++++++++++++++++++------------ 6 files changed, 70 insertions(+), 14 deletions(-) create mode 100644 params-fersa.yaml create mode 100644 params-nova9.yaml diff --git a/params-fersa.yaml b/params-fersa.yaml new file mode 100644 index 0000000..ff396c1 --- /dev/null +++ b/params-fersa.yaml @@ -0,0 +1,16 @@ +featurize: + columns: Channel_4_Data + convert_timestamp_to_datetime: true + dataset: nova10_p8_10hz-20230721121352 + overlap: 0 + timestamp_column: timestamp + window_size: 30 +postprocess: + min_segment_length: 1 +train: + annotations_dir: nova10_p8_10hz_annotations + fix_predefined_centroids: false + learning_method: minibatchkmeans + max_iter: 100 + n_clusters: 7 + use_predefined_centroids: true diff --git a/params-nova9.yaml b/params-nova9.yaml new file mode 100644 index 0000000..14542e9 --- /dev/null +++ b/params-nova9.yaml @@ -0,0 +1,16 @@ +featurize: + columns: Channel_4_Data + convert_timestamp_to_datetime: true + dataset: nova9_m4 + overlap: 0 + timestamp_column: timestamp + window_size: 10 +postprocess: + min_segment_length: 1 +train: + annotations_dir: nova9_m4_10hz_annotations + fix_predefined_centroids: false + learning_method: meanshift + max_iter: 100 + n_clusters: 4 + use_predefined_centroids: true diff --git a/params.yaml b/params.yaml index de4fb5f..b132c7b 100644 --- a/params.yaml +++ b/params.yaml @@ -4,7 +4,7 @@ featurize: dataset: nova10_p8_10hz overlap: 0 timestamp_column: timestamp - window_size: 30 + window_size: 35 postprocess: min_segment_length: 1 train: diff --git a/src/cluster_utils.py b/src/cluster_utils.py index 25c9f9d..e95e8ab 100644 --- a/src/cluster_utils.py +++ b/src/cluster_utils.py @@ -420,7 +420,8 @@ def create_event_log(labels, identifier="", segments = find_segments(labels) event_log = create_event_log_from_segments(segments, feature_vector_timestamps) - event_log["case"] = identifier + event_log["source"] = identifier + event_log["case"] = "" return event_log diff --git a/src/featurize.py b/src/featurize.py index 51f315e..77d8fe4 100644 --- a/src/featurize.py +++ b/src/featurize.py @@ -58,6 +58,8 @@ def featurize(dir_path="", inference=False, inference_df=None): with open("params.yaml", "r") as params_file: params = yaml.safe_load(params_file) + print(params) + print("=========") dataset = params["featurize"]["dataset"] columns = params["featurize"]["columns"] window_size = params["featurize"]["window_size"] diff --git a/src/postprocess.py b/src/postprocess.py index ceeb7b8..3638563 100644 --- a/src/postprocess.py +++ b/src/postprocess.py @@ -257,20 +257,39 @@ def generate_cluster_names(model, cluster_centers): """ levels = ["lowest", "low", "medium", "high", "highest"] + cluster_labels = [] cluster_names = [] + cluster_characteristics = [] + cluster_colors = [] + n_clusters = cluster_centers.shape[0] + # Add index and color for each cluster for i in range(n_clusters): - cluster_names.append(f"{i} ({COLORS[i]}): ") + cluster_colors.append(COLORS[i]) + cluster_labels.append(i) + cluster_names.append("") + cluster_characteristics.append("") maxs = cluster_centers.argmax(axis=0) mins = cluster_centers.argmin(axis=0) for i in range(len(FEATURE_NAMES)): - cluster_names[maxs[i]] += "highest " + FEATURE_NAMES[i] + ", " - cluster_names[mins[i]] += "lowest " + FEATURE_NAMES[i] + ", " - - cluster_names = pd.DataFrame(cluster_names, columns=["cluster_name"]) + # cluster_names[maxs[i]] += "highest " + FEATURE_NAMES[i] + ", " + # cluster_names[mins[i]] += "lowest " + FEATURE_NAMES[i] + ", " + cluster_characteristics[maxs[i]] += "highest " + FEATURE_NAMES[i] + ", " + cluster_characteristics[mins[i]] += "lowest " + FEATURE_NAMES[i] + ", " + + print(cluster_labels) + print(cluster_names) + print(cluster_characteristics) + + # cluster_names = pd.DataFrame([cluster_labels, cluster_names, cluster_characteristics], columns=["cluster_label", "cluster_name", "cluster_characteristics"]) + cluster_names = pd.DataFrame({ + "cluster_label": cluster_labels, + "cluster_name": cluster_names, + "cluster_characteristics": cluster_characteristics + }) return cluster_names @@ -346,12 +365,13 @@ def postprocess(model, cluster_centers, feature_vectors, labels): if use_predefined_centroids: if len(predefined_centroids_dict) == n_clusters: for i, key in enumerate(predefined_centroids_dict): - cluster_names["cluster_name"][i] = ( - str(cluster_names["cluster_name"][i].split(":")[0]) - + ": " - + f" {key}, ".upper() - + str(cluster_names["cluster_name"][i].split(":")[1]) - ) + # cluster_names["cluster_name"][i] = ( + # str(cluster_names["cluster_name"][i].split(":")[0]) + # + ": " + # + f" {key}, ".upper() + # + str(cluster_names["cluster_name"][i].split(":")[1]) + # ) + cluster_names["cluster_name"][i] = key.upper() if expectations != None: # Add number to expectations @@ -359,7 +379,8 @@ def postprocess(model, cluster_centers, feature_vectors, labels): if expectation["name"].lower() == key.lower(): expectation["label"] = i - cluster_names.to_csv(OUTPUT_PATH / "cluster_names.csv") + cluster_names["source"] = params["featurize"]["dataset"] + cluster_names.to_csv(OUTPUT_PATH / "cluster_names.csv", index=False) if expectations != None: event_log_score(event_log, expectations)