Skip to content

Commit

Permalink
Fix input of labels, now you can import a dictonary with the time ser…
Browse files Browse the repository at this point in the history
…ies that you want to label.
  • Loading branch information
protti committed Jun 21, 2024
1 parent 1f46b39 commit 6c594bb
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 57 deletions.
83 changes: 55 additions & 28 deletions FeatTS/featTS_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
from tsfresh import feature_selection
import multiprocessing as mp

import numpy as np

class FeatTS(object):
"""
Expand Down Expand Up @@ -41,7 +41,7 @@ class FeatTS(object):
(default is Greedy).
"""

def __init__(self, n_clusters, n_jobs=4, value_PFA=0.9, max_numb_feat=20,
def __init__(self, n_clusters, n_jobs=1, value_PFA=0.9, max_numb_feat=20,
random_feat=False, threshold_community=0.8, algorithm_community='Greedy') :
"""
initialize FeatTS method
Expand All @@ -55,7 +55,7 @@ def __init__(self, n_clusters, n_jobs=4, value_PFA=0.9, max_numb_feat=20,
self.algorithm_community = {algorithm_community:{}}
self.feats_selected_ = []

def fit(self, X, y=[], train_semi_supervised=0, external_feat: pandas.DataFrame = None):
def fit(self, X, labels=None, external_feat: pd.DataFrame = None):
"""
compute FeatTS on X
Expand All @@ -64,7 +64,7 @@ def fit(self, X, y=[], train_semi_supervised=0, external_feat: pandas.DataFrame
X : array of shape (n_samples, n_timestamps)
Training instances to cluster.
y : array of labels (n_samples)
y : dict of labels {idx:class}
train_perc : percentage of semi-supervision (float)
Expand All @@ -80,35 +80,66 @@ def fit(self, X, y=[], train_semi_supervised=0, external_feat: pandas.DataFrame
if external_feat is not None and X.shape[0] != external_feat.shape[0] :
raise ValueError("The external features should have a feature value for each time series in input")

if y!=[]:
datasetAdapted = {"listOut": util.adaptTimeSeriesUCR(X),'series': pd.Series((str(i) for i in y)),
"listOfClass": list(str(i) for i in y)}
if labels is not None:
datasetAdapted = {"listOut": util.adaptTimeSeriesUCR(X),'labels': labels}

else:
datasetAdapted = {"listOut": util.adaptTimeSeriesUCR(X), 'series': pd.Series(list(str(-100) for i in range(X.shape[0]))),
"listOfClass": list(-100 for i in range(X.shape[0]))}
datasetAdapted = {"listOut": util.adaptTimeSeriesUCR(X)}

self.feats_selected_, features_filtered_direct = self.__features_extraction_selection(datasetAdapted, external_feat, self.value_PFA)


self.feats_selected_, features_filtered_direct = self.__features_extraction_selection(datasetAdapted, train_semi_supervised, external_feat, self.value_PFA)
matrixNsym = self.__community_and_matrix_creation(self.feats_selected_, datasetAdapted, features_filtered_direct)
self.labels_ = self.__cluster(matrixNsym, datasetAdapted)

def __features_extraction_selection(self,datasetAdapted, train_semi_supervised, external_feat, value_PFA):
def __features_extraction_selection(self,datasetAdapted, external_feat, value_PFA):

# Create the dataframe for the extraction of the features
listOut = datasetAdapted["listOut"]
listOfClass = datasetAdapted["listOfClass"]

filtreFeat, seriesAcc, features_filtered_direct = util.extractFeature(listOut, listOfClass, train_semi_supervised, external_feat=external_feat)
features_filtered_direct = util.extractFeature(listOut, external_feat=external_feat)

if external_feat is not None:
external_feat = features_filtered_direct[external_feat.columns.tolist()].copy()
features_filtered_direct.drop(columns=external_feat.columns.tolist(), inplace=True)
# features_filtered_direct.drop(columns=external_feat.columns.tolist(), inplace=True)

pfa = PFA()
features_filtered_direct = util.cleaning(features_filtered_direct)
if train_semi_supervised > 0:

if 'labels' in list(datasetAdapted.keys()):
allAcc = list(datasetAdapted["labels"].keys())
seriesAcc = pd.Series((datasetAdapted["labels"][i] for i in allAcc))
filtreFeat = features_filtered_direct.loc[allAcc].reset_index(drop=True)

multiclass = False
significant_class = 1
if len(seriesAcc.unique()) > 2:
multiclass = True
significant_class = len(seriesAcc.unique())


if 'id' in filtreFeat.keys():
filtreFeat = filtreFeat.drop('id', axis='columns')
elif 'index' in filtreFeat.keys():
filtreFeat = filtreFeat.drop('index', axis='columns')


# Extract the relevance for each features and it will be ordered by importance
ris = feature_selection.relevance.calculate_relevance_table(filtreFeat, seriesAcc, ml_task="classification")
ris = feature_selection.relevance.calculate_relevance_table(filtreFeat, seriesAcc,
ml_task="classification",
n_jobs=self.n_jobs,
multiclass=multiclass,
n_significant=significant_class)
if external_feat is not None:
ris = ris[~ris['feature'].isin(external_feat.columns.tolist())]

if multiclass:
p_value_columns = [col for col in ris.columns if col.startswith('p_value')]
# Replace NaN values with inf in the p_value columns
ris[p_value_columns] = ris[p_value_columns].fillna(np.inf)
# Sum the p_value columns
ris['p_value'] = ris[p_value_columns].sum(axis=1)

ris = ris.sort_values(by='p_value')

if self.random_feat:
Expand All @@ -127,20 +158,18 @@ def __features_extraction_selection(self,datasetAdapted, train_semi_supervised,

if external_feat is not None:
featPFA.extend(external_feat.columns.tolist())
features_filtered_direct = features_filtered_direct.join(external_feat)
# Identify columns in external_feat that are not in features_filtered_direct
non_overlapping_columns = external_feat.columns.difference(features_filtered_direct.columns)
# Select only the non-overlapping columns from external_feat
external_feat_non_overlapping = external_feat[non_overlapping_columns]
# Perform the join with the non-overlapping columns
features_filtered_direct = features_filtered_direct.join(external_feat_non_overlapping)

return featPFA, features_filtered_direct
def __community_and_matrix_creation(self, featPFA, datasetAdapted, features_filtered_direct):

def __community_and_matrix_creation(self, featPFA, datasetAdapted, features_filtered_direct):
listOfId = set(datasetAdapted["listOut"]["id"])
dictOfT = {}
# Create of dataframe where there are the values of the features take in consideration
for value in set(list(datasetAdapted["series"])):
dictSing = {value: pd.Series([0], index=["count"])}
dictOfT.update(dictSing)

dictOfInfoTrain = {}

# Creation of the features that we want to use
listOfFeat = featPFA

Expand All @@ -167,7 +196,6 @@ def collect_result_Train(result):
dictSing = {'list': list(clusterInside), 'weight': dictOfInfoTrain[key]["weightFeat"]}
setCluster.append(dictSing)


# Creation of CoOccurrence Matrix
# print("Matrix Creation...")
matrixNsym = util.getTabNonSym(setCluster, list(listOfId))
Expand All @@ -178,14 +206,13 @@ def __cluster(self, matrixNsym, datasetAdapted):
# List of the cluster created in the training set. It will be used later for the intersaction
# with the cluster extract from the testing.
listOfId = set(datasetAdapted["listOut"]["id"])
series = datasetAdapted["series"]

listOfCommFindTest = util.getCluster(matrixNsym, listOfId, self.n_clusters)

listOfCommFindTest = util.createSet(listOfCommFindTest, self.n_clusters)

# Modify the index of the TimeSeries with their classes
y_pred = [0 for x in range(len(series))]
y_pred = [0 for x in range(len(listOfId))]
for value in range(len(listOfCommFindTest)):
for ind in listOfCommFindTest[value]["cluster"]:
y_pred[ind] = value
Expand Down
30 changes: 6 additions & 24 deletions FeatTS/utilFeatExtr.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,32 +78,14 @@ def getDataframeAcc(appSeries,perc):
allNotAccInd.append(i)
return list(sorted(allAccInd)),list(sorted(allNotAccInd))

def extractFeature(listOut, external_feat=None):

features_filtered_direct = extract_features(listOut, column_id='id', column_sort='time')
if external_feat is not None:
features_filtered_direct = features_filtered_direct.join(external_feat)
features_filtered_direct = normalization_data(features_filtered_direct)

def getSubSetFeatures(df,allAccInd,allNotAccInd,listOfClass):
df = df.drop(allNotAccInd, axis=0)
df = df.reset_index()
seriesAcc = pd.Series((listOfClass[i] for i in allAccInd))
return df,seriesAcc


def extractFeature(listOut, listOfClass,trainFeatDataset,features_filtered_direct = None, external_feat=None):

if features_filtered_direct is None:
features_filtered_direct = extract_features(listOut, column_id='id', column_sort='time')
if external_feat is not None:
features_filtered_direct = features_filtered_direct.join(external_feat)
features_filtered_direct = normalization_data(features_filtered_direct)


allAcc,allNotAcc = choose_and_exclude_indices_by_percentage(listOfClass, trainFeatDataset)
# allAcc,allNotAcc = getDataframeAcc(series,trainFeatDataset)
filtreFeat,seriesAcc = getSubSetFeatures(features_filtered_direct,allAcc,allNotAcc,listOfClass)
if 'id' in filtreFeat.keys():
filtreFeat = filtreFeat.drop('id',axis='columns')
else:
filtreFeat = filtreFeat.drop('index',axis='columns')
return filtreFeat,seriesAcc,features_filtered_direct
return features_filtered_direct


def normalization_data(features_filtered_direct):
Expand Down
30 changes: 26 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,33 @@
import numpy as np
from FeatTS import FeatTS

import random
from collections import defaultdict

def select_random_percent(labels, perc):
# Group indices by class
class_indices = defaultdict(list)
for idx, label in enumerate(labels):
class_indices[label].append(idx)

# Select 20% of indices randomly for each class
selected_indices = {}
for label, indices in class_indices.items():
num_to_select = max(1, int(len(indices) * perc)) # At least one item should be selected
selected_indices_for_class = random.sample(indices, num_to_select)
for idx in selected_indices_for_class:
selected_indices[idx] = label

return selected_indices

if __name__ == '__main__':

dataCof = load_classification("Coffee")
dataCof = load_classification("ArrowHead")
X = np.squeeze(dataCof[0], axis=1)
y = dataCof[1].astype(int)
featTS = FeatTS(n_clusters=2)
featTS.fit(X,y,train_semi_supervised=0.2)
print(adjusted_mutual_info_score(featTS.labels_,y))
# external_feat = pd.DataFrame({'LEN':y})
labels = select_random_percent(y,0.2)
featTS = FeatTS(n_clusters=3, n_jobs=4)
featTS.fit(X,labels=labels)
print(adjusted_mutual_info_score(featTS.labels_,y))
print(featTS.feats_selected_)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from setuptools import setup, find_packages

__version__ = "0.0.2"
__version__ = "0.0.3"

setup(
name="FeatTS",
Expand Down

0 comments on commit 6c594bb

Please sign in to comment.