-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentropy_of_trajectory.py
179 lines (121 loc) · 4.84 KB
/
entropy_of_trajectory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 7 17:03:38 2020
@author: lyubo
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
#%matplotlib inline
from sklearn.cluster import KMeans
from scipy.stats import entropy
from math import log, e
#import timeit
def k_mean_from_data(df, k):
'''
input: real data of coordinates
output:
K_mean result of clustering;
Clusters - (2 x N) array of number of clusters of type
[cl_1_start, cl_1_stop
cl_2_start, cl_2_stop
...
]
'''
# df = pd.read_csv('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/cityBrain/my_trips.csv')
Y = df.latitudestart.values # in the format of np.random.rand(100,2)
X = df.longitudestart.values
Y = np.append(Y, df.latitudestop.values)
X = np.append(X, df.longitudestop.values)
#plt.scatter(X, Y, s = 50, c = 'b')
#plt.show()
size_origin = np.shape(df.latitudestop.values)
#print('size destinations ', size_origin)
size = X.shape
X_data = np.zeros(( int(size[0]),2))
#print(X_data.shape)
X_data[:,0] = X
X_data[:,1] = Y
#print('plotting data on a map with centres')
Kmean = KMeans(n_clusters=k) #In this case, we arbitrarily gave k (n_clusters) an arbitrary value of two
Kmean.fit(X_data)
#print('centres of clustering ', Kmean.cluster_centers_)
#array_cent = Kmean.cluster_centers_
X_labels = np.zeros((size_origin[0],2))
X_labels[:,0] = Kmean.labels_[0:size_origin[0]]
X_labels[:,1] = Kmean.labels_[size_origin[0]:size[0]]
#print('done with clustering')
return Kmean, X_labels
# function to estimate simple entropy
def simple_entrop_data(Kmean_res, array_size):
'''
input: Kmean_res results of k-means clustering on k clusters,
output: function returns entropy array as function of time'''
size_origin = np.shape(Kmean_res.labels_)
#print('size of array', size_origin)
entrop_array_time = np.zeros(2*int(size_origin[0])) # size of entrop_array is (2*size) since each trip has 2 entrances
for ind in range(1, 2*int(size_origin[0])):
#each entrance of X_new[i, j] corresponds to trip
entrop_shape = np.shape(np.unique(Kmean_res.labels_[0:ind]))
entrop_array_time[ind] = int(entrop_shape[0])
return entrop_array_time
# function to estimate shannon entropy
def entropy_shannon(labels, base=None):
'''
input: labels 1D array of set of points
'''
value,counts = np.unique(labels, return_counts=True)
return entropy(counts, base=base)
# different way of calculation of entropy
def entropy2(labels, base=None):
'''
input: labels 1D array of set of points
output:
computes entropy of label distribution.
'''
n_labels = len(labels)
if n_labels <= 1:
return 0
value,counts = np.unique(labels, return_counts=True)
probs = counts / n_labels
n_classes = np.count_nonzero(probs)
if n_classes <= 1:
return 0
ent = 0.
# Compute entropy
base = e if base is None else base
for i in probs:
ent -= i * log(i, base)
return ent
########################################
# Main programs body
########################################
# load data on total trips, it may be heavy
filepath_before = 'C:/Users/lyubo/Documents/DATA_networks/mobilitydata/cityBrain/my_trips.csv'
filepath_full = 'C:/Users/lyubo/Documents/DATA_networks/mobilitydata/cityBrain/trips_updated.csv'
df_full = pd.read_csv(filepath_full)
print(df_full.shape)
print(df_full.columns)
df_full.head(5)
#########################################
# run clustering code functions on data
k = 30
K_mean_df, X_labels = k_mean_from_data(df_full, k)
print(X_labels)
# run entropy code on data
print('plotting Shannon entropy as a function of time')
k = 50 #number of clusters
N_data_points = df_full.shape[0] #df.shape() # number of datapoints
Kmean_res, X_labels = k_mean_from_data(df_full, k) # gives function which
shannon_entrop_array = np.zeros(2*N_data_points) # entropy after each move of the traveler (for each time step there are two points)
print('shape of labels ', np.shape(Kmean_res.labels_[:]))
for time in range(10, 2*N_data_points):
shannon_entrop_array[time] = entropy_shannon(Kmean_res.labels_[0:time]) #entropy_shannon(labels)
#test and plot entropy array for different k-means clusters
plt.plot(shannon_entrop_array)
#print('calculating entropy for k clusters ', k_ind)
plt.ylabel('number of unique locations')
plt.xlabel('rescaled time - trips ') #trips
plt.title('shannon entropy for one user')
plt.show()