-
Notifications
You must be signed in to change notification settings - Fork 0
/
nifl_helper.py
473 lines (396 loc) · 17.9 KB
/
nifl_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
## Helper functions:
## Load in flowlines from CSV
## Identify evenly-spaced points along them
## Pull given dataset at these points
## Export a point time series to CSV
from math import sqrt
from scipy import interpolate
import collections
import datetime
import numpy as np
import statsmodels.api as sm
import pandas as pd
import iceutils as ice
from iceutils import tseries, timeutils
def Flowline_CSV(filename, has_width=False, flip_order=False):
"""Read in glacier flowlines in CSV format.
Based on automatically-selected flowline format produced by EHU.
Parameters
----------
filename : string
Name/path of CSV.
has_width : Boolean, optional
Whether the flowline has glacier width stored. The default is False.
flip_order : Boolean, optional
Whether lines need flipping to run from terminus upstream. The default is False.
Returns
-------
centrelines_list : list
list of flowlines
"""
f = open(filename,'r')
header = f.readline() #header line
hdr = header.strip('\r\n')
keys = hdr.split(',') #get names of variables
data = {k : [] for k in keys}
data['Length_ID'] = collections.OrderedDict() #new dictionary that counts how many points (i.e. lines of file) are in each flowline. Must be ordered for later iteration!
data['Lineslist'] = []
lines = f.readlines()
f.close()
temp = []
j = 0
for i,l in enumerate(lines):
linstrip = l.strip('\r\n')
parts = linstrip.split(',')
x_coord = float(parts[1])
y_coord = float(parts[2])
if parts[0] not in data['Length_ID'].keys(): #finding out where lines separate
temp = []
data['Lineslist'].append(temp) #initialize new empty array that can be modified in-place later
data['Length_ID'][parts[0]] = 1
j+=1
else:
data['Length_ID'][parts[0]] += 1
if has_width:
width = float(parts[3])
temp.append((x_coord, y_coord, width))
else:
temp.append((x_coord, y_coord))
data['Lineslist'][j-1] = np.array(temp) #need to modify an existing array rather than append to keep correct indexing
nlines = len(data['Length_ID'].keys())
if flip_order:
centrelines_list = [np.array(data['Lineslist'][j])[::-1] for j in range(nlines)] #making arrays, reversed to start at terminus rather than peak
else:
centrelines_list = [np.array(data['Lineslist'][j]) for j in range(nlines)] # arrays already start at terminus
return centrelines_list
def ds(num, flarr):
"""Find path length between adjacent entries of a 2D array.
Parameters
----------
num : int
Index of first array element.
flarr : array
The array containing 2D coordinate positions to examine.
Returns
-------
ds : float
Cartesian distance between flarr[num] and flarr[num-1].
"""
if num==0:
return 0
else:
return sqrt((flarr[num,0]-flarr[num-1,0])**2 + (flarr[num,1]-flarr[num-1,1])**2)
def ArcArray(flowline):
"""Compute arc length along a flowline of coordinates
Parameters
----------
flowline : array
2D array of coordinates (xi, yi).
Returns
-------
arcarr: array
Corresponding arc length values at each point
"""
arcarr = []
xs = range(len(flowline))
for n in xs:
if n==0:
arcarr.append(0)
else:
arcarr.append(arcarr[-1]+ds(n, flowline))
return arcarr
## Set up design matrix and perform lasso regression, according to Bryan's documentation
def build_collection(dates):
"""
Function that creates a list of basis functions for a given datetime vector dates.
"""
# Get date bounds
tstart, tend = dates[0], dates[-1]
# Initalize a collection and relevant basis functions
collection = ice.tseries.timefn.TimefnCollection()
bspl = ice.tseries.timefn.fnmap['bspline']
ispl = ice.tseries.timefn.fnmap['isplineset']
poly = ice.tseries.timefn.fnmap['poly']
# Add polynomial first for secular components
collection.append(poly(tref=tstart, order=1, units='years'))
# Use B-splines for seasonal (short-term) signals
Δtdec = 1.0 / 3.0 # years
Δt = datetime.timedelta(days=int(Δtdec*365))
t_current = tstart
while t_current <= tend:
collection.append(bspl(order=3, scale=Δtdec, units='years', tref=t_current))
t_current += Δt
# Integrated B-slines for transient signals
# In general, we don't know the timescales of transients a prior
# Therefore, we add integrated B-splines of various timescales where the
# timescale is controlled by the 'nspl' value (this means to divide the time
# vector into 'nspl' equally spaced spline center locations)
for nspl in [128, 64, 32, 16, 8, 4]:
collection.append(ispl(order=3, num=nspl, units='years', tmin=tstart, tmax=tend))
# Done
return collection
# Build a priori covariance matrix, mainly for repeating B-splines
def computeCm(collection, b_spl_sigma=0.5):
from scipy.linalg import block_diag
# Define some prior sigmas (large prior sigmas for secular and transient)
sigma_secular = 100.0
sigma_bspl = b_spl_sigma
sigma_ispl = 100.0
# Get lengths of different model partitions
fnParts = ice.tseries.timefn.getFunctionTypes(collection)
nBspl = len(fnParts['seasonal'])
nIspl = len(fnParts['transient'])
nSecular = len(fnParts['secular'])
# Diagonal prior for secular components
C_sec = sigma_secular**2 * np.eye(nSecular)
# Get decimal times of B-spline centers
tdec = []
for basis in collection.data:
if basis.fnname == 'BSpline':
tdec.append(ice.datestr2tdec(pydtime=basis.tref))
tdec = np.array(tdec)
# Correlation with splines w/ similar knot times
# Decaying covariance w/ time
tau = 1.0
C_bspl = np.zeros((nBspl, nBspl))
for i in range(nBspl):
weight_decay = np.exp(-1.0 * np.abs(tdec - tdec[i]) / tau)
ind = np.zeros((nBspl,), dtype=bool)
ind[i::5] = True
ind[i::-5] = True
C_bspl[i,ind] = weight_decay[ind]
C_bspl *= sigma_bspl**2
# Diagonal prior for integrated B-splines
C_ispl = sigma_ispl**2 * np.eye(nIspl)
return block_diag(C_sec, C_bspl, C_ispl)
def VSeriesAtPoint(pt, vel_stack, collection, model, model_pred, solver, t_grid,
sigma=1.5, data_key='igram'):
"""
Invert for a continuous 1D velocity series at a given point
Parameters
----------
pt : tuple
Position (x,y) at which to pull series.
vel_stack : ice.MagStack
Stack of 2D velocity fields including this point.
collection : ice.tseries TimefnCollection
Basis functions for the tseries dates in vel_stack
model : ice.tseries.model.Model
Model instance for inversion
model_pred : ice.tseries.model.Model
Model instance for prediction - evenly spaced dates
solver : ice.tseries solver
Instance of solver that will do the inversion. LassoRegression has worked well.
t_grid : ndarray
Evenly spaced decimal times at which to sample spline-fit velocity
sigma : float, optional
B-spline sigma for a priori covariance at this point. The default is 1.5.
data_key : str, optional
Convention for accessing HDF5 data. The default is 'igram' (per B. Riel).
Returns
-------
pred : dict
Model prediction for this point. Keys are 'full', 'seasonal', 'transient', 'secular', 'step'
short_term : ndarray
Series with long-term signals removed
long_term : ndarray
Series with short-term signals removed
"""
series = vel_stack.timeseries(xy=pt, key=data_key)
# Construct a priori covariance
Cm = computeCm(collection, b_spl_sigma=sigma)
solver.regMat = np.linalg.inv(Cm)
# Perform inversion to get coefficient vector and coefficient covariance matrix
status, m, Cm = solver.invert(model.G, series) # fit near-terminus (series[0]) first
assert status == ice.SUCCESS, 'Failed inversion'
# Model will perform predictions
pred = model_pred.predict(m)
# Separate out seasonal (short-term) and secular + transient (long-term) signals
short_term = pred['seasonal']
long_term = pred['secular'] + pred['transient']
# Remove long-term signals from data
series_short_term = series - np.interp(vel_stack.tdec, t_grid, long_term)
# Remove short-term signals from data
series_long_term = series - np.interp(vel_stack.tdec, t_grid, short_term)
return pred, series_short_term, series_long_term
def Xcorr1D(pt, series_func, series_dates, velocity_pred, t_grid, t_limits, diff=1, normalize=True, pos_only=False):
"""
Compute cross-correlation on coincident series of a 1D time series
(e.g. catchment-integrated runoff or SMB) versus velocity at a point.
Parameters
----------
pt : tuple
Position (x,y) at which to pull velocity series.
series_func : interpolate.interp1d
1D-interpolated function with values of data series over time.
series_dates : list
Decimal dates of data points
velocity_series : dict
Output of iceutils prediction.
t_grid : ndarray
Evenly spaced decimal times at which spline-fit velocity is sampled
t_limits : tuple
Start and end dates (decimal) of the time period to study
diff : int, optional
Number of discrete differences to apply to data. Default is 1.
Setting diff=0 will process the input data as-is.
normalize : bool, optional
Whether to normalize for a cross-correlation in [-1,1]. Default is True.
This makes the output inter-comparable with normalized output for other
variables. If set to False, the signal amplitude will be larger but
the correlation values may exceed 1.
pos_only : bool, optional
Whether to analyse only xcorrs with positive lag values. Default is False.
This allows a bidirectional causal relationship. For a causal relationship
hypothesised to be single-directional, choose True to display only positive
lag values.
Returns
-------
corr : array
Cross-correlation coefficients between SMB, velocity
lags : array
Time lag for each correlation value
ci : array
Confidence intervals for evaluation
"""
t_min = max(min(series_dates), t_limits[0])
t_max = min(max(series_dates), t_limits[1])
coincident_dates = np.asarray([t for t in t_grid if (t>=t_min and t<t_max)])
coincident_series = series_func(coincident_dates) # sample at same dates as velocity series
series_diff = np.diff(coincident_series, n=diff)
vel_series_0 = velocity_pred['full'][np.where(t_grid>=t_min)]
vel_series = vel_series_0[np.where(t_grid[np.where(t_grid>=t_min)]<t_max)] # trim dates to match t_limits
vel_diff = np.diff(vel_series, n=diff)
if normalize:
series_diff = (series_diff-np.mean(series_diff)) / (np.std(series_diff)*len(series_diff))
vel_diff = (vel_diff-np.mean(vel_diff)) / (np.std(vel_diff))
corr = np.correlate(series_diff, vel_diff, mode='full')
lags = range(int(-0.5*len(corr)), int(0.5*len(corr)+1))
ci = [1.96/np.sqrt(len(coincident_series)-abs(k)) for k in lags]
## convert lags to physical units
lags = np.mean(np.diff(t_grid))*365.26*np.asarray(lags)
if pos_only:
corr = corr[np.argwhere(lags>=0)].squeeze()
ci = np.asarray(ci)[np.argwhere(lags>=0)].squeeze()
lags = lags[np.argwhere(lags>=0)].squeeze()
return corr, lags, ci
def Arima_ResidFunc(series, normalize=True, diff=1):
"""
Fit an ARIMA model of appropriate order to the input series, compute the
residuals we will use for the cross-correlation with velocity.
Parameters
----------
series : pd.Series
The "input variable" (SMB, runoff, terminus position...) expressed as a
one-dimensional time series. The timestamp of each data point
should be the "index" of the pandas Series.
normalize : bool, optional
Whether to normalize series before fitting ARIMA. Default is True.
Normalized residuals will make the output of XCorr1D inter-comparable
with normalized output for other variables. If set to False, the signal
amplitude will be larger but the correlation values may exceed 1.
diff : int, optional
Number of discrete differences to apply to data. Default is 1.
We do this to hone in on the effect of our variables on each other,
rather than a mutual response (background trend) to a shared
un-modelled factor.
Returns
-------
mod_fit: statsmodels.tsa.arima.model.ARIMAResultsWrapper
The ARIMA model fit to the series, which we will use in filtering
the velocity data for cross-correlation.
(Ref: https://online.stat.psu.edu/stat510/lesson/9/9.1)
resid_func : interpolate.interp1d
1D-interpolated function with values of residuals (series-ARIMA) over time.
We will use this to sample the residuals at the same time intervals
as the velocity, which is necessary for the cross-correlation.
series_resid : pd.Series , optional
A pd.Series of the residuals
"""
if normalize:
series_normed = (series-np.mean(series)) / (np.std(series)*len(series))
else:
series_normed = series
## Prewhitening data: fit ARIMA model to series data
d = diff # order of differencing to apply in ARIMA model
## partial autocorrelation function helps us choose the AR term (p) in ARIMA
lag_pacf = sm.tsa.stattools.pacf(series_normed, nlags=20, method='ols')
upper_confidence = 1.96/np.sqrt(len(series_normed))
p_candidates = np.argwhere(lag_pacf > upper_confidence).squeeze()
p = p_candidates[p_candidates >0][0] # choose first nonzero value that exceeds upper CI
## autocorrelation function helps us choose the MA term (q) in ARIMA
lag_acf = sm.tsa.stattools.acf(series_normed, nlags=20)
q_candidates = np.argwhere(lag_acf > upper_confidence).squeeze()
q = q_candidates[q_candidates >0][0] # choose first nonzero value that exceeds upper CI
print('Fitting ARIMA of order ({},{},{})'.format(p,d,q))
mod = sm.tsa.arima.ARIMA(series_normed, order=(p,d,q))
mod_fit = mod.fit()
series_resid = pd.DataFrame(mod_fit.resid).squeeze()
resid_d = [d.utctimetuple() for d in series_resid.index]
resid_d_interp = [ice.timeutils.datestr2tdec(d[0], d[1], d[2]) for d in resid_d]
resid_func = interpolate.interp1d(resid_d_interp, series_resid)
return mod_fit, resid_func, series_resid
def Xcorr1D_prewhitened(resid_func, series_dates, velocity_pred, t_grid, t_limits,
mod_fit, normalize=True, pos_only=False):
"""
Compute cross-correlation on coincident series of a 1D time series
(e.g. catchment-integrated runoff or SMB) versus velocity at a point.
Note that unlike XCorr1D, we do not include the option to difference the
velocity time series here. Rather, we apply a convolution filter defined
by the parameters of the ARIMA model mod_fit.
Parameters
----------
resid_func : interpolate.interp1d
1D-interpolated function with values of data series over time.
We assume that this is already normalized.
series_dates : list
Decimal dates of data points
velocity_series : dict
Output of iceutils prediction.
t_grid : ndarray
Evenly spaced decimal times at which spline-fit velocity is sampled
t_limits : tuple
Start and end dates (decimal) of the time period to study
mod_fit : statsmodels.tsa.arima.model.ARIMAResultsWrapper
An ARIMA model fit to the input variable, which we will use in filtering
the velocity data for cross-correlation.
(Ref: https://online.stat.psu.edu/stat510/lesson/9/9.1)
normalize : bool, optional
Whether to normalize for a cross-correlation in [-1,1]. Default is True.
This makes the output inter-comparable with normalized output for other
variables. If set to False, the signal amplitude will be larger but
the correlation values may exceed 1.
pos_only : bool, optional
Whether to analyse only xcorrs with positive lag values. Default is False.
This allows a bidirectional causal relationship. For a causal relationship
hypothesised to be single-directional, choose True to display only positive
lag values.
Returns
-------
corr : array
Cross-correlation coefficients between SMB, velocity
lags : array
Time lag for each correlation value
ci : array
Confidence intervals for evaluation
"""
t_min = max(min(series_dates), t_limits[0])
t_max = min(max(series_dates), t_limits[1])
coincident_dates = np.asarray([t for t in t_grid if (t>=t_min and t<t_max)])
coincident_resid = resid_func(coincident_dates) # sample at same dates as velocity series
vel_series_0 = velocity_pred['full'][np.where(t_grid>=t_min)]
vel_series = vel_series_0[np.where(t_grid[np.where(t_grid>=t_min)]<t_max)] # trim dates to match t_limits
if normalize:
vel_series = (vel_series-np.mean(vel_series)) / (np.std(vel_series))
v_filt = sm.tsa.filters.convolution_filter(vel_series, filt=mod_fit.params)[1:-1] #trim nans from ends
# TODO: confirm whether this whitens correctly
corr = np.correlate(coincident_resid[1:-1], v_filt, mode='full')
lags = range(int(-0.5*len(corr)), int(0.5*len(corr)+1))
ci = [1.96/np.sqrt(len(coincident_resid)-abs(k)) for k in lags]
## convert lags to physical units
lags = np.mean(np.diff(t_grid))*365.26*np.asarray(lags)
if pos_only:
corr = corr[np.argwhere(lags>=0)].squeeze()
ci = np.asarray(ci)[np.argwhere(lags>=0)].squeeze()
lags = lags[np.argwhere(lags>=0)].squeeze()
return corr, lags, ci