Skip to content

Commit

Permalink
Merge pull request #64 from Dianeod/dianedev
Browse files Browse the repository at this point in the history
Dianedev
  • Loading branch information
awilson-kx authored Jan 6, 2020
2 parents 9f653e8 + 4d51674 commit fab4c32
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 35 deletions.
2 changes: 1 addition & 1 deletion fresh/extract.q
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ fresh.createfeatures:{[data;aggs;cnames;conf]
p0:exec f from conf where valid,pnum=0;
p1:exec f,pnames,pvals from conf where valid,pnum>0;
calcs:p0,raze p1[`f]cross'p1[`pnames],'/:'(cross/)each p1`pvals;
calcs:(cnames:$[n:"j"$abs system"s";(n;0N)#;enlist]cnames)cross\:calcs;
calcs:(cnames:$[n:"j"$abs system"s";$[n<count cnames;(n;0N);(n)]#;enlist]cnames)cross\:calcs;
q:{flip[(` sv'`.ml.fresh.feat,'x[;1];x[;0])],'last@''2_'x}each calcs;
q:(`$ssr[;".";"o"]@''"_"sv''string raze@''calcs)!'q;
r:(uj/).[?[;();aggs!aggs;]]peach flip((cnames,\:aggs:aggs,())#\:data;q);
Expand Down
4 changes: 3 additions & 1 deletion fresh/select.q
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ fresh.i.fisherexact:.p.import[`scipy.stats]`:fisher_exact
/ q utils
fresh.i.ktau:{fresh.i.kendalltau[<;x;y]1}
fresh.i.fisher:{fresh.i.fisherexact[<;count@''@\:[group@'x value group y]distinct x]1}

/ Function change due to scipy update https://github.com/scipy/scipy/blob/v1.3.2/scipy/stats/stats.py#L5385-L5573
fresh.i.ks:{
k:max abs(-). value(1+d bin\:raze d)%n:count each d:asc each y group x;
fresh.i.ksdistrib[k*en+.12+.11%en:sqrt prd[n]%sum n]`}
fresh.i.ksdistrib[k*en:sqrt prd[n]%sum n]`}
fresh.i.ksyx:{fresh.i.ks[y;x]}

/ feature significance
Expand Down
39 changes: 16 additions & 23 deletions fresh/tests/features.t
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ that are present in the tsfresh documentation. It should be noted that for large
\l fresh/tests/test.p

xj:10000?10000;
xi:10000?10000i;
xi:1000?1000i;
xf:10000?50000f;
xh:10000?5000h;
xb:10000#0101101011b;
Expand Down Expand Up @@ -63,7 +63,7 @@ np:.p.import[`numpy]
.ml.fresh.feat.absenergy[xj] ~ "f"$abs_energy[xj]
.ml.fresh.feat.absenergy[xf] ~ abs_energy[xf]
.ml.fresh.feat.absenergy[xb] ~ "f"$abs_energy[xb]
.ml.fresh.feat.absenergy[xi] ~ "f"$abs_energy[xi]
.ml.fresh.feat.absenergy[xi] = "f"$abs_energy[xi]
.ml.fresh.feat.absenergy[x0] ~ "f"$abs_energy[x0]
.ml.fresh.feat.absenergy[x1] ~ "f"$abs_energy[x1]
.ml.fresh.feat.absenergy[x2] ~ "f"$abs_energy[x2]
Expand Down Expand Up @@ -242,14 +242,14 @@ np:.p.import[`numpy]
.ml.fresh.feat.mean2dercentral[xj] ~ mean_second_derivative_central[xj]
.ml.fresh.feat.mean2dercentral[xf] ~ mean_second_derivative_central[xf]
.ml.fresh.feat.mean2dercentral[xi] ~ mean_second_derivative_central[xi]
.ml.fresh.feat.mean2dercentral[xb] ~ 0.0005
.ml.fresh.feat.mean2dercentral[xb] ~ 0f
.ml.fresh.feat.mean2dercentral[x0] ~ 0n
.ml.fresh.feat.mean2dercentral[x1] ~ 0n
.ml.fresh.feat.mean2dercentral[x2] ~ 0n
.ml.fresh.feat.mean2dercentral[xnull] ~ 0n

.ml.fresh.feat.skewness[xj] ~ skewness_py[xj]
(.ml.fresh.feat.skewness[xf] - skewness_py[xf])<1e-15
(.ml.fresh.feat.skewness[xf] - skewness_py[xf])<1e-13
.ml.fresh.feat.skewness[xb] ~ skewness_py[xb]
.ml.fresh.feat.skewness[xi] ~ skewness_py[xi]
.ml.fresh.feat.skewness[x0] ~ 0n
Expand Down Expand Up @@ -390,7 +390,7 @@ abs[.ml.fresh.feat.binnedentropy[xnull;50]] ~ 0f
.ml.fresh.feat.treverseasymstat[xj;2] ~ time_reversal_asymmetry_statistic[xj;2]
.ml.fresh.feat.treverseasymstat[xf;2] ~ time_reversal_asymmetry_statistic[xf;2]
.ml.fresh.feat.treverseasymstat[xi;2] ~ time_reversal_asymmetry_statistic[xi;2]
.ml.fresh.feat.treverseasymstat[xb;2] ~ 0.001
.ml.fresh.feat.treverseasymstat[xb;2] ~ 0.0001
.ml.fresh.feat.treverseasymstat[x0;2] ~ 0f
.ml.fresh.feat.treverseasymstat[x1;2] ~ "f"$time_reversal_asymmetry_statistic[x1;2]
.ml.fresh.feat.treverseasymstat[x2;2] ~ "f"$time_reversal_asymmetry_statistic[x2;2]
Expand All @@ -401,25 +401,18 @@ abs[.ml.fresh.feat.binnedentropy[xnull;50]] ~ 0f
.ml.fresh.feat.indexmassquantile[xh;0.] ~ index_mass_quantile[xh;0.]
.ml.fresh.feat.indexmassquantile[xi;x0] ~ x0

.ml.fresh.feat.lastmax[xi] ~ first_location_of_maximum[xi]
.ml.fresh.feat.lastmax[xj] ~ first_location_of_maximum[xj]
.ml.fresh.feat.lastmax[xf] ~ first_location_of_maximum[xf]
.ml.fresh.feat.lastmax[xi] ~ last_location_of_maximum[xi]
.ml.fresh.feat.lastmax[xj] ~ last_location_of_maximum[xj]
.ml.fresh.feat.lastmax[xf] ~ last_location_of_maximum[xf]
.ml.fresh.feat.lastmax[x0] ~ 0n
.ml.fresh.feat.lastmax[xs] ~ 0f

.ml.fresh.feat.lastmin[xi] ~ first_location_of_minimum[xi]
.ml.fresh.feat.lastmin[xj] ~ first_location_of_minimum[xj]
.ml.fresh.feat.lastmin[xf] ~ first_location_of_minimum[xf]
.ml.fresh.feat.lastmin[xi] ~ last_location_of_minimum[xi]
.ml.fresh.feat.lastmin[xj] ~ last_location_of_minimum[xj]
.ml.fresh.feat.lastmin[xf] ~ last_location_of_minimum[xf]
.ml.fresh.feat.lastmin[x0] ~ 0n
.ml.fresh.feat.lastmin[xs] ~ 0f

(value .ml.fresh.feat.aggautocorr[xi]) ~ agg_autocorrelation[xi;]each autocorrkeys
(value .ml.fresh.feat.aggautocorr[xj]) ~ agg_autocorrelation[xj;]each autocorrkeys
(value .ml.fresh.feat.aggautocorr[xf]) ~ agg_autocorrelation[xf;]each autocorrkeys
(value .ml.fresh.feat.aggautocorr[xh]) ~ agg_autocorrelation[xh;]each autocorrkeys
(value .ml.fresh.feat.aggautocorr[xb]) ~ agg_autocorrelation[xb;]each autocorrkeys
(value .ml.fresh.feat.aggautocorr[xnull]) ~ 4#0f

(value .ml.fresh.feat.changequant[xf;0.2;0.8;1b]) ~ change_quantiles[xf;0.2;0.8;1b;]each changequantkeys
(value .ml.fresh.feat.changequant[xf;0.25;0.7;1b]) ~ change_quantiles[xf;0.25;0.7;1b;]each changequantkeys
(value .ml.fresh.feat.changequant[xf;0.2;0.65;1b]) ~ change_quantiles[xf;0.2;0.65;1b;]each changequantkeys
Expand Down Expand Up @@ -474,13 +467,13 @@ abs[.ml.fresh.feat.binnedentropy[xnull;50]] ~ 0f
(.ml.fresh.feat.lintrend[xnull]`intercept) ~ 0f
(.ml.fresh.feat.lintrend[xnull]`rval) ~ 0f

(value .ml.fresh.feat.aggautocorr[xj]) ~ agg_autocorrelation[xj;]each `mean`var`median`std
(value .ml.fresh.feat.aggautocorr[xf]) ~ agg_autocorrelation[xf;]each `mean`var`median`std
(value .ml.fresh.feat.aggautocorr[xb]) ~ agg_autocorrelation[xb;]each `mean`var`median`std
(value .ml.fresh.feat.aggautocorr[xi]) ~ agg_autocorrelation[xi;]each `mean`var`median`std
(value .ml.fresh.feat.aggautocorr[xj]) ~ agg_autocorrelation[xj;]each autocorrkeys
(value .ml.fresh.feat.aggautocorr[xf]) ~ agg_autocorrelation[xf;]each autocorrkeys
(1_value .ml.fresh.feat.aggautocorr[xb]) ~ 1_agg_autocorrelation[xb;]each autocorrkeys
(value .ml.fresh.feat.aggautocorr[xi]) ~ agg_autocorrelation[xi;]each autocorrkeys
(value .ml.fresh.feat.aggautocorr[x0]) ~ 4#0f
(value .ml.fresh.feat.aggautocorr[x1]) ~ 4#0f
(value .ml.fresh.feat.aggautocorr[x2]) ~ agg_autocorrelation[x2;]each `mean`var`median`std
(value .ml.fresh.feat.aggautocorr[x2]) ~ agg_autocorrelation[x2;]each autocorrkeys
(value .ml.fresh.feat.aggautocorr[xnull]) ~ 4#0f

(.ml.fresh.feat.fftaggreg[xj]`centroid) ~ fft_aggregated[xj][0]
Expand Down
2 changes: 1 addition & 1 deletion fresh/tests/significancetests.p
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ p)def< target_binary_feature_real_test(y, x):
x_y1 = x[y == y1]
x_y0 = x[y == y0]

KS, p_ks = stats.ks_2samp(x_y1, x_y0)
KS, p_ks = stats.ks_2samp(x_y1, x_y0,mode='asymp')
return p_ks

p)def< target_real_feature_real_test(x, y):
Expand Down
2 changes: 2 additions & 0 deletions fresh/tests/test.p
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ p)def< count_above_mean(x): x = np.asarray(x); m = np.mean(x); return np.where(x
p)def< count_below_mean(x): x = np.asarray(x); m = np.mean(x); return np.where(x < m)[0].shape[0]
p)def< first_location_of_maximum(x): x = np.asarray(x); return np.argmax(x) / len(x) if len(x) > 0 else np.NaN
p)def< first_location_of_minimum(x): x = np.asarray(x); return np.argmin(x) / len(x) if len(x) > 0 else np.NaN
p)def< last_location_of_minimum(x): x = np.asarray(x); return 1.0 - (1+np.argmin(x[::-1]))/ len(x) if len(x) > 0 else np.NaN
p)def< last_location_of_maximum(x): x = np.asarray(x); return 1.0 - (1+np.argmax(x[::-1]))/ len(x) if len(x) > 0 else np.NaN
p)def< ratio_val_num_to_t_series(x):return len(set(x))/len(x)
p)def< ratio_beyond_r_sigma(x,r):return sum(abs(x - np.mean(x)) > r * np.std(x))/len(x)
p)def< large_standard_deviation(x,r):x = np.asarray(x);return np.std(x) > (r * (max(x) - min(x)))
Expand Down
8 changes: 5 additions & 3 deletions util/mproc.q
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
\d .ml

.z.pd:`u#0#0i
if[not `mproc in key .ml;.z.pd:`u#0#0i;mproc.N:0]
.z.pc:{.z.pd:`u#.z.pd except x}
mproc.reg:{.z.pd,:.z.w;neg[.z.w]@/:mproc.cmds}
mproc.init:{[n;x]
if[not p:system"p";'"set port to multiprocess"];
mproc.cmds:x;
do[n;system"q ",path,"/util/mprocw.q -pp ",string p];}
neg[.z.pd]@\:/:x;
mproc.cmds,:x;
do[0|n-mproc.N;system"q ",path,"/util/mprocw.q -pp ",string p];
mproc.N|:n;}
16 changes: 16 additions & 0 deletions util/tests/utiltst.t
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,18 @@
np:.p.import[`numpy]

p)import pandas as pd
p)import numpy as np
p)import datetime
t:.p.eval"pd.DataFrame({'fcol':[0.1,0.2,0.3,0.4,0.5],'jcol':[10,20,30,40,50]})"
t2:.p.eval"pd.DataFrame({'fcol':[None,None,None,None,None],'jcol':[True,False,True,False,True]})"
t3:.p.eval"pd.DataFrame({'date':[datetime.date(2005, 7, 14),datetime.date(2005, 7, 15)],'time':[datetime.time(12, 10, 30,500)
,datetime.time(12, 13, 30,200)],'str':['h','i'],'ind':[1.3,2.5],'bool':[True,False]})"
t4:.p.eval"pd.DataFrame({'bool':[True,False],'date':[np.datetime64('2005-02-25'),np.datetime64('2015-12-22')],'timed':[datetime.timedelta(hours=-5),
datetime.timedelta(seconds=1000)]})"
p)dtT = pd.Series(pd.date_range('2019-01-01 1:30',periods=2)).to_frame(name='dt')
p)dtT['dt_with_tz']=dtT.dt.dt.tz_localize('CET')
t5: .p.eval "dtT"
dt1:2019.01.01D01:30:00.000000000 2019.01.02D01:30:00.000000000

plaintab:([]4 5 6.;1 2 3.;-1 -2 -3.;0.4 0.5 0.6)
xm:100 10#1000?100f
Expand All @@ -17,6 +27,8 @@ dfsj:.ml.tab2df tx:select by scol,jcol from tt
(dfsx:.ml.tab2df tx)[`:index][:;`:names;(`scol;::)]
(dfxj:.ml.tab2df tx)[`:index][:;`:names;(::;`jcol)]
(dfxx:.ml.tab2df tx)[`:index][:;`:names;(::;::)]
tt2:([]date:2005.07.14 2005.07.15;timesp:("N"$"12:10:30.000500000";"N"$"12:13:30.000200007");time:20:30:00.001 19:23:20.201;str:enlist each ("h";"i");ind:1.3 2.5;bool:10b)
112 112 112 10 -9 -1h~type each first (.ml.tab2df tt2)[`:values]`

.ml.shape[1 2 3*/:til 10] ~ np[`:shape][1 2 3*/:til 10]`
.ml.shape[enlist 1] ~ np[`:shape][enlist 1]`
Expand All @@ -43,6 +55,10 @@ first[.ml.eye[1]] ~ enlist 1f

.ml.df2tab[t]~([]fcol:0.1*1+til 5;jcol:10*1+til 5)
.ml.df2tab[t2]~([]fcol:5#(::);jcol:10101b)
.ml.df2tab_tz[t3;0b;1b]~([]date:2005.07.14 2005.07.15;time:("N"$"12:10:30.000500000";"N"$"12:13:30.000200000");str:enlist each ("h";"i");ind:1.3 2.5;bool:10b)
.ml.df2tab_tz[t4;0b;1b]~([]bool:10b;date:"p"$(2005.02.25;2015.12.22);timed:(neg "N"$"05:00:00";"N"$"00:16:40"))
.ml.df2tab_tz[t5;1b;0b]~([]dt:dt1;dt_with_tz:dt1)
.ml.df2tab_tz[t5;0b;0b]~([]dt:dt1;dt_with_tz:dt1-"T"$"01:00:00")

tt~update`$scol from .ml.df2tab df
tj~update`$scol from .ml.df2tab dfj
Expand Down
36 changes: 31 additions & 5 deletions util/util.q
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
/ shape of matrix/table
shape:{-1_count each first scan x}
/ values between x and y in steps of length z
arange:{x+z*til ceiling(y-x)%z}
arange:{x+z*til 0|ceiling(y-x)%z}
/ z evenly spaced values between x and y
linspace:{x+til[z]*(y-x)%z-1}
/ identity matrix
Expand All @@ -19,12 +19,34 @@ tab2df:{
r:.p.import[`pandas;`:DataFrame;@[flip 0!x;i.fndcols[x]"pmdznuvt";i.q2npdt]][@;cols x];
$[count k:keys x;r[`:set_index]k;r]}
/ pandas dataframe to q tab
df2tab:{
df2tab_tz:{
n:$[enlist[::]~x[`:index.names]`;0;x[`:index.nlevels]`];
c:`$(x:$[n;x[`:reset_index][];x])[`:columns.get_values][]`;
d:x[`:select_dtypes][`exclude pykw`datetime][`:to_dict;`list]`;
d,:"p"$x[`:select_dtypes][`include pykw`datetime][`:astype;`int64][`:to_dict;<;`list]+1970.01.01D0;
c:`$(x:$[n;x[`:reset_index][];x])[`:columns.to_numpy][]`;
d:x[`:select_dtypes][pykwargs enlist[`exclude]!enlist`datetime`datetimetz`timedelta][`:to_dict;`list]`;
d,:dt_convert x[`:select_dtypes][`include pykw`datetime];
d,:dt_dict[x[`:select_dtypes][`include pykw`timedelta]]+"n"$0;
d,:tz_convert[;y]x[`:select_dtypes][`include pykw`datetimetz];
// check if the first value in columns are foreign
if[0<count dti:where 112h=type each first each value d;
d,:dtk!date_time_convert[;z] each d dtk:key[d]dti];
n!flip c#d}
// Convert time zone data (0b -> UTC time; 1b -> local time)
tz_convert:{$[y~0b;dt_convert;{"P"$neg[6]_/:'x[`:astype;`str][`:to_dict;<;`list]}]x}
// Convert datetime/datetimetz to timestamp
dt_convert:{"p"$dt_dict[x]+1970.01.01D0}
// Convert data to integer representation and return as a dict
dt_dict:{x[`:astype;`int64][`:to_dict;<;`list]}
// Convert datetime.date/time types to kdb+ date/time
date_time_convert:{
$[y~0b;x;
[ fval:.p.wrap first x;
// convert datetime.time/date to iso string format and convert to kdb+
// otherwise return foreign
$[i.isinstance[fval;i.dt`:time];{"N"$.p.wrap[x][`:isoformat][]`}each x;
i.isinstance[fval;i.dt`:date];{"D"$.p.wrap[x][`:isoformat][]`}each x;
x]]]}
// function defaults to return UTC timezone(y) and non converted date/times(z)
df2tab:df2tab_tz[;0b;0b]

/ split into train/test sets with sz% in test
traintestsplit:{[x;y;sz]`xtrain`ytrain`xtest`ytest!raze(x;y)@\:/:(0,floor n*1-sz)_neg[n]?n:count x}
Expand All @@ -33,3 +55,7 @@ traintestsplit:{[x;y;sz]`xtrain`ytrain`xtest`ytest!raze(x;y)@\:/:(0,floor n*1-sz
i.ap:{$[0=type y;x each y;98=type y;flip x each flip y;99<>type y;x y;98=type key y;key[y]!.z.s value y;x each y]}
/ find columns of x with type in y
i.fndcols:{m[`c]where(m:0!meta x)[`t]in y}

// required python utilities for df2tab
i.isinstance:.p.import[`builtins][`:isinstance;<]
i.dt :.p.import[`datetime]
2 changes: 1 addition & 1 deletion xval/tests/xval.t
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ count[.ml.xv.kfstrat[k;1;xc;yc;fs[dtc][]]]~3
(rnd[(avg/).ml.gs.kfsplit[k;1;xc;yc;fs dtc;pc;0]]-rnd@[;0]gridsearchc[xc;yc])<.05

((@[;2].ml.gs.kfsplit[k;1;xf;yf;fs net;pr;.2])-@[;0]gridsearchr[xf;yf])<.05
((@[;2].ml.gs.kfsplit[k;1;xi;yi;fs net;pr;.2])-@[;0]gridsearchr[xi;yi])<.05
((@[;2].ml.gs.kfsplit[k;1;xi;yi;fs net;pr;.2])-@[;0]gridsearchr[xi;yi])<.06
((@[;2].ml.gs.kfsplit[k;1;xb;yb;fs dtc;pc;.2])-@[;0]gridsearchc[xb;yb])<.05
((@[;2].ml.gs.kfsplit[k;1;xc;yc;fs dtc;pc;.2])-@[;0]gridsearchc[xc;yc])<.05

Expand Down

0 comments on commit fab4c32

Please sign in to comment.