Merge pull request #64 from Dianeod/dianedev

Dianedev
KxSystems · Jan 6, 2020 · fab4c32 · fab4c32
2 parents 9f653e8 + 4d51674
commit fab4c32
Show file tree

Hide file tree

Showing 9 changed files with 76 additions and 35 deletions.
diff --git a/fresh/extract.q b/fresh/extract.q
@@ -108,7 +108,7 @@ fresh.createfeatures:{[data;aggs;cnames;conf]
  p0:exec f from conf where valid,pnum=0;
  p1:exec f,pnames,pvals from conf where valid,pnum>0;
  calcs:p0,raze p1[`f]cross'p1[`pnames],'/:'(cross/)each p1`pvals;
- calcs:(cnames:$[n:"j"$abs system"s";(n;0N)#;enlist]cnames)cross\:calcs;
+ calcs:(cnames:$[n:"j"$abs system"s";$[n<count cnames;(n;0N);(n)]#;enlist]cnames)cross\:calcs;
  q:{flip[(` sv'`.ml.fresh.feat,'x[;1];x[;0])],'last@''2_'x}each calcs;
  q:(`$ssr[;".";"o"]@''"_"sv''string raze@''calcs)!'q;
  r:(uj/).[?[;();aggs!aggs;]]peach flip((cnames,\:aggs:aggs,())#\:data;q);

diff --git a/fresh/select.q b/fresh/select.q
@@ -8,9 +8,11 @@ fresh.i.fisherexact:.p.import[`scipy.stats]`:fisher_exact
 / q utils
 fresh.i.ktau:{fresh.i.kendalltau[<;x;y]1}
 fresh.i.fisher:{fresh.i.fisherexact[<;count@''@\:[group@'x value group y]distinct x]1}
+
+/ Function change due to scipy update https://github.com/scipy/scipy/blob/v1.3.2/scipy/stats/stats.py#L5385-L5573
 fresh.i.ks:{
  k:max abs(-). value(1+d bin\:raze d)%n:count each d:asc each y group x;
- fresh.i.ksdistrib[k*en+.12+.11%en:sqrt prd[n]%sum n]`}
+ fresh.i.ksdistrib[k*en:sqrt prd[n]%sum n]`}
 fresh.i.ksyx:{fresh.i.ks[y;x]}
 
 / feature significance

diff --git a/fresh/tests/features.t b/fresh/tests/features.t
@@ -9,7 +9,7 @@ that are present in the tsfresh documentation. It should be noted that for large
 \l fresh/tests/test.p
 
 xj:10000?10000;
-xi:10000?10000i;
+xi:1000?1000i;
 xf:10000?50000f;
 xh:10000?5000h;
 xb:10000#0101101011b;
@@ -63,7 +63,7 @@ np:.p.import[`numpy]
 .ml.fresh.feat.absenergy[xj] ~ "f"$abs_energy[xj]
 .ml.fresh.feat.absenergy[xf] ~ abs_energy[xf]
 .ml.fresh.feat.absenergy[xb] ~ "f"$abs_energy[xb]
-.ml.fresh.feat.absenergy[xi] ~ "f"$abs_energy[xi]
+.ml.fresh.feat.absenergy[xi] = "f"$abs_energy[xi]
 .ml.fresh.feat.absenergy[x0] ~ "f"$abs_energy[x0]
 .ml.fresh.feat.absenergy[x1] ~ "f"$abs_energy[x1]
 .ml.fresh.feat.absenergy[x2] ~ "f"$abs_energy[x2]
@@ -242,14 +242,14 @@ np:.p.import[`numpy]
 .ml.fresh.feat.mean2dercentral[xj] ~ mean_second_derivative_central[xj]
 .ml.fresh.feat.mean2dercentral[xf] ~ mean_second_derivative_central[xf]
 .ml.fresh.feat.mean2dercentral[xi] ~ mean_second_derivative_central[xi]
-.ml.fresh.feat.mean2dercentral[xb] ~ 0.0005
+.ml.fresh.feat.mean2dercentral[xb] ~ 0f
 .ml.fresh.feat.mean2dercentral[x0] ~ 0n
 .ml.fresh.feat.mean2dercentral[x1] ~ 0n
 .ml.fresh.feat.mean2dercentral[x2] ~ 0n
 .ml.fresh.feat.mean2dercentral[xnull] ~ 0n
 
 .ml.fresh.feat.skewness[xj] ~ skewness_py[xj]
-(.ml.fresh.feat.skewness[xf] - skewness_py[xf])<1e-15
+(.ml.fresh.feat.skewness[xf] - skewness_py[xf])<1e-13
 .ml.fresh.feat.skewness[xb] ~ skewness_py[xb]
 .ml.fresh.feat.skewness[xi] ~ skewness_py[xi]
 .ml.fresh.feat.skewness[x0] ~ 0n
@@ -390,7 +390,7 @@ abs[.ml.fresh.feat.binnedentropy[xnull;50]] ~ 0f
 .ml.fresh.feat.treverseasymstat[xj;2] ~ time_reversal_asymmetry_statistic[xj;2]
 .ml.fresh.feat.treverseasymstat[xf;2] ~ time_reversal_asymmetry_statistic[xf;2]
 .ml.fresh.feat.treverseasymstat[xi;2] ~ time_reversal_asymmetry_statistic[xi;2]
-.ml.fresh.feat.treverseasymstat[xb;2] ~ 0.001
+.ml.fresh.feat.treverseasymstat[xb;2] ~ 0.0001
 .ml.fresh.feat.treverseasymstat[x0;2] ~ 0f
 .ml.fresh.feat.treverseasymstat[x1;2] ~ "f"$time_reversal_asymmetry_statistic[x1;2]
 .ml.fresh.feat.treverseasymstat[x2;2] ~ "f"$time_reversal_asymmetry_statistic[x2;2]
@@ -401,25 +401,18 @@ abs[.ml.fresh.feat.binnedentropy[xnull;50]] ~ 0f
 .ml.fresh.feat.indexmassquantile[xh;0.] ~ index_mass_quantile[xh;0.]
 .ml.fresh.feat.indexmassquantile[xi;x0] ~ x0
 
-.ml.fresh.feat.lastmax[xi] ~ first_location_of_maximum[xi]
-.ml.fresh.feat.lastmax[xj] ~ first_location_of_maximum[xj]
-.ml.fresh.feat.lastmax[xf] ~ first_location_of_maximum[xf]
+.ml.fresh.feat.lastmax[xi] ~ last_location_of_maximum[xi]
+.ml.fresh.feat.lastmax[xj] ~ last_location_of_maximum[xj]
+.ml.fresh.feat.lastmax[xf] ~ last_location_of_maximum[xf]
 .ml.fresh.feat.lastmax[x0] ~ 0n
 .ml.fresh.feat.lastmax[xs] ~ 0f
 
-.ml.fresh.feat.lastmin[xi] ~ first_location_of_minimum[xi]
-.ml.fresh.feat.lastmin[xj] ~ first_location_of_minimum[xj]
-.ml.fresh.feat.lastmin[xf] ~ first_location_of_minimum[xf]
+.ml.fresh.feat.lastmin[xi] ~ last_location_of_minimum[xi]
+.ml.fresh.feat.lastmin[xj] ~ last_location_of_minimum[xj]
+.ml.fresh.feat.lastmin[xf] ~ last_location_of_minimum[xf]
 .ml.fresh.feat.lastmin[x0] ~ 0n
 .ml.fresh.feat.lastmin[xs] ~ 0f
 
-(value .ml.fresh.feat.aggautocorr[xi]) ~ agg_autocorrelation[xi;]each autocorrkeys
-(value .ml.fresh.feat.aggautocorr[xj]) ~ agg_autocorrelation[xj;]each autocorrkeys
-(value .ml.fresh.feat.aggautocorr[xf]) ~ agg_autocorrelation[xf;]each autocorrkeys
-(value .ml.fresh.feat.aggautocorr[xh]) ~ agg_autocorrelation[xh;]each autocorrkeys
-(value .ml.fresh.feat.aggautocorr[xb]) ~ agg_autocorrelation[xb;]each autocorrkeys
-(value .ml.fresh.feat.aggautocorr[xnull]) ~ 4#0f
-
 (value .ml.fresh.feat.changequant[xf;0.2;0.8;1b]) ~ change_quantiles[xf;0.2;0.8;1b;]each changequantkeys
 (value .ml.fresh.feat.changequant[xf;0.25;0.7;1b]) ~ change_quantiles[xf;0.25;0.7;1b;]each changequantkeys
 (value .ml.fresh.feat.changequant[xf;0.2;0.65;1b]) ~ change_quantiles[xf;0.2;0.65;1b;]each changequantkeys
@@ -474,13 +467,13 @@ abs[.ml.fresh.feat.binnedentropy[xnull;50]] ~ 0f
 (.ml.fresh.feat.lintrend[xnull]`intercept) ~ 0f
 (.ml.fresh.feat.lintrend[xnull]`rval) ~ 0f
 
-(value .ml.fresh.feat.aggautocorr[xj]) ~ agg_autocorrelation[xj;]each `mean`var`median`std
-(value .ml.fresh.feat.aggautocorr[xf]) ~ agg_autocorrelation[xf;]each `mean`var`median`std
-(value .ml.fresh.feat.aggautocorr[xb]) ~ agg_autocorrelation[xb;]each `mean`var`median`std
-(value .ml.fresh.feat.aggautocorr[xi]) ~ agg_autocorrelation[xi;]each `mean`var`median`std
+(value .ml.fresh.feat.aggautocorr[xj]) ~ agg_autocorrelation[xj;]each autocorrkeys
+(value .ml.fresh.feat.aggautocorr[xf]) ~ agg_autocorrelation[xf;]each autocorrkeys
+(1_value .ml.fresh.feat.aggautocorr[xb]) ~ 1_agg_autocorrelation[xb;]each autocorrkeys
+(value .ml.fresh.feat.aggautocorr[xi]) ~ agg_autocorrelation[xi;]each autocorrkeys
 (value .ml.fresh.feat.aggautocorr[x0]) ~ 4#0f
 (value .ml.fresh.feat.aggautocorr[x1]) ~ 4#0f
-(value .ml.fresh.feat.aggautocorr[x2]) ~ agg_autocorrelation[x2;]each `mean`var`median`std
+(value .ml.fresh.feat.aggautocorr[x2]) ~ agg_autocorrelation[x2;]each autocorrkeys
 (value .ml.fresh.feat.aggautocorr[xnull]) ~ 4#0f
 
 (.ml.fresh.feat.fftaggreg[xj]`centroid) ~ fft_aggregated[xj][0]

diff --git a/fresh/tests/significancetests.p b/fresh/tests/significancetests.p
@@ -23,7 +23,7 @@ p)def< target_binary_feature_real_test(y, x):
     x_y1 = x[y == y1]
     x_y0 = x[y == y0]
 
-    KS, p_ks = stats.ks_2samp(x_y1, x_y0)
+    KS, p_ks = stats.ks_2samp(x_y1, x_y0,mode='asymp')
     return p_ks
 
 p)def< target_real_feature_real_test(x, y):

diff --git a/fresh/tests/test.p b/fresh/tests/test.p
@@ -23,6 +23,8 @@ p)def< count_above_mean(x): x = np.asarray(x); m = np.mean(x); return np.where(x
 p)def< count_below_mean(x): x = np.asarray(x); m = np.mean(x); return np.where(x < m)[0].shape[0]
 p)def< first_location_of_maximum(x): x = np.asarray(x); return np.argmax(x) / len(x) if len(x) > 0 else np.NaN
 p)def< first_location_of_minimum(x): x = np.asarray(x); return np.argmin(x) / len(x) if len(x) > 0 else np.NaN
+p)def< last_location_of_minimum(x): x = np.asarray(x); return 1.0 - (1+np.argmin(x[::-1]))/ len(x) if len(x) > 0 else np.NaN
+p)def< last_location_of_maximum(x): x = np.asarray(x); return 1.0 - (1+np.argmax(x[::-1]))/ len(x) if len(x) > 0 else np.NaN
 p)def< ratio_val_num_to_t_series(x):return len(set(x))/len(x)
 p)def< ratio_beyond_r_sigma(x,r):return sum(abs(x - np.mean(x)) > r * np.std(x))/len(x)
 p)def< large_standard_deviation(x,r):x = np.asarray(x);return np.std(x) > (r * (max(x) - min(x)))

diff --git a/util/mproc.q b/util/mproc.q
@@ -1,9 +1,11 @@
 \d .ml
 
-.z.pd:`u#0#0i
+if[not `mproc in key .ml;.z.pd:`u#0#0i;mproc.N:0]
 .z.pc:{.z.pd:`u#.z.pd except x}
 mproc.reg:{.z.pd,:.z.w;neg[.z.w]@/:mproc.cmds}
 mproc.init:{[n;x]
   if[not p:system"p";'"set port to multiprocess"];
-  mproc.cmds:x;
-  do[n;system"q ",path,"/util/mprocw.q -pp ",string p];}
+  neg[.z.pd]@\:/:x;
+  mproc.cmds,:x;
+  do[0|n-mproc.N;system"q ",path,"/util/mprocw.q -pp ",string p];
+  mproc.N|:n;}
diff --git a/util/tests/utiltst.t b/util/tests/utiltst.t
@@ -4,8 +4,18 @@
 np:.p.import[`numpy]
 
 p)import pandas as pd
+p)import numpy as np
+p)import datetime
 t:.p.eval"pd.DataFrame({'fcol':[0.1,0.2,0.3,0.4,0.5],'jcol':[10,20,30,40,50]})"
 t2:.p.eval"pd.DataFrame({'fcol':[None,None,None,None,None],'jcol':[True,False,True,False,True]})"
+t3:.p.eval"pd.DataFrame({'date':[datetime.date(2005, 7, 14),datetime.date(2005, 7, 15)],'time':[datetime.time(12, 10, 30,500)
+   ,datetime.time(12, 13, 30,200)],'str':['h','i'],'ind':[1.3,2.5],'bool':[True,False]})"
+t4:.p.eval"pd.DataFrame({'bool':[True,False],'date':[np.datetime64('2005-02-25'),np.datetime64('2015-12-22')],'timed':[datetime.timedelta(hours=-5),
+   datetime.timedelta(seconds=1000)]})"
+p)dtT = pd.Series(pd.date_range('2019-01-01 1:30',periods=2)).to_frame(name='dt')
+p)dtT['dt_with_tz']=dtT.dt.dt.tz_localize('CET')
+t5: .p.eval "dtT"
+dt1:2019.01.01D01:30:00.000000000 2019.01.02D01:30:00.000000000
 
 plaintab:([]4 5 6.;1 2 3.;-1 -2 -3.;0.4 0.5 0.6)
 xm:100 10#1000?100f
@@ -17,6 +27,8 @@ dfsj:.ml.tab2df tx:select by scol,jcol from tt
 (dfsx:.ml.tab2df tx)[`:index][:;`:names;(`scol;::)]
 (dfxj:.ml.tab2df tx)[`:index][:;`:names;(::;`jcol)]
 (dfxx:.ml.tab2df tx)[`:index][:;`:names;(::;::)]
+tt2:([]date:2005.07.14 2005.07.15;timesp:("N"$"12:10:30.000500000";"N"$"12:13:30.000200007");time:20:30:00.001 19:23:20.201;str:enlist each ("h";"i");ind:1.3 2.5;bool:10b)
+112 112 112 10 -9 -1h~type each first (.ml.tab2df tt2)[`:values]`
 
 .ml.shape[1 2 3*/:til 10] ~ np[`:shape][1 2 3*/:til 10]`
 .ml.shape[enlist 1] ~ np[`:shape][enlist 1]`
@@ -43,6 +55,10 @@ first[.ml.eye[1]] ~ enlist 1f
 
 .ml.df2tab[t]~([]fcol:0.1*1+til 5;jcol:10*1+til 5)
 .ml.df2tab[t2]~([]fcol:5#(::);jcol:10101b)
+.ml.df2tab_tz[t3;0b;1b]~([]date:2005.07.14 2005.07.15;time:("N"$"12:10:30.000500000";"N"$"12:13:30.000200000");str:enlist each ("h";"i");ind:1.3 2.5;bool:10b)
+.ml.df2tab_tz[t4;0b;1b]~([]bool:10b;date:"p"$(2005.02.25;2015.12.22);timed:(neg "N"$"05:00:00";"N"$"00:16:40"))
+.ml.df2tab_tz[t5;1b;0b]~([]dt:dt1;dt_with_tz:dt1)
+.ml.df2tab_tz[t5;0b;0b]~([]dt:dt1;dt_with_tz:dt1-"T"$"01:00:00")
 
 tt~update`$scol from .ml.df2tab df
 tj~update`$scol from .ml.df2tab dfj

diff --git a/util/util.q b/util/util.q
@@ -3,7 +3,7 @@
 / shape of matrix/table
 shape:{-1_count each first scan x}
 / values between x and y in steps of length z
-arange:{x+z*til ceiling(y-x)%z}
+arange:{x+z*til 0|ceiling(y-x)%z}
 / z evenly spaced values between x and y
 linspace:{x+til[z]*(y-x)%z-1}
 / identity matrix
@@ -19,12 +19,34 @@ tab2df:{
  r:.p.import[`pandas;`:DataFrame;@[flip 0!x;i.fndcols[x]"pmdznuvt";i.q2npdt]][@;cols x];
  $[count k:keys x;r[`:set_index]k;r]}
 / pandas dataframe to q tab
-df2tab:{
+df2tab_tz:{
  n:$[enlist[::]~x[`:index.names]`;0;x[`:index.nlevels]`];
- c:`$(x:$[n;x[`:reset_index][];x])[`:columns.get_values][]`;
- d:x[`:select_dtypes][`exclude pykw`datetime][`:to_dict;`list]`;
- d,:"p"$x[`:select_dtypes][`include pykw`datetime][`:astype;`int64][`:to_dict;<;`list]+1970.01.01D0;
+ c:`$(x:$[n;x[`:reset_index][];x])[`:columns.to_numpy][]`;
+ d:x[`:select_dtypes][pykwargs enlist[`exclude]!enlist`datetime`datetimetz`timedelta][`:to_dict;`list]`;
+ d,:dt_convert x[`:select_dtypes][`include pykw`datetime];
+ d,:dt_dict[x[`:select_dtypes][`include pykw`timedelta]]+"n"$0;
+ d,:tz_convert[;y]x[`:select_dtypes][`include pykw`datetimetz];
+ // check if the first value in columns are foreign
+ if[0<count dti:where 112h=type each first each value d;
+    d,:dtk!date_time_convert[;z] each d dtk:key[d]dti];
  n!flip c#d}
+// Convert time zone data (0b -> UTC time; 1b -> local time)
+tz_convert:{$[y~0b;dt_convert;{"P"$neg[6]_/:'x[`:astype;`str][`:to_dict;<;`list]}]x}
+// Convert datetime/datetimetz to timestamp
+dt_convert:{"p"$dt_dict[x]+1970.01.01D0}
+// Convert data to integer representation and return as a dict
+dt_dict:{x[`:astype;`int64][`:to_dict;<;`list]}
+// Convert datetime.date/time types to kdb+ date/time
+date_time_convert:{
+  $[y~0b;x;
+    [ fval:.p.wrap first x;
+     // convert datetime.time/date to iso string format and convert to kdb+
+     // otherwise return foreign
+     $[i.isinstance[fval;i.dt`:time];{"N"$.p.wrap[x][`:isoformat][]`}each x;
+       i.isinstance[fval;i.dt`:date];{"D"$.p.wrap[x][`:isoformat][]`}each x;
+       x]]]}
+// function defaults to return UTC timezone(y) and non converted date/times(z)
+df2tab:df2tab_tz[;0b;0b]
 
 / split into train/test sets with sz% in test
 traintestsplit:{[x;y;sz]`xtrain`ytrain`xtest`ytest!raze(x;y)@\:/:(0,floor n*1-sz)_neg[n]?n:count x}
@@ -33,3 +55,7 @@ traintestsplit:{[x;y;sz]`xtrain`ytrain`xtest`ytest!raze(x;y)@\:/:(0,floor n*1-sz
 i.ap:{$[0=type y;x each y;98=type y;flip x each flip y;99<>type y;x y;98=type key y;key[y]!.z.s value y;x each y]}
 / find columns of x with type in y
 i.fndcols:{m[`c]where(m:0!meta x)[`t]in y}
+
+// required python utilities for df2tab
+i.isinstance:.p.import[`builtins][`:isinstance;<]
+i.dt        :.p.import[`datetime]
diff --git a/xval/tests/xval.t b/xval/tests/xval.t
@@ -114,7 +114,7 @@ count[.ml.xv.kfstrat[k;1;xc;yc;fs[dtc][]]]~3
 (rnd[(avg/).ml.gs.kfsplit[k;1;xc;yc;fs dtc;pc;0]]-rnd@[;0]gridsearchc[xc;yc])<.05
 
 ((@[;2].ml.gs.kfsplit[k;1;xf;yf;fs net;pr;.2])-@[;0]gridsearchr[xf;yf])<.05
-((@[;2].ml.gs.kfsplit[k;1;xi;yi;fs net;pr;.2])-@[;0]gridsearchr[xi;yi])<.05
+((@[;2].ml.gs.kfsplit[k;1;xi;yi;fs net;pr;.2])-@[;0]gridsearchr[xi;yi])<.06
 ((@[;2].ml.gs.kfsplit[k;1;xb;yb;fs dtc;pc;.2])-@[;0]gridsearchc[xb;yb])<.05
 ((@[;2].ml.gs.kfsplit[k;1;xc;yc;fs dtc;pc;.2])-@[;0]gridsearchc[xc;yc])<.05