diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8aaea2b --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +# IntelliJ +.idea/ +*.iml + +# vscode +.vscode/ +*.code-workspace diff --git a/.travis.yml b/.travis.yml index c1be208..06428d6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,13 @@ language: go go: master -script: go test -v ./dataframe ./series +before_script: + - echo 'Checking code quality issues.' + - go vet ./... + - echo 'Checking that gofmt was used.' + - diff -u <(echo -n) <(gofmt -d .) + - echo 'Checking tidiness of go mod.' + - go mod tidy + - test -z "$(git status --porcelain)" +script: + - echo 'Running tests.' + - go test -v ./... diff --git a/CHANGELOG.md b/CHANGELOG.md index db736fe..aad6416 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,26 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## [0.11.0] - 2021-04-25 +### Added +- Rolling window Mean and StdDev +- GroupBy and Aggregate +- Numeric column index +- Read HTML tables +- extra checks for TravisCI +- Combining filters with AND +- User-defined filters +- Concatination of Dataframes + +### Changed +- Make fixColnames faster +- Use Go 1.16 +- Update dependencies + +### Fixed +- Linter issues +- Failing tests + ## [0.10.1] - 2019-11-08 ### Fixed - LoadRecords printing type debug information diff --git a/README.md b/README.md index 5b8bf55..49e5742 100644 --- a/README.md +++ b/README.md @@ -182,14 +182,80 @@ column "B" is greater than 4: fil := df.Filter( dataframe.F{"A", series.Eq, "a"}, dataframe.F{"B", series.Greater, 4}, +) + +filAlt := df.FilterAggregation( + dataframe.Or, + dataframe.F{"A", series.Eq, "a"}, + dataframe.F{"B", series.Greater, 4}, ) +``` + +Filters inside Filter are combined as OR operations, alternatively we can use `df.FilterAggragation` with `dataframe.Or`. + +If we want to combine filters with AND operations, we can use `df.FilterAggregation` with `dataframe.And`. + +```go +fil := df.FilterAggregation( + dataframe.And, + dataframe.F{"A", series.Eq, "a"}, + dataframe.F{"D", series.Eq, true}, +) +``` + +To combine AND and OR operations, we can use chaining of filters. + +```go +// combine filters with OR +fil := df.Filter( + dataframe.F{"A", series.Eq, "a"}, + dataframe.F{"B", series.Greater, 4}, +) +// apply AND for fil and fil2 fil2 := fil.Filter( dataframe.F{"D", series.Eq, true}, ) ``` -Filters inside Filter are combined as OR operations whereas if we chain -Filter methods, they will behave as AND. +Filtering is based on predefined comparison operators: +* `series.Eq` +* `series.Neq` +* `series.Greater` +* `series.GreaterEq` +* `series.Less` +* `series.LessEq` +* `series.In` + +However, if these filter operations are not sufficient, we can use user-defined comparators. +We use `series.CompFunc` and a user-defined function with the signature `func(series.Element) bool` to provide user-defined filters to `df.Filter` and `df.FilterAggregation`. + +```go +hasPrefix := func(prefix string) func(el series.Element) bool { + return func (el series.Element) bool { + if el.Type() == String { + if val, ok := el.Val().(string); ok { + return strings.HasPrefix(val, prefix) + } + } + return false + } + } + +fil := df.Filter( + dataframe.F{"A", series.CompFunc, hasPrefix("aa")}, +) +``` + +This example filters rows based on whether they have a cell value starting with `"aa"` in column `"A"`. + +#### GroupBy && Aggregation + +GroupBy && Aggregation + +```go +groups := df.GroupBy("key1", "key2") // Group by column "key1", and column "key2" +aggre := groups.Aggregation([]AggregationType{Aggregation_MAX, Aggregation_MIN}, []string{"values", "values2"}) // Maximum value in column "values", Minimum value in column "values2" +``` #### Arrange diff --git a/dataframe/dataframe.go b/dataframe/dataframe.go index cf1ae41..df95d55 100644 --- a/dataframe/dataframe.go +++ b/dataframe/dataframe.go @@ -14,6 +14,8 @@ import ( "unicode/utf8" "github.com/go-gota/gota/series" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) // DataFrame is a data structure designed for operating on table like data (Such @@ -233,7 +235,7 @@ func (df DataFrame) print( } } if i < len(notShowing) { - notShownArr = append(notShownArr, notShowing[i:len(notShowing)]) + notShownArr = append(notShownArr, notShowing[i:]) } for k, ns := range notShownArr { notShown += strings.Join(ns, ", ") @@ -250,7 +252,7 @@ func (df DataFrame) print( // Subsetting, mutating and transforming DataFrame methods // ======================================================= -// Set will update the values of a DataFrame for all rows selected via indexes. +// Set will update the values of a DataFrame for the rows selected via indexes. func (df DataFrame) Set(indexes series.Indexes, newvalues DataFrame) DataFrame { if df.Err != nil { return df @@ -371,6 +373,148 @@ func (df DataFrame) Drop(indexes SelectIndexes) DataFrame { return df } +const KEY_ERROR = "KEY_ERROR" + +//GroupBy Group dataframe by columns +func (df DataFrame) GroupBy(colnames ...string) *Groups { + if len(colnames) <= 0 { + return nil + } + groupDataFrame := make(map[string]DataFrame) + groupSeries := make(map[string][]map[string]interface{}) + // Check that colname exist on dataframe + for _, c := range colnames { + if idx := findInStringSlice(c, df.Names()); idx == -1 { + return &Groups{Err: fmt.Errorf("GroupBy: can't find column name: %s", c)} + } + } + + for _, s := range df.Maps() { + // Gen Key for per Series + key := "" + for i, c := range colnames { + format := "" + if i == 0 { + format = "%s%" + } else { + format = "%s_%" + } + switch s[c].(type) { + case string, bool: + format += "s" + case int, int16, int32, int64: + format += "d" + case float32, float64: + format += "f" + default: + return &Groups{Err: fmt.Errorf("GroupBy: type not found")} + } + key = fmt.Sprintf(format, key, s[c]) + } + groupSeries[key] = append(groupSeries[key], s) + } + + for k, cMaps := range groupSeries { + groupDataFrame[k] = LoadMaps(cMaps) + } + groups := &Groups{groups: groupDataFrame, colnames: colnames} + return groups +} + +//AggregationType Aggregation method type +type AggregationType int + +const ( + Aggregation_MAX AggregationType = 0 + Aggregation_MIN AggregationType = 1 + Aggregation_MEAN AggregationType = 2 + Aggregation_MEDIAN AggregationType = 3 + Aggregation_STD AggregationType = 4 + Aggregation_SUM AggregationType = 5 + Aggregation_COUNT AggregationType = 6 +) + +func (aggregation AggregationType) String() string { + switch aggregation { + case Aggregation_MAX: + return "MAX" + case Aggregation_MIN: + return "MIN" + case Aggregation_MEAN: + return "MEAN" + case Aggregation_MEDIAN: + return "MEDIAN" + case Aggregation_STD: + return "STD" + case Aggregation_SUM: + return "SUM" + case Aggregation_COUNT: + return "COUNT" + default: + return "UNKNOWN" + } +} + +//Groups : structure generated by groupby +type Groups struct { + groups map[string]DataFrame + colnames []string + aggregation DataFrame + Err error +} + +// Aggregation :Aggregate dataframe by aggregation type and aggregation column name +func (gps Groups) Aggregation(typs []AggregationType, colnames []string) DataFrame { + if gps.groups == nil { + return DataFrame{Err: fmt.Errorf("Aggregation: input is nil")} + } + if len(typs) != len(colnames) { + return DataFrame{Err: fmt.Errorf("Aggregation: len(typs) != len(colanmes)")} + } + dfMaps := make([]map[string]interface{}, 0) + for _, df := range gps.groups { + targetMap := df.Maps()[0] + curMap := make(map[string]interface{}) + // add columns of group by + for _, c := range gps.colnames { + if value, ok := targetMap[c]; ok { + curMap[c] = value + } else { + return DataFrame{Err: fmt.Errorf("Aggregation: can't find column name: %s", c)} + } + } + // Aggregation + for i, c := range colnames { + curSeries := df.Col(c) + var value float64 + switch typs[i] { + case Aggregation_MAX: + value = curSeries.Max() + case Aggregation_MEAN: + value = curSeries.Mean() + case Aggregation_MEDIAN: + value = curSeries.Median() + case Aggregation_MIN: + value = curSeries.Min() + case Aggregation_STD: + value = curSeries.StdDev() + case Aggregation_SUM: + value = curSeries.Sum() + case Aggregation_COUNT: + value = float64(curSeries.Len()) + default: + return DataFrame{Err: fmt.Errorf("Aggregation: this method %s not found", typs[i])} + + } + curMap[fmt.Sprintf("%s_%s", c, typs[i])] = value + } + dfMaps = append(dfMaps, curMap) + + } + gps.aggregation = LoadMaps(dfMaps) + return gps.aggregation +} + // Rename changes the name of one of the columns of a DataFrame func (df DataFrame) Rename(newname, oldname string) DataFrame { if df.Err != nil { @@ -427,6 +571,54 @@ func (df DataFrame) RBind(dfb DataFrame) DataFrame { return New(expandedSeries...) } +// Concat concatenates rows of two DataFrames like RBind, but also including +// unmatched columns. +func (df DataFrame) Concat(dfb DataFrame) DataFrame { + if df.Err != nil { + return df + } + if dfb.Err != nil { + return dfb + } + + uniques := make(map[string]struct{}) + cols := []string{} + for _, t := range []DataFrame{df, dfb} { + for _, u := range t.Names() { + if _, ok := uniques[u]; !ok { + uniques[u] = struct{}{} + cols = append(cols, u) + } + } + } + + expandedSeries := make([]series.Series, len(cols)) + for k, v := range cols { + aidx := findInStringSlice(v, df.Names()) + bidx := findInStringSlice(v, dfb.Names()) + + // aidx and bidx must not be -1 at the same time. + var a, b series.Series + if aidx != -1 { + a = df.columns[aidx] + } else { + bb := dfb.columns[bidx] + a = series.New(make([]struct{}, df.nrows), bb.Type(), bb.Name) + } + if bidx != -1 { + b = dfb.columns[bidx] + } else { + b = series.New(make([]struct{}, dfb.nrows), a.Type(), a.Name) + } + newSeries := a.Concat(b) + if err := newSeries.Err; err != nil { + return DataFrame{Err: fmt.Errorf("concat: %v", err)} + } + expandedSeries[k] = newSeries + } + return New(expandedSeries...) +} + // Mutate changes a column of the DataFrame with the given Series or adds it as // a new column if the column name does not exist. func (df DataFrame) Mutate(s series.Series) DataFrame { @@ -463,6 +655,7 @@ func (df DataFrame) Mutate(s series.Series) DataFrame { // F is the filtering structure type F struct { + Colidx int Colname string Comparator series.Comparator Comparando interface{} @@ -473,14 +666,47 @@ type F struct { // whereas if we chain Filter calls, every filter will act as an AND operation // with regards to the rest. func (df DataFrame) Filter(filters ...F) DataFrame { + return df.FilterAggregation(Or, filters...) +} + +// Aggregation defines the filter aggregation +type Aggregation int + +func (a Aggregation) String() string { + switch a { + case Or: + return "or" + case And: + return "and" + } + return fmt.Sprintf("unknown aggragation %d", a) +} + +const ( + // Or aggregates filters with logical or + Or Aggregation = iota + // And aggregates filters with logical and + And +) + +// FilterAggregation will filter the rows of a DataFrame based on the given filters. All +// filters on the argument of a Filter call are aggregated depending on the supplied +// aggregation. +func (df DataFrame) FilterAggregation(agg Aggregation, filters ...F) DataFrame { if df.Err != nil { return df } + compResults := make([]series.Series, len(filters)) for i, f := range filters { - idx := findInStringSlice(f.Colname, df.Names()) - if idx < 0 { - return DataFrame{Err: fmt.Errorf("filter: can't find column name")} + var idx int + if f.Colname == "" { + idx = f.Colidx + } else { + idx = findInStringSlice(f.Colname, df.Names()) + if idx < 0 { + return DataFrame{Err: fmt.Errorf("filter: can't find column name")} + } } res := df.columns[idx].Compare(f.Comparator, f.Comparando) if err := res.Err; err != nil { @@ -488,10 +714,11 @@ func (df DataFrame) Filter(filters ...F) DataFrame { } compResults[i] = res } - // Join compResults via "OR" + if len(compResults) == 0 { return df.Copy() } + res, err := compResults[0].Bool() if err != nil { return DataFrame{Err: fmt.Errorf("filter: %v", err)} @@ -502,7 +729,14 @@ func (df DataFrame) Filter(filters ...F) DataFrame { return DataFrame{Err: fmt.Errorf("filter: %v", err)} } for j := 0; j < len(res); j++ { - res[j] = res[j] || nextRes[j] + switch agg { + case Or: + res[j] = res[j] || nextRes[j] + case And: + res[j] = res[j] && nextRes[j] + default: + panic(agg) + } } } return df.Subset(res) @@ -1106,7 +1340,9 @@ func ReadCSV(r io.Reader, options ...LoadOption) DataFrame { // resulting records. func ReadJSON(r io.Reader, options ...LoadOption) DataFrame { var m []map[string]interface{} - err := json.NewDecoder(r).Decode(&m) + d := json.NewDecoder(r) + d.UseNumber() + err := d.Decode(&m) if err != nil { return DataFrame{Err: err} } @@ -1160,6 +1396,131 @@ func (df DataFrame) WriteJSON(w io.Writer) error { return json.NewEncoder(w).Encode(df.Maps()) } +// Internal state for implementing ReadHTML +type remainder struct { + index int + text string + nrows int +} + +func readRows(trs []*html.Node) [][]string { + rems := []remainder{} + rows := [][]string{} + for _, tr := range trs { + xrems := []remainder{} + row := []string{} + index := 0 + text := "" + for j, td := 0, tr.FirstChild; td != nil; j, td = j+1, td.NextSibling { + if td.Type == html.ElementNode && td.DataAtom == atom.Td { + + for len(rems) > 0 { + v := rems[0] + if v.index > index { + break + } + v, rems = rems[0], rems[1:] + row = append(row, v.text) + if v.nrows > 1 { + xrems = append(xrems, remainder{v.index, v.text, v.nrows - 1}) + } + index++ + } + + rowspan, colspan := 1, 1 + for _, attr := range td.Attr { + switch attr.Key { + case "rowspan": + if k, err := strconv.Atoi(attr.Val); err == nil { + rowspan = k + } + case "colspan": + if k, err := strconv.Atoi(attr.Val); err == nil { + colspan = k + } + } + } + for c := td.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.TextNode { + text = strings.TrimSpace(c.Data) + } + } + + for k := 0; k < colspan; k++ { + row = append(row, text) + if rowspan > 1 { + xrems = append(xrems, remainder{index, text, rowspan - 1}) + } + index++ + } + } + } + for j := 0; j < len(rems); j++ { + v := rems[j] + row = append(row, v.text) + if v.nrows > 1 { + xrems = append(xrems, remainder{v.index, v.text, v.nrows - 1}) + } + } + rows = append(rows, row) + rems = xrems + } + for len(rems) > 0 { + xrems := []remainder{} + row := []string{} + for i := 0; i < len(rems); i++ { + v := rems[i] + row = append(row, v.text) + if v.nrows > 1 { + xrems = append(xrems, remainder{v.index, v.text, v.nrows - 1}) + } + } + rows = append(rows, row) + rems = xrems + } + return rows +} + +func ReadHTML(r io.Reader, options ...LoadOption) []DataFrame { + var err error + var dfs []DataFrame + var doc *html.Node + var f func(*html.Node) + + doc, err = html.Parse(r) + if err != nil { + return []DataFrame{DataFrame{Err: err}} + } + + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.DataAtom == atom.Table { + trs := []*html.Node{} + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode && c.DataAtom == atom.Tbody { + for cc := c.FirstChild; cc != nil; cc = cc.NextSibling { + if cc.Type == html.ElementNode && (cc.DataAtom == atom.Th || cc.DataAtom == atom.Tr) { + trs = append(trs, cc) + } + } + } + } + + df := LoadRecords(readRows(trs), options...) + if df.Err == nil { + dfs = append(dfs, df) + } + return + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + + f(doc) + return dfs +} + // Getters/Setters for DataFrame fields // ==================================== @@ -1392,7 +1753,7 @@ func (df DataFrame) LeftJoin(b DataFrame, keys ...string) DataFrame { newCols[ii].Append(elem) ii++ } - for _ = range iNotKeysB { + for range iNotKeysB { newCols[ii].Append(nil) ii++ } @@ -1496,7 +1857,7 @@ func (df DataFrame) RightJoin(b DataFrame, keys ...string) DataFrame { newCols[ii].Append(elem) ii++ } - for _ = range iNotKeysA { + for range iNotKeysA { newCols[ii].Append(nil) ii++ } @@ -1598,7 +1959,7 @@ func (df DataFrame) OuterJoin(b DataFrame, keys ...string) DataFrame { newCols[ii].Append(elem) ii++ } - for _ = range iNotKeysB { + for range iNotKeysB { newCols[ii].Append(nil) ii++ } @@ -1624,7 +1985,7 @@ func (df DataFrame) OuterJoin(b DataFrame, keys ...string) DataFrame { newCols[ii].Append(elem) ii++ } - for _ = range iNotKeysA { + for range iNotKeysA { newCols[ii].Append(nil) ii++ } @@ -1717,7 +2078,7 @@ func (df DataFrame) Elem(r, c int) series.Element { // fixColnames assigns a name to the missing column names and makes it so that the // column names are unique. func fixColnames(colnames []string) { - // Find duplicated colnames + // Find duplicated and missing colnames dupnamesidx := make(map[string][]int) var missingnames []int for i := 0; i < len(colnames); i++ { @@ -1726,16 +2087,17 @@ func fixColnames(colnames []string) { missingnames = append(missingnames, i) continue } - for j := 0; j < len(colnames); j++ { - b := colnames[j] - if i != j && a == b { - temp := dupnamesidx[a] - if !inIntSlice(i, temp) { - dupnamesidx[a] = append(temp, i) - } - } + // for now, dupnamesidx contains the indices of *all* the columns + // the columns with unique locations will be removed after this loop + dupnamesidx[a] = append(dupnamesidx[a], i) + } + // NOTE: deleting a map key in a range is legal and correct in Go. + for k, places := range dupnamesidx { + if len(places) < 2 { + delete(dupnamesidx, k) } } + // Now: dupnameidx contains only keys that appeared more than once // Autofill missing column names counter := 0 diff --git a/dataframe/dataframe_test.go b/dataframe/dataframe_test.go index 114c0e4..564d193 100644 --- a/dataframe/dataframe_test.go +++ b/dataframe/dataframe_test.go @@ -2,6 +2,7 @@ package dataframe import ( "bytes" + "fmt" "reflect" "strconv" "strings" @@ -553,6 +554,117 @@ func TestDataFrame_RBind(t *testing.T) { } } +func TestDataFrame_Concat(t *testing.T) { + type NA struct{} + + a := New( + series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ) + table := []struct { + dfa DataFrame + dfb DataFrame + expDf DataFrame + }{ + { + a, + New( + series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4, 1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + { + a, + New( + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "1", "2", "4", "5", "4"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4, 1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + + { + a, + New( + series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2").Concat(series.New([]NA{NA{}, NA{}, NA{}, NA{}, NA{}}, series.Int, "")), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + { + a, + New( + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + series.New([]string{"a", "b", "c", "d", "e"}, series.String, "COL.4"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "1", "2", "4", "5", "4"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4, 1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + series.New([]NA{NA{}, NA{}, NA{}, NA{}, NA{}}, series.String, "COL.4").Concat(series.New([]string{"a", "b", "c", "d", "e"}, series.String, "COL.4")), + ), + }, + { + a, + New( + series.New([]string{"a", "b", "c", "d", "e"}, series.String, "COL.0"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "1", "2", "4", "5", "4"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4, 1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + series.New([]NA{NA{}, NA{}, NA{}, NA{}, NA{}}, series.String, "COL.0").Concat(series.New([]string{"a", "b", "c", "d", "e"}, series.String, "COL.0")), + ), + }, + { + DataFrame{}, + a, + a, + }, + } + for i, tc := range table { + b := tc.dfa.Concat(tc.dfb) + + if b.Err != nil { + t.Errorf("Test: %d\nError:%v", i, b.Err) + } + //if err := checkAddrDf(a, b); err != nil { + //t.Error(err) + //} + // Check that the types are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Types(), b.Types()) { + t.Errorf("Test: %d\nDifferent types:\nA:%v\nB:%v", i, tc.expDf.Types(), b.Types()) + } + // Check that the colnames are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Names(), b.Names()) { + t.Errorf("Test: %d\nDifferent colnames:\nA:%v\nB:%v", i, tc.expDf.Names(), b.Names()) + } + // Check that the values are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Records(), b.Records()) { + t.Errorf("Test: %d\nDifferent values:\nA:%v\nB:%v", i, tc.expDf.Records(), b.Records()) + } + } +} func TestDataFrame_Records(t *testing.T) { a := New( series.New([]string{"a", "b", "c"}, series.String, "COL.1"), @@ -634,7 +746,7 @@ func TestDataFrame_Mutate(t *testing.T) { } } -func TestDataFrame_Filter(t *testing.T) { +func TestDataFrame_Filter_Or(t *testing.T) { a := New( series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), @@ -645,7 +757,7 @@ func TestDataFrame_Filter(t *testing.T) { expDf DataFrame }{ { - []F{{"COL.2", series.GreaterEq, 4}}, + []F{{0, "COL.2", series.GreaterEq, 4}}, New( series.New([]string{"b", "c", "d"}, series.String, "COL.1"), series.New([]int{4, 5, 4}, series.Int, "COL.2"), @@ -654,8 +766,8 @@ func TestDataFrame_Filter(t *testing.T) { }, { []F{ - {"COL.2", series.Greater, 4}, - {"COL.2", series.Eq, 1}, + {0, "COL.2", series.Greater, 4}, + {0, "COL.2", series.Eq, 1}, }, New( series.New([]string{"b", "c"}, series.String, "COL.1"), @@ -665,9 +777,21 @@ func TestDataFrame_Filter(t *testing.T) { }, { []F{ - {"COL.2", series.Greater, 4}, - {"COL.2", series.Eq, 1}, - {"COL.1", series.Eq, "d"}, + {0, "COL.2", series.Greater, 4}, + {0, "COL.2", series.Eq, 1}, + {0, "COL.1", series.Eq, "d"}, + }, + New( + series.New([]string{"b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + { + []F{ + {1, "", series.Greater, 4}, + {1, "", series.Eq, 1}, + {0, "", series.Eq, "d"}, }, New( series.New([]string{"b", "c", "d"}, series.String, "COL.1"), @@ -697,6 +821,117 @@ func TestDataFrame_Filter(t *testing.T) { if !reflect.DeepEqual(tc.expDf.Records(), b.Records()) { t.Errorf("Test: %d\nDifferent values:\nA:%v\nB:%v", i, tc.expDf.Records(), b.Records()) } + + b2 := a.FilterAggregation(Or, tc.filters...) + + // Check that the types are the same between both DataFrames + if !reflect.DeepEqual(b.Types(), b2.Types()) { + t.Errorf("Test: %d\nDifferent types:\nB:%v\nB2:%v", i, b.Types(), b2.Types()) + } + // Check that the colnames are the same between both DataFrames + if !reflect.DeepEqual(b.Names(), b2.Names()) { + t.Errorf("Test: %d\nDifferent colnames:\nB:%v\nB2:%v", i, b.Names(), b2.Names()) + } + // Check that the values are the same between both DataFrames + if !reflect.DeepEqual(b.Records(), b2.Records()) { + t.Errorf("Test: %d\nDifferent values:\nB:%v\nB2:%v", i, b.Records(), b2.Records()) + } + } +} + +func TestDataFrame_Filter_And(t *testing.T) { + a := New( + series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ) + table := []struct { + filters []F + expDf DataFrame + }{ + { + []F{{Colname: "COL.2", Comparator: series.GreaterEq, Comparando: 4}}, + New( + series.New([]string{"b", "c", "d"}, series.String, "COL.1"), + series.New([]int{4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + { + []F{{Colidx: 1, Comparator: series.GreaterEq, Comparando: 4}}, + New( + series.New([]string{"b", "c", "d"}, series.String, "COL.1"), + series.New([]int{4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + // should not have any rows + { + []F{ + {Colname: "COL.2", Comparator: series.Greater, Comparando: 4}, + {Colname: "COL.2", Comparator: series.Eq, Comparando: 1}, + }, + New( + series.New([]string{}, series.String, "COL.1"), + series.New([]int{}, series.Int, "COL.2"), + series.New([]float64{}, series.Float, "COL.3"), + ), + }, + { + []F{ + {Colidx: 1, Comparator: series.Greater, Comparando: 4}, + {Colidx: 1, Comparator: series.Eq, Comparando: 1}, + }, + New( + series.New([]string{}, series.String, "COL.1"), + series.New([]int{}, series.Int, "COL.2"), + series.New([]float64{}, series.Float, "COL.3"), + ), + }, + { + []F{ + {Colname: "COL.2", Comparator: series.Less, Comparando: 4}, + {Colname: "COL.1", Comparator: series.Eq, Comparando: "b"}, + }, + New( + series.New([]string{"b"}, series.String, "COL.1"), + series.New([]int{1}, series.Int, "COL.2"), + series.New([]float64{3.0}, series.Float, "COL.3"), + ), + }, + { + []F{ + {Colidx: 1, Comparator: series.Less, Comparando: 4}, + {Colidx: 0, Comparator: series.Eq, Comparando: "b"}, + }, + New( + series.New([]string{"b"}, series.String, "COL.1"), + series.New([]int{1}, series.Int, "COL.2"), + series.New([]float64{3.0}, series.Float, "COL.3"), + ), + }, + } + for i, tc := range table { + b := a.FilterAggregation(And, tc.filters...) + + if b.Err != nil { + t.Errorf("Test: %d\nError:%v", i, b.Err) + } + //if err := checkAddrDf(a, b); err != nil { + //t.Error(err) + //} + // Check that the types are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Types(), b.Types()) { + t.Errorf("Test: %d\nDifferent types:\nA:%v\nB:%v", i, tc.expDf.Types(), b.Types()) + } + // Check that the colnames are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Names(), b.Names()) { + t.Errorf("Test: %d\nDifferent colnames:\nA:%v\nB:%v", i, tc.expDf.Names(), b.Names()) + } + // Check that the values are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Records(), b.Records()) { + t.Errorf("Test: %d\nDifferent values:\nA:%v\nB:%v", i, tc.expDf.Records(), b.Records()) + } } } @@ -1008,13 +1243,13 @@ func TestLoadMaps(t *testing.T) { { // Test: 0 LoadMaps( []map[string]interface{}{ - map[string]interface{}{ + { "A": "a", "B": 1, "C": true, "D": 0, }, - map[string]interface{}{ + { "A": "b", "B": 2, "C": true, @@ -1032,13 +1267,13 @@ func TestLoadMaps(t *testing.T) { { // Test: 1 LoadMaps( []map[string]interface{}{ - map[string]interface{}{ + { "A": "a", "B": 1, "C": true, "D": 0, }, - map[string]interface{}{ + { "A": "b", "B": 2, "C": true, @@ -1059,13 +1294,13 @@ func TestLoadMaps(t *testing.T) { { // Test: 2 LoadMaps( []map[string]interface{}{ - map[string]interface{}{ + { "A": "a", "B": 1, "C": true, "D": 0, }, - map[string]interface{}{ + { "A": "b", "B": 2, "C": true, @@ -1086,13 +1321,13 @@ func TestLoadMaps(t *testing.T) { { // Test: 3 LoadMaps( []map[string]interface{}{ - map[string]interface{}{ + { "A": "a", "B": 1, "C": true, "D": 0, }, - map[string]interface{}{ + { "A": "b", "B": 2, "C": true, @@ -1117,13 +1352,13 @@ func TestLoadMaps(t *testing.T) { { // Test: 4 LoadMaps( []map[string]interface{}{ - map[string]interface{}{ + { "A": "a", "B": 1, "C": true, "D": 0, }, - map[string]interface{}{ + { "A": "b", "B": 2, "C": true, @@ -1191,13 +1426,13 @@ func TestReadJSON(t *testing.T) { expDf DataFrame }{ { - `[{"COL.1":null,"COL.2":1,"COL.3":3},{"COL.1":5,"COL.2":2,"COL.3":2},{"COL.1":6,"COL.2":3,"COL.3":1}]`, + `[{"COL.1":null,"COL.2":1,"COL.3":3},{"COL.1":5,"COL.2":2,"COL.3":2},{"COL.1":6,"COL.2":3,"COL.3":20180428}]`, LoadRecords( [][]string{ {"COL.1", "COL.2", "COL.3"}, {"NaN", "1", "3"}, {"5", "2", "2"}, - {"6", "3", "1"}, + {"6", "3", "20180428"}, }, DetectTypes(false), DefaultType(series.Int), @@ -1238,6 +1473,79 @@ func TestReadJSON(t *testing.T) { } } +func TestReadHTML(t *testing.T) { + table := []struct { + htmlStr string + expDf []DataFrame + }{ + { + "", + []DataFrame{}, + }, + { + ` + + + + +
COL.1
100
+ + `, + []DataFrame{ + LoadRecords( + [][]string{ + {"COL.1"}, + {"100"}, + }), + }, + }, + { + ` + + + + +
COL.1COL.2COL.3
100
+ + `, + []DataFrame{ + LoadRecords( + [][]string{ + {"COL.1", "COL.2", "COL.3"}, + {"COL.1", "COL.2", "100"}, + }), + }, + }, + } + + for i, tc := range table { + cs := ReadHTML(strings.NewReader(tc.htmlStr)) + if tc.htmlStr != "" && len(cs) == 0 { + t.Errorf("Test: %d, got zero dataframes: %#v", i, cs) + } + for j, c := range cs { + if len(cs) != len(tc.expDf) { + t.Errorf("Test: %d\n got len(%d), want len(%d)", i, len(cs), len(tc.expDf)) + } + if c.Err != nil { + t.Errorf("Test: %d\nError:%v", i, c.Err) + } + // Check that the types are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf[j].Types(), c.Types()) { + t.Errorf("Test: %d\nDifferent types:\nA:%v\nB:%v", i, tc.expDf[j].Types(), c.Types()) + } + // Check that the colnames are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf[j].Names(), c.Names()) { + t.Errorf("Test: %d\nDifferent colnames:\nA:%v\nB:%v", i, tc.expDf[j].Names(), c.Names()) + } + // Check that the values are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf[j].Records(), c.Records()) { + t.Errorf("Test: %d\nDifferent values:\nA:%v\nB:%v", i, tc.expDf[j].Records(), c.Records()) + } + } + } +} + func TestDataFrame_SetNames(t *testing.T) { a := New( series.New([]string{"a", "b", "c"}, series.String, "COL.1"), @@ -1641,17 +1949,17 @@ func TestDataFrame_Maps(t *testing.T) { ) m := a.Maps() expected := []map[string]interface{}{ - map[string]interface{}{ + { "COL.1": "a", "COL.2": nil, "COL.3": nil, }, - map[string]interface{}{ + { "COL.1": "b", "COL.2": 2, "COL.3": nil, }, - map[string]interface{}{ + { "COL.1": "c", "COL.2": 3, "COL.3": 3, @@ -2491,11 +2799,11 @@ func TestDescribe(t *testing.T) { { LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"b", "4", "6.0", "true"}, - []string{"c", "3", "6.0", "false"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"b", "4", "6.0", "true"}, + {"c", "3", "6.0", "false"}, + {"a", "2", "7.1", "false"}, }), New( @@ -2558,3 +2866,69 @@ func TestDescribe(t *testing.T) { } } } + +const MIN = 0.000001 + +func IsEqual(f1, f2 float64) bool { + if f1 > f2 { + return math.Dim(f1, f2) < MIN + } else { + return math.Dim(f2, f1) < MIN + } +} +func TestDataFrame_GroupBy(t *testing.T) { + a := New( + series.New([]string{"b", "a", "b", "a", "b"}, series.String, "key1"), + series.New([]int{1, 2, 1, 2, 2}, series.Int, "key2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "values"), + ) + groups := a.GroupBy("key1", "key2") + resultMap := make(map[string]float32, 3) + resultMap[fmt.Sprintf("%s_%d", "a", 2)] = 4 + 3.2 + resultMap[fmt.Sprintf("%s_%d", "b", 1)] = 3 + 5.3 + resultMap[fmt.Sprintf("%s_%d", "b", 2)] = 1.2 + + for k, values := range groups.groups { + curV := 0.0 + for _, vMap := range values.Maps() { + curV += vMap["values"].(float64) + } + targetV, ok := resultMap[k] + if !ok { + t.Errorf("GroupBy: %s not found", k) + return + } + if !IsEqual(float64(targetV), curV) { + t.Errorf("GroupBy: expect %f , but got %f", targetV, curV) + } + } + + b := New( + series.New([]string{"b", "a", "b", "a", "b"}, series.String, "key3"), + ) + groups = b.GroupBy("key1", "key2") + if groups.Err == nil { + t.Errorf("GroupBy: COLUMNS NOT FOUND") + } +} + +func TestDataFrame_Aggregation(t *testing.T) { + a := New( + series.New([]string{"b", "a", "b", "a", "b"}, series.String, "key1"), + series.New([]int{1, 2, 1, 2, 2}, series.Int, "key2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "values"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "values2"), + ) + groups := a.GroupBy("key1", "key2") + df := groups.Aggregation([]AggregationType{Aggregation_MAX, Aggregation_MIN, Aggregation_COUNT, Aggregation_SUM}, []string{"values", "values2", "values2", "values2"}) + resultMap := make(map[string]float32, 3) + resultMap[fmt.Sprintf("%s_%d", "a", 2)] = 4 + resultMap[fmt.Sprintf("%s_%d", "b", 1)] = 5.3 + resultMap[fmt.Sprintf("%s_%d", "b", 2)] = 1.2 + for _, m := range df.Maps() { + key := fmt.Sprintf("%s_%d", m["key1"], m["key2"]) + if !IsEqual(m["values_MAX"].(float64), float64(resultMap[key])) { + t.Errorf("Aggregation: expect %f , but got %f", float64(resultMap[key]), m["values"].(float64)) + } + } +} diff --git a/dataframe/examples_test.go b/dataframe/examples_test.go index 6687ea7..8cdb36c 100644 --- a/dataframe/examples_test.go +++ b/dataframe/examples_test.go @@ -24,9 +24,9 @@ func ExampleLoadStructs() { Accuracy float64 } users := []User{ - User{"Aram", 17, 0.2}, - User{"Juan", 18, 0.8}, - User{"Ana", 22, 0.5}, + {"Aram", 17, 0.2}, + {"Juan", 18, 0.8}, + {"Ana", 22, 0.5}, } df := dataframe.LoadStructs(users) fmt.Println(df) @@ -35,11 +35,11 @@ func ExampleLoadStructs() { func ExampleLoadRecords() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"k", "5", "7.0", "true"}, - []string{"k", "4", "6.0", "true"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"k", "5", "7.0", "true"}, + {"k", "4", "6.0", "true"}, + {"a", "2", "7.1", "false"}, }, ) fmt.Println(df) @@ -48,11 +48,11 @@ func ExampleLoadRecords() { func ExampleLoadRecords_options() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"k", "5", "7.0", "true"}, - []string{"k", "4", "6.0", "true"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"k", "5", "7.0", "true"}, + {"k", "4", "6.0", "true"}, + {"a", "2", "7.1", "false"}, }, dataframe.DetectTypes(false), dataframe.DefaultType(series.Float), @@ -67,13 +67,13 @@ func ExampleLoadRecords_options() { func ExampleLoadMaps() { df := dataframe.LoadMaps( []map[string]interface{}{ - map[string]interface{}{ + { "A": "a", "B": 1, "C": true, "D": 0, }, - map[string]interface{}{ + { "A": "b", "B": 2, "C": true, @@ -109,11 +109,11 @@ func ExampleReadJSON() { func ExampleDataFrame_Subset() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"k", "5", "7.0", "true"}, - []string{"k", "4", "6.0", "true"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"k", "5", "7.0", "true"}, + {"k", "4", "6.0", "true"}, + {"a", "2", "7.1", "false"}, }, ) sub := df.Subset([]int{0, 2}) @@ -123,11 +123,11 @@ func ExampleDataFrame_Subset() { func ExampleDataFrame_Select() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"k", "5", "7.0", "true"}, - []string{"k", "4", "6.0", "true"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"k", "5", "7.0", "true"}, + {"k", "4", "6.0", "true"}, + {"a", "2", "7.1", "false"}, }, ) sel1 := df.Select([]int{0, 2}) @@ -139,11 +139,11 @@ func ExampleDataFrame_Select() { func ExampleDataFrame_Filter() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"k", "5", "7.0", "true"}, - []string{"k", "4", "6.0", "true"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"k", "5", "7.0", "true"}, + {"k", "4", "6.0", "true"}, + {"a", "2", "7.1", "false"}, }, ) fil := df.Filter( @@ -172,11 +172,11 @@ func ExampleDataFrame_Filter() { func ExampleDataFrame_Mutate() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"k", "5", "7.0", "true"}, - []string{"k", "4", "6.0", "true"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"k", "5", "7.0", "true"}, + {"k", "4", "6.0", "true"}, + {"a", "2", "7.1", "false"}, }, ) // Change column C with a new one @@ -194,20 +194,20 @@ func ExampleDataFrame_Mutate() { func ExampleDataFrame_InnerJoin() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"k", "5", "7.0", "true"}, - []string{"k", "4", "6.0", "true"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"k", "5", "7.0", "true"}, + {"k", "4", "6.0", "true"}, + {"a", "2", "7.1", "false"}, }, ) df2 := dataframe.LoadRecords( [][]string{ - []string{"A", "F", "D"}, - []string{"1", "1", "true"}, - []string{"4", "2", "false"}, - []string{"2", "8", "false"}, - []string{"5", "9", "false"}, + {"A", "F", "D"}, + {"1", "1", "true"}, + {"4", "2", "false"}, + {"2", "8", "false"}, + {"5", "9", "false"}, }, ) join := df.InnerJoin(df2, "D") @@ -217,20 +217,20 @@ func ExampleDataFrame_InnerJoin() { func ExampleDataFrame_Set() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"k", "5", "7.0", "true"}, - []string{"k", "4", "6.0", "true"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"k", "5", "7.0", "true"}, + {"k", "4", "6.0", "true"}, + {"a", "2", "7.1", "false"}, }, ) df2 := df.Set( series.Ints([]int{0, 2}), dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"b", "4", "6.0", "true"}, - []string{"c", "3", "6.0", "false"}, + {"A", "B", "C", "D"}, + {"b", "4", "6.0", "true"}, + {"c", "3", "6.0", "false"}, }, ), ) @@ -240,11 +240,11 @@ func ExampleDataFrame_Set() { func ExampleDataFrame_Arrange() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"b", "4", "6.0", "true"}, - []string{"c", "3", "6.0", "false"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"b", "4", "6.0", "true"}, + {"c", "3", "6.0", "false"}, + {"a", "2", "7.1", "false"}, }, ) sorted := df.Arrange( @@ -257,11 +257,11 @@ func ExampleDataFrame_Arrange() { func ExampleDataFrame_Describe() { df := dataframe.LoadRecords( [][]string{ - []string{"A", "B", "C", "D"}, - []string{"a", "4", "5.1", "true"}, - []string{"b", "4", "6.0", "true"}, - []string{"c", "3", "6.0", "false"}, - []string{"a", "2", "7.1", "false"}, + {"A", "B", "C", "D"}, + {"a", "4", "5.1", "true"}, + {"b", "4", "6.0", "true"}, + {"c", "3", "6.0", "false"}, + {"a", "2", "7.1", "false"}, }, ) fmt.Println(df.Describe()) diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..6af1705 --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module github.com/go-gota/gota + +go 1.16 + +require ( + golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6 + gonum.org/v1/gonum v0.9.1 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..20a4dc3 --- /dev/null +++ b/go.sum @@ -0,0 +1,72 @@ +dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8= +github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= +github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= +github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= +github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g= +github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks= +github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY= +github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY= +github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= +github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= +github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= +github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= +github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY= +github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 h1:n9HxLrNxWWtEb1cA950nuEEj3QnKbtsCJ6KjcgisNUs= +golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE= +golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= +golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= +golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/image v0.0.0-20190910094157-69e4b8554b2a/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6 h1:0PC75Fz/kyMGhL0e1QnypqK2kQMqKt9csD1GnMJR+Zk= +golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210304124612-50617c2ba197/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= +gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= +gonum.org/v1/gonum v0.9.1 h1:HCWmqqNoELL0RAQeKBXWtkp04mGk8koafcB4He6+uhc= +gonum.org/v1/gonum v0.9.1/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= +gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= +gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/series/rolling_window.go b/series/rolling_window.go new file mode 100644 index 0000000..e2f1740 --- /dev/null +++ b/series/rolling_window.go @@ -0,0 +1,52 @@ +package series + +// RollingWindow is used for rolling window calculations. +type RollingWindow struct { + window int + series Series +} + +// Rolling creates new RollingWindow +func (s Series) Rolling(window int) RollingWindow { + return RollingWindow{ + window: window, + series: s, + } +} + +// Mean returns the rolling mean. +func (r RollingWindow) Mean() (s Series) { + s = New([]float64{}, Float, "Mean") + for _, block := range r.getBlocks() { + s.Append(block.Mean()) + } + + return +} + +// StdDev returns the rolling mean. +func (r RollingWindow) StdDev() (s Series) { + s = New([]float64{}, Float, "StdDev") + for _, block := range r.getBlocks() { + s.Append(block.StdDev()) + } + + return +} + +func (r RollingWindow) getBlocks() (blocks []Series) { + for i := 1; i <= r.series.Len(); i++ { + if i < r.window { + blocks = append(blocks, r.series.Empty()) + continue + } + + index := []int{} + for j := i - r.window; j < i; j++ { + index = append(index, j) + } + blocks = append(blocks, r.series.Subset(index)) + } + + return +} diff --git a/series/rolling_window_test.go b/series/rolling_window_test.go new file mode 100644 index 0000000..77ca331 --- /dev/null +++ b/series/rolling_window_test.go @@ -0,0 +1,85 @@ +package series + +import ( + "math" + "strings" + "testing" +) + +func TestSeries_RollingMean(t *testing.T) { + tests := []struct { + window int + series Series + expected Series + }{ + { + 3, + Ints([]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), + Floats([]float64{math.NaN(), math.NaN(), 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}), + }, + { + 2, + Floats([]float64{1.0, 2.0, 3.0}), + Floats([]float64{math.NaN(), 1.5, 2.5}), + }, + { + 0, + Floats([]float64{}), + Floats([]float64{}), + }, + } + + for testnum, test := range tests { + expected := test.expected + received := test.series.Rolling(test.window).Mean() + + for i := 0; i < expected.Len(); i++ { + if strings.Compare(expected.Elem(i).String(), + received.Elem(i).String()) != 0 { + t.Errorf( + "Test:%v\nExpected:\n%v\nReceived:\n%v", + testnum, expected, received, + ) + } + } + } +} + +func TestSeries_RollingStdDev(t *testing.T) { + tests := []struct { + window int + series Series + expected Series + }{ + { + 3, + Ints([]int{5, 5, 6, 7, 5, 5, 5}), + Floats([]float64{math.NaN(), math.NaN(), 0.5773502691896257, 1.0, 1.0, 1.1547005383792515, 0.0}), + }, + { + 2, + Floats([]float64{1.0, 2.0, 3.0}), + Floats([]float64{math.NaN(), 0.7071067811865476, 0.7071067811865476}), + }, + { + 0, + Floats([]float64{}), + Floats([]float64{}), + }, + } + + for testnum, test := range tests { + expected := test.expected + received := test.series.Rolling(test.window).StdDev() + + for i := 0; i < expected.Len(); i++ { + if strings.Compare(expected.Elem(i).String(), + received.Elem(i).String()) != 0 { + t.Errorf( + "Test:%v\nExpected:\n%v\nReceived:\n%v", + testnum, expected, received, + ) + } + } + } +} diff --git a/series/series.go b/series/series.go index 79fe0a0..408773d 100644 --- a/series/series.go +++ b/series/series.go @@ -93,15 +93,19 @@ type Comparator string // Supported Comparators const ( - Eq Comparator = "==" // Equal - Neq Comparator = "!=" // Non equal - Greater Comparator = ">" // Greater than - GreaterEq Comparator = ">=" // Greater or equal than - Less Comparator = "<" // Lesser than - LessEq Comparator = "<=" // Lesser or equal than - In Comparator = "in" // Inside + Eq Comparator = "==" // Equal + Neq Comparator = "!=" // Non equal + Greater Comparator = ">" // Greater than + GreaterEq Comparator = ">=" // Greater or equal than + Less Comparator = "<" // Lesser than + LessEq Comparator = "<=" // Lesser or equal than + In Comparator = "in" // Inside + CompFunc Comparator = "func" // user-defined comparison function ) +// compFunc defines a user-defined comparator function. Used internally for type assertions +type compFunc = func(el Element) bool + // Type is a convenience alias that can be used for a more type safe way of // reason and use Series types. type Type string @@ -390,9 +394,25 @@ func (s Series) Compare(comparator Comparator, comparando interface{}) Series { return ret, nil } - comp := New(comparando, s.t, "") bools := make([]bool, s.Len()) - // In comparator comparation + + // CompFunc comparator comparison + if comparator == CompFunc { + f, ok := comparando.(compFunc) + if !ok { + panic("comparando is not a comparison function of type func(el Element) bool") + } + + for i := 0; i < s.Len(); i++ { + e := s.elements.Elem(i) + bools[i] = f(e) + } + + return Bools(bools) + } + + comp := New(comparando, s.t, "") + // In comparator comparison if comparator == In { for i := 0; i < s.Len(); i++ { e := s.elements.Elem(i) @@ -777,7 +797,6 @@ func (s Series) Quantile(p float64) float64 { // the function passed in via argument `f` will not expect another type, but // instead expects to handle Element(s) of type Float. func (s Series) Map(f MapFunction) Series { - mappedValues := make([]Element, s.Len()) for i := 0; i < s.Len(); i++ { value := f(s.elements.Elem(i)) @@ -785,3 +804,17 @@ func (s Series) Map(f MapFunction) Series { } return New(mappedValues, s.Type(), s.Name) } + +// Sum calculates the sum value of a series +func (s Series) Sum() float64 { + if s.elements.Len() == 0 || s.Type() == String || s.Type() == Bool { + return math.NaN() + } + sFloat := s.Float() + sum := sFloat[0] + for i := 1; i < len(sFloat); i++ { + elem := sFloat[i] + sum += elem + } + return sum +} diff --git a/series/series_test.go b/series/series_test.go index c7d0516..d8e24fe 100644 --- a/series/series_test.go +++ b/series/series_test.go @@ -4,8 +4,8 @@ import ( "fmt" "math" "reflect" - "testing" "strings" + "testing" ) // Check that there are no shared memory addreses between the elements of two Series @@ -421,6 +421,76 @@ func TestSeries_Compare(t *testing.T) { } } +func TestSeries_Compare_CompFunc(t *testing.T) { + table := []struct { + series Series + comparator Comparator + comparando interface{} + expected Series + panic bool + }{ + { + Strings([]string{"A", "B", "C", "B", "D", "BADA"}), + CompFunc, + func(el Element) bool { + if el.Type() == String { + if val, ok := el.Val().(string); ok { + return strings.HasPrefix(val, "B") + } + return false + } + return false + }, + Bools([]bool{false, true, false, true, false, true}), + false, + }, + { + Strings([]string{"A", "B", "C", "B", "D", "BADA"}), + CompFunc, + func(el Element) {}, + Bools([]bool{false, false, false, false, false}), + true, + }, + } + for testnum, test := range table { + func() { + defer func() { + if r := recover(); r != nil { + // recovered + if !test.panic { + t.Errorf("did not expected panic but was '%v'", r) + } + } else { + // nothing to recover from + if test.panic { + t.Errorf("exptected panic but did not panic") + } + } + }() + + a := test.series + b := a.Compare(test.comparator, test.comparando) + if err := b.Err; err != nil { + t.Errorf("Test:%v\nError:%v", testnum, err) + } + expected := test.expected.Records() + received := b.Records() + if !reflect.DeepEqual(expected, received) { + t.Errorf( + "Test:%v\nExpected:\n%v\nReceived:\n%v", + testnum, expected, received, + ) + } + if err := checkTypes(b); err != nil { + t.Errorf( + "Test:%v\nError:%v", + testnum, err, + ) + } + }() + } +} + func TestSeries_Subset(t *testing.T) { table := []struct { series Series @@ -1525,9 +1595,8 @@ func TestSeries_Quantile(t *testing.T) { } } - func TestSeries_Map(t *testing.T) { - tests := []struct { + tests := []struct { series Series expected Series }{ @@ -1564,11 +1633,11 @@ func TestSeries_Map(t *testing.T) { doubleFloat64 := func(e Element) Element { var result Element result = e.Copy() - result.Set(result.Float() * 2) + result.Set(result.Float() * 2) return Element(result) } - // and two booleans + // and two booleans and := func(e Element) Element { var result Element result = e.Copy() @@ -1588,11 +1657,11 @@ func TestSeries_Map(t *testing.T) { i, err := result.Int() if err != nil { return Element(&intElement{ - e: +5, + e: +5, nan: false, }) } - result.Set(i + 5) + result.Set(i + 5) return Element(result) } @@ -1604,12 +1673,12 @@ func TestSeries_Map(t *testing.T) { return Element(result) } - for testnum, test := range tests { + for testnum, test := range tests { switch test.series.Type() { case Bool: expected := test.expected received := test.series.Map(and) - for i := 0 ; i