diff --git a/README.md b/README.md index 33e8366..54872b5 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,16 @@ - ![](media/logo_med.png) # Preludio ### A PRQL based data transformation language + Preludio is a data transformation language based on PRQL. It is a language that allows you to transform and manipulate data in a simple and intuitive way, batteries included. No libraries or external dependencies are required to run the language. ### Example + This is a simple example of what you can already do with Preludio. It reads a CSV file, derives two new columns, selects some columns and writes the result to a new CSV file. @@ -18,6 +19,7 @@ let clean = ( readCSV "test_files\\Cars.csv" delimiter: ";" header:true strReplace [MPG, Displacement, Horsepower, Acceleration] old:"," new:"." asFloat [MPG, Displacement, Horsepower, Acceleration] + orderBy [-Origin, Cylinders, -MPG] ) let europe5Cylinders = ( @@ -41,25 +43,29 @@ let europe5Cylinders = ( ![](media/repl_example.gif) ### Features -- [x] Arithmetic and logical operators -- [x] Read and write CSV files -- [x] Derive new columns -- [x] Select columns -- [x] Filter rows -- [ ] Sort rows -- [ ] Group by and aggregate -- [ ] Join tables + +- [x] Arithmetic and logical operators +- [x] Read and write CSV files +- [x] Derive new columns +- [x] Select columns +- [x] Filter rows +- [x] Sort rows +- [ ] Group by and aggregate +- [ ] Join tables ### Installation + To run it, you need to have [Go](https://golang.org/doc/install) installed. Once you have Go, you can clone this repository. To run the program, you can use the following command: + ```bash go run . ``` ### Future Features + - [x] Move to [Gandalff](https://github.com/caerbannogwhite/preludio/tree/main/core/gandalff) library - [ ] Add statistical functions - [ ] Add support for Excel files @@ -80,16 +86,23 @@ In case the language becomes quite successful, I will consider adding: - [ ] Integration with OpenAI (https://openai.com/blog/openai-api/), ie. image to table ### Contributing + If you want to contribute to this project, you can do so by forking the repository and submitting a pull request. ### Developers + If the grammar is changed, the parser must be regenerated. To do this, run the following command: (on Windows) + ``` make.ps1 ``` ### Log - - **2 / 08 / 2023** Preludio is now using the Gandalff library for managing data. - - **21 / 03 / 2023** First publishing of the repository. Many things are still not working. + +- **20 / 08 / 2023** After exactly one year from the first commit, Preludio is fairly stable and usable. The language is still missing a few core features (like `join` and aggregators, already supported by Gandalff), but it is already possible to perform many operations with it. +- **02 / 08 / 2023** Preludio is now using the Gandalff library for managing data. +- **21 / 03 / 2023** First publishing of the repository. Many things are still not working. +- **18 / 03 / 2023** Gandalff library: fist commit. +- **20 / 08 / 2022** Preludio: fist commit. diff --git a/core/bytefeeder/bytefeeder.go b/core/bytefeeder/bytefeeder.go index 6f81933..cf29f3f 100644 --- a/core/bytefeeder/bytefeeder.go +++ b/core/bytefeeder/bytefeeder.go @@ -379,7 +379,6 @@ func (bf *ByteFeeder) ExitExprUnary(ctx *ExprUnaryContext) { case "not": bf.AppendInstruction(typesys.OP_UNARY_NOT, 0, 0) - } } } diff --git a/core/full_language_test.go b/core/full_language_test.go index cb8fb31..86eaeaa 100644 --- a/core/full_language_test.go +++ b/core/full_language_test.go @@ -3,6 +3,7 @@ package preludiocore import ( "bytefeeder" "gandalff" + "math" "os" "testing" ) @@ -17,144 +18,205 @@ func Test_Expressions(t *testing.T) { bytecode, _, _ = bytefeeder.CompileSource(`true`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, true); err != nil { + if err = checkCurrentResult(be, true); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`false`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, false); err != nil { + if err = checkCurrentResult(be, false); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`true * false`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, int64(0)); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`true / false`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, math.Inf(1)); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`true % false`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, math.NaN()); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`true ** false`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, int64(1)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`true + false`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, int64(1)); err != nil { + if err = checkCurrentResult(be, int64(1)); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`true - false`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, int64(1)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`true and false`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, false); err != nil { + if err = checkCurrentResult(be, false); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`true or false`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, true); err != nil { + if err = checkCurrentResult(be, true); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`not true`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, false); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`true or (false and true)`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, true); err != nil { + if err = checkCurrentResult(be, true); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`true or not false and true or not true`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, true); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`1 * 5`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, int64(5)); err != nil { + if err = checkCurrentResult(be, int64(5)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`1 / 3`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, float64(0.3333333333333333)); err != nil { + if err = checkCurrentResult(be, float64(0.3333333333333333)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`4682 % 427`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, float64(412)); err != nil { + if err = checkCurrentResult(be, float64(412)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`3 ** 4`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, int64(81)); err != nil { + if err = checkCurrentResult(be, int64(81)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`2 ** (2 + 1 * 2)`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, int64(16)); err != nil { + if err = checkCurrentResult(be, int64(16)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`1 - 2`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, int64(-1)); err != nil { + if err = checkCurrentResult(be, int64(-1)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`1 + 2`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, int64(3)); err != nil { + if err = checkCurrentResult(be, int64(3)); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`+1 + 2`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, int64(3)); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`-1`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, int64(-1)); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`-1 + 2`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, int64(1)); err != nil { + t.Error(err) + } + + bytecode, _, _ = bytefeeder.CompileSource(`-1.0 - 2`) + be.RunBytecode(bytecode) + if err = checkCurrentResult(be, float64(-3)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`1.325235e-3 * 5`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, float64(0.006626175)); err != nil { + if err = checkCurrentResult(be, float64(0.006626175)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`1.325235e-3 / 3`) be.RunBytecode(bytecode) - - if be.__currentResult == nil { - t.Error("Expected result, got nil") - } else if be.__currentResult.isFloat64Scalar() == false { - t.Error("Expected float scalar, got", be.__currentResult) - } else if f, err := be.__currentResult.getFloat64Scalar(); err != nil || f != 0.00044174499999999995 { - t.Error("Expected 0.00044174499999999995, got", f, err) + if err = checkCurrentResult(be, float64(0.00044174499999999995)); err != nil { + t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`"hello" + "world"`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, "helloworld"); err != nil { + if err = checkCurrentResult(be, "helloworld"); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`1 + 2 * 3 - 4 + 5 * 6`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, int64(33)); err != nil { + if err = checkCurrentResult(be, int64(33)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`1 + 2 * 3 - 4 + 5 * 6 % 7 + "hello"`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, "5hello"); err != nil { + if err = checkCurrentResult(be, "5hello"); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`3.4 + 2.3 * 3.2 - 4.1 + 5.0 * 6.9`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, float64(41.16)); err != nil { + if err = checkCurrentResult(be, float64(41.16)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`(1 + 2) * (3 - 4) + 5 * 6`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, int64(27)); err != nil { + if err = checkCurrentResult(be, int64(27)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`(1 + (2 * 3)) - (4 + (5 * (6 % 7 + 8))) / ((9) + (10 * 11 - 12 % 13))`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, float64(6.308411214953271)); err != nil { + if err = checkCurrentResult(be, float64(6.308411214953271)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`(1 + (2 * (3 - (4 + (5 * (6 % (7 + (8 - (9 + (10 * (11 - (12 + (13 * (14 % (15 + (16 - (17 + (18 * (19 - (20 + (21 * (22 % (23 + (24 - (25 + (26 * (27 - (28 + (29 * (30 - (31 + (32 * (33 - (34 + (35 * (36 % (37 + (38 - (39 + (40 * (41 - (42 + (43 * (44 % (45 + (46 - (47 + (48 * (49 - (50 + (51 * (52 % (53 + (54 - (55 + (56 * (57 - (58 + (59 * (60 % (61 + (62 - (63 + (64 * (65 - (66 + (67 * (68 % (69 + (70 - (71 + (72 * (73 - (74 + (75 * (76 % (77 + (78 - (79 + (80 * (81 - (82 + (83 * (84 % (85 + (86 - (87 + (88 * (89 - (90 + (91 * (92 % (93 + (94 - (95 + (96 * (97 - (98 + (99 * (100))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, float64(-61.0)); err != nil { + if err = checkCurrentResult(be, float64(-61.0)); err != nil { t.Error(err) } bytecode, _, _ = bytefeeder.CompileSource(`1e30 / 1.000001 / 1.000002 / 1.000003 / 1.000004 / 1.000005 / 1.000006 / 1.000007 / 1.000008 / 1.000009 / 1.000010 / 1.000011 / 1.000012 / 1.000013 / 1.000014 / 1.000015 / 1.000016 / 1.000017 / 1.000018 / 1.000019 / 1.000020 / 1.000021 / 1.000022 / 1.000023 / 1.000024 / 1.000025 / 1.000026 / 1.000027 / 1.000028 / 1.000029 / 1.000030 / 1.000031 / 1.000032 / 1.000033 / 1.000034 / 1.000035 / 1.000036 / 1.000037 / 1.000038 / 1.000039 / 1.000040 / 1.000041 / 1.000042 / 1.000043 / 1.000044 / 1.000045 / 1.000046 / 1.000047 / 1.000048 / 1.000049 / 1.000050 / 1.000051 / 1.000052 / 1.000053 / 1.000054 / 1.000055 / 1.000056 / 1.000057 / 1.000058 / 1.000059 / 1.000060 / 1.000061 / 1.000062 / 1.000063 / 1.000064 / 1.000065 / 1.000066 / 1.000067 / 1.000068 / 1.000069 / 1.000070 / 1.000071 / 1.000072 / 1.000073 / 1.000074 / 1.000075 / 1.000076 / 1.000077 / 1.000078 / 1.000079 / 1.000080 / 1.000081 / 1.000082 / 1.000083 / 1.000084 / 1.000085 / 1.000086 / 1.000087 / 1.000088 / 1.000089 / 1.000090 / 1.000091 / 1.000092 / 1.000093 / 1.000094 / 1.000095 / 1.000096 / 1.000097 / 1.000098 / 1.000099 / 1.000100`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, float64(9.949628981268441e+29)); err != nil { + if err = checkCurrentResult(be, float64(9.949628981268441e+29)); err != nil { t.Error(err) } } @@ -172,7 +234,7 @@ func Test_Assignements(t *testing.T) { bytecode, _, _ = bytefeeder.CompileSource(source) be.RunBytecode(bytecode) - if err = currentResultChecker(be, int64(33)); err != nil { + if err = checkCurrentResult(be, int64(33)); err != nil { t.Error(err) } @@ -185,7 +247,7 @@ func Test_Assignements(t *testing.T) { bytecode, _, _ = bytefeeder.CompileSource(`d * (e * (f * (g * (d * (e * (f * (g)))))))`) be.RunBytecode(bytecode) - if err = currentResultChecker(be, 1.000000040200004); err != nil { + if err = checkCurrentResult(be, 1.000000040200004); err != nil { t.Error(err) } diff --git a/core/gandalff/README.md b/core/gandalff/README.md index 843afa2..dd87fe8 100644 --- a/core/gandalff/README.md +++ b/core/gandalff/README.md @@ -78,22 +78,18 @@ The data types not checked are not yet supported, but might be in the future. - [x] FilterByMask - [x] FilterByIndex -- [ ] Group +- [x] Group - - [x] Group - - [x] SubGroup - - [ ] Group with nulls - - [ ] SubGroup with nulls + - [x] Group (with nulls) + - [x] SubGroup (with nulls) - [x] Map -- [ ] Sort +- [x] Sort - - [x] Sort - - [ ] SortRev - - [ ] Sort with nulls - - [ ] SortRev with nulls + - [x] Sort (with nulls) + - [x] SortRev (with nulls) -- [ ] Take +- [x] Take ### Supported operations for DataFrame @@ -112,9 +108,9 @@ The data types not checked are not yet supported, but might be in the future. - [ ] Outer with nulls - [ ] Map -- [ ] OrderBy +- [x] OrderBy - [x] Select -- [ ] Take +- [x] Take ### Supported stats functions @@ -142,8 +138,13 @@ type Series interface { Len() int // Returns the name of the series. Name() string + // Sets the name of the series. + SetName(name string) Series + // Returns the type of the series. Type() typesys.BaseType + // Returns the type and cardinality of the series. + TypeCard() typesys.BaseTypeCard // Returns if the series is grouped. IsGrouped() bool @@ -182,11 +183,7 @@ type Series interface { // Set the element at index i. Set(i int, v any) Series // Take the elements according to the given interval. - Take(start, end, step int) Series - - // Sort Interface. - Less(i, j int) bool - Swap(i, j int) + Take(params ...int) Series // Append elements to the series. // Value can be a single value, slice of values, @@ -212,17 +209,24 @@ type Series interface { // Filters out the elements by the given mask. // Mask can be a bool series, a slice of bools or a slice of ints. Filter(mask any) Series + filterIntSlice(mask []int) Series // Maps the elements of the series. Map(f GDLMapFunc, stringPool *StringPool) Series // Group the elements in the series. - Group() Series - SubGroup(gp SeriesPartition) Series + group() Series + GroupBy(gp SeriesPartition) Series + UnGroup() Series // Get the partition of the series. GetPartition() SeriesPartition + // Sort Interface. + Less(i, j int) bool + equal(i, j int) bool + Swap(i, j int) + // Sorts the elements of the series. Sort() Series SortRev() Series @@ -249,9 +253,6 @@ type Series interface { - [ ] Implement memory optimized Bool series with uint64. - [ ] Using uint64 for null mask. -- [ ] Implement and test grouped sorting for all types. -- [ ] Implement and test grouped with nulls. - [ ] Implement chunked series. -- [ ] Implement CSV writer. - [ ] Implement Excel reader and writer (https://github.com/tealeg/xlsx). - [ ] Implement JSON reader and writer. diff --git a/core/gandalff/benchmarking/data/gandalff_1.tsv b/core/gandalff/benchmarking/data/gandalff_1_0.tsv similarity index 100% rename from core/gandalff/benchmarking/data/gandalff_1.tsv rename to core/gandalff/benchmarking/data/gandalff_1_0.tsv diff --git a/core/gandalff/benchmarking/data/gandalff_2.tsv b/core/gandalff/benchmarking/data/gandalff_2_0.tsv similarity index 100% rename from core/gandalff/benchmarking/data/gandalff_2.tsv rename to core/gandalff/benchmarking/data/gandalff_2_0.tsv diff --git a/core/gandalff/benchmarking/data/gandalff_3_0.tsv b/core/gandalff/benchmarking/data/gandalff_3_0.tsv new file mode 100644 index 0000000..04f16d1 --- /dev/null +++ b/core/gandalff/benchmarking/data/gandalff_3_0.tsv @@ -0,0 +1,21 @@ +solution question in_rows time_ns +gandalff_3_0 Q1 1e4 316380 +gandalff_3_0 Q1 1e5 1482389 +gandalff_3_0 Q1 1e6 10871940 +gandalff_3_0 Q1 1e7 113146250 +gandalff_3_0 Q2 1e4 1324013 +gandalff_3_0 Q2 1e5 6776976 +gandalff_3_0 Q2 1e6 61533190 +gandalff_3_0 Q2 1e7 888291650 +gandalff_3_0 Q3 1e4 318091 +gandalff_3_0 Q3 1e5 2491238 +gandalff_3_0 Q3 1e6 30301374 +gandalff_3_0 Q3 1e7 529594500 +gandalff_3_0 Q4 1e4 72138 +gandalff_3_0 Q4 1e5 913448 +gandalff_3_0 Q4 1e6 10418554 +gandalff_3_0 Q4 1e7 122941510 +gandalff_3_0 Q5 1e4 73908 +gandalff_3_0 Q5 1e5 881757 +gandalff_3_0 Q5 1e6 32939966 +gandalff_3_0 Q5 1e7 522937250 \ No newline at end of file diff --git a/core/gandalff/benchmarking/gen.ps1 b/core/gandalff/benchmarking/gen.ps1 index 0681fc4..94899b1 100644 --- a/core/gandalff/benchmarking/gen.ps1 +++ b/core/gandalff/benchmarking/gen.ps1 @@ -1,6 +1,13 @@ +# With 0% NAs Rscript.exe groupby-datagen.R 1e4 1e2 0 0 Rscript.exe groupby-datagen.R 1e5 1e2 0 0 Rscript.exe groupby-datagen.R 1e6 1e2 0 0 Rscript.exe groupby-datagen.R 1e7 1e2 0 0 +# With 10% NAs +Rscript.exe groupby-datagen.R 1e4 1e2 10 0 +Rscript.exe groupby-datagen.R 1e5 1e2 10 0 +Rscript.exe groupby-datagen.R 1e6 1e2 10 0 +Rscript.exe groupby-datagen.R 1e7 1e2 10 0 + Move-Item -Force G1_* ..\testdata\ \ No newline at end of file diff --git a/core/gandalff/benchmarking/pandas_groupby.py b/core/gandalff/benchmarking/pandas_groupby.py index d146341..3fb28d1 100644 --- a/core/gandalff/benchmarking/pandas_groupby.py +++ b/core/gandalff/benchmarking/pandas_groupby.py @@ -18,7 +18,10 @@ fun = ".groupby" cache = "TRUE" on_disk = "FALSE" -data_names = ["G1_1e4_1e2_0_0", "G1_1e5_1e2_0_0", "G1_1e6_1e2_0_0", "G1_1e7_1e2_0_0"] +data_names = [ + "G1_1e4_1e2_0_0", "G1_1e5_1e2_0_0", "G1_1e6_1e2_0_0", "G1_1e7_1e2_0_0", + "G1_1e4_1e2_10_0", "G1_1e5_1e2_10_0", "G1_1e6_1e2_10_0", "G1_1e7_1e2_10_0" +] for data_name in data_names: filepath = os.path.join("..", "testdata", data_name+".csv") diff --git a/core/gandalff/benchmarking/plots.ipynb b/core/gandalff/benchmarking/plots.ipynb index 1b08a1e..4c58959 100644 --- a/core/gandalff/benchmarking/plots.ipynb +++ b/core/gandalff/benchmarking/plots.ipynb @@ -138,8 +138,8 @@ " print(\"New Gandalff VS Polars\")\n", " print(t.to_markdown(index=False))\n", "\n", - "calculate_speedup(\"gandalff_1\", \"data\\\\gandalff_1.tsv\")\n", - "calculate_speedup(\"gandalff_2_5\", \"data\\\\gandalff_2_5.tsv\")\n", + "calculate_speedup(\"gandalff_1_0\", \"data\\\\gandalff_1_0.tsv\")\n", + "calculate_speedup(\"gandalff_3_0\", \"data\\\\gandalff_3_0.tsv\")\n", "\n" ] } diff --git a/core/gandalff/benchmarking/polars_groupby.py b/core/gandalff/benchmarking/polars_groupby.py index 35db866..2dbd891 100644 --- a/core/gandalff/benchmarking/polars_groupby.py +++ b/core/gandalff/benchmarking/polars_groupby.py @@ -18,7 +18,10 @@ fun = ".groupby" cache = "TRUE" on_disk = "FALSE" -data_names = ["G1_1e4_1e2_0_0", "G1_1e5_1e2_0_0", "G1_1e6_1e2_0_0", "G1_1e7_1e2_0_0"] +data_names = [ + "G1_1e4_1e2_0_0", "G1_1e5_1e2_0_0", "G1_1e6_1e2_0_0", "G1_1e7_1e2_0_0", + "G1_1e4_1e2_10_0", "G1_1e5_1e2_10_0", "G1_1e6_1e2_10_0", "G1_1e7_1e2_10_0" +] for data_name in data_names: filepath = os.path.join("..", "testdata", data_name+".csv") diff --git a/core/gandalff/gdl_consts.go b/core/gandalff/gdl_consts.go index b617774..def9ed1 100644 --- a/core/gandalff/gdl_consts.go +++ b/core/gandalff/gdl_consts.go @@ -19,8 +19,9 @@ const ( MINIMUM_PARALLEL_SIZE_1 = 16_384 MINIMUM_PARALLEL_SIZE_2 = 131_072 - HASH_MAGIC_NUMBER = int64(0xa8f4979b77e3f93) - HASH_NULL_KEY = int64(0x7ff8000000000001) + HASH_MAGIC_NUMBER = int64(0xa8f4979b77e3f93) + HASH_MAGIC_NUMBER_NULL = int64(0x7fff4979b77e3f93) + HASH_NULL_KEY = int64(0x7ff8000000000001) ) //////////////////////////////// ENUMS diff --git a/core/gandalff/gdl_dataframe.go b/core/gandalff/gdl_dataframe.go index 519d9b5..8ec0cc7 100644 --- a/core/gandalff/gdl_dataframe.go +++ b/core/gandalff/gdl_dataframe.go @@ -38,7 +38,7 @@ type DataFrame interface { // Add new series to the dataframe. // AddSeries adds a generic series to the dataframe. - AddSeries(series Series) DataFrame + AddSeries(series ...Series) DataFrame // AddSeriesFromBool adds a series of bools to the dataframe. AddSeriesFromBool(name string, isNullable, makeCopy bool, data []bool) DataFrame // AddSeriesFromInt32 adds a series of ints to the dataframe. @@ -77,11 +77,17 @@ type DataFrame interface { Agg(...aggregator) DataFrame + // Sort the dataframe. + Len() int + Less(i, j int) bool + Swap(i, j int) + OrderBy(params ...SortParam) DataFrame + // IO Describe() string Records(header bool) [][]string - PrettyPrint(nrows int) DataFrame + PrettyPrint(nrows ...int) DataFrame FromCSV() *CsvReader ToCSV() *CsvWriter @@ -227,3 +233,19 @@ func (agg stdAggregator) getAggregateType() AggregateType { func Std(name string) aggregator { return stdAggregator{name, AGGREGATE_STD} } + +//////////////////////// SORT + +type SortParam struct { + asc bool + name string + series Series +} + +func Asc(name string) SortParam { + return SortParam{asc: true, name: name} +} + +func Desc(name string) SortParam { + return SortParam{asc: false, name: name} +} diff --git a/core/gandalff/gdl_dataframe_base.go b/core/gandalff/gdl_dataframe_base.go index d7dcfb5..42412f0 100644 --- a/core/gandalff/gdl_dataframe_base.go +++ b/core/gandalff/gdl_dataframe_base.go @@ -20,6 +20,7 @@ type BaseDataFrame struct { series []Series pool *StringPool partitions []BaseDataFramePartitionEntry + sortParams []SortParam } func NewBaseDataFrame() DataFrame { @@ -98,7 +99,7 @@ func (df BaseDataFrame) GetSeriesIndex(name string) int { return -1 } -func (df BaseDataFrame) AddSeries(series Series) DataFrame { +func (df BaseDataFrame) AddSeries(series ...Series) DataFrame { if df.err != nil { return df } @@ -108,12 +109,14 @@ func (df BaseDataFrame) AddSeries(series Series) DataFrame { return df } - if df.NCols() > 0 && series.Len() != df.NRows() { - df.err = fmt.Errorf("BaseDataFrame.AddSeries: series length (%d) does not match dataframe length (%d)", series.Len(), df.NRows()) - return df - } + for _, series_ := range series { + if df.NCols() > 0 && series_.Len() != df.NRows() { + df.err = fmt.Errorf("BaseDataFrame.AddSeries: series length (%d) does not match dataframe length (%d)", series_.Len(), df.NRows()) + return df + } - df.series = append(df.series, series) + df.series = append(df.series, series_) + } return df } @@ -261,15 +264,6 @@ func (df BaseDataFrame) SeriesAt(index int) Series { return df.series[index] } -// Returns the series at the given index. -// For internal use only: returns nil if the series is not found. -func (df BaseDataFrame) __seriesAt(index int) Series { - if index < 0 || index >= len(df.series) { - return nil - } - return df.series[index] -} - func (df BaseDataFrame) Select(names ...string) DataFrame { if df.err != nil { return df @@ -300,13 +294,10 @@ func (df BaseDataFrame) SelectAt(indices ...int) DataFrame { selected := NewBaseDataFrame() for _, index := range indices { - series := df.__seriesAt(index) - if series != nil { - selected.AddSeries(series) + if index < 0 || index >= len(df.series) { + selected.AddSeries(df.series[index]) } else { - return BaseDataFrame{ - err: fmt.Errorf("BaseDataFrame.SelectAt: series at index %d not found", index), - } + return BaseDataFrame{err: fmt.Errorf("BaseDataFrame.SelectAt: index %d out of bounds", index)} } } @@ -372,7 +363,7 @@ func (df BaseDataFrame) GroupBy(by ...string) DataFrame { df.partitions[partitionsIndex] = BaseDataFramePartitionEntry{ index: i, name: name, - partition: series.Group().GetPartition(), + partition: series.group().GetPartition(), } } else @@ -381,7 +372,7 @@ func (df BaseDataFrame) GroupBy(by ...string) DataFrame { df.partitions[partitionsIndex] = BaseDataFramePartitionEntry{ index: i, name: name, - partition: series.SubGroup(df.partitions[partitionsIndex-1].partition).GetPartition(), + partition: series.GroupBy(df.partitions[partitionsIndex-1].partition).GetPartition(), } } } @@ -428,15 +419,15 @@ func (df BaseDataFrame) groupHelper() (DataFrame, *[][]int, *[]int) { // The last partition tells us how many groups there are // and how many rows are in each group - indeces := make([][]int, 0, df.partitions[len(df.partitions)-1].partition.GetSize()) - for _, group := range df.partitions[len(df.partitions)-1].partition.GetMap() { + indeces := make([][]int, 0, df.partitions[len(df.partitions)-1].partition.getSize()) + for _, group := range df.partitions[len(df.partitions)-1].partition.getMap() { indeces = append(indeces, group) } // Keep only the grouped series for _, partition := range df.partitions { seriesIndices[partition.index] = false - old := df.__seriesAt(partition.index) + old := df.series[partition.index] // TODO: null masks, null values are all mapped to the same group @@ -650,8 +641,8 @@ func (df BaseDataFrame) Join(how DataFrameJoinType, other DataFrame, on ...strin pB := otherGrouped.getPartitions() // Get the maps, keys and sort them - mapA := pA[len(pA)-1].GetMap() - mapB := pB[len(pB)-1].GetMap() + mapA := pA[len(pA)-1].getMap() + mapB := pB[len(pB)-1].getMap() keysA := make([]int64, 0, len(mapA)) keysB := make([]int64, 0, len(mapB)) @@ -976,6 +967,64 @@ func (df BaseDataFrame) Take(params ...int) DataFrame { return taken } +func (df BaseDataFrame) Len() int { + if df.err != nil || len(df.series) < 1 { + return 0 + } + + return df.series[0].Len() +} + +func (df BaseDataFrame) Less(i, j int) bool { + for _, param := range df.sortParams { + if !param.series.equal(i, j) { + return (param.asc && param.series.Less(i, j)) || (!param.asc && param.series.Less(j, i)) + } + } + + return false +} + +func (df BaseDataFrame) Swap(i, j int) { + for _, series := range df.series { + series.Swap(i, j) + } +} + +func (df BaseDataFrame) OrderBy(params ...SortParam) DataFrame { + if df.err != nil { + return df + } + + if df.isGrouped { + df.err = fmt.Errorf("BaseDataFrame.OrderBy: cannot order grouped DataFrame") + return df + } + + // CHECK: params must have unique names and names must be valid + paramNames := make(map[string]bool) + for i, param := range params { + if paramNames[param.name] { + df.err = fmt.Errorf("BaseDataFrame.OrderBy: series names must be unique") + return df + } + paramNames[param.name] = true + + if series := df.__series(param.name); series != nil { + params[i].series = series + } else { + df.err = fmt.Errorf("BaseDataFrame.OrderBy: series \"%s\" not found", param.name) + return df + } + } + + df.sortParams = params + sort.Sort(df) + df.sortParams = nil + + return df +} + //////////////////////// SUMMARY func (df BaseDataFrame) Agg(aggregators ...aggregator) DataFrame { @@ -1129,7 +1178,7 @@ func (df BaseDataFrame) Records(header bool) [][]string { if header { out[0] = make([]string, df.NCols()) for j := 0; j < df.NCols(); j++ { - out[0][j] = df.__seriesAt(j).Name() + out[0][j] = df.series[j].Name() } h = 1 @@ -1138,14 +1187,14 @@ func (df BaseDataFrame) Records(header bool) [][]string { for i := 0 + h; i < df.NRows()+h; i++ { out[i] = make([]string, df.NCols()) for j := 0; j < df.NCols(); j++ { - out[i][j] = df.__seriesAt(j).GetString(i - h) + out[i][j] = df.series[j].GetString(i - h) } } return out } -func (df BaseDataFrame) PrettyPrint(nrows int) DataFrame { +func (df BaseDataFrame) PrettyPrint(nrowsParam ...int) DataFrame { if df.err != nil { fmt.Println(df.err) return df @@ -1226,6 +1275,17 @@ func (df BaseDataFrame) PrettyPrint(nrows int) DataFrame { } fmt.Println("+") + var nrows int + if len(nrowsParam) == 0 { + if df.NRows() < 20 { + nrows = df.NRows() + } else { + nrows = 10 + } + } else { + nrows = nrowsParam[0] + } + // data if nrows >= 0 { nrows = int(math.Min(float64(nrows), float64(df.NRows()))) diff --git a/core/gandalff/gdl_dataframe_base_bench_test.go b/core/gandalff/gdl_dataframe_base_bench_test.go index cacb430..1246cc6 100644 --- a/core/gandalff/gdl_dataframe_base_bench_test.go +++ b/core/gandalff/gdl_dataframe_base_bench_test.go @@ -11,6 +11,10 @@ var G1_1e4_1e2_0_0_df *DataFrame var G1_1e5_1e2_0_0_df *DataFrame var G1_1e6_1e2_0_0_df *DataFrame var G1_1e7_1e2_0_0_df *DataFrame +var G1_1e4_1e2_10_0_df *DataFrame +var G1_1e5_1e2_10_0_df *DataFrame +var G1_1e6_1e2_10_0_df *DataFrame +var G1_1e7_1e2_10_0_df *DataFrame func read_G1_1e4_1e2_0_0() { f, err := os.OpenFile(filepath.Join("testdata", "G1_1e4_1e2_0_0.csv"), os.O_RDONLY, 0666) @@ -80,11 +84,83 @@ func read_G1_1e7_1e2_0_0() { } } +func read_G1_1e4_1e2_10_0() { + f, err := os.OpenFile(filepath.Join("testdata", "G1_1e4_1e2_10_0.csv"), os.O_RDONLY, 0666) + if err == nil { + df := NewBaseDataFrame(). + FromCSV(). + SetDelimiter(','). + SetReader(f). + Read() + + f.Close() + + G1_1e4_1e2_10_0_df = &df + } else { + G1_1e4_1e2_10_0_df = nil + } +} + +func read_G1_1e5_1e2_10_0() { + f, err := os.OpenFile(filepath.Join("testdata", "G1_1e5_1e2_10_0.csv"), os.O_RDONLY, 0666) + if err == nil { + df := NewBaseDataFrame(). + FromCSV(). + SetDelimiter(','). + SetReader(f). + Read() + + f.Close() + + G1_1e5_1e2_10_0_df = &df + } else { + G1_1e5_1e2_10_0_df = nil + } +} + +func read_G1_1e6_1e2_10_0() { + f, err := os.OpenFile(filepath.Join("testdata", "G1_1e6_1e2_10_0.csv"), os.O_RDONLY, 0666) + if err == nil { + df := NewBaseDataFrame(). + FromCSV(). + SetDelimiter(','). + SetReader(f). + Read() + + f.Close() + + G1_1e6_1e2_10_0_df = &df + } else { + G1_1e6_1e2_10_0_df = nil + } +} + +func read_G1_1e7_1e2_10_0() { + f, err := os.OpenFile(filepath.Join("testdata", "G1_1e7_1e2_10_0.csv"), os.O_RDONLY, 0666) + if err == nil { + df := NewBaseDataFrame(). + FromCSV(). + SetDelimiter(','). + SetReader(f). + Read() + + f.Close() + + G1_1e7_1e2_10_0_df = &df + } else { + G1_1e7_1e2_10_0_df = nil + } +} + func init() { read_G1_1e4_1e2_0_0() read_G1_1e5_1e2_0_0() read_G1_1e6_1e2_0_0() read_G1_1e7_1e2_0_0() + read_G1_1e4_1e2_10_0() + read_G1_1e5_1e2_10_0() + read_G1_1e6_1e2_10_0() + read_G1_1e7_1e2_10_0() } func Benchmark_Filter_Q1_1e5(b *testing.B) { @@ -363,6 +439,98 @@ func Test_GroupBy_Q1_1e7(t *testing.T) { } } +func Test_GroupBy_Q1_1e4_10PercNAs(t *testing.T) { + if G1_1e4_1e2_10_0_df == nil { + t.Skip("G1_1e4_1e2_10_0 dataframe not loaded") + } + + df := (*G1_1e4_1e2_10_0_df). + GroupBy("id1"). + Agg(Sum("v1")) + + if df.NRows() != 91 { + t.Errorf("Expected 91 rows, got %d", df.NRows()) + } + + if df.NCols() != 2 { + t.Errorf("Expected 2 columns, got %d", df.NCols()) + } + + check := df.Agg(Sum("v1")).Series("v1").Get(0).(float64) + if check != 27044 { + t.Errorf("Expected 27044, got %f", check) + } +} + +func Test_GroupBy_Q1_1e5_10PercNAs(t *testing.T) { + if G1_1e5_1e2_10_0_df == nil { + t.Skip("G1_1e5_1e2_10_0 dataframe not loaded") + } + + df := (*G1_1e5_1e2_10_0_df). + GroupBy("id1"). + Agg(Sum("v1")) + + if df.NRows() != 91 { + t.Errorf("Expected 91 rows, got %d", df.NRows()) + } + + if df.NCols() != 2 { + t.Errorf("Expected 2 columns, got %d", df.NCols()) + } + + check := df.Agg(Sum("v1")).Series("v1").Get(0).(float64) + if check != 270421 { + t.Errorf("Expected 270421, got %f", check) + } +} + +func Test_GroupBy_Q1_1e6_10PercNAs(t *testing.T) { + if G1_1e6_1e2_10_0_df == nil { + t.Skip("G1_1e6_1e2_10_0 dataframe not loaded") + } + + df := (*G1_1e6_1e2_10_0_df). + GroupBy("id1"). + Agg(Sum("v1")) + + if df.NRows() != 91 { + t.Errorf("Expected 91 rows, got %d", df.NRows()) + } + + if df.NCols() != 2 { + t.Errorf("Expected 2 columns, got %d", df.NCols()) + } + + check := df.Agg(Sum("v1")).Series("v1").Get(0).(float64) + if check != 2700684 { + t.Errorf("Expected 2700684, got %f", check) + } +} + +func Test_GroupBy_Q1_1e7_10PercNAs(t *testing.T) { + if G1_1e7_1e2_10_0_df == nil { + t.Skip("G1_1e7_1e2_10_0 dataframe not loaded") + } + + df := (*G1_1e7_1e2_10_0_df). + GroupBy("id1"). + Agg(Sum("v1")) + + if df.NRows() != 91 { + t.Errorf("Expected 91 rows, got %d", df.NRows()) + } + + if df.NCols() != 2 { + t.Errorf("Expected 2 columns, got %d", df.NCols()) + } + + check := df.Agg(Sum("v1")).Series("v1").Get(0).(float64) + if check != 26998588 { + t.Errorf("Expected 26998588, got %f", check) + } +} + func Test_GroupBy_Q2_1e4(t *testing.T) { if G1_1e4_1e2_0_0_df == nil { t.Skip("G1_1e4_1e2_0_0 dataframe not loaded") diff --git a/core/gandalff/gdl_dataframe_base_test.go b/core/gandalff/gdl_dataframe_base_test.go index 644674b..f60fbbf 100644 --- a/core/gandalff/gdl_dataframe_base_test.go +++ b/core/gandalff/gdl_dataframe_base_test.go @@ -731,3 +731,243 @@ func Test_BaseDataFrame_Join(t *testing.T) { checkEqSliceString(resBexp, res.SeriesAt(1).Data().([]string), t, "Full Join") checkEqSliceString(resCexp, res.SeriesAt(2).Data().([]string), t, "Full Join") } + +func Test_BaseDataFrame_Sort(t *testing.T) { + var res DataFrame + + df := NewBaseDataFrame(). + AddSeriesFromInt64("A", false, false, []int64{1, 5, 2, 1, 4, 1, 5, 1, 2, 1}). + AddSeriesFromString("B", false, []string{"a", "b", "c", "d", "e", "f", "g", "a", "b", "c"}). + AddSeriesFromFloat64("C", false, false, []float64{1.2, 2.3, 3.4, 4.5, 5.6, 7.8, 8.9, 1.2, 2.3, 3.4}). + AddSeriesFromBool("D", false, false, []bool{true, false, true, true, false, true, true, false, true, false}) + + res = df.OrderBy(Asc("A")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 1, 1, 2, 2, 4, 5, 5}, nil, "") { + t.Error("BaseDataFrame Sort, column A asc failed") + } + + res = df.OrderBy(Desc("A")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{5, 5, 4, 2, 2, 1, 1, 1, 1, 1}, nil, "") { + t.Error("BaseDataFrame Sort, column A desc failed") + } + + res = df.OrderBy(Asc("B")) + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"a", "a", "b", "b", "c", "c", "d", "e", "f", "g"}, nil, "") { + t.Error("BaseDataFrame Sort, column B asc failed") + } + + res = df.OrderBy(Desc("B")) + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"g", "f", "e", "d", "c", "c", "b", "b", "a", "a"}, nil, "") { + t.Error("BaseDataFrame Sort, column B desc failed") + } + + res = df.OrderBy(Asc("C")) + if !checkEqSliceFloat64(res.Series("C").(SeriesFloat64).Float64s(), []float64{1.2, 1.2, 2.3, 2.3, 3.4, 3.4, 4.5, 5.6, 7.8, 8.9}, nil, "") { + t.Error("BaseDataFrame Sort, column C asc failed") + } + + res = df.OrderBy(Desc("C")) + if !checkEqSliceFloat64(res.Series("C").(SeriesFloat64).Float64s(), []float64{8.9, 7.8, 5.6, 4.5, 3.4, 3.4, 2.3, 2.3, 1.2, 1.2}, nil, "") { + t.Error("BaseDataFrame Sort, column C desc failed") + } + + res = df.OrderBy(Asc("D")) + if !checkEqSliceBool(res.Series("D").(SeriesBool).Bools(), []bool{false, false, false, false, true, true, true, true, true, true}, nil, "") { + t.Error("BaseDataFrame Sort, column D asc failed") + } + + res = df.OrderBy(Desc("D")) + if !checkEqSliceBool(res.Series("D").(SeriesBool).Bools(), []bool{true, true, true, true, true, true, false, false, false, false}, nil, "") { + t.Error("BaseDataFrame Sort, column D desc failed") + } + + //////////////////////// .Sort() with 2 columns + + res = df.OrderBy(Asc("A"), Asc("B")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 1, 1, 2, 2, 4, 5, 5}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc: A failed") + } + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"a", "a", "c", "d", "f", "b", "c", "e", "b", "g"}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc: B failed") + } + + res = df.OrderBy(Asc("A"), Desc("B")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 1, 1, 2, 2, 4, 5, 5}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B desc: A failed") + } + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"f", "d", "c", "a", "a", "c", "b", "e", "g", "b"}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B desc: B failed") + } + + res = df.OrderBy(Desc("A"), Asc("B")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{5, 5, 4, 2, 2, 1, 1, 1, 1, 1}, nil, "") { + t.Error("BaseDataFrame Sort A desc, B asc: A failed") + } + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"b", "g", "e", "b", "c", "a", "a", "c", "d", "f"}, nil, "") { + t.Error("BaseDataFrame Sort A desc, B asc: B failed") + } + + res = df.OrderBy(Desc("A"), Desc("B")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{5, 5, 4, 2, 2, 1, 1, 1, 1, 1}, nil, "") { + t.Error("BaseDataFrame Sort A desc, B desc: A failed") + } + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"g", "b", "e", "c", "b", "f", "d", "c", "a", "a"}, nil, "") { + t.Error("BaseDataFrame Sort A desc, B desc: B failed") + } + + res = df.OrderBy(Asc("A"), Asc("C")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 1, 1, 2, 2, 4, 5, 5}, nil, "") { + t.Error("BaseDataFrame Sort A asc, C asc: A failed") + } + if !checkEqSliceFloat64(res.Series("C").(SeriesFloat64).Float64s(), []float64{1.2, 1.2, 3.4, 4.5, 7.8, 2.3, 3.4, 5.6, 2.3, 8.9}, nil, "") { + t.Error("BaseDataFrame Sort A asc, C asc: C failed") + } + + res = df.OrderBy(Asc("A"), Desc("C")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 1, 1, 2, 2, 4, 5, 5}, nil, "") { + t.Error("BaseDataFrame Sort A asc, C desc: A failed") + } + if !checkEqSliceFloat64(res.Series("C").(SeriesFloat64).Float64s(), []float64{7.8, 4.5, 3.4, 1.2, 1.2, 3.4, 2.3, 5.6, 8.9, 2.3}, nil, "") { + t.Error("BaseDataFrame Sort A asc, C desc: C failed") + } + + res = df.OrderBy(Desc("A"), Asc("C")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{5, 5, 4, 2, 2, 1, 1, 1, 1, 1}, nil, "") { + t.Error("BaseDataFrame Sort A desc, C asc: A failed") + } + if !checkEqSliceFloat64(res.Series("C").(SeriesFloat64).Float64s(), []float64{2.3, 8.9, 5.6, 2.3, 3.4, 1.2, 1.2, 3.4, 4.5, 7.8}, nil, "") { + t.Error("BaseDataFrame Sort A desc, C asc: C failed") + } + + res = df.OrderBy(Desc("A"), Desc("C")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{5, 5, 4, 2, 2, 1, 1, 1, 1, 1}, nil, "") { + t.Error("BaseDataFrame Sort A desc, C desc: A failed") + } + if !checkEqSliceFloat64(res.Series("C").(SeriesFloat64).Float64s(), []float64{8.9, 2.3, 5.6, 3.4, 2.3, 7.8, 4.5, 3.4, 1.2, 1.2}, nil, "") { + t.Error("BaseDataFrame Sort A desc, C desc: C failed") + } + + //////////////////////// .Sort() with 3 columns + + res = df.OrderBy(Asc("A"), Asc("B"), Asc("D")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 1, 1, 2, 2, 4, 5, 5}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc, D asc: A failed") + } + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"a", "a", "c", "d", "f", "b", "c", "e", "b", "g"}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc, D asc: B failed") + } + if !checkEqSliceBool(res.Series("D").(SeriesBool).Bools(), []bool{false, true, false, true, true, true, true, false, false, true}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc, D asc: D failed") + } + + res = df.OrderBy(Asc("A"), Asc("B"), Desc("D")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 1, 1, 2, 2, 4, 5, 5}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc, D desc: A failed") + } + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"a", "a", "c", "d", "f", "b", "c", "e", "b", "g"}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc, D desc: B failed") + } + if !checkEqSliceBool(res.Series("D").(SeriesBool).Bools(), []bool{true, false, false, true, true, true, true, false, false, true}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc, D desc: D failed") + } + + res = df.OrderBy(Asc("A"), Desc("B"), Asc("D")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 1, 1, 2, 2, 4, 5, 5}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B desc, D asc: A failed") + } + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"f", "d", "c", "a", "a", "c", "b", "e", "g", "b"}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B desc, D asc: B failed") + } + if !checkEqSliceBool(res.Series("D").(SeriesBool).Bools(), []bool{true, true, false, false, true, true, true, false, true, false}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B desc, D asc: D failed") + } + + res = df.OrderBy(Asc("A"), Desc("B"), Desc("D")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 1, 1, 2, 2, 4, 5, 5}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B desc, D desc: A failed") + } + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"f", "d", "c", "a", "a", "c", "b", "e", "g", "b"}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B desc, D desc: B failed") + } + if !checkEqSliceBool(res.Series("D").(SeriesBool).Bools(), []bool{true, true, false, true, false, true, true, false, true, false}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B desc, D desc: D failed") + } + + //////////////////////// + + res = df.OrderBy(Desc("D"), Asc("C"), Desc("B")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 2, 2, 1, 1, 5, 1, 5, 1, 4}, nil, "") { + t.Error("BaseDataFrame Sort D desc, C asc, B desc: A failed") + } + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"a", "b", "c", "d", "f", "g", "a", "b", "c", "e"}, nil, "") { + t.Error("BaseDataFrame Sort D desc, C asc, B desc: B failed") + } + if !checkEqSliceFloat64(res.Series("C").(SeriesFloat64).Float64s(), []float64{1.2, 2.3, 3.4, 4.5, 7.8, 8.9, 1.2, 2.3, 3.4, 5.6}, nil, "") { + t.Error("BaseDataFrame Sort D desc, C asc, B desc: C failed") + } + if !checkEqSliceBool(res.Series("D").(SeriesBool).Bools(), []bool{true, true, true, true, true, true, false, false, false, false}, nil, "") { + t.Error("BaseDataFrame Sort D desc, C asc, B desc: D failed") + } +} + +func Test_BaseDataFrame_Sort_Nulls(t *testing.T) { + var res DataFrame + + a := NewSeriesInt64("A", false, false, []int64{1, 4, 2, 1, 4, 1, 4, 1, 2, 1}). + SetNullMask([]bool{false, false, false, false, false, true, false, false, true, true}) + b := NewSeriesString("B", false, []string{"a", "b", "c", "d", "e", "f", "g", "a", "b", "c"}, NewStringPool()). + SetNullMask([]bool{true, true, false, false, false, true, false, false, false, false}) + c := NewSeriesFloat64("C", false, false, []float64{1.2, 2.3, 3.4, 4.5, 5.6, 7.8, 8.9, 1.2, 2.3, 3.4}). + SetNullMask([]bool{false, false, false, false, false, true, false, false, true, true}) + d := NewSeriesBool("D", false, false, []bool{true, false, true, true, false, true, true, false, true, false}). + SetNullMask([]bool{false, false, false, false, false, true, false, false, true, true}) + + df := NewBaseDataFrame(). + AddSeries(a, b, c, d) + + res = df.OrderBy(Asc("A")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 2, 4, 4, 4, 1, 2, 1}, nil, "") { + t.Error("BaseDataFrame Sort A asc: A failed") + } + if !checkEqSliceBool(res.Series("A").GetNullMask(), []bool{false, false, false, false, false, false, false, true, true, true}, nil, "") { + t.Error("BaseDataFrame Sort A asc: A nullmask failed") + } + + res = df.OrderBy(Desc("A")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 2, 1, 4, 4, 4, 2, 1, 1, 1}, nil, "") { + t.Error("BaseDataFrame Sort A desc: A failed") + } + if !checkEqSliceBool(res.Series("A").GetNullMask(), []bool{true, true, true, false, false, false, false, false, false, false}, nil, "") { + t.Error("BaseDataFrame Sort A desc: A nullmask failed") + } + + res = df.OrderBy(Asc("A"), Asc("B")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 2, 4, 4, 4, 2, 1, 1}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc: A failed") + } + if !checkEqSliceBool(res.Series("A").GetNullMask(), []bool{false, false, false, false, false, false, false, true, true, true}, nil, "") { + t.Error("BaseDataFrame Sort A asc: A nullmask failed") + } + + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"a", "d", "a", "c", "e", "g", "b", "b", "c", "f"}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc: B failed") + } + if !checkEqSliceBool(res.Series("B").GetNullMask(), []bool{false, false, true, false, false, false, true, false, false, true}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc: B nullmask failed") + } + + res = df.OrderBy(Asc("A"), Desc("B")) + if !checkEqSliceInt64(res.Series("A").(SeriesInt64).Int64s(), []int64{1, 1, 1, 2, 4, 4, 4, 1, 1, 2}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc: A failed") + } + if !checkEqSliceBool(res.Series("A").GetNullMask(), []bool{false, false, false, false, false, false, false, true, true, true}, nil, "") { + t.Error("BaseDataFrame Sort A asc: A nullmask failed") + } + + if !checkEqSliceString(res.Series("B").(SeriesString).Strings(), []string{"a", "d", "a", "c", "b", "g", "e", "f", "c", "b"}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc: B failed") + } + if !checkEqSliceBool(res.Series("B").GetNullMask(), []bool{true, false, false, false, true, false, false, true, false, false}, nil, "") { + t.Error("BaseDataFrame Sort A asc, B asc: B nullmask failed") + } +} diff --git a/core/gandalff/gdl_grouping.go b/core/gandalff/gdl_grouping.go index c8f9b19..2246081 100644 --- a/core/gandalff/gdl_grouping.go +++ b/core/gandalff/gdl_grouping.go @@ -5,11 +5,181 @@ import ( "sync" ) +func __series_groupby( + threadNum, minParallelSize, dataLen int, hasNulls bool, + worker func(threadNum, start, end int, map_ map[int64][]int), + workerNulls func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int), +) map[int64][]int { + + // If the data is too small, just run the worker function + if dataLen < minParallelSize { + map_ := make(map[int64][]int) + if hasNulls { + nulls := make([]int, 0) + workerNulls(0, 0, dataLen, map_, &nulls) + + // Add the nulls to the map + if len(nulls) > 0 { + nullKey := __series_get_nullkey(map_, HASH_NULL_KEY) + map_[nullKey] = nulls + } + } else { + worker(0, 0, dataLen, map_) + } + + return map_ + } + + // Initialize the maps + maps := make([]map[int64][]int, THREADS_NUMBER) + for i := 0; i < THREADS_NUMBER; i++ { + maps[i] = make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) + } + + var nulls [][]int + if hasNulls { + nulls = make([][]int, THREADS_NUMBER) + for i := 0; i < THREADS_NUMBER; i++ { + nulls[i] = make([]int, 0) + } + } + + // Initialize the wait groups array, one for each level where level is the + // log2 of the number of threads. + // The first lever of wait groups has THREADS_NUMBER/2 wait groups, and each + // wait group waits for two threads. + // + // Example: if THREADS_NUMBER = 16, then + // - the FIRST level has 8 wait groups (each wait group waits for 2 threads) + // - the 1st element waits for threads 0 and 1 + // - the 2nd element waits for threads 2 and 3 + // - ... + // When workers 0 and 1 are done, the 1st element is notified so the fist + // merger of the second level can start + // + // - the second level has 4 wait groups + // - ... + levels := int(math.Log2(float64(threadNum))) + wg := make([][]sync.WaitGroup, levels) + for i := 0; i < levels; i++ { + wg[i] = make([]sync.WaitGroup, threadNum/(1< wait for wg[0][0] + // - idx1 = 2, idx2 = 3 -> wait for wg[0][1] + // - ... + // - idx1 = 14, idx2 = 15 -> wait for wg[0][7] + wg[level][idx1>>uint(level+1)].Wait() + + for k, v := range maps[idx2] { + maps[idx1][k] = append(maps[idx1][k], v...) + } + + if nulls != nil { + nulls[idx1] = append(nulls[idx1], nulls[idx2]...) + } + + // Notify the wait groups at the next level + // + // Example: if THREADS_NUMBER = 16 and level = 0, then + // - idx1 = 0, idx2 = 1 -> notify wg[1][0] + // - idx1 = 2, idx2 = 3 -> notify wg[1][0] + // - ... + // - idx1 = 14, idx2 = 15 -> notify wg[1][3] + wg[level+1][idx1>>uint(level+2)].Done() + } + } else { + actualWorker = func(idx int) { + start := idx * dataLen / threadNum + end := (idx + 1) * dataLen / threadNum + if idx == threadNum-1 { + end = dataLen + } + + worker(idx, start, end, maps[idx]) + + // Notify the wait groups at the first level + wg[0][idx/2].Done() + } + + merger = func(level, idx1, idx2 int) { + // Example: if THREADS_NUMBER = 16 and level = 0, then + // - idx1 = 0, idx2 = 1 -> wait for wg[0][0] + // - idx1 = 2, idx2 = 3 -> wait for wg[0][1] + // - ... + // - idx1 = 14, idx2 = 15 -> wait for wg[0][7] + wg[level][idx1>>uint(level+1)].Wait() + + for k, v := range maps[idx2] { + maps[idx1][k] = append(maps[idx1][k], v...) + } + + // Notify the wait groups at the next level + // + // Example: if THREADS_NUMBER = 16 and level = 0, then + // - idx1 = 0, idx2 = 1 -> notify wg[1][0] + // - idx1 = 2, idx2 = 3 -> notify wg[1][0] + // - ... + // - idx1 = 14, idx2 = 15 -> notify wg[1][3] + wg[level+1][idx1>>uint(level+2)].Done() + } + } + + // Compute the submaps + for i := 0; i < threadNum; i++ { + go actualWorker(i) + } + + // Merge the submaps + for level := 0; level < levels; level++ { + for i := 0; i < threadNum; i += (1 << uint(level+1)) { + go merger(level, i, i+(1< 0 { + nullKey := __series_get_nullkey(maps[0], HASH_NULL_KEY) + maps[0][nullKey] = nulls[0] + } + + return maps[0] +} + func __series_groupby_multithreaded( threadNum, dataLen int, maps []map[int64][]int, nulls [][]int, worker func(threadNum, start, end int), ) { - // Initialize the wait groups array, one for each level where level is the // log2 of the number of threads. // The first lever of wait groups has THREADS_NUMBER/2 wait groups, and each diff --git a/core/gandalff/gdl_hashing.go b/core/gandalff/gdl_hashing.go deleted file mode 100644 index b5ed774..0000000 --- a/core/gandalff/gdl_hashing.go +++ /dev/null @@ -1,28 +0,0 @@ -package gandalff - -type HMElem struct { - Key uint64 - Value *[]int -} - -type CustomHM [][]HMElem - -func NewCustomHM(size int) *CustomHM { - hm := make(CustomHM, size) - return &hm -} - -func (hm *CustomHM) Get(key uint64) *[]int { - index := int(key % uint64(len(*hm))) - for _, elem := range (*hm)[index] { - if elem.Key == key { - return elem.Value - } - } - return nil -} - -func (hm *CustomHM) Put(key uint64, value *[]int) { - index := int(key % uint64(len(*hm))) - (*hm)[index] = append((*hm)[index], HMElem{key, value}) -} diff --git a/core/gandalff/gdl_series.go b/core/gandalff/gdl_series.go index 054d07e..c5d0043 100644 --- a/core/gandalff/gdl_series.go +++ b/core/gandalff/gdl_series.go @@ -60,10 +60,6 @@ type Series interface { // Take the elements according to the given interval. Take(params ...int) Series - // Sort Interface. - Less(i, j int) bool - Swap(i, j int) - // Append elements to the series. // Value can be a single value, slice of values, // a nullable value, a slice of nullable values or a series. @@ -94,12 +90,18 @@ type Series interface { Map(f GDLMapFunc, stringPool *StringPool) Series // Group the elements in the series. - Group() Series - SubGroup(gp SeriesPartition) Series + group() Series + GroupBy(gp SeriesPartition) Series + UnGroup() Series // Get the partition of the series. GetPartition() SeriesPartition + // Sort Interface. + Less(i, j int) bool + equal(i, j int) bool + Swap(i, j int) + // Sorts the elements of the series. Sort() Series SortRev() Series @@ -140,14 +142,8 @@ func NewSeries(name string, t typesys.BaseType, nullable bool, makeCopy bool, da type SeriesPartition interface { // Returns the number partitions. - GetSize() int + getSize() int + // Returns the indices of the groups. - GetMap() map[int64][]int - // Returns the indices for a given value. The value must be of the same type as the series. - // If val is nil then the indices of the null values are returned. - GetValueIndices(val any) []int - // Returns the keys of the groups. - GetKeys() any - - debugPrint() + getMap() map[int64][]int } diff --git a/core/gandalff/gdl_series_bool.go b/core/gandalff/gdl_series_bool.go index 15d233e..5410b0a 100644 --- a/core/gandalff/gdl_series_bool.go +++ b/core/gandalff/gdl_series_bool.go @@ -2,6 +2,7 @@ package gandalff import ( "fmt" + "sort" "typesys" ) @@ -42,7 +43,7 @@ func (s SeriesBool) Type() typesys.BaseType { // Returns the type and cardinality of the series. func (s SeriesBool) TypeCard() typesys.BaseTypeCard { - return typesys.BaseTypeCard{typesys.BoolType, s.Len()} + return typesys.BaseTypeCard{Base: typesys.BoolType, Card: s.Len()} } // Returns if the series is grouped. @@ -222,35 +223,6 @@ func (s SeriesBool) Take(params ...int) Series { return s.filterIntSlice(indeces) } -func (s SeriesBool) Less(i, j int) bool { - if s.isNullable { - if s.nullMask[i>>3]&(1< 0 { - return false - } - if s.nullMask[j>>3]&(1< 0 { - return true - } - } - return !s.data[i] && s.data[j] -} - -func (s SeriesBool) Swap(i, j int) { - if s.isNullable { - if s.nullMask[i>>3]&(1< 0 { - s.nullMask[j>>3] |= 1 << uint(j%8) - } else { - s.nullMask[j>>3] &= ^(1 << uint(j%8)) - } - if s.nullMask[j>>3]&(1< 0 { - s.nullMask[i>>3] |= 1 << uint(i%8) - } else { - s.nullMask[i>>3] &= ^(1 << uint(i%8)) - } - } - - s.data[i], s.data[j] = s.data[j], s.data[i] -} - // Append appends a value or a slice of values to the series. func (s SeriesBool) Append(v any) Series { switch v := v.(type) { @@ -764,97 +736,114 @@ func (s SeriesBool) Map(f GDLMapFunc, stringPool *StringPool) Series { // which means no sub-grouping). // So is for the null group, which has the same size as the partition vector. type SeriesBoolPartition struct { - series *SeriesBool partition map[int64][]int - nulls []int } -func (p SeriesBoolPartition) GetSize() int { - return len(p.partition) +func (gp *SeriesBoolPartition) getSize() int { + return len(gp.partition) } -func (p SeriesBoolPartition) GetMap() map[int64][]int { - return p.partition +func (gp *SeriesBoolPartition) getMap() map[int64][]int { + return gp.partition } -func (p SeriesBoolPartition) GetValueIndices(val any) []int { - if val == nil { - return p.nulls - } else if v, ok := val.(bool); ok { - if v { - return p.partition[1] - } else { - return p.partition[0] +func (s SeriesBool) group() Series { + + // Define the worker callback + worker := func(threadNum, start, end int, map_ map[int64][]int) { + for i := start; i < end; i++ { + if s.data[i] { + map_[1] = append(map_[1], i) + } else { + map_[0] = append(map_[0], i) + } } } - return make([]int, 0) -} + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + for i := start; i < end; i++ { + if s.IsNull(i) { + (*nulls) = append((*nulls), i) + } else if s.data[i] { + map_[1] = append(map_[1], i) + } else { + map_[0] = append(map_[0], i) + } -func (gp SeriesBoolPartition) GetKeys() any { - keys := make([]bool, 0, 2) - return keys -} + } + } -func (gp SeriesBoolPartition) debugPrint() { - fmt.Println("SeriesBoolPartition") - data := gp.series.Data().([]bool) - for k, v := range gp.partition { - fmt.Printf("%10v - %5v: %v\n", k, data[v[0]], v) + partition := SeriesBoolPartition{ + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_1, s.Len(), s.HasNull(), + worker, workerNulls), } + + s.isGrouped = true + s.partition = &partition + + return s } -func (s SeriesBool) Group() Series { - map_ := make(map[int64][]int) - for index := 0; index < len(s.data); index++ { - if s.data[index] { - map_[1] = append(map_[1], index) - } else { - map_[0] = append(map_[0], index) +func (s SeriesBool) GroupBy(partition SeriesPartition) Series { + // collect all keys + otherIndeces := partition.getMap() + keys := make([]int64, len(otherIndeces)) + i := 0 + for k := range otherIndeces { + keys[i] = k + i++ + } + + // Define the worker callback + worker := func(threadNum, start, end int, map_ map[int64][]int) { + var newHash int64 + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function + if s.data[index] { + newHash = (1 + HASH_MAGIC_NUMBER) + (h << 13) + (h >> 4) + } else { + newHash = HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) + } + map_[newHash] = append(map_[newHash], index) + } } } - return SeriesBool{ - isGrouped: true, - isNullable: s.isNullable, - sorted: s.sorted, - name: s.name, - data: s.data, - nullMask: s.nullMask, - partition: &SeriesBoolPartition{ - series: &s, - partition: map_, - nulls: nil, - }} -} - -func (s SeriesBool) SubGroup(partition SeriesPartition) Series { - newMap := make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) - - var newHash int64 - for h, indexes := range partition.GetMap() { - for _, index := range indexes { - if s.data[index] { - newHash = 1 + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) - } else { - newHash = HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + var newHash int64 + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function + if s.IsNull(index) { + newHash = HASH_MAGIC_NUMBER_NULL + (h << 13) + (h >> 4) + } else if s.data[index] { + newHash = (1 + HASH_MAGIC_NUMBER) + (h << 13) + (h >> 4) + } else { + newHash = HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) + } + map_[newHash] = append(map_[newHash], index) } - newMap[newHash] = append(newMap[newHash], index) } } - return SeriesBool{ - isGrouped: true, - isNullable: s.isNullable, - sorted: s.sorted, - name: s.name, - data: s.data, - nullMask: s.nullMask, - partition: &SeriesBoolPartition{ - series: &s, - partition: newMap, - nulls: nil, - }} + newPartition := SeriesBoolPartition{ + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_1, len(keys), s.HasNull(), + worker, workerNulls), + } + + s.isGrouped = true + s.partition = &newPartition + + return s +} + +func (s SeriesBool) UnGroup() Series { + s.isGrouped = false + s.partition = nil + return s } func (s SeriesBool) GetPartition() SeriesPartition { @@ -863,10 +852,61 @@ func (s SeriesBool) GetPartition() SeriesPartition { //////////////////////// SORTING OPERATIONS +func (s SeriesBool) Less(i, j int) bool { + if s.isNullable { + if s.nullMask[i>>3]&(1< 0 { + return false + } + if s.nullMask[j>>3]&(1< 0 { + return true + } + } + return !s.data[i] && s.data[j] +} + +func (s SeriesBool) equal(i, j int) bool { + if s.isNullable { + if (s.nullMask[i>>3] & (1 << uint(i%8))) > 0 { + return (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 + } + if (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 { + return false + } + } + + return s.data[i] == s.data[j] +} + +func (s SeriesBool) Swap(i, j int) { + if s.isNullable { + // i is null, j is not null + if s.nullMask[i>>3]&(1< 0 && s.nullMask[j>>3]&(1<>3] &= ^(1 << uint(i%8)) + s.nullMask[j>>3] |= 1 << uint(j%8) + } else + + // i is not null, j is null + if s.nullMask[i>>3]&(1<>3]&(1< 0 { + s.nullMask[i>>3] |= 1 << uint(i%8) + s.nullMask[j>>3] &= ^(1 << uint(j%8)) + } + } + + s.data[i], s.data[j] = s.data[j], s.data[i] +} + func (s SeriesBool) Sort() Series { + if s.sorted != SORTED_ASC { + sort.Sort(s) + s.sorted = SORTED_ASC + } return s } func (s SeriesBool) SortRev() Series { + if s.sorted != SORTED_DESC { + sort.Sort(sort.Reverse(s)) + s.sorted = SORTED_DESC + } return s } diff --git a/core/gandalff/gdl_series_bool_memopt.go b/core/gandalff/gdl_series_bool_memopt.go index f84ab61..a791463 100644 --- a/core/gandalff/gdl_series_bool_memopt.go +++ b/core/gandalff/gdl_series_bool_memopt.go @@ -53,7 +53,7 @@ func (s SeriesBoolMemOpt) Type() typesys.BaseType { // Returns the type and cardinality of the series. func (s SeriesBoolMemOpt) TypeCard() typesys.BaseTypeCard { - return typesys.BaseTypeCard{typesys.BoolType, s.Len()} + return typesys.BaseTypeCard{Base: typesys.BoolType, Card: s.Len()} } // Returns if the series is grouped. @@ -241,43 +241,6 @@ func (s SeriesBoolMemOpt) Take(params ...int) Series { return s.filterIntSlice(indeces) } -func (s SeriesBoolMemOpt) Less(i, j int) bool { - if s.isNullable { - if s.nullMask[i>>3]&(1< 0 { - return false - } - if s.nullMask[j>>3]&(1< 0 { - return true - } - } - return s.data[i>>3]&(1< 0 && s.data[j>>3]&(1<>3]&(1< 0 { - s.nullMask[j>>3] |= 1 << uint(j%8) - } else { - s.nullMask[j>>3] &= ^(1 << uint(j%8)) - } - if s.nullMask[j>>3]&(1< 0 { - s.nullMask[i>>3] |= 1 << uint(i%8) - } else { - s.nullMask[i>>3] &= ^(1 << uint(i%8)) - } - } - if s.data[i>>3]&(1< 0 { - s.data[j>>3] |= 1 << uint(j%8) - } else { - s.data[j>>3] &= ^(1 << uint(j%8)) - } - if s.data[j>>3]&(1< 0 { - s.data[i>>3] |= 1 << uint(i%8) - } else { - s.data[i>>3] &= ^(1 << uint(i%8)) - } -} - // Append appends a value or a slice of values to the series. func (s SeriesBoolMemOpt) Append(v any) Series { switch v := v.(type) { @@ -964,47 +927,24 @@ func (s SeriesBoolMemOpt) Map(f GDLMapFunc, stringPool *StringPool) Series { // which means no sub-grouping). // So is for the null group, which has the same size as the partition vector. type SeriesBoolMemOptPartition struct { - series *SeriesBoolMemOpt - partition map[int64][]int - nulls []int + seriesList []Series + partition map[int64][]int + nulls []int } -func (p SeriesBoolMemOptPartition) GetSize() int { - return len(p.partition) +func (gp *SeriesBoolMemOptPartition) getSize() int { + return len(gp.partition) } -func (p SeriesBoolMemOptPartition) GetMap() map[int64][]int { - return p.partition +func (gp *SeriesBoolMemOptPartition) getMap() map[int64][]int { + return gp.partition } -func (p SeriesBoolMemOptPartition) GetValueIndices(val any) []int { - if val == nil { - return p.nulls - } else if v, ok := val.(bool); ok { - if v { - return p.partition[1] - } else { - return p.partition[0] - } - } - - return make([]int, 0) -} - -func (gp SeriesBoolMemOptPartition) GetKeys() any { - keys := make([]bool, 0, 2) - return keys -} - -func (gp SeriesBoolMemOptPartition) debugPrint() { - fmt.Println("SeriesBoolMemOptPartition") - data := gp.series.Data().([]bool) - for k, v := range gp.partition { - fmt.Printf("%10v - %5v: %v\n", k, data[v[0]], v) - } +func (gp *SeriesBoolMemOptPartition) getSeriesList() []Series { + return gp.seriesList } -func (s SeriesBoolMemOpt) Group() Series { +func (s SeriesBoolMemOpt) group() Series { map_ := make(map[int64][]int) for index := 0; index < s.size; index++ { map_[int64((s.data[index>>3]&(1<<(index%8)))>>int64(index%8))] = append(map_[int64((s.data[index>>3]&(1<<(index%8)))>>int64(index%8))], index) @@ -1018,17 +958,16 @@ func (s SeriesBoolMemOpt) Group() Series { data: s.data, nullMask: s.nullMask, partition: &SeriesBoolMemOptPartition{ - series: &s, partition: map_, nulls: nil, }} } -func (s SeriesBoolMemOpt) SubGroup(partition SeriesPartition) Series { +func (s SeriesBoolMemOpt) GroupBy(partition SeriesPartition) Series { newMap := make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) var newHash int64 - for h, indexes := range partition.GetMap() { + for h, indexes := range partition.getMap() { for _, index := range indexes { newHash = int64((s.data[index>>3]&(1<<(index%8)))>>int64(index%8)) + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) newMap[newHash] = append(newMap[newHash], index) @@ -1043,16 +982,67 @@ func (s SeriesBoolMemOpt) SubGroup(partition SeriesPartition) Series { data: s.data, nullMask: s.nullMask, partition: &SeriesBoolMemOptPartition{ - series: &s, partition: newMap, nulls: nil, }} } +func (s SeriesBoolMemOpt) UnGroup() Series { + s.isGrouped = false + s.partition = nil + return s +} + func (s SeriesBoolMemOpt) GetPartition() SeriesPartition { return s.partition } +func (s SeriesBoolMemOpt) Less(i, j int) bool { + if s.isNullable { + if s.nullMask[i>>3]&(1< 0 { + return false + } + if s.nullMask[j>>3]&(1< 0 { + return true + } + } + return s.data[i>>3]&(1< 0 && s.data[j>>3]&(1<>3]&(1< 0) && (s.nullMask[j>>3]&(1< 0) { + return true + } + } + return ((s.data[i>>3]&(1< 0) && (s.data[j>>3]&(1< 0)) || ((s.data[i>>3]&(1<>3]&(1<>3]&(1< 0 { + s.nullMask[j>>3] |= 1 << uint(j%8) + } else { + s.nullMask[j>>3] &= ^(1 << uint(j%8)) + } + if s.nullMask[j>>3]&(1< 0 { + s.nullMask[i>>3] |= 1 << uint(i%8) + } else { + s.nullMask[i>>3] &= ^(1 << uint(i%8)) + } + } + if s.data[i>>3]&(1< 0 { + s.data[j>>3] |= 1 << uint(j%8) + } else { + s.data[j>>3] &= ^(1 << uint(j%8)) + } + if s.data[j>>3]&(1< 0 { + s.data[i>>3] |= 1 << uint(i%8) + } else { + s.data[i>>3] &= ^(1 << uint(i%8)) + } +} + func (s SeriesBoolMemOpt) Sort() Series { return s } diff --git a/core/gandalff/gdl_series_bool_test.go b/core/gandalff/gdl_series_bool_test.go index 0d608e7..b37cba6 100644 --- a/core/gandalff/gdl_series_bool_test.go +++ b/core/gandalff/gdl_series_bool_test.go @@ -698,6 +698,122 @@ func Test_SeriesBool_Map(t *testing.T) { } } +func Test_SeriesBool_Group(t *testing.T) { + var partMap map[int64][]int + + data1 := []bool{true, true, true, true, true, true, true, true, true, true} + data1Mask := []bool{false, false, false, false, false, true, true, true, true, true} + data2 := []bool{true, true, false, false, true, true, false, false, true, true} + data2Mask := []bool{false, true, false, true, false, true, false, true, false, true} + data3 := []bool{true, false, true, false, true, false, true, false, true, false} + data3Mask := []bool{false, false, true, false, false, true, false, false, true, false} + + // Test 1 + s1 := NewSeriesBool("s1", true, true, data1). + SetNullMask(data1Mask). + group() + + p1 := s1.GetPartition().getMap() + if len(p1) != 2 { + t.Errorf("Expected 2 groups, got %d", len(p1)) + } + + partMap = map[int64][]int{ + 0: {0, 1, 2, 3, 4}, + 1: {5, 6, 7, 8, 9}, + } + if !checkEqPartitionMap(p1, partMap, nil, "Bool Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p1) + } + + // Test 2 + s2 := NewSeriesBool("s2", true, true, data2). + SetNullMask(data2Mask). + GroupBy(s1.GetPartition()) + + p2 := s2.GetPartition().getMap() + if len(p2) != 6 { + t.Errorf("Expected 6 groups, got %d", len(p2)) + } + + partMap = map[int64][]int{ + 0: {0, 4}, + 1: {1, 3}, + 2: {2}, + 3: {5, 7, 9}, + 4: {6}, + 5: {8}, + } + if !checkEqPartitionMap(p2, partMap, nil, "Bool Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p2) + } + + // Test 3 + s3 := NewSeriesBool("test", true, true, data3). + SetNullMask(data3Mask). + GroupBy(s2.GetPartition()) + + p3 := s3.GetPartition().getMap() + if len(p3) != 7 { + t.Errorf("Expected 7 groups, got %d", len(p3)) + } + + partMap = map[int64][]int{ + 0: {0, 4}, + 1: {1, 3}, + 2: {2}, + 3: {5}, + 4: {6}, + 5: {7, 9}, + 6: {8}, + } + if !checkEqPartitionMap(p3, partMap, nil, "Bool Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p3) + } + + // debugPrintPartition(s1.GetPartition(), s1) + // debugPrintPartition(s2.GetPartition(), s1, s2) + // debugPrintPartition(s3.GetPartition(), s1, s2, s3) + + partMap = nil +} + +func Test_SeriesBool_Sort(t *testing.T) { + data := []bool{false, false, false, true, true, true, false, true, false, false, true, true, true, false, true, true, true, false, true, false} + mask := []bool{false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true} + + // Create a new series. + s := NewSeriesBool("test", false, true, data) + + // Sort the series. + sorted := s.Sort() + + // Check the data. + expected := []bool{false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true, true} + if !checkEqSliceBool(sorted.Data().([]bool), expected, nil, "") { + t.Errorf("SeriesBool.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]bool)) + } + + // Create a new series. + s = NewSeriesBool("test", true, true, data). + SetNullMask(mask) + + // Sort the series. + sorted = s.Sort() + + // Check the data. + expected = []bool{false, false, false, false, true, true, true, true, true, true, true, true, false, false, true, true, true, false, false, false} + if !checkEqSliceBool(sorted.Data().([]bool), expected, nil, "") { + t.Errorf("SeriesBool.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]bool)) + } + + // Check the null mask. + expectedMask := []bool{false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true} + if !checkEqSliceBool(sorted.GetNullMask(), expectedMask, nil, "") { + t.Errorf("SeriesBool.Sort() failed, expecting %v, got %v", expectedMask, sorted.GetNullMask()) + } +} + func Test_SeriesBool_Arithmetic_Mul(t *testing.T) { bools := NewSeriesBool("test", true, false, []bool{true}).(SeriesBool) boolv := NewSeriesBool("test", true, false, []bool{true, false, true, false, true, false, true, true, false, false}).(SeriesBool) diff --git a/core/gandalff/gdl_series_error.go b/core/gandalff/gdl_series_error.go index 0f14e50..0371099 100644 --- a/core/gandalff/gdl_series_error.go +++ b/core/gandalff/gdl_series_error.go @@ -58,7 +58,7 @@ func (s SeriesError) Type() typesys.BaseType { // Returns the type and cardinality of the series. func (s SeriesError) TypeCard() typesys.BaseTypeCard { - return typesys.BaseTypeCard{typesys.ErrorType, s.Len()} + return typesys.BaseTypeCard{Base: typesys.ErrorType, Card: s.Len()} } // Returns if the series has null values. @@ -115,13 +115,6 @@ func (s SeriesError) Take(params ...int) Series { return s } -// Sort interface. -func (s SeriesError) Less(i, j int) bool { - return false -} - -func (s SeriesError) Swap(i, j int) {} - // Append elements to the series. func (s SeriesError) Append(v any) Series { return s @@ -171,18 +164,33 @@ func (s SeriesError) Map(f GDLMapFunc, stringPool *StringPool) Series { } // Group the elements in the series. -func (s SeriesError) Group() Series { +func (s SeriesError) group() Series { return nil } -func (s SeriesError) SubGroup(gp SeriesPartition) Series { +func (s SeriesError) GroupBy(gp SeriesPartition) Series { return nil } +func (s SeriesError) UnGroup() Series { + return s +} + func (s SeriesError) GetPartition() SeriesPartition { return nil } +// Sort interface. +func (s SeriesError) Less(i, j int) bool { + return false +} + +func (s SeriesError) equal(i, j int) bool { + return false +} + +func (s SeriesError) Swap(i, j int) {} + func (s SeriesError) Sort() Series { return s } diff --git a/core/gandalff/gdl_series_float64.go b/core/gandalff/gdl_series_float64.go index e8ffc11..c2a7cb2 100644 --- a/core/gandalff/gdl_series_float64.go +++ b/core/gandalff/gdl_series_float64.go @@ -2,6 +2,7 @@ package gandalff import ( "fmt" + "sort" "typesys" "unsafe" ) @@ -42,7 +43,7 @@ func (s SeriesFloat64) Type() typesys.BaseType { // Returns the type and cardinality of the series. func (s SeriesFloat64) TypeCard() typesys.BaseTypeCard { - return typesys.BaseTypeCard{typesys.Float64Type, s.Len()} + return typesys.BaseTypeCard{Base: typesys.Float64Type, Card: s.Len()} } // Returns if the series is grouped. @@ -212,34 +213,6 @@ func (s SeriesFloat64) Take(params ...int) Series { return s.filterIntSlice(indeces) } -func (s SeriesFloat64) Less(i, j int) bool { - if s.isNullable { - if s.nullMask[i>>3]&(1< 0 { - return false - } - if s.nullMask[j>>3]&(1< 0 { - return true - } - } - return s.data[i] < s.data[j] -} - -func (s SeriesFloat64) Swap(i, j int) { - if s.isNullable { - if s.nullMask[i>>3]&(1< 0 { - s.nullMask[j>>3] |= 1 << uint(j%8) - } else { - s.nullMask[j>>3] &= ^(1 << uint(j%8)) - } - if s.nullMask[j>>3]&(1< 0 { - s.nullMask[i>>3] |= 1 << uint(i%8) - } else { - s.nullMask[i>>3] &= ^(1 << uint(i%8)) - } - } - s.data[i], s.data[j] = s.data[j], s.data[i] -} - func (s SeriesFloat64) Append(v any) Series { switch v := v.(type) { case float64, []float64: @@ -753,121 +726,42 @@ func (s SeriesFloat64) Map(f GDLMapFunc, stringPool *StringPool) Series { //////////////////////// GROUPING OPERATIONS type SeriesFloat64Partition struct { - series *SeriesFloat64 - seriesSize int partition map[int64][]int indexToGroup []int } -func (gp SeriesFloat64Partition) GetSize() int { +func (gp *SeriesFloat64Partition) getSize() int { return len(gp.partition) } -func (gp SeriesFloat64Partition) beginSorting() SeriesFloat64Partition { - gp.indexToGroup = make([]int, gp.seriesSize) - for i, part := range gp.partition { - for _, idx := range part { - gp.indexToGroup[idx] = int(i) - } - } - - return gp -} - -func (gp SeriesFloat64Partition) endSorting() SeriesFloat64Partition { - // newPartition := make(map[int64][]int, len(gp.partition)) - // newNullGroup := make([]int, len(gp.nulls)) - - // for i, part := range gp.partition { - // newPartition[i] = make([]int, 0, len(part)) - // } - - // for i, g := range gp.indexToGroup { - // if g < len(gp.partition) { - // newPartition[int64(g)] = append(newPartition[int64(g)], i) - // } else { - // newNullGroup[g-len(gp.partition)] = append(newNullGroup[g-len(gp.partition)], i) - // } - // } - - gp.indexToGroup = nil - return gp -} - -func (gp SeriesFloat64Partition) GetMap() map[int64][]int { +func (gp *SeriesFloat64Partition) getMap() map[int64][]int { return gp.partition } -func (gp SeriesFloat64Partition) GetValueIndices(val any) []int { - if val == nil { - if nulls, ok := gp.partition[HASH_NULL_KEY]; ok { - return nulls - } - } else if v, ok := val.(float64); ok { - if part, ok := gp.partition[int64(v)]; ok { - return part - } - } - - return make([]int, 0) -} +func (s SeriesFloat64) group() Series { -func (gp SeriesFloat64Partition) GetKeys() any { - keys := make([]float64, 0, len(gp.partition)) - for k := range gp.partition { - if k != HASH_NULL_KEY { - keys = append(keys, *(*float64)(unsafe.Pointer(&k))) + // Define the worker callback + worker := func(threadNum, start, end int, map_ map[int64][]int) { + for i := start; i < end; i++ { + map_[*(*int64)(unsafe.Pointer((&s.data[i])))] = append(map_[*(*int64)(unsafe.Pointer((&s.data[i])))], i) } } - return keys -} - -func (gp SeriesFloat64Partition) debugPrint() { - fmt.Println("SeriesFloat64Partition") - data := gp.series.Data().([]float64) - for k, v := range gp.partition { - // f := *(*float64)(unsafe.Pointer(&k)) - fmt.Printf("%v - %10.4f: %v\n", k, data[v[0]], v) - } -} - -func (s SeriesFloat64) Group() Series { - - var partition SeriesFloat64Partition - if len(s.data) < MINIMUM_PARALLEL_SIZE_2 { - map_ := make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) - for i, v := range s.data { - map_[*(*int64)(unsafe.Pointer((&v)))] = append(map_[*(*int64)(unsafe.Pointer((&v)))], i) - } - - partition = SeriesFloat64Partition{ - series: &s, - seriesSize: s.Len(), - partition: map_, - } - } else { - // Initialize the maps and the wait groups - allMaps := make([]map[int64][]int, THREADS_NUMBER) - for i := 0; i < THREADS_NUMBER; i++ { - allMaps[i] = make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) - } - - // Define the worker callback - worker := func(threadNum, start, end int) { - map_ := allMaps[threadNum] - for i := start; i < end; i++ { + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + for i := start; i < end; i++ { + if s.IsNull(i) { + (*nulls) = append((*nulls), i) + } else { map_[*(*int64)(unsafe.Pointer((&s.data[i])))] = append(map_[*(*int64)(unsafe.Pointer((&s.data[i])))], i) } } + } - __series_groupby_multithreaded(THREADS_NUMBER, len(s.data), allMaps, nil, worker) - - partition = SeriesFloat64Partition{ - series: &s, - seriesSize: s.Len(), - partition: allMaps[0], - } + partition := SeriesFloat64Partition{ + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_2, len(s.data), s.HasNull(), + worker, workerNulls), } s.isGrouped = true @@ -876,62 +770,46 @@ func (s SeriesFloat64) Group() Series { return s } -func (s SeriesFloat64) SubGroup(partition SeriesPartition) Series { - var newPartition SeriesFloat64Partition - otherIndeces := partition.GetMap() - - if len(s.data) < MINIMUM_PARALLEL_SIZE_2 { - - map_ := make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) +func (s SeriesFloat64) GroupBy(partition SeriesPartition) Series { + // collect all keys + otherIndeces := partition.getMap() + keys := make([]int64, len(otherIndeces)) + i := 0 + for k := range otherIndeces { + keys[i] = k + i++ + } + // Define the worker callback + worker := func(threadNum, start, end int, map_ map[int64][]int) { var newHash int64 - for h, v := range otherIndeces { - for _, index := range v { + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function newHash = *(*int64)(unsafe.Pointer((&(s.data)[index]))) + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) map_[newHash] = append(map_[newHash], index) } } + } - newPartition = SeriesFloat64Partition{ - series: &s, - seriesSize: s.Len(), - partition: map_, - } - } else { - - // collect all keys - keys := make([]int64, len(otherIndeces)) - i := 0 - for k := range otherIndeces { - keys[i] = k - i++ - } - - // Initialize the maps and the wait groups - allMaps := make([]map[int64][]int, THREADS_NUMBER) - for i := 0; i < THREADS_NUMBER; i++ { - allMaps[i] = make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) - } - - // Define the worker callback - worker := func(threadNum, start, end int) { - var newHash int64 - map_ := allMaps[threadNum] - for _, h := range keys[start:end] { - for _, index := range otherIndeces[h] { + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + var newHash int64 + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function + if s.IsNull(index) { + newHash = HASH_MAGIC_NUMBER_NULL + (h << 13) + (h >> 4) + } else { newHash = *(*int64)(unsafe.Pointer((&(s.data)[index]))) + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) - map_[newHash] = append(map_[newHash], index) } + map_[newHash] = append(map_[newHash], index) } } + } - __series_groupby_multithreaded(THREADS_NUMBER, len(keys), allMaps, nil, worker) - - newPartition = SeriesFloat64Partition{ - series: &s, - seriesSize: s.Len(), - partition: allMaps[0], - } + newPartition := SeriesFloat64Partition{ + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_1, len(keys), s.HasNull(), + worker, workerNulls), } s.isGrouped = true @@ -940,16 +818,74 @@ func (s SeriesFloat64) SubGroup(partition SeriesPartition) Series { return s } +func (s SeriesFloat64) UnGroup() Series { + s.isGrouped = false + s.partition = nil + return s +} + func (s SeriesFloat64) GetPartition() SeriesPartition { return s.partition } +//////////////////////// SORTING OPERATIONS + +func (s SeriesFloat64) Less(i, j int) bool { + if s.isNullable { + if s.nullMask[i>>3]&(1< 0 { + return false + } + if s.nullMask[j>>3]&(1< 0 { + return true + } + } + + return s.data[i] < s.data[j] +} + +func (s SeriesFloat64) equal(i, j int) bool { + if s.isNullable { + if (s.nullMask[i>>3] & (1 << uint(i%8))) > 0 { + return (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 + } + if (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 { + return false + } + } + + return s.data[i] == s.data[j] +} + +func (s SeriesFloat64) Swap(i, j int) { + if s.isNullable { + // i is null, j is not null + if s.nullMask[i>>3]&(1< 0 && s.nullMask[j>>3]&(1<>3] &= ^(1 << uint(i%8)) + s.nullMask[j>>3] |= 1 << uint(j%8) + } else + + // i is not null, j is null + if s.nullMask[i>>3]&(1<>3]&(1< 0 { + s.nullMask[i>>3] |= 1 << uint(i%8) + s.nullMask[j>>3] &= ^(1 << uint(j%8)) + } + } + + s.data[i], s.data[j] = s.data[j], s.data[i] +} + func (s SeriesFloat64) Sort() Series { + if s.sorted != SORTED_ASC { + sort.Sort(s) + s.sorted = SORTED_ASC + } return s } func (s SeriesFloat64) SortRev() Series { + if s.sorted != SORTED_DESC { + sort.Sort(sort.Reverse(s)) + s.sorted = SORTED_DESC + } return s } - -//////////////////////// SORTING OPERATIONS diff --git a/core/gandalff/gdl_series_float64_test.go b/core/gandalff/gdl_series_float64_test.go index 07ba87e..3b57d49 100644 --- a/core/gandalff/gdl_series_float64_test.go +++ b/core/gandalff/gdl_series_float64_test.go @@ -577,6 +577,123 @@ func Test_SeriesFloat64_Map(t *testing.T) { } } +func Test_SeriesFloat64_Group(t *testing.T) { + var partMap map[int64][]int + + data1 := []float64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1} + data1Mask := []bool{false, false, false, false, false, true, true, true, true, true} + data2 := []float64{1, 1, 2, 2, 1, 1, 2, 2, 1, 1} + data2Mask := []bool{false, true, false, true, false, true, false, true, false, true} + data3 := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + data3Mask := []bool{false, false, false, false, false, true, true, true, true, true} + + // Test 1 + s1 := NewSeriesFloat64("s1", true, true, data1). + SetNullMask(data1Mask). + group() + + p1 := s1.GetPartition().getMap() + if len(p1) != 2 { + t.Errorf("Expected 2 groups, got %d", len(p1)) + } + + partMap = map[int64][]int{ + 0: {0, 1, 2, 3, 4}, + 1: {5, 6, 7, 8, 9}, + } + if !checkEqPartitionMap(p1, partMap, nil, "Float64 Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p1) + } + + // Test 2 + s2 := NewSeriesFloat64("s2", true, true, data2). + SetNullMask(data2Mask). + GroupBy(s1.GetPartition()) + + p2 := s2.GetPartition().getMap() + if len(p2) != 6 { + t.Errorf("Expected 6 groups, got %d", len(p2)) + } + + partMap = map[int64][]int{ + 0: {0, 4}, + 1: {1, 3}, + 2: {2}, + 3: {5, 7, 9}, + 4: {6}, + 5: {8}, + } + if !checkEqPartitionMap(p2, partMap, nil, "Float64 Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p2) + } + + // Test 3 + s3 := NewSeriesFloat64("test", true, true, data3). + SetNullMask(data3Mask). + GroupBy(s2.GetPartition()) + + p3 := s3.GetPartition().getMap() + if len(p3) != 8 { + t.Errorf("Expected 8 groups, got %d", len(p3)) + } + + partMap = map[int64][]int{ + 0: {0}, + 1: {1}, + 2: {2}, + 3: {3}, + 4: {4}, + 5: {5, 7, 9}, + 6: {6}, + 7: {8}, + } + if !checkEqPartitionMap(p3, partMap, nil, "Float64 Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p3) + } + + // debugPrintPartition(s1.GetPartition(), s1) + // debugPrintPartition(s2.GetPartition(), s1, s2) + // debugPrintPartition(s3.GetPartition(), s1, s2, s3) + + partMap = nil +} + +func Test_SeriesFloat64_Sort(t *testing.T) { + data := []float64{3.8, 5.7, -2.3, -0.2, 6.6, -0.5, -6.4, -2.4, 0.2, -2.8, -7.1, -1.7, -4.2, 1.3, -6.2, -2.8, -4.4, -0.6, 0.0, 9.3} + mask := []bool{false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true} + + // Create a new series. + s := NewSeriesFloat64("test", false, true, data) + + // Sort the series. + sorted := s.Sort() + + // Check the data. + expected := []float64{-7.1, -6.4, -6.2, -4.4, -4.2, -2.8, -2.8, -2.4, -2.3, -1.7, -0.6, -0.5, -0.2, 0.0, 0.2, 1.3, 3.8, 5.7, 6.6, 9.3} + if !checkEqSliceFloat64(sorted.Data().([]float64), expected, nil, "") { + t.Errorf("SeriesFloat64.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]float64)) + } + + // Create a new series. + s = NewSeriesFloat64("test", true, true, data). + SetNullMask(mask) + + // Sort the series. + sorted = s.Sort() + + // Check the data. + expected = []float64{-7.1, -6.4, -6.2, -4.4, -4.2, -2.3, 0.0, 0.2, 3.8, 6.6, -0.5, -1.7, -2.8, 1.3, -2.4, -2.8, -0.2, -0.6, 5.7, 9.3} + if !checkEqSliceFloat64(sorted.Data().([]float64), expected, nil, "") { + t.Errorf("SeriesFloat64.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]float64)) + } + + // Check the null mask. + expectedMask := []bool{false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true} + if !checkEqSliceBool(sorted.GetNullMask(), expectedMask, nil, "") { + t.Errorf("SeriesFloat64.Sort() failed, expecting %v, got %v", expectedMask, sorted.GetNullMask()) + } +} + func Test_SeriesFloat64_Arithmetic_Mul(t *testing.T) { bools := NewSeriesBool("test", true, false, []bool{true}).(SeriesBool) boolv := NewSeriesBool("test", true, false, []bool{true, false, true, false, true, false, true, true, false, false}).(SeriesBool) diff --git a/core/gandalff/gdl_series_int32.go b/core/gandalff/gdl_series_int32.go index 861d7a2..9a5fd92 100644 --- a/core/gandalff/gdl_series_int32.go +++ b/core/gandalff/gdl_series_int32.go @@ -43,7 +43,7 @@ func (s SeriesInt32) Type() typesys.BaseType { // Returns the type and cardinality of the series. func (s SeriesInt32) TypeCard() typesys.BaseTypeCard { - return typesys.BaseTypeCard{typesys.Int32Type, s.Len()} + return typesys.BaseTypeCard{Base: typesys.Int32Type, Card: s.Len()} } // Returns if the series is grouped. @@ -235,50 +235,6 @@ func (s SeriesInt32) Take(params ...int) Series { return s.filterIntSlice(indeces) } -func (s SeriesInt32) Less(i, j int) bool { - if s.isGrouped { - if s.partition.indexToGroup[i] != s.partition.indexToGroup[j] { - return s.partition.indexToGroup[i] < s.partition.indexToGroup[j] - } - return s.data[i] < s.data[j] - } else - - // if s is grouped the null element are is the same group - // so there is no need to check if the element is null - if s.isNullable { - if s.nullMask[i>>3]&(1< 0 { - return false - } - if s.nullMask[j>>3]&(1< 0 { - return true - } - } - - return s.data[i] < s.data[j] -} - -func (s SeriesInt32) Swap(i, j int) { - if s.isGrouped { - s.partition.indexToGroup[i], s.partition.indexToGroup[j] = s.partition.indexToGroup[j], s.partition.indexToGroup[i] - } - - if s.isNullable { - // i is null, j is not null - if s.nullMask[i>>3]&(1< 0 && s.nullMask[j>>3]&(1<>3] &= ^(1 << uint(i%8)) - s.nullMask[j>>3] |= 1 << uint(j%8) - } else - - // i is not null, j is null - if s.nullMask[i>>3]&(1<>3]&(1< 0 { - s.nullMask[i>>3] |= 1 << uint(i%8) - s.nullMask[j>>3] &= ^(1 << uint(j%8)) - } - } - - s.data[i], s.data[j] = s.data[j], s.data[i] -} - func (s SeriesInt32) Append(v any) Series { switch v := v.(type) { case int32, []int32: @@ -807,149 +763,53 @@ func (s SeriesInt32) Map(f GDLMapFunc, stringPool *StringPool) Series { //////////////////////// GROUPING OPERATIONS type SeriesInt32Partition struct { - isDense bool - seriesSize int partition map[int64][]int + isDense bool partitionDenseMin int32 partitionDense [][]int partitionDenseNulls []int - indexToGroup []int } -func (gp SeriesInt32Partition) GetSize() int { +func (gp *SeriesInt32Partition) getSize() int { if gp.isDense { - nulls := 0 - if len(gp.partitionDenseNulls) > 0 { - nulls = 1 + if gp.partitionDenseNulls != nil && len(gp.partitionDenseNulls) > 0 { + return len(gp.partitionDense) + 1 } - return len(gp.partitionDense) + nulls + return len(gp.partitionDense) } return len(gp.partition) } -func (gp SeriesInt32Partition) beginSorting() SeriesInt32Partition { - gp.indexToGroup = make([]int, gp.seriesSize) +func (gp *SeriesInt32Partition) getMap() map[int64][]int { if gp.isDense { + map_ := make(map[int64][]int, len(gp.partitionDense)) for i, part := range gp.partitionDense { - for _, idx := range part { - gp.indexToGroup[idx] = i - } + map_[int64(i)+int64(gp.partitionDenseMin)] = part } - for _, idx := range gp.partitionDenseNulls { - gp.indexToGroup[idx] = len(gp.partitionDense) + // Merge the nulls to the map + if gp.partitionDenseNulls != nil && len(gp.partitionDenseNulls) > 0 { + nullKey := __series_get_nullkey(map_, HASH_NULL_KEY) + map_[nullKey] = gp.partitionDenseNulls } - } else { - for i, part := range gp.partition { - for _, idx := range part { - gp.indexToGroup[idx] = int(i) - } - } - } - return gp -} - -func (gp SeriesInt32Partition) endSorting() SeriesInt32Partition { - if gp.isDense { - newPartitionDense := make([][]int, len(gp.partitionDense)) - newPartitionDenseNulls := make([]int, len(gp.partitionDenseNulls)) - - for _, part := range gp.partitionDense { - newPartitionDense[gp.indexToGroup[part[0]]] = make([]int, len(part)) - } - - for i, idx := range gp.indexToGroup { - if idx == len(gp.partitionDense) { - newPartitionDenseNulls = append(newPartitionDenseNulls, i) - } else { - newPartitionDense[idx] = append(newPartitionDense[idx], i) - } - } - - gp.partitionDense = newPartitionDense - gp.partitionDenseNulls = newPartitionDenseNulls - } else { - newPartition := make(map[int64][]int, len(gp.partition)) - for _, part := range gp.partition { - newPartition[int64(gp.indexToGroup[part[0]])] = make([]int, len(part)) - } - - for i, idx := range gp.indexToGroup { - newPartition[int64(idx)] = append(newPartition[int64(idx)], i) - } - - gp.partition = newPartition - } - - gp.indexToGroup = nil - return gp -} - -func (gp SeriesInt32Partition) GetMap() map[int64][]int { - if gp.isDense { - map_ := make(map[int64][]int, len(gp.partitionDense)) - for i, part := range gp.partitionDense { - map_[int64(i+int(gp.partitionDenseMin))] = part - } return map_ } return gp.partition } -func (gp SeriesInt32Partition) GetValueIndices(val any) []int { - if val == nil { - if gp.isDense { - return gp.partitionDenseNulls - } else if nulls, ok := gp.partition[HASH_NULL_KEY]; ok { - return nulls - } - } else if v, ok := val.(int32); ok { - if gp.isDense { - return gp.partitionDense[v] - } else if part, ok := gp.partition[int64(v)]; ok { - return part - } - } - - return make([]int, 0) -} - -func (gp SeriesInt32Partition) GetKeys() any { - var keys []int - if gp.isDense { - keys = make([]int, 0, len(gp.partitionDense)) - for k, indeces := range gp.partitionDense { - if len(indeces) > 0 { - keys = append(keys, k) - } - } - } else { - keys = make([]int, 0, len(gp.partition)) - for k := range gp.partition { - if k != HASH_NULL_KEY { - keys = append(keys, int(k)) - } - } - } - - return keys -} - -func (gp SeriesInt32Partition) debugPrint() { - fmt.Println("SeriesInt32Partition") - map_ := gp.GetMap() - for k, v := range map_ { - fmt.Printf("%4d: %v\n", k, v) - } -} - -func (s SeriesInt32) Group() Series { +func (s SeriesInt32) group() Series { + var useDenseMap bool + var min, max int32 var partition SeriesInt32Partition + + // If the number of elements is small, + // look for the minimum and maximum values if len(s.data) < MINIMUM_PARALLEL_SIZE_2 { - max := s.data[0] - min := s.data[0] + useDenseMap = true + max = s.data[0] + min = s.data[0] for _, v := range s.data { if v > max { max = v @@ -958,33 +818,49 @@ func (s SeriesInt32) Group() Series { min = v } } + } + + // If the difference between the maximum and minimum values is acceptable, + // then we can use a dense map, otherwise we use a sparse map + if useDenseMap && (max-min >= MINIMUM_PARALLEL_SIZE_1) { + useDenseMap = false + } + // DENSE MAP + if useDenseMap { + var nulls []int map_ := make([][]int, max-min+1) for i := 0; i < len(map_); i++ { map_[i] = make([]int, 0, DEFAULT_DENSE_MAP_ARRAY_INITIAL_CAPACITY) } - for i, v := range s.data { - map_[v-min] = append(map_[v-min], i) + if s.HasNull() { + nulls = make([]int, 0, DEFAULT_DENSE_MAP_ARRAY_INITIAL_CAPACITY) + for i, v := range s.data { + if s.IsNull(i) { + nulls = append(nulls, i) + } else { + map_[v-min] = append(map_[v-min], i) + } + } + } else { + for i, v := range s.data { + map_[v-min] = append(map_[v-min], i) + } } partition = SeriesInt32Partition{ - isDense: true, - seriesSize: s.Len(), - partitionDenseMin: min, - partitionDense: map_, - } - - } else { - // Initialize the maps - allMaps := make([]map[int64][]int, THREADS_NUMBER) - for i := 0; i < THREADS_NUMBER; i++ { - allMaps[i] = make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) + isDense: true, + partitionDenseMin: min, + partitionDense: map_, + partitionDenseNulls: nulls, } + } else + // SPARSE MAP + { // Define the worker callback - worker := func(threadNum, start, end int) { - map_ := allMaps[threadNum] + worker := func(threadNum, start, end int, map_ map[int64][]int) { up := end - ((end - start) % 8) for i := start; i < up; { map_[int64(s.data[i])] = append(map_[int64(s.data[i])], i) @@ -1010,12 +886,22 @@ func (s SeriesInt32) Group() Series { } } - __series_groupby_multithreaded(THREADS_NUMBER, len(s.data), allMaps, nil, worker) + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + for i := start; i < end; i++ { + if s.IsNull(i) { + (*nulls) = append((*nulls), i) + } else { + map_[int64(s.data[i])] = append(map_[int64(s.data[i])], i) + } + } + } partition = SeriesInt32Partition{ - isDense: false, - seriesSize: s.Len(), - partition: allMaps[0], + isDense: false, + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_2, len(s.data), s.HasNull(), + worker, workerNulls), } } @@ -1025,60 +911,50 @@ func (s SeriesInt32) Group() Series { return s } -func (s SeriesInt32) SubGroup(partition SeriesPartition) Series { - var newPartition SeriesInt32Partition - otherIndeces := partition.GetMap() - - if len(s.data) < MINIMUM_PARALLEL_SIZE_2 { +func (s SeriesInt32) GroupBy(partition SeriesPartition) Series { + if partition == nil { + return s + } - map_ := make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) + // collect all keys + otherIndeces := partition.getMap() + keys := make([]int64, len(otherIndeces)) + i := 0 + for k := range otherIndeces { + keys[i] = k + i++ + } + // Define the worker callback + worker := func(threadNum, start, end int, map_ map[int64][]int) { var newHash int64 - for h, v := range otherIndeces { - for _, index := range v { + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function newHash = int64(s.data[index]) + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) map_[newHash] = append(map_[newHash], index) } } + } - newPartition = SeriesInt32Partition{ - seriesSize: s.Len(), - partition: map_, - } - } else { - - // collect all keys - keys := make([]int64, len(otherIndeces)) - i := 0 - for k := range otherIndeces { - keys[i] = k - i++ - } - - // Initialize the maps and the wait groups - allMaps := make([]map[int64][]int, THREADS_NUMBER) - for i := 0; i < THREADS_NUMBER; i++ { - allMaps[i] = make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) - } - - // Define the worker callback - worker := func(threadNum, start, end int) { - var newHash int64 - map_ := allMaps[threadNum] - for _, h := range keys[start:end] { - for _, index := range otherIndeces[h] { + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + var newHash int64 + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function + if s.IsNull(index) { + newHash = HASH_MAGIC_NUMBER_NULL + (h << 13) + (h >> 4) + } else { newHash = int64(s.data[index]) + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) - map_[newHash] = append(map_[newHash], index) } + map_[newHash] = append(map_[newHash], index) } } + } - __series_groupby_multithreaded(THREADS_NUMBER, len(keys), allMaps, nil, worker) - - newPartition = SeriesInt32Partition{ - seriesSize: s.Len(), - partition: allMaps[0], - } + newPartition := SeriesInt32Partition{ + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_2, len(keys), s.HasNull(), + worker, workerNulls), } s.isGrouped = true @@ -1087,21 +963,65 @@ func (s SeriesInt32) SubGroup(partition SeriesPartition) Series { return s } +func (s SeriesInt32) UnGroup() Series { + s.isGrouped = false + s.partition = nil + return s +} + func (s SeriesInt32) GetPartition() SeriesPartition { return s.partition } //////////////////////// SORTING OPERATIONS +func (s SeriesInt32) Less(i, j int) bool { + if s.isNullable { + if s.nullMask[i>>3]&(1< 0 { + return false + } + if s.nullMask[j>>3]&(1< 0 { + return true + } + } + + return s.data[i] < s.data[j] +} + +func (s SeriesInt32) equal(i, j int) bool { + if s.isNullable { + if (s.nullMask[i>>3] & (1 << uint(i%8))) > 0 { + return (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 + } + if (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 { + return false + } + } + + return s.data[i] == s.data[j] +} + +func (s SeriesInt32) Swap(i, j int) { + if s.isNullable { + // i is null, j is not null + if s.nullMask[i>>3]&(1< 0 && s.nullMask[j>>3]&(1<>3] &= ^(1 << uint(i%8)) + s.nullMask[j>>3] |= 1 << uint(j%8) + } else + + // i is not null, j is null + if s.nullMask[i>>3]&(1<>3]&(1< 0 { + s.nullMask[i>>3] |= 1 << uint(i%8) + s.nullMask[j>>3] &= ^(1 << uint(j%8)) + } + } + + s.data[i], s.data[j] = s.data[j], s.data[i] +} + func (s SeriesInt32) Sort() Series { if s.sorted != SORTED_ASC { - if s.isGrouped { - *s.partition = (*s.partition).beginSorting() - sort.Sort(s) - *s.partition = (*s.partition).endSorting() - } else { - sort.Sort(s) - } + sort.Sort(s) s.sorted = SORTED_ASC } return s @@ -1109,13 +1029,7 @@ func (s SeriesInt32) Sort() Series { func (s SeriesInt32) SortRev() Series { if s.sorted != SORTED_DESC { - if s.isGrouped { - *s.partition = (*s.partition).beginSorting() - sort.Sort(sort.Reverse(s)) - *s.partition = (*s.partition).endSorting() - } else { - sort.Sort(sort.Reverse(s)) - } + sort.Sort(sort.Reverse(s)) s.sorted = SORTED_DESC } return s diff --git a/core/gandalff/gdl_series_int32_test.go b/core/gandalff/gdl_series_int32_test.go index 06cb397..f071248 100644 --- a/core/gandalff/gdl_series_int32_test.go +++ b/core/gandalff/gdl_series_int32_test.go @@ -678,212 +678,123 @@ func Test_SeriesInt32_Map(t *testing.T) { } } -func Test_SeriesInt32_Sort(t *testing.T) { +func Test_SeriesInt32_Group(t *testing.T) { + var partMap map[int64][]int - data := []int32{2, 323, 42, 4, 9, 674, 42, 48, 9811, 79, 3, 12, 492, 47005, -173, -28, 323, 42, 4, 9, 31, 425, 2} - mask := []bool{false, false, true, false, false, true, false, false, true, false, false, true, false, false, true, false, false, true, false, false, true, false, false} + data1 := []int32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1} + data1Mask := []bool{false, false, false, false, false, true, true, true, true, true} + data2 := []int32{1, 1, 2, 2, 1, 1, 2, 2, 1, 1} + data2Mask := []bool{false, true, false, true, false, true, false, true, false, true} + data3 := []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + data3Mask := []bool{false, false, false, false, false, true, true, true, true, true} - // Create a new series. - s := NewSeriesInt32("test", true, true, data) + // Test 1 + s1 := NewSeriesInt32("s1", true, true, data1). + SetNullMask(data1Mask). + group() - // Sort the series. - sorted := s.Sort() - - // Check the length. - if sorted.Len() != 23 { - t.Errorf("Expected length of 23, got %d", sorted.Len()) + p1 := s1.GetPartition().getMap() + if len(p1) != 2 { + t.Errorf("Expected 2 groups, got %d", len(p1)) } - // Check the data. - result := []int32{-173, -28, 2, 2, 3, 4, 4, 9, 9, 12, 31, 42, 42, 42, 48, 79, 323, 323, 425, 492, 674, 9811, 47005} - for i, v := range sorted.Data().([]int32) { - if v != result[i] { - t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - } + partMap = map[int64][]int{ + 0: {0, 1, 2, 3, 4}, + 1: {5, 6, 7, 8, 9}, + } + if !checkEqPartitionMap(p1, partMap, nil, "Int32 Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p1) } - ///////////////////////////////////////////////////////////////////////////////////// - - // Create a new series. - s = NewSeriesInt32("test", true, true, data) - - // Set the null mask. - s.SetNullMask(mask) - - // Sort the series. - sorted = s.Sort() + // Test 2 + s2 := NewSeriesInt32("s2", true, true, data2). + SetNullMask(data2Mask). + GroupBy(s1.GetPartition()) - // Check the length. - if sorted.Len() != 23 { - t.Errorf("Expected length of 23, got %d", sorted.Len()) + p2 := s2.GetPartition().getMap() + if len(p2) != 6 { + t.Errorf("Expected 6 groups, got %d", len(p2)) } - // Check the data. - result = []int32{-28, 2, 2, 3, 4, 4, 9, 9, 42, 48, 79, 323, 323, 425, 492, 47005, 42, 674, 9811, 12, -173, 42, 31} - for i, v := range sorted.Data().([]int32) { - if i < 16 && v != result[i] { - t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - } + partMap = map[int64][]int{ + 0: {0, 4}, + 1: {1, 3}, + 2: {2}, + 3: {5, 7, 9}, + 4: {6}, + 5: {8}, } - - // Check the null mask. - for i, v := range sorted.GetNullMask() { - if i < 16 && v != false { - t.Errorf("Expected nullMask of %v, got %v at index %d", false, v, i) - } else if i >= 16 && v != true { - t.Errorf("Expected nullMask of %v, got %v at index %d", true, v, i) - } + if !checkEqPartitionMap(p2, partMap, nil, "Int32 Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p2) } -} -func Test_SeriesInt32_GroupedSort(t *testing.T) { - data := []int32{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1} - // mask := []bool{false, true, false, false, false, false, true, false, false, false, false, true, false, false, false} + // Test 3 + s3 := NewSeriesInt32("test", true, true, data3). + SetNullMask(data3Mask). + GroupBy(s2.GetPartition()) - partData := []int32{3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2} - p := NewSeriesInt32("part", true, true, partData).Group() - - // Create a new series. - s := NewSeriesInt32("test", true, true, data). - SubGroup(p.GetPartition()). - Sort() - - // Check the length. - if s.Len() != 15 { - t.Errorf("Expected length of 15, got %d", s.Len()) + p3 := s3.GetPartition().getMap() + if len(p3) != 8 { + t.Errorf("Expected 8 groups, got %d", len(p3)) } - // Check the data. - result := []int32{6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 11, 12, 13, 14, 15} - for i, v := range s.Data().([]int32) { - if v != result[i] { - t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - } + partMap = map[int64][]int{ + 0: {0}, + 1: {1}, + 2: {2}, + 3: {3}, + 4: {4}, + 5: {5, 7, 9}, + 6: {6}, + 7: {8}, + } + if !checkEqPartitionMap(p3, partMap, nil, "Int32 Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p3) } - // ///////////////////////////////////////////////////////////////////////////////////// - - // s = NewSeriesInt32("test", true, true, data). - // SetNullMask(mask). - // SubGroup(p.GetPartition()). - // Sort() - - // // Check the length. - // if s.Len() != 15 { - // t.Errorf("Expected length of 15, got %d", s.Len()) - // } - - // // Check the data. - // result = []int{6, 7, 8, 10, 1, 2, 3, 5, 11, 12, 13, 15, 9, 4, 14} - // for i, v := range s.Data().([]int) { - // if i < 14 && v != result[i] { - // t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - // } - // } - - // // Check the null mask. - // for i, v := range s.GetNullMask() { - // if i < 12 && v != false { - // t.Errorf("Expected nullMask of %v, got %v at index %d", false, v, i) - // } else if i >= 12 && v != true { - // t.Errorf("Expected nullMask of %v, got %v at index %d", true, v, i) - // } - // } + // debugPrintPartition(s1.GetPartition(), s1) + // debugPrintPartition(s2.GetPartition(), s1, s2) + // debugPrintPartition(s3.GetPartition(), s1, s2, s3) - // ///////////////////////////////////////////////////////////////////////////////////// - // // Reverse sort. + partMap = nil +} - dataRev := []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} - // maskRev := []bool{false, true, false, false, false, false, true, false, false, false, false, true, false, false, false} +func Test_SeriesInt32_Sort(t *testing.T) { + data := []int32{-195, -27, 33, 679, -67, 920, -352, -674, 250, 767, 697, 873, -802, -123, 308, -558, -518, 169, 313, 593} + mask := []bool{false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true} - s = NewSeriesInt32("test", true, true, dataRev). - SubGroup(p.GetPartition()). - SortRev() + // Create a new series. + s := NewSeriesInt32("test", false, true, data) - // Check the length. - if s.Len() != 15 { - t.Errorf("Expected length of 15, got %d", s.Len()) - } + // Sort the series. + sorted := s.Sort() // Check the data. - result = []int32{5, 4, 3, 2, 1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6} - for i, v := range s.Data().([]int32) { - if v != result[i] { - t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - } + expected := []int32{-802, -674, -558, -518, -352, -195, -123, -67, -27, 33, 169, 250, 308, 313, 593, 679, 697, 767, 873, 920} + if !checkEqSliceInt32(sorted.Data().([]int32), expected, nil, "") { + t.Errorf("SeriesInt32.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]int32)) } - ///////////////////////////////////////////////////////////////////////////////////// - - // s = NewSeriesInt32("test", true, true, dataRev). - // SetNullMask(maskRev). - // SubGroup(p.GetPartition()). - // SortRev() + // Create a new series. + s = NewSeriesInt32("test", true, true, data). + SetNullMask(mask) - // // Check the length. - // if s.Len() != 15 { - // t.Errorf("Expected length of 15, got %d", s.Len()) - // } + // Sort the series. + sorted = s.Sort() - // // Check the data. - // result = []int{5, 4, 3, 1, 10, 9, 8, 6, 15, 14, 13, 11, 2, 7, 12} - // for i, v := range s.Data().([]int) { - // if i < 14 && v != result[i] { - // t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - // } - // } + // Check the data. + expected = []int32{-802, -518, -352, -195, -67, 33, 250, 308, 313, 697, 920, 873, 767, -123, -674, -558, 679, 169, -27, 593} + if !checkEqSliceInt32(sorted.Data().([]int32), expected, nil, "") { + t.Errorf("SeriesInt32.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]int32)) + } // Check the null mask. - // for i, v := range s.GetNullMask() { - // if i < 12 && v != false { - // t.Errorf("Expected nullMask of %v, got %v at index %d", false, v, i) - // } else if i >= 12 && v != true { - // t.Errorf("Expected nullMask of %v, got %v at index %d", true, v, i) - // } - // } + expectedMask := []bool{false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true} + if !checkEqSliceBool(sorted.GetNullMask(), expectedMask, nil, "") { + t.Errorf("SeriesInt32.Sort() failed, expecting %v, got %v", expectedMask, sorted.GetNullMask()) + } } -// func Test_SeriesInt32_Multiplication(t *testing.T) { - -// data := []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20} - -// // s * 2 -// res := NewSeriesInt32("test", true, true, &data).Mul(NewSeriesInt32("test", true, true, &[]int{2})) -// if e, ok := res.(SeriesError); ok { -// t.Errorf("Got error: %v", e) -// } - -// // Check the length. -// if res.Len() != 20 { -// t.Errorf("Expected length of 20, got %d", res.Len()) -// } - -// // Check the data. -// for i, v := range res.Data().([]int) { -// if v != data[i]*2 { -// t.Errorf("Expected %v, got %v at index %d", data[i]*2, v, i) -// } -// } - -// // 2 * s -// res = NewSeriesInt32("test", true, true, &[]int{2}).Mul(NewSeriesInt32("test", true, true, &data)) -// if e, ok := res.(SeriesError); ok { -// t.Errorf("Got error: %v", e) -// } - -// // Check the length. -// if res.Len() != 20 { -// t.Errorf("Expected length of 20, got %d", res.Len()) -// } - -// // Check the data. -// for i, v := range res.Data().([]int) { -// if v != data[i]*2 { -// t.Errorf("Expected %v, got %v at index %d", data[i]*2, v, i) -// } -// } -// } - func Test_SeriesInt32_Arithmetic_Mul(t *testing.T) { bools := NewSeriesBool("test", true, false, []bool{true}).(SeriesBool) boolv := NewSeriesBool("test", true, false, []bool{true, false, true, false, true, false, true, true, false, false}).(SeriesBool) diff --git a/core/gandalff/gdl_series_int64.go b/core/gandalff/gdl_series_int64.go index 57b0cd0..9bad313 100644 --- a/core/gandalff/gdl_series_int64.go +++ b/core/gandalff/gdl_series_int64.go @@ -43,7 +43,7 @@ func (s SeriesInt64) Type() typesys.BaseType { // Returns the type and cardinality of the series. func (s SeriesInt64) TypeCard() typesys.BaseTypeCard { - return typesys.BaseTypeCard{typesys.Int64Type, s.Len()} + return typesys.BaseTypeCard{Base: typesys.Int64Type, Card: s.Len()} } // Returns if the series is grouped. @@ -244,50 +244,6 @@ func (s SeriesInt64) Take(params ...int) Series { return s.filterIntSlice(indeces) } -func (s SeriesInt64) Less(i, j int) bool { - if s.isGrouped { - if s.partition.indexToGroup[i] != s.partition.indexToGroup[j] { - return s.partition.indexToGroup[i] < s.partition.indexToGroup[j] - } - return s.data[i] < s.data[j] - } else - - // if s is grouped the null element are is the same group - // so there is no need to check if the element is null - if s.isNullable { - if s.nullMask[i>>3]&(1< 0 { - return false - } - if s.nullMask[j>>3]&(1< 0 { - return true - } - } - - return s.data[i] < s.data[j] -} - -func (s SeriesInt64) Swap(i, j int) { - if s.isGrouped { - s.partition.indexToGroup[i], s.partition.indexToGroup[j] = s.partition.indexToGroup[j], s.partition.indexToGroup[i] - } - - if s.isNullable { - // i is null, j is not null - if s.nullMask[i>>3]&(1< 0 && s.nullMask[j>>3]&(1<>3] &= ^(1 << uint(i%8)) - s.nullMask[j>>3] |= 1 << uint(j%8) - } else - - // i is not null, j is null - if s.nullMask[i>>3]&(1<>3]&(1< 0 { - s.nullMask[i>>3] |= 1 << uint(i%8) - s.nullMask[j>>3] &= ^(1 << uint(j%8)) - } - } - - s.data[i], s.data[j] = s.data[j], s.data[i] -} - func (s SeriesInt64) Append(v any) Series { switch v := v.(type) { case int64, []int64: @@ -813,172 +769,53 @@ func (s SeriesInt64) Map(f GDLMapFunc, stringPool *StringPool) Series { //////////////////////// GROUPING OPERATIONS type SeriesInt64Partition struct { - isDense bool - seriesSize int partition map[int64][]int - nullKey NullableInt64 + isDense bool partitionDenseMin int64 partitionDense [][]int partitionDenseNulls []int - indexToGroup []int64 } -func (gp SeriesInt64Partition) GetSize() int { +func (gp *SeriesInt64Partition) getSize() int { if gp.isDense { - nulls := 0 - if len(gp.partitionDenseNulls) > 0 { - nulls = 1 + if gp.partitionDenseNulls != nil && len(gp.partitionDenseNulls) > 0 { + return len(gp.partitionDense) + 1 } - return len(gp.partitionDense) + nulls + return len(gp.partitionDense) } return len(gp.partition) } -func (gp SeriesInt64Partition) beginSorting() SeriesInt64Partition { - gp.indexToGroup = make([]int64, gp.seriesSize) - if gp.isDense { - for i, part := range gp.partitionDense { - for _, idx := range part { - gp.indexToGroup[idx] = int64(i) - } - } - - // put nulls at the end - for _, idx := range gp.partitionDenseNulls { - gp.indexToGroup[idx] = int64(len(gp.partitionDense)) - } - } else { - if gp.nullKey.Valid { - nulls := gp.partition[gp.nullKey.Value] - delete(gp.partition, gp.nullKey.Value) - - for i, part := range gp.partition { - for _, idx := range part { - gp.indexToGroup[idx] = i - } - } - - // put nulls at the end - for _, idx := range nulls { - gp.indexToGroup[idx] = int64(len(gp.partition)) - } - } else { - for i, part := range gp.partition { - for _, idx := range part { - gp.indexToGroup[idx] = i - } - } - } - } - - return gp -} - -func (gp SeriesInt64Partition) endSorting() SeriesInt64Partition { - if gp.isDense { - newPartitionDense := make([][]int, len(gp.partitionDense)) - newPartitionDenseNulls := make([]int, len(gp.partitionDenseNulls)) - - for _, part := range gp.partitionDense { - newPartitionDense[gp.indexToGroup[part[0]]] = make([]int, len(part)) - } - - if len(gp.partitionDenseNulls) > 0 { - for i, idx := range gp.indexToGroup { - if idx == int64(len(gp.partitionDense)) { - newPartitionDenseNulls = append(newPartitionDenseNulls, i) - } else { - newPartitionDense[idx] = append(newPartitionDense[idx], i) - } - } - } else { - for i, idx := range gp.indexToGroup { - newPartitionDense[idx] = append(newPartitionDense[idx], i) - } - } - - gp.partitionDense = newPartitionDense - gp.partitionDenseNulls = newPartitionDenseNulls - } else { - newPartition := make(map[int64][]int, len(gp.partition)) - - for _, part := range gp.partition { - newPartition[int64(gp.indexToGroup[part[0]])] = make([]int, len(part)) - } - - for i, idx := range gp.indexToGroup { - newPartition[int64(idx)] = append(newPartition[int64(idx)], i) - } - - gp.partition = newPartition - } - - gp.indexToGroup = nil - return gp -} - -func (gp SeriesInt64Partition) GetMap() map[int64][]int { +func (gp *SeriesInt64Partition) getMap() map[int64][]int { if gp.isDense { map_ := make(map[int64][]int, len(gp.partitionDense)) for i, part := range gp.partitionDense { map_[int64(i)+gp.partitionDenseMin] = part } - return map_ - } - - return gp.partition -} -func (gp SeriesInt64Partition) GetValueIndices(val any) []int { - if val == nil { - if gp.isDense { - return gp.partitionDenseNulls - } else if nulls, ok := gp.partition[HASH_NULL_KEY]; ok { - return nulls + // Merge the nulls to the map + if gp.partitionDenseNulls != nil && len(gp.partitionDenseNulls) > 0 { + nullKey := __series_get_nullkey(map_, HASH_NULL_KEY) + map_[nullKey] = gp.partitionDenseNulls } - } else if v, ok := val.(int32); ok { - if gp.isDense { - return gp.partitionDense[v] - } else if part, ok := gp.partition[int64(v)]; ok { - return part - } - } - - return make([]int, 0) -} -func (gp SeriesInt64Partition) GetKeys() any { - var keys []int64 - if gp.isDense { - keys = make([]int64, 0, len(gp.partitionDense)) - for k, indeces := range gp.partitionDense { - if len(indeces) > 0 { - keys = append(keys, int64(k)) - } - } - } else { - keys = make([]int64, 0, len(gp.partition)) - for k := range gp.partition { - keys = append(keys, k) - } + return map_ } - return keys -} - -func (gp SeriesInt64Partition) debugPrint() { - fmt.Println("SeriesInt64Partition") - map_ := gp.GetMap() - for k, v := range map_ { - fmt.Printf("%4d: %v\n", k, v) - } + return gp.partition } -func (s SeriesInt64) Group() Series { +func (s SeriesInt64) group() Series { + var useDenseMap bool + var min, max int64 var partition SeriesInt64Partition + + // If the number of elements is small, + // look for the minimum and maximum values if len(s.data) < MINIMUM_PARALLEL_SIZE_2 { - max := s.data[0] - min := s.data[0] + useDenseMap = true + max = s.data[0] + min = s.data[0] for _, v := range s.data { if v > max { max = v @@ -987,7 +824,16 @@ func (s SeriesInt64) Group() Series { min = v } } + } + + // If the difference between the maximum and minimum values is acceptable, + // then we can use a dense map, otherwise we use a sparse map + if useDenseMap && (max-min >= MINIMUM_PARALLEL_SIZE_1) { + useDenseMap = false + } + // DENSE MAP + if useDenseMap { var nulls []int map_ := make([][]int, max-min+1) for i := 0; i < len(map_); i++ { @@ -1011,75 +857,57 @@ func (s SeriesInt64) Group() Series { partition = SeriesInt64Partition{ isDense: true, - seriesSize: s.Len(), partitionDenseMin: min, partitionDense: map_, partitionDenseNulls: nulls, } - } else { - var nullKey NullableInt64 - - // Initialize the maps - allMaps := make([]map[int64][]int, THREADS_NUMBER) - for i := 0; i < THREADS_NUMBER; i++ { - allMaps[i] = make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) - } + } else - if s.HasNull() { - allNulls := make([][]int, THREADS_NUMBER) - - // Define the worker callback - worker := func(threadNum, start, end int) { - map_ := allMaps[threadNum] - for i := start; i < end; i++ { - if s.IsNull(i) { - allNulls[threadNum] = append(allNulls[threadNum], i) - } else { - map_[s.data[i]] = append(map_[s.data[i]], i) - } - } + // SPARSE MAP + { + // Define the worker callback + worker := func(threadNum, start, end int, map_ map[int64][]int) { + up := end - ((end - start) % 8) + for i := start; i < up; { + map_[s.data[i]] = append(map_[s.data[i]], i) + i++ + map_[s.data[i]] = append(map_[s.data[i]], i) + i++ + map_[s.data[i]] = append(map_[s.data[i]], i) + i++ + map_[s.data[i]] = append(map_[s.data[i]], i) + i++ + map_[s.data[i]] = append(map_[s.data[i]], i) + i++ + map_[s.data[i]] = append(map_[s.data[i]], i) + i++ + map_[s.data[i]] = append(map_[s.data[i]], i) + i++ + map_[s.data[i]] = append(map_[s.data[i]], i) + i++ } - __series_groupby_multithreaded(THREADS_NUMBER, len(s.data), allMaps, allNulls, worker) - - nullKey = NullableInt64{Valid: true, Value: __series_get_nullkey(allMaps[0], HASH_NULL_KEY)} - } else { - // Define the worker callback - worker := func(threadNum, start, end int) { - map_ := allMaps[threadNum] - up := end - ((end - start) % 8) - for i := start; i < up; { - map_[s.data[i]] = append(map_[s.data[i]], i) - i++ - map_[s.data[i]] = append(map_[s.data[i]], i) - i++ - map_[s.data[i]] = append(map_[s.data[i]], i) - i++ - map_[s.data[i]] = append(map_[s.data[i]], i) - i++ - map_[s.data[i]] = append(map_[s.data[i]], i) - i++ - map_[s.data[i]] = append(map_[s.data[i]], i) - i++ - map_[s.data[i]] = append(map_[s.data[i]], i) - i++ - map_[s.data[i]] = append(map_[s.data[i]], i) - i++ - } + for i := up; i < end; i++ { + map_[s.data[i]] = append(map_[s.data[i]], i) + } + } - for i := up; i < end; i++ { + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + for i := start; i < end; i++ { + if s.IsNull(i) { + (*nulls) = append((*nulls), i) + } else { map_[s.data[i]] = append(map_[s.data[i]], i) } } - - __series_groupby_multithreaded(THREADS_NUMBER, len(s.data), allMaps, nil, worker) } partition = SeriesInt64Partition{ - isDense: false, - seriesSize: s.Len(), - partition: allMaps[0], - nullKey: nullKey, + isDense: false, + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_2, len(s.data), s.HasNull(), + worker, workerNulls), } } @@ -1089,62 +917,50 @@ func (s SeriesInt64) Group() Series { return s } -func (s SeriesInt64) SubGroup(partition SeriesPartition) Series { - var newPartition SeriesInt64Partition - otherIndeces := partition.GetMap() - - if len(s.data) < MINIMUM_PARALLEL_SIZE_2 { +func (s SeriesInt64) GroupBy(partition SeriesPartition) Series { + if partition == nil { + return s + } - map_ := make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) + // collect all keys + otherIndeces := partition.getMap() + keys := make([]int64, len(otherIndeces)) + i := 0 + for k := range otherIndeces { + keys[i] = k + i++ + } + // Define the worker callback + worker := func(threadNum, start, end int, map_ map[int64][]int) { var newHash int64 - for h, v := range otherIndeces { - for _, index := range v { + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function newHash = s.data[index] + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) map_[newHash] = append(map_[newHash], index) } } + } - newPartition = SeriesInt64Partition{ - seriesSize: s.Len(), - partition: map_, - } - } else { - // collect all keys - keys := make([]int64, len(otherIndeces)) - i := 0 - for k := range otherIndeces { - keys[i] = k - i++ - } - - // Initialize the maps - allMaps := make([]map[int64][]int, THREADS_NUMBER) - for i := 0; i < THREADS_NUMBER; i++ { - allMaps[i] = make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) - } - - if s.HasNull() { - } else { - // Define the worker callback - worker := func(threadNum, start, end int) { - var newHash int64 - map_ := allMaps[threadNum] - for _, h := range keys[start:end] { - for _, index := range otherIndeces[h] { - newHash = s.data[index] + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) - map_[newHash] = append(map_[newHash], index) - } + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + var newHash int64 + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function + if s.IsNull(index) { + newHash = HASH_MAGIC_NUMBER_NULL + (h << 13) + (h >> 4) + } else { + newHash = s.data[index] + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) } + map_[newHash] = append(map_[newHash], index) } - - __series_groupby_multithreaded(THREADS_NUMBER, len(keys), allMaps, nil, worker) } + } - newPartition = SeriesInt64Partition{ - seriesSize: s.Len(), - partition: allMaps[0], - } + newPartition := SeriesInt64Partition{ + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_2, len(keys), s.HasNull(), + worker, workerNulls), } s.isGrouped = true @@ -1153,21 +969,65 @@ func (s SeriesInt64) SubGroup(partition SeriesPartition) Series { return s } +func (s SeriesInt64) UnGroup() Series { + s.isGrouped = false + s.partition = nil + return s +} + func (s SeriesInt64) GetPartition() SeriesPartition { return s.partition } //////////////////////// SORTING OPERATIONS +func (s SeriesInt64) Less(i, j int) bool { + if s.isNullable { + if s.nullMask[i>>3]&(1< 0 { + return false + } + if s.nullMask[j>>3]&(1< 0 { + return true + } + } + + return s.data[i] < s.data[j] +} + +func (s SeriesInt64) equal(i, j int) bool { + if s.isNullable { + if (s.nullMask[i>>3] & (1 << uint(i%8))) > 0 { + return (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 + } + if (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 { + return false + } + } + + return s.data[i] == s.data[j] +} + +func (s SeriesInt64) Swap(i, j int) { + if s.isNullable { + // i is null, j is not null + if s.nullMask[i>>3]&(1< 0 && s.nullMask[j>>3]&(1<>3] &= ^(1 << uint(i%8)) + s.nullMask[j>>3] |= 1 << uint(j%8) + } else + + // i is not null, j is null + if s.nullMask[i>>3]&(1<>3]&(1< 0 { + s.nullMask[i>>3] |= 1 << uint(i%8) + s.nullMask[j>>3] &= ^(1 << uint(j%8)) + } + } + + s.data[i], s.data[j] = s.data[j], s.data[i] +} + func (s SeriesInt64) Sort() Series { if s.sorted != SORTED_ASC { - if s.isGrouped { - *s.partition = (*s.partition).beginSorting() - sort.Sort(s) - *s.partition = (*s.partition).endSorting() - } else { - sort.Sort(s) - } + sort.Sort(s) s.sorted = SORTED_ASC } return s @@ -1175,13 +1035,7 @@ func (s SeriesInt64) Sort() Series { func (s SeriesInt64) SortRev() Series { if s.sorted != SORTED_DESC { - if s.isGrouped { - *s.partition = (*s.partition).beginSorting() - sort.Sort(sort.Reverse(s)) - *s.partition = (*s.partition).endSorting() - } else { - sort.Sort(sort.Reverse(s)) - } + sort.Sort(sort.Reverse(s)) s.sorted = SORTED_DESC } return s diff --git a/core/gandalff/gdl_series_int64_test.go b/core/gandalff/gdl_series_int64_test.go index 1e1753c..d098962 100644 --- a/core/gandalff/gdl_series_int64_test.go +++ b/core/gandalff/gdl_series_int64_test.go @@ -663,212 +663,123 @@ func Test_SeriesInt64_Map(t *testing.T) { } } -func Test_SeriesInt64_Sort(t *testing.T) { - - data := []int64{2, 323, 42, 4, 9, 674, 42, 48, 9811, 79, 3, 12, 492, 47005, -173, -28, 323, 42, 4, 9, 31, 425, 2} - mask := []bool{false, false, true, false, false, true, false, false, true, false, false, true, false, false, true, false, false, true, false, false, true, false, false} +func Test_SeriesInt64_Group(t *testing.T) { + var partMap map[int64][]int - // Create a new series. - s := NewSeriesInt64("test", true, true, data) + data1 := []int64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1} + data1Mask := []bool{false, false, false, false, false, true, true, true, true, true} + data2 := []int64{1, 1, 2, 2, 1, 1, 2, 2, 1, 1} + data2Mask := []bool{false, true, false, true, false, true, false, true, false, true} + data3 := []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + data3Mask := []bool{false, false, false, false, false, true, true, true, true, true} - // Sort the series. - sorted := s.Sort() + // Test 1 + s1 := NewSeriesInt64("s1", true, true, data1). + SetNullMask(data1Mask). + group() - // Check the length. - if sorted.Len() != 23 { - t.Errorf("Expected length of 23, got %d", sorted.Len()) + p1 := s1.GetPartition().getMap() + if len(p1) != 2 { + t.Errorf("Expected 2 groups, got %d", len(p1)) } - // Check the data. - result := []int64{-173, -28, 2, 2, 3, 4, 4, 9, 9, 12, 31, 42, 42, 42, 48, 79, 323, 323, 425, 492, 674, 9811, 47005} - for i, v := range sorted.Data().([]int64) { - if v != result[i] { - t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - } + partMap = map[int64][]int{ + 0: {0, 1, 2, 3, 4}, + 1: {5, 6, 7, 8, 9}, + } + if !checkEqPartitionMap(p1, partMap, nil, "Int64 Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p1) } - ///////////////////////////////////////////////////////////////////////////////////// + // Test 2 + s2 := NewSeriesInt64("s2", true, true, data2). + SetNullMask(data2Mask). + GroupBy(s1.GetPartition()) - // Create a new series. - s = NewSeriesInt64("test", true, true, data) + p2 := s2.GetPartition().getMap() + if len(p2) != 6 { + t.Errorf("Expected 6 groups, got %d", len(p2)) + } - // Set the null mask. - s.SetNullMask(mask) + partMap = map[int64][]int{ + 0: {0, 4}, + 1: {1, 3}, + 2: {2}, + 3: {5, 7, 9}, + 4: {6}, + 5: {8}, + } + if !checkEqPartitionMap(p2, partMap, nil, "Int64 Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p2) + } - // Sort the series. - sorted = s.Sort() + // Test 3 + s3 := NewSeriesInt64("test", true, true, data3). + SetNullMask(data3Mask). + GroupBy(s2.GetPartition()) - // Check the length. - if sorted.Len() != 23 { - t.Errorf("Expected length of 23, got %d", sorted.Len()) + p3 := s3.GetPartition().getMap() + if len(p3) != 8 { + t.Errorf("Expected 8 groups, got %d", len(p3)) } - // Check the data. - result = []int64{-28, 2, 2, 3, 4, 4, 9, 9, 42, 48, 79, 323, 323, 425, 492, 47005, 42, 674, 9811, 12, -173, 42, 31} - for i, v := range sorted.Data().([]int64) { - if i < 16 && v != result[i] { - t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - } + partMap = map[int64][]int{ + 0: {0}, + 1: {1}, + 2: {2}, + 3: {3}, + 4: {4}, + 5: {5, 7, 9}, + 6: {6}, + 7: {8}, } - - // Check the null mask. - for i, v := range sorted.GetNullMask() { - if i < 16 && v != false { - t.Errorf("Expected nullMask of %v, got %v at index %d", false, v, i) - } else if i >= 16 && v != true { - t.Errorf("Expected nullMask of %v, got %v at index %d", true, v, i) - } + if !checkEqPartitionMap(p3, partMap, nil, "Int64 Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p3) } -} -func Test_SeriesInt64_GroupedSort(t *testing.T) { - data := []int64{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1} - // mask := []bool{false, true, false, false, false, false, true, false, false, false, false, true, false, false, false} + // debugPrintPartition(s1.GetPartition(), s1) + // debugPrintPartition(s2.GetPartition(), s1, s2) + // debugPrintPartition(s3.GetPartition(), s1, s2, s3) + + partMap = nil +} - partData := []int64{3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2} - p := NewSeriesInt64("part", true, true, partData).Group() +func Test_SeriesInt64_Sort(t *testing.T) { + data := []int64{821, 258, -547, -624, 337, -909, -715, 317, -827, -103, 271, 159, 230, -346, 471, 897, 801, 492, 45, -70} + mask := []bool{false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true} // Create a new series. - s := NewSeriesInt64("test", true, true, data). - SubGroup(p.GetPartition()). - Sort() + s := NewSeriesInt64("test", false, true, data) - // Check the length. - if s.Len() != 15 { - t.Errorf("Expected length of 15, got %d", s.Len()) - } + // Sort the series. + sorted := s.Sort() // Check the data. - result := []int64{6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 11, 12, 13, 14, 15} - for i, v := range s.Data().([]int64) { - if v != result[i] { - t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - } + expected := []int64{-909, -827, -715, -624, -547, -346, -103, -70, 45, 159, 230, 258, 271, 317, 337, 471, 492, 801, 821, 897} + if !checkEqSliceInt64(sorted.Data().([]int64), expected, nil, "") { + t.Errorf("SeriesInt64.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]int64)) } - ///////////////////////////////////////////////////////////////////////////////////// - - // s = NewSeriesInt64("test", true, true, data). - // SetNullMask(mask). - // SubGroup(p.GetPartition()). - // Sort() - - // // Check the length. - // if s.Len() != 15 { - // t.Errorf("Expected length of 15, got %d", s.Len()) - // } - - // // Check the data. - // result = []int64{6, 7, 8, 10, 1, 2, 3, 5, 11, 12, 13, 15, 9, 4, 14} - // for i, v := range s.Data().([]int64) { - // if i < 14 && v != result[i] { - // t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - // } - // } - - // // Check the null mask. - // for i, v := range s.GetNullMask() { - // if i < 12 && v != false { - // t.Errorf("Expected nullMask of %v, got %v at index %d", false, v, i) - // } else if i >= 12 && v != true { - // t.Errorf("Expected nullMask of %v, got %v at index %d", true, v, i) - // } - // } - - ///////////////////////////////////////////////////////////////////////////////////// - // Reverse sort. - - dataRev := []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} - // maskRev := []bool{false, true, false, false, false, false, true, false, false, false, false, true, false, false, false} - - s = NewSeriesInt64("test", true, true, dataRev). - SubGroup(p.GetPartition()). - SortRev() + // Create a new series. + s = NewSeriesInt64("test", true, true, data). + SetNullMask(mask) - // Check the length. - if s.Len() != 15 { - t.Errorf("Expected length of 15, got %d", s.Len()) - } + // Sort the series. + sorted = s.Sort() // Check the data. - result = []int64{5, 4, 3, 2, 1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6} - for i, v := range s.Data().([]int64) { - if v != result[i] { - t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - } + expected = []int64{-827, -715, -547, 45, 230, 271, 337, 471, 801, 821, -909, 159, -103, -346, 317, 897, -624, 492, 258, -70} + if !checkEqSliceInt64(sorted.Data().([]int64), expected, nil, "") { + t.Errorf("SeriesInt64.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]int64)) } - /////////////////////////////////////////////////////////////////////////////////// - - // s = NewSeriesInt64("test", true, true, dataRev). - // SetNullMask(maskRev). - // SubGroup(p.GetPartition()). - // SortRev() - - // // Check the length. - // if s.Len() != 15 { - // t.Errorf("Expected length of 15, got %d", s.Len()) - // } - - // // Check the data. - // result = []int64{5, 4, 3, 1, 10, 9, 8, 6, 15, 14, 13, 11, 2, 7, 12} - // for i, v := range s.Data().([]int64) { - // if i < 14 && v != result[i] { - // t.Errorf("Expected %v, got %v at index %d", result[i], v, i) - // } - // } - - // // Check the null mask. - // for i, v := range s.GetNullMask() { - // if i < 12 && v != false { - // t.Errorf("Expected nullMask of %v, got %v at index %d", false, v, i) - // } else if i >= 12 && v != true { - // t.Errorf("Expected nullMask of %v, got %v at index %d", true, v, i) - // } - // } + // Check the null mask. + expectedMask := []bool{false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true} + if !checkEqSliceBool(sorted.GetNullMask(), expectedMask, nil, "") { + t.Errorf("SeriesInt64.Sort() failed, expecting %v, got %v", expectedMask, sorted.GetNullMask()) + } } -// func Test_SeriesInt64_Multiplication(t *testing.T) { - -// data := []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20} - -// // s * 2 -// res := NewSeriesInt64("test", true, true, &data).Mul(NewSeriesInt64("test", true, true, &[]int{2})) -// if e, ok := res.(SeriesError); ok { -// t.Errorf("Got error: %v", e) -// } - -// // Check the length. -// if res.Len() != 20 { -// t.Errorf("Expected length of 20, got %d", res.Len()) -// } - -// // Check the data. -// for i, v := range res.Data().([]int64) { -// if v != data[i]*2 { -// t.Errorf("Expected %v, got %v at index %d", data[i]*2, v, i) -// } -// } - -// // 2 * s -// res = NewSeriesInt64("test", true, true, &[]int{2}).Mul(NewSeriesInt64("test", true, true, &data)) -// if e, ok := res.(SeriesError); ok { -// t.Errorf("Got error: %v", e) -// } - -// // Check the length. -// if res.Len() != 20 { -// t.Errorf("Expected length of 20, got %d", res.Len()) -// } - -// // Check the data. -// for i, v := range res.Data().([]int64) { -// if v != data[i]*2 { -// t.Errorf("Expected %v, got %v at index %d", data[i]*2, v, i) -// } -// } -// } - func Test_SeriesInt64_Arithmetic_Mul(t *testing.T) { bools := NewSeriesBool("test", true, false, []bool{true}).(SeriesBool) boolv := NewSeriesBool("test", true, false, []bool{true, false, true, false, true, false, true, true, false, false}).(SeriesBool) diff --git a/core/gandalff/gdl_series_string.go b/core/gandalff/gdl_series_string.go index 6c04169..309c945 100644 --- a/core/gandalff/gdl_series_string.go +++ b/core/gandalff/gdl_series_string.go @@ -2,6 +2,7 @@ package gandalff import ( "fmt" + "sort" "strconv" "strings" "sync" @@ -70,7 +71,7 @@ func (s SeriesString) Type() typesys.BaseType { // Returns the type and cardinality of the series. func (s SeriesString) TypeCard() typesys.BaseTypeCard { - return typesys.BaseTypeCard{typesys.StringType, s.Len()} + return typesys.BaseTypeCard{Base: typesys.StringType, Card: s.Len()} } // Returns if the series is grouped. @@ -254,34 +255,6 @@ func (s SeriesString) Take(params ...int) Series { return s.filterIntSlice(indeces) } -func (s SeriesString) Less(i, j int) bool { - if s.isNullable { - if s.nullMask[i>>3]&(1< 0 { - return false - } - if s.nullMask[j>>3]&(1< 0 { - return true - } - } - return strings.Compare(*s.data[i], *s.data[j]) < 0 -} - -func (s SeriesString) Swap(i, j int) { - if s.isNullable { - if s.nullMask[i>>3]&(1< 0 { - s.nullMask[j>>3] |= 1 << uint(j%8) - } else { - s.nullMask[j>>3] &= ^(1 << uint(j%8)) - } - if s.nullMask[j>>3]&(1< 0 { - s.nullMask[i>>3] |= 1 << uint(i%8) - } else { - s.nullMask[i>>3] &= ^(1 << uint(i%8)) - } - } - s.data[i], s.data[j] = s.data[j], s.data[i] -} - // Append appends a value or a slice of values to the series. func (s SeriesString) Append(v any) Series { switch v := v.(type) { @@ -979,99 +952,47 @@ func (s SeriesString) Map(f GDLMapFunc, stringPool *StringPool) Series { //////////////////////// GROUPING OPERATIONS type SeriesStringPartition struct { - seriesSize int - partition map[int64][]int - indexToGroup []int - pool *StringPool + partition map[int64][]int + pool *StringPool } -func (gp SeriesStringPartition) GetSize() int { +func (gp *SeriesStringPartition) getSize() int { return len(gp.partition) } -func (gp SeriesStringPartition) GetMap() map[int64][]int { +func (gp *SeriesStringPartition) getMap() map[int64][]int { return gp.partition } -func (gp SeriesStringPartition) GetValueIndices(val any) []int { - if val == nil { - if nulls, ok := gp.partition[HASH_NULL_KEY]; ok { - return nulls - } - } else { - if v, ok := val.(string); ok { - if addr := gp.pool.Put(v); addr == nil { - if vals, ok := gp.partition[(*(*int64)(unsafe.Pointer(unsafe.Pointer(addr))))]; ok { - return vals - } - } - } - } - - return make([]int, 0) -} - -func (gp SeriesStringPartition) GetKeys() any { - keys := make([]string, 0, len(gp.partition)) - for k := range gp.partition { - if k != HASH_NULL_KEY { - keys = append(keys, *(*string)(unsafe.Pointer(&k))) - } - } - return keys -} - -func (gp SeriesStringPartition) debugPrint() { - fmt.Println("SeriesStringPartition") - map_ := gp.GetMap() - for k, v := range map_ { - ptr := (*string)(unsafe.Pointer(uintptr(k))) - fmt.Printf("%10s: %v\n", *ptr, v) - } -} - -func (s SeriesString) Group() Series { - - var partition SeriesStringPartition - if len(s.data) < MINIMUM_PARALLEL_SIZE_1 { - map_ := make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) +func (s SeriesString) group() Series { + // Define the worker callback + worker := func(threadNum, start, end int, map_ map[int64][]int) { var ptr unsafe.Pointer - for i := 0; i < len(s.data); i++ { + for i := start; i < end; i++ { ptr = unsafe.Pointer(s.data[i]) map_[(*(*int64)(unsafe.Pointer(&ptr)))] = append(map_[(*(*int64)(unsafe.Pointer(&ptr)))], i) } + } - partition = SeriesStringPartition{ - seriesSize: s.Len(), - partition: map_, - pool: s.pool, - } - } else { - - // Initialize the maps and the wait groups - allMaps := make([]map[int64][]int, THREADS_NUMBER) - for i := 0; i < THREADS_NUMBER; i++ { - allMaps[i] = make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) - } - - // Define the worker callback - worker := func(threadNum, start, end int) { - map_ := allMaps[threadNum] - var ptr unsafe.Pointer - for i := start; i < end; i++ { + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + var ptr unsafe.Pointer + for i := start; i < end; i++ { + if s.IsNull(i) { + (*nulls) = append((*nulls), i) + } else { ptr = unsafe.Pointer(s.data[i]) map_[(*(*int64)(unsafe.Pointer(&ptr)))] = append(map_[(*(*int64)(unsafe.Pointer(&ptr)))], i) } } + } - __series_groupby_multithreaded(THREADS_NUMBER, len(s.data), allMaps, nil, worker) - - partition = SeriesStringPartition{ - seriesSize: s.Len(), - partition: allMaps[0], - pool: s.pool, - } + partition := SeriesStringPartition{ + pool: s.pool, + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_1, len(s.data), s.HasNull(), + worker, workerNulls), } s.isGrouped = true @@ -1080,67 +1001,51 @@ func (s SeriesString) Group() Series { return s } -func (s SeriesString) SubGroup(partition SeriesPartition) Series { - var newPartition SeriesStringPartition - otherIndeces := partition.GetMap() - - if len(s.data) < MINIMUM_PARALLEL_SIZE_1 { - - map_ := make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) +func (s SeriesString) GroupBy(partition SeriesPartition) Series { + // collect all keys + otherIndeces := partition.getMap() + keys := make([]int64, len(otherIndeces)) + i := 0 + for k := range otherIndeces { + keys[i] = k + i++ + } + // Define the worker callback + worker := func(threadNum, start, end int, map_ map[int64][]int) { var newHash int64 var ptr unsafe.Pointer - for h, v := range otherIndeces { - for _, index := range v { + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function ptr = unsafe.Pointer(s.data[index]) newHash = *(*int64)(unsafe.Pointer(&ptr)) + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) map_[newHash] = append(map_[newHash], index) } } + } - newPartition = SeriesStringPartition{ - seriesSize: s.Len(), - partition: map_, - pool: s.pool, - } - } else { - - // collect all keys - keys := make([]int64, len(otherIndeces)) - i := 0 - for k := range otherIndeces { - keys[i] = k - i++ - } - - // Initialize the maps and the wait groups - allMaps := make([]map[int64][]int, THREADS_NUMBER) - for i := 0; i < THREADS_NUMBER; i++ { - allMaps[i] = make(map[int64][]int, DEFAULT_HASH_MAP_INITIAL_CAPACITY) - } - - // Define the worker callback - worker := func(threadNum, start, end int) { - var newHash int64 - var ptr unsafe.Pointer - - map_ := allMaps[threadNum] - for _, h := range keys[start:end] { - for _, index := range otherIndeces[h] { + // Define the worker callback for nulls + workerNulls := func(threadNum, start, end int, map_ map[int64][]int, nulls *[]int) { + var newHash int64 + var ptr unsafe.Pointer + for _, h := range keys[start:end] { // keys is defined outside the function + for _, index := range otherIndeces[h] { // otherIndeces is defined outside the function + if s.IsNull(index) { + newHash = HASH_MAGIC_NUMBER_NULL + (h << 13) + (h >> 4) + } else { ptr = unsafe.Pointer(s.data[index]) newHash = *(*int64)(unsafe.Pointer(&ptr)) + HASH_MAGIC_NUMBER + (h << 13) + (h >> 4) - map_[newHash] = append(map_[newHash], index) } + map_[newHash] = append(map_[newHash], index) } } + } - __series_groupby_multithreaded(THREADS_NUMBER, len(keys), allMaps, nil, worker) - - newPartition = SeriesStringPartition{ - seriesSize: s.Len(), - partition: allMaps[0], - pool: s.pool, - } + newPartition := SeriesStringPartition{ + pool: s.pool, + partition: __series_groupby( + THREADS_NUMBER, MINIMUM_PARALLEL_SIZE_1, len(keys), s.HasNull(), + worker, workerNulls), } s.isGrouped = true @@ -1149,20 +1054,78 @@ func (s SeriesString) SubGroup(partition SeriesPartition) Series { return s } +func (s SeriesString) UnGroup() Series { + s.isGrouped = false + s.partition = nil + return s +} + func (s SeriesString) GetPartition() SeriesPartition { return s.partition } +//////////////////////// SORTING OPERATIONS + +func (s SeriesString) Less(i, j int) bool { + if s.isNullable { + if s.nullMask[i>>3]&(1< 0 { + return false + } + if s.nullMask[j>>3]&(1< 0 { + return true + } + } + + return (*s.data[i]) < (*s.data[j]) +} + +func (s SeriesString) equal(i, j int) bool { + if s.isNullable { + if (s.nullMask[i>>3] & (1 << uint(i%8))) > 0 { + return (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 + } + if (s.nullMask[j>>3] & (1 << uint(j%8))) > 0 { + return false + } + } + + return (*s.data[i]) == (*s.data[j]) +} + +func (s SeriesString) Swap(i, j int) { + if s.isNullable { + // i is null, j is not null + if s.nullMask[i>>3]&(1< 0 && s.nullMask[j>>3]&(1<>3] &= ^(1 << uint(i%8)) + s.nullMask[j>>3] |= 1 << uint(j%8) + } else + + // i is not null, j is null + if s.nullMask[i>>3]&(1<>3]&(1< 0 { + s.nullMask[i>>3] |= 1 << uint(i%8) + s.nullMask[j>>3] &= ^(1 << uint(j%8)) + } + } + + s.data[i], s.data[j] = s.data[j], s.data[i] +} + func (s SeriesString) Sort() Series { + if s.sorted != SORTED_ASC { + sort.Sort(s) + s.sorted = SORTED_ASC + } return s } func (s SeriesString) SortRev() Series { + if s.sorted != SORTED_DESC { + sort.Sort(sort.Reverse(s)) + s.sorted = SORTED_DESC + } return s } -//////////////////////// SORTING OPERATIONS - //////////////////////// STRING OPERATIONS func (s SeriesString) ToUpper() Series { diff --git a/core/gandalff/gdl_series_string_test.go b/core/gandalff/gdl_series_string_test.go index 7506aae..7c00183 100644 --- a/core/gandalff/gdl_series_string_test.go +++ b/core/gandalff/gdl_series_string_test.go @@ -586,6 +586,124 @@ func Test_SeriesString_Map(t *testing.T) { } } +func Test_SeriesString_Group(t *testing.T) { + var partMap map[int64][]int + pool := NewStringPool() + + data1 := []string{"foo", "foo", "foo", "foo", "foo", "foo", "foo", "foo", "foo", "foo"} + data1Mask := []bool{false, false, false, false, false, true, true, true, true, true} + data2 := []string{"foo", "foo", "bar", "bar", "foo", "foo", "bar", "bar", "foo", "foo"} + data2Mask := []bool{false, true, false, true, false, true, false, true, false, true} + data3 := []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"} + data3Mask := []bool{false, false, false, false, false, true, true, true, true, true} + + // Test 1 + s1 := NewSeriesString("s1", true, data1, pool). + SetNullMask(data1Mask). + group() + + p1 := s1.GetPartition().getMap() + if len(p1) != 2 { + t.Errorf("Expected 2 groups, got %d", len(p1)) + } + + partMap = map[int64][]int{ + 0: {0, 1, 2, 3, 4}, + 1: {5, 6, 7, 8, 9}, + } + if !checkEqPartitionMap(p1, partMap, nil, "String Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p1) + } + + // Test 2 + s2 := NewSeriesString("s2", true, data2, pool). + SetNullMask(data2Mask). + GroupBy(s1.GetPartition()) + + p2 := s2.GetPartition().getMap() + if len(p2) != 6 { + t.Errorf("Expected 6 groups, got %d", len(p2)) + } + + partMap = map[int64][]int{ + 0: {0, 4}, + 1: {1, 3}, + 2: {2}, + 3: {5, 7, 9}, + 4: {6}, + 5: {8}, + } + if !checkEqPartitionMap(p2, partMap, nil, "String Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p2) + } + + // Test 3 + s3 := NewSeriesString("test", true, data3, pool). + SetNullMask(data3Mask). + GroupBy(s2.GetPartition()) + + p3 := s3.GetPartition().getMap() + if len(p3) != 8 { + t.Errorf("Expected 8 groups, got %d", len(p3)) + } + + partMap = map[int64][]int{ + 0: {0}, + 1: {1}, + 2: {2}, + 3: {3}, + 4: {4}, + 5: {5, 7, 9}, + 6: {6}, + 7: {8}, + } + if !checkEqPartitionMap(p3, partMap, nil, "String Group") { + t.Errorf("Expected partition map of %v, got %v", partMap, p3) + } + + // debugPrintPartition(s1.GetPartition(), s1) + // debugPrintPartition(s2.GetPartition(), s1, s2) + // debugPrintPartition(s3.GetPartition(), s1, s2, s3) + + partMap = nil +} + +func Test_SeriesString_Sort(t *testing.T) { + data := []string{"w", "w", "d", "y", "b", "e", "a", "e", "e", "b", "l", "u", "a", "g", "w", "u", "{", "x", "t", "h"} + mask := []bool{false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true} + + // Create a new series. + s := NewSeriesString("test", false, data, NewStringPool()) + + // Sort the series. + sorted := s.Sort() + + // Check the data. + expected := []string{"a", "a", "b", "b", "d", "e", "e", "e", "g", "h", "l", "t", "u", "u", "w", "w", "w", "x", "y", "{"} + if !checkEqSliceString(sorted.Data().([]string), expected, nil, "") { + t.Errorf("SeriesString.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]string)) + } + + // Create a new series. + s = NewSeriesString("test", true, data, NewStringPool()). + SetNullMask(mask) + + // Sort the series. + sorted = s.Sort() + + // Check the data. + expected = []string{"a", "a", "b", "d", "e", "l", "t", "w", "w", "{", "e", "u", "b", "g", "e", "u", "y", "x", "w", "h"} + if !checkEqSliceString(sorted.Data().([]string), expected, nil, "") { + t.Errorf("SeriesString.Sort() failed, expecting %v, got %v", expected, sorted.Data().([]string)) + } + + // Check the null mask. + expectedMask := []bool{false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true} + if !checkEqSliceBool(sorted.GetNullMask(), expectedMask, nil, "") { + t.Errorf("SeriesString.Sort() failed, expecting %v, got %v", expectedMask, sorted.GetNullMask()) + } +} + func Test_SeriesString_Arithmetic_Add(t *testing.T) { pool := NewStringPool() diff --git a/core/gandalff/gdl_series_utils.go b/core/gandalff/gdl_series_utils.go index 538514e..c4edb34 100644 --- a/core/gandalff/gdl_series_utils.go +++ b/core/gandalff/gdl_series_utils.go @@ -65,3 +65,31 @@ func seriesTakePreprocess(size int, params ...int) ([]int, error) { return nil, fmt.Errorf("series.Take: invalid number of parameters: %d", len(params)) } } + +func debugPrintPartition(p SeriesPartition, series ...Series) { + map_ := p.getMap() + + header := "" + separators := "" + for _, s := range series { + header += fmt.Sprintf("| %-10s ", s.Name()) + separators += "|------------" + } + + fmt.Println() + fmt.Printf(" | %-20s %s | %-20s |\n", "Key", header, "Indeces") + fmt.Printf(" |%s%s-|%s|\n", "----------------------", separators, "----------------------") + for k, v := range map_ { + vals := "" + for _, s := range series { + vals += fmt.Sprintf("| %-10s ", s.GetString(v[0])) + } + + indeces := "" + for _, i := range v { + indeces += fmt.Sprintf("%d ", i) + } + fmt.Printf(" | %-20d %s | %-20s |\n", k, vals, indeces) + } + fmt.Println() +} diff --git a/core/gandalff/utils.go b/core/gandalff/utils.go index 03186c2..485806a 100644 --- a/core/gandalff/utils.go +++ b/core/gandalff/utils.go @@ -3,6 +3,7 @@ package gandalff import ( "fmt" "math" + "sort" "testing" ) @@ -32,7 +33,11 @@ func checkEqSliceBool(a, b []bool, t *testing.T, msg string) bool { if t == nil { for i, x := range a { if x != b[i] { - fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + if msg != "" { + fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + } else { + fmt.Printf(" checkEqSliceBool: %4d - expected '%v', got '%v'\n", i, b[i], a[i]) + } return false } } @@ -47,6 +52,20 @@ func checkEqSliceBool(a, b []bool, t *testing.T, msg string) bool { return true } +func checkEqSliceInt(a, b []int) bool { + if len(a) != len(b) { + return false + } + + for i, x := range a { + if x != b[i] { + return false + } + } + + return true +} + func checkEqSliceInt32(a, b []int32, t *testing.T, msg string) bool { if len(a) != len(b) { return false @@ -55,7 +74,11 @@ func checkEqSliceInt32(a, b []int32, t *testing.T, msg string) bool { if t == nil { for i, x := range a { if x != b[i] { - fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + if msg != "" { + fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + } else { + fmt.Printf(" checkEqSliceInt32: %4d - expected '%v', got '%v'\n", i, b[i], a[i]) + } return false } } @@ -78,7 +101,11 @@ func checkEqSliceInt64(a, b []int64, t *testing.T, msg string) bool { if t == nil { for i, x := range a { if x != b[i] { - fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + if msg != "" { + fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + } else { + fmt.Printf(" checkEqSliceInt64: %4d - expected '%v', got '%v'\n", i, b[i], a[i]) + } return false } } @@ -104,7 +131,11 @@ func checkEqSliceFloat64(a, b []float64, t *testing.T, msg string) bool { continue } if x != b[i] { - fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + if msg != "" { + fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + } else { + fmt.Printf(" checkEqSliceFloat64: %4d - expected '%v', got '%v'\n", i, b[i], a[i]) + } return false } } @@ -130,7 +161,11 @@ func checkEqSliceString(a, b []string, t *testing.T, msg string) bool { if t == nil { for i, x := range a { if x != b[i] { - fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + if msg != "" { + fmt.Printf(" %s: %4d - expected '%v', got '%v'\n", msg, i, b[i], a[i]) + } else { + fmt.Printf(" checkEqSliceString: %4d - expected '%v', got '%v'\n", i, b[i], a[i]) + } return false } } @@ -144,3 +179,45 @@ func checkEqSliceString(a, b []string, t *testing.T, msg string) bool { } return true } + +func checkEqPartitionMap(a, b map[int64][]int, t *testing.T, msg string) bool { + if len(a) != len(b) { + return false + } + + // check if the two maps represent the same partitioning + // the keys can be different, but the values must be the same + if t == nil { + for _, v := range a { + found := false + vSorted := sort.IntSlice(v) + for _, w := range b { + if checkEqSliceInt(vSorted, sort.IntSlice(w)) { + found = true + break + } + } + if !found { + fmt.Printf(" %s: expected partition %v not found\n", msg, v) + return false + } + } + } else { + for _, v := range a { + found := false + vSorted := sort.IntSlice(v) + for _, w := range b { + if checkEqSliceInt(vSorted, sort.IntSlice(w)) { + found = true + break + } + } + if !found { + t.Errorf("%s: expected partition %v not found\n", msg, v) + return false + } + } + } + + return true +} diff --git a/core/intern.go b/core/intern.go index de06f27..f9860ab 100644 --- a/core/intern.go +++ b/core/intern.go @@ -404,11 +404,26 @@ func (i *__p_intern__) listToSeriesString() (gandalff.Series, error) { } } -func (lhs *__p_intern__) appendOperand(op typesys.OPCODE, rhs *__p_intern__) { +// isNeg returns true if the expression is a negative number +// used for special cases like orderBy +func (i *__p_intern__) isNeg() bool { + if len(i.expr) == 2 { + if op, ok := i.expr[1].(typesys.OPCODE); ok && op == typesys.OP_UNARY_SUB { + return true + } + } + return false +} + +func (lhs *__p_intern__) appendBinaryOperation(op typesys.OPCODE, rhs *__p_intern__) { lhs.expr = append(lhs.expr, rhs.expr...) lhs.expr = append(lhs.expr, op) } +func (rhs *__p_intern__) appendUnaryOperation(op typesys.OPCODE) { + rhs.expr = append(rhs.expr, op) +} + func isOperator(t interface{}) (typesys.OPCODE, bool) { if v, ok := t.(typesys.OPCODE); ok { return v, true diff --git a/core/operators.go b/core/operators.go index b5f9c7e..c8eebd9 100644 --- a/core/operators.go +++ b/core/operators.go @@ -6,7 +6,7 @@ import ( "typesys" ) -func solveExpr(vm *ByteEater, i *__p_intern__) error { +func (vm *ByteEater) solveExpr(i *__p_intern__) error { // TODO: check if this is possible and // if it's the case to raise an error if i == nil || i.expr == nil || len(i.expr) == 0 { @@ -23,48 +23,73 @@ func solveExpr(vm *ByteEater, i *__p_intern__) error { case __p_list__: for idx := range l { - if err := solveExpr(vm, &l[idx]); err != nil { + if err := vm.solveExpr(&l[idx]); err != nil { return err } } - - default: } } stack := make([]interface{}, 0) + var op typesys.OPCODE + var ok, errorMode bool + var exprIdx int + var result interface{} + for len(i.expr) > 1 { // Load the stack until we find an operators - var ok bool - var op typesys.OPCODE - for { - if op, ok = isOperator(i.expr[0]); ok { - i.expr = i.expr[1:len(i.expr)] - break - } - - stack = append(stack, i.expr[0]) - i.expr = i.expr[1:len(i.expr)] + ok = false + for exprIdx = 0; !ok; op, ok = i.expr[exprIdx].(typesys.OPCODE) { + exprIdx++ } + stack = append(stack, i.expr[0:exprIdx]...) + i.expr = i.expr[exprIdx+1 : len(i.expr)] - var result interface{} + errorMode = false + result = gandalff.SeriesError{} // UNARY - // if op, ok := isOperator(t2); ok { - // i.expr = i.expr[2:len(i.expr)] + if op.IsUnaryOp() { + t1 := stack[len(stack)-1] + stack = stack[0 : len(stack)-1] - // if s, ok := t1.(__p_symbol__); ok { - // t1 = vm.symbolResolution(s) - // } + if s, ok := t1.(__p_symbol__); ok { + t1 = vm.symbolResolution(s) + } - // switch op { - // case typesys.OP_UNARY_ADD: - // case typesys.OP_UNARY_SUB: - // case typesys.OP_UNARY_NOT: - // } - // } else + switch op { + case typesys.OP_UNARY_ADD: + result = t1 + + case typesys.OP_UNARY_SUB: + switch s1 := t1.(type) { + case gandalff.SeriesInt32: + result = s1.Neg() + case gandalff.SeriesInt64: + result = s1.Neg() + case gandalff.SeriesFloat64: + result = s1.Neg() + default: + errorMode = true + } + + case typesys.OP_UNARY_NOT: + if s1, ok := t1.(gandalff.SeriesBool); ok { + result = s1.Not() + } else { + errorMode = true + } + } + + // Check for errors + if _, ok := result.(gandalff.SeriesError); ok || errorMode { + return fmt.Errorf("unary operator %s not supported for %s", + operatorToCode(op), + t1.(gandalff.Series).TypeCard().ToString()) + } + } else // BINARY { @@ -81,83 +106,68 @@ func solveExpr(vm *ByteEater, i *__p_intern__) error { t2 = vm.symbolResolution(s) } + // Type check + s1 := t1.(gandalff.Series) + s2 := t2.(gandalff.Series) + switch op { case typesys.OP_BINARY_MUL: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Mul(t2.(gandalff.Series)) - } + result = s1.Mul(s2) case typesys.OP_BINARY_DIV: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Div(t2.(gandalff.Series)) - } + result = s1.Div(s2) case typesys.OP_BINARY_MOD: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Mod(t2.(gandalff.Series)) - } + result = s1.Mod(s2) case typesys.OP_BINARY_POW: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Pow(t2.(gandalff.Series)) - } + result = s1.Pow(s2) case typesys.OP_BINARY_ADD: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Add(t2.(gandalff.Series)) - } + result = s1.Add(s2) case typesys.OP_BINARY_SUB: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Sub(t2.(gandalff.Series)) - } - - case typesys.OP_BINARY_AND: - if s1, ok := t1.(gandalff.SeriesBool); ok { - result = s1.And(t2.(gandalff.Series)) - } - - case typesys.OP_BINARY_OR: - if s1, ok := t1.(gandalff.SeriesBool); ok { - result = s1.Or(t2.(gandalff.Series)) - } + result = s1.Sub(s2) case typesys.OP_BINARY_EQ: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Eq(t2.(gandalff.Series)) - } + result = s1.Eq(s2) case typesys.OP_BINARY_NE: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Ne(t2.(gandalff.Series)) - } + result = s1.Ne(s2) case typesys.OP_BINARY_LT: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Lt(t2.(gandalff.Series)) - } + result = s1.Lt(s2) case typesys.OP_BINARY_LE: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Le(t2.(gandalff.Series)) - } + result = s1.Le(s2) case typesys.OP_BINARY_GT: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Gt(t2.(gandalff.Series)) - } + result = s1.Gt(s2) case typesys.OP_BINARY_GE: - if s1, ok := t1.(gandalff.Series); ok { - result = s1.Ge(t2.(gandalff.Series)) + result = s1.Ge(s2) + + case typesys.OP_BINARY_AND: + if s1, ok := t1.(gandalff.SeriesBool); ok { + result = s1.And(s2) + } else { + errorMode = true + } + + case typesys.OP_BINARY_OR: + if s1, ok := t1.(gandalff.SeriesBool); ok { + result = s1.Or(s2) + } else { + errorMode = true } } - if _, ok := result.(gandalff.SeriesError); ok { - return fmt.Errorf("operator %s not supported between %s and %s", + // Check for errors + if _, ok := result.(gandalff.SeriesError); ok || errorMode { + return fmt.Errorf("binary operator %s not supported between %s and %s", operatorToString(op), - t1.(gandalff.Series).TypeCard().ToString(), - t2.(gandalff.Series).TypeCard().ToString()) + s1.TypeCard().ToString(), + s2.TypeCard().ToString()) } } diff --git a/core/operators_test.go b/core/operators_test.go index 51aebf2..a68ddc0 100644 --- a/core/operators_test.go +++ b/core/operators_test.go @@ -1,6 +1,7 @@ package preludiocore import ( + "fmt" "math" "testing" "typesys" @@ -14,8 +15,6 @@ func init() { func Test_Operator_Mul(t *testing.T) { - var err error - b1 := be.newPInternTerm([]bool{true, false, true, false}) b2 := be.newPInternTerm([]bool{false, false, true, true}) in := be.newPInternTerm([]int64{1, 2, 3, 4}) @@ -25,119 +24,63 @@ func Test_Operator_Mul(t *testing.T) { // BOOL { // BOOL * BOOL - b1.appendOperand(typesys.OP_BINARY_MUL, b2) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_MUL, b2) + if err := checkExpression(be, b1, []int64{0, 0, 1, 0}); err != nil { t.Error(err) - } else if !b1.isInt64Vector() { - t.Error("Expected integer vector type") - } else { - v, _ := b1.getInt64Vector() - if v[0] != 0 || v[1] != 0 || v[2] != 1 || v[3] != 0 { - t.Error("Expected [0, 0, 1, 0], got", v) - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL * INTEGER - b1.appendOperand(typesys.OP_BINARY_MUL, in) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_MUL, in) + if err := checkExpression(be, b1, []int64{1, 0, 3, 0}); err != nil { t.Error(err) - } else if !b1.isInt64Vector() { - t.Error("Expected integer vector type") - } else { - v, _ := b1.getInt64Vector() - if v[0] != 1 || v[1] != 0 || v[2] != 3 || v[3] != 0 { - t.Error("Expected [1, 0, 3, 0] got", v) - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL * FLOAT - b1.appendOperand(typesys.OP_BINARY_MUL, fl) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_MUL, fl) + if err := checkExpression(be, b1, []float64{5.0, 0.0, 7.0, 0.0}); err != nil { t.Error(err) - } else if !b1.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := b1.getFloat64Vector() - if v[0] != 5.0 || v[1] != 0.0 || v[2] != 7.0 || v[3] != 0.0 { - t.Error("Expected [5.0, 0.0, 7.0, 0.0] got", v) - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL * STRING - b1.appendOperand(typesys.OP_BINARY_MUL, st) - err = solveExpr(be, b1) - - if err == nil || err.Error() != "operator * not supported between Bool[4] and String[4]" { - t.Error("Expected error") + b1.appendBinaryOperation(typesys.OP_BINARY_MUL, st) + if err := checkExpression(be, b1, fmt.Errorf("binary operator * not supported between Bool[4] and String[4]")); err != nil { + t.Error(err) } } // INTEGER { // INTEGER * BOOL - in.appendOperand(typesys.OP_BINARY_MUL, b2) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_MUL, b2) + if err := checkExpression(be, in, []int64{0, 0, 3, 4}); err != nil { t.Error(err) - } else if !in.isInt64Vector() { - t.Error("Expected integer vector type") - } else { - v, _ := in.getInt64Vector() - if v[0] != 0 || v[1] != 0 || v[2] != 3 || v[3] != 4 { - t.Error("Expected [0, 0, 3, 4] got", v) - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER * INTEGER - in.appendOperand(typesys.OP_BINARY_MUL, in) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_MUL, in) + if err := checkExpression(be, in, []int64{1, 4, 9, 16}); err != nil { t.Error(err) - } else if !in.isInt64Vector() { - t.Error("Expected integer vector type") - } else { - v, _ := in.getInt64Vector() - if v[0] != 1 || v[1] != 4 || v[2] != 9 || v[3] != 16 { - t.Error("Expected [1, 4, 9, 16] got", v) - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER * FLOAT - in.appendOperand(typesys.OP_BINARY_MUL, fl) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_MUL, fl) + if err := checkExpression(be, in, []float64{5.0, 12.0, 21.0, 32.0}); err != nil { t.Error(err) - } else if !in.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := in.getFloat64Vector() - if v[0] != 5.0 || v[1] != 12.0 || v[2] != 21.0 || v[3] != 32.0 { - t.Error("Expected [5.0, 12.0, 21.0, 32.0] got", v) - } } // reset in @@ -147,54 +90,27 @@ func Test_Operator_Mul(t *testing.T) { // FLOAT { // FLOAT * BOOL - fl.appendOperand(typesys.OP_BINARY_MUL, b2) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_MUL, b2) + if err := checkExpression(be, fl, []float64{0.0, 0.0, 7.0, 8.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 0.0 || v[1] != 0.0 || v[2] != 7.0 || v[3] != 8.0 { - t.Error("Expected [0.0, 0.0, 7.0, 8.0] got", v) - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT * INTEGER - fl.appendOperand(typesys.OP_BINARY_MUL, in) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_MUL, in) + if err := checkExpression(be, fl, []float64{5.0, 12.0, 21.0, 32.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 5.0 || v[1] != 12.0 || v[2] != 21.0 || v[3] != 32.0 { - t.Error("Expected [0.0, 0.0, 21.0, 0.0] got", v) - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT * FLOAT - fl.appendOperand(typesys.OP_BINARY_MUL, fl) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_MUL, fl) + if err := checkExpression(be, fl, []float64{25.0, 36.0, 49.0, 64.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 25.0 || v[1] != 36.0 || v[2] != 49.0 || v[3] != 64.0 { - t.Error("Expected [25.0, 36.0, 49.0, 64.0] got", v) - } } // reset fl @@ -204,52 +120,42 @@ func Test_Operator_Mul(t *testing.T) { // STRING { // STRING * BOOL - st.appendOperand(typesys.OP_BINARY_MUL, b2) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator * not supported between String[4] and Bool[4]" { - t.Error("Expected error") + st.appendBinaryOperation(typesys.OP_BINARY_MUL, b2) + if err := checkExpression(be, st, fmt.Errorf("binary operator * not supported between String[4] and Bool[4]")); err != nil { + t.Error(err) } // reset st st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING * INTEGER - st.appendOperand(typesys.OP_BINARY_MUL, in) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator * not supported between String[4] and Int64[4]" { - t.Error("Expected error") + st.appendBinaryOperation(typesys.OP_BINARY_MUL, in) + if err := checkExpression(be, st, fmt.Errorf("binary operator * not supported between String[4] and Int64[4]")); err != nil { + t.Error(err) } // reset st st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING * FLOAT - st.appendOperand(typesys.OP_BINARY_MUL, fl) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator * not supported between String[4] and Float64[4]" { - t.Error("Expected error") + st.appendBinaryOperation(typesys.OP_BINARY_MUL, fl) + if err := checkExpression(be, st, fmt.Errorf("binary operator * not supported between String[4] and Float64[4]")); err != nil { + t.Error(err) } // reset st st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING * STRING - st.appendOperand(typesys.OP_BINARY_MUL, st) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator * not supported between String[4] and String[4]" { - t.Error("Expected error") + st.appendBinaryOperation(typesys.OP_BINARY_MUL, st) + if err := checkExpression(be, st, fmt.Errorf("binary operator * not supported between String[4] and String[4]")); err != nil { + t.Error(err) } } } func Test_Operator_Div(t *testing.T) { - var err error - b1 := be.newPInternTerm([]bool{true, false, true, false}) b2 := be.newPInternTerm([]bool{false, false, true, true}) in := be.newPInternTerm([]int64{1, 2, 3, 4}) @@ -259,65 +165,36 @@ func Test_Operator_Div(t *testing.T) { // BOOL { // BOOL / BOOL - b1.appendOperand(typesys.OP_BINARY_DIV, b2) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_DIV, b2) + if err := checkExpression(be, b1, []float64{math.Inf(1), math.NaN(), 1.0, 0.0}); err != nil { t.Error(err) - } else if !b1.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := b1.getFloat64Vector() - if !math.IsInf(v[0], 1) || !math.IsNaN(v[1]) || v[2] != 1.0 || v[3] != 0.0 { - t.Error("Expected [+Inf, NaN, 1.0, 0.0]") - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL / INTEGER - b1.appendOperand(typesys.OP_BINARY_DIV, in) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_DIV, in) + if err := checkExpression(be, b1, []float64{1.0, 0.0, 0.3333333333333333, 0.0}); err != nil { t.Error(err) - } else if !b1.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := b1.getFloat64Vector() - if v[0] != 1.0 || v[1] != 0.0 || v[2] != 0.3333333333333333 || v[3] != 0.0 { - t.Error("Expected [1.0, 0.0, 0.3333333333333333, 0.0]") - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL / FLOAT - b1.appendOperand(typesys.OP_BINARY_DIV, fl) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_DIV, fl) + if err := checkExpression(be, b1, []float64{0.2, 0.0, 0.14285714285714285, 0.0}); err != nil { t.Error(err) - } else if !b1.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := b1.getFloat64Vector() - if v[0] != 0.2 || v[1] != 0.0 || v[2] != 0.14285714285714285 || v[3] != 0.0 { - t.Error("Expected [0.2, 0.0, 0.14285714285714285, 0.0]") - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL / STRING - b1.appendOperand(typesys.OP_BINARY_DIV, st) - err = solveExpr(be, b1) - - if err == nil || err.Error() != "operator / not supported between Bool[4] and String[4]" { - t.Error("Expected error") + b1.appendBinaryOperation(typesys.OP_BINARY_DIV, st) + if err := checkExpression(be, b1, fmt.Errorf("binary operator / not supported between Bool[4] and String[4]")); err != nil { + t.Error(err) } // reset b1 @@ -327,64 +204,35 @@ func Test_Operator_Div(t *testing.T) { // INTEGER { // INTEGER / BOOL - in.appendOperand(typesys.OP_BINARY_DIV, b1) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_DIV, b1) + if err := checkExpression(be, in, []float64{1.0, math.Inf(1), 3.0, math.Inf(1)}); err != nil { t.Error(err) - } else if !in.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := in.getFloat64Vector() - if v[0] != 1.0 || !math.IsInf(v[1], 1) || v[2] != 3.0 || !math.IsInf(v[3], 1) { - t.Error("Expected [1.0, +Inf, 3.0, +Inf]") - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER / INTEGER - in.appendOperand(typesys.OP_BINARY_DIV, in) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_DIV, in) + if err := checkExpression(be, in, []float64{1.0, 1.0, 1.0, 1.0}); err != nil { t.Error(err) - } else if !in.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := in.getFloat64Vector() - if v[0] != 1.0 || v[1] != 1.0 || v[2] != 1.0 || v[3] != 1.0 { - t.Error("Expected [1.0, 1.0, 1.0, 1.0]") - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER / FLOAT - in.appendOperand(typesys.OP_BINARY_DIV, fl) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_DIV, fl) + if err := checkExpression(be, in, []float64{0.2, 0.3333333333333333, 0.42857142857142855, 0.5}); err != nil { t.Error(err) - } else if !in.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := in.getFloat64Vector() - if v[0] != 0.2 || v[1] != 0.3333333333333333 || v[2] != 0.42857142857142855 || v[3] != 0.5 { - t.Error("Expected [0.2, 0.3333333333333333, 0.42857142857142855, 0.5]") - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER / STRING - in.appendOperand(typesys.OP_BINARY_DIV, st) - err = solveExpr(be, in) - - if err == nil || err.Error() != "operator / not supported between Int64[4] and String[4]" { + in.appendBinaryOperation(typesys.OP_BINARY_DIV, st) + if err := checkExpression(be, in, fmt.Errorf("binary operator / not supported between Int64[4] and String[4]")); err != nil { t.Error(err) } @@ -395,64 +243,35 @@ func Test_Operator_Div(t *testing.T) { // FLOAT { // FLOAT / BOOL - fl.appendOperand(typesys.OP_BINARY_DIV, b1) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_DIV, b1) + if err := checkExpression(be, fl, []float64{5.0, math.Inf(1), 7.0, math.Inf(1)}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 5.0 || !math.IsInf(v[1], 1) || v[2] != 7.0 || !math.IsInf(v[3], 1) { - t.Error("Expected [5.0, +Inf, 7.0, +Inf]") - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT / INTEGER - fl.appendOperand(typesys.OP_BINARY_DIV, in) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_DIV, in) + if err := checkExpression(be, fl, []float64{5.0, 3.0, 2.3333333333333335, 2.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 5.0 || v[1] != 3.0 || v[2] != 2.3333333333333335 || v[3] != 2.0 { - t.Error("Expected [5.0, 3.0, 2.3333333333333335, 2.0]") - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT / FLOAT - fl.appendOperand(typesys.OP_BINARY_DIV, fl) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_DIV, fl) + if err := checkExpression(be, fl, []float64{1.0, 1.0, 1.0, 1.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 1 || v[1] != 1 || v[2] != 1 || v[3] != 1 { - t.Error("Expected [1, 1, 1, 1]") - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT / STRING - fl.appendOperand(typesys.OP_BINARY_DIV, st) - err = solveExpr(be, fl) - - if err == nil || err.Error() != "operator / not supported between Float64[4] and String[4]" { + fl.appendBinaryOperation(typesys.OP_BINARY_DIV, st) + if err := checkExpression(be, fl, fmt.Errorf("binary operator / not supported between Float64[4] and String[4]")); err != nil { t.Error(err) } @@ -463,10 +282,8 @@ func Test_Operator_Div(t *testing.T) { // STRING { // STRING / BOOL - st.appendOperand(typesys.OP_BINARY_DIV, b1) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator / not supported between String[4] and Bool[4]" { + st.appendBinaryOperation(typesys.OP_BINARY_DIV, b1) + if err := checkExpression(be, st, fmt.Errorf("binary operator / not supported between String[4] and Bool[4]")); err != nil { t.Error(err) } @@ -474,10 +291,8 @@ func Test_Operator_Div(t *testing.T) { st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING / INTEGER - st.appendOperand(typesys.OP_BINARY_DIV, in) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator / not supported between String[4] and Int64[4]" { + st.appendBinaryOperation(typesys.OP_BINARY_DIV, in) + if err := checkExpression(be, st, fmt.Errorf("binary operator / not supported between String[4] and Int64[4]")); err != nil { t.Error(err) } @@ -485,10 +300,8 @@ func Test_Operator_Div(t *testing.T) { st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING / FLOAT - st.appendOperand(typesys.OP_BINARY_DIV, fl) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator / not supported between String[4] and Float64[4]" { + st.appendBinaryOperation(typesys.OP_BINARY_DIV, fl) + if err := checkExpression(be, st, fmt.Errorf("binary operator / not supported between String[4] and Float64[4]")); err != nil { t.Error(err) } @@ -496,10 +309,8 @@ func Test_Operator_Div(t *testing.T) { st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING / STRING - st.appendOperand(typesys.OP_BINARY_DIV, st) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator / not supported between String[4] and String[4]" { + st.appendBinaryOperation(typesys.OP_BINARY_DIV, st) + if err := checkExpression(be, st, fmt.Errorf("binary operator / not supported between String[4] and String[4]")); err != nil { t.Error(err) } } @@ -507,8 +318,6 @@ func Test_Operator_Div(t *testing.T) { func Test_Operator_Add(t *testing.T) { - var err error - b1 := be.newPInternTerm([]bool{true, false, true, false}) b2 := be.newPInternTerm([]bool{false, false, true, true}) in := be.newPInternTerm([]int64{1, 2, 3, 4}) @@ -518,72 +327,36 @@ func Test_Operator_Add(t *testing.T) { // BOOL { // BOOL + BOOL - b1.appendOperand(typesys.OP_BINARY_ADD, b2) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_ADD, b2) + if err := checkExpression(be, b1, []int64{1, 0, 2, 1}); err != nil { t.Error(err) - } else if !b1.isInt64Vector() { - t.Error("Expected integer vector type") - } else { - v, _ := b1.getInt64Vector() - if v[0] != 1 || v[1] != 0 || v[2] != 2 || v[3] != 1 { - t.Error("Expected [1, 0, 2, 1] got", v) - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL + INTEGER - b1.appendOperand(typesys.OP_BINARY_ADD, in) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_ADD, in) + if err := checkExpression(be, b1, []int64{2, 2, 4, 4}); err != nil { t.Error(err) - } else if !b1.isInt64Vector() { - t.Error("Expected int vector type") - } else { - v, _ := b1.getInt64Vector() - if v[0] != 2 || v[1] != 2 || v[2] != 4 || v[3] != 4 { - t.Error("Expected [2, 2, 4, 4] got", v) - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL + FLOAT - b1.appendOperand(typesys.OP_BINARY_ADD, fl) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_ADD, fl) + if err := checkExpression(be, b1, []float64{6.0, 6.0, 8.0, 8.0}); err != nil { t.Error(err) - } else if !b1.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := b1.getFloat64Vector() - if v[0] != 6.0 || v[1] != 6.0 || v[2] != 8.0 || v[3] != 8.0 { - t.Error("Expected [6.0, 6.0, 8.0, 8.0] got", v) - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL + STRING - b1.appendOperand(typesys.OP_BINARY_ADD, st) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_ADD, st) + if err := checkExpression(be, b1, []string{"truea", "falseb", "truec", "falsed"}); err != nil { t.Error(err) - } else if !b1.isStringVector() { - t.Error("Expected string vector type") - } else { - v, _ := b1.getStringVector() - if v[0] != "truea" || v[1] != "falseb" || v[2] != "truec" || v[3] != "falsed" { - t.Error("Expected [truea, falseb, truec, falsed] got", v) - } } // reset b1 @@ -593,72 +366,36 @@ func Test_Operator_Add(t *testing.T) { // INTEGER { // INTEGER + BOOL - in.appendOperand(typesys.OP_BINARY_ADD, b1) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_ADD, b1) + if err := checkExpression(be, in, []int64{2, 2, 4, 4}); err != nil { t.Error(err) - } else if !in.isInt64Vector() { - t.Error("Expected int vector type") - } else { - v, _ := in.getInt64Vector() - if v[0] != 2 || v[1] != 2 || v[2] != 4 || v[3] != 4 { - t.Error("Expected [2, 2, 4, 4] got", v) - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER + INTEGER - in.appendOperand(typesys.OP_BINARY_ADD, in) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_ADD, in) + if err := checkExpression(be, in, []int64{2, 4, 6, 8}); err != nil { t.Error(err) - } else if !in.isInt64Vector() { - t.Error("Expected int vector type") - } else { - v, _ := in.getInt64Vector() - if v[0] != 2 || v[1] != 4 || v[2] != 6 || v[3] != 8 { - t.Error("Expected [2, 4, 6, 8] got", v) - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER + FLOAT - in.appendOperand(typesys.OP_BINARY_ADD, fl) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_ADD, fl) + if err := checkExpression(be, in, []float64{6.0, 8.0, 10.0, 12.0}); err != nil { t.Error(err) - } else if !in.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := in.getFloat64Vector() - if v[0] != 6.0 || v[1] != 8.0 || v[2] != 10.0 || v[3] != 12.0 { - t.Error("Expected [6.0, 8.0, 10.0, 12.0] got", v) - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER + STRING - in.appendOperand(typesys.OP_BINARY_ADD, st) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_ADD, st) + if err := checkExpression(be, in, []string{"1a", "2b", "3c", "4d"}); err != nil { t.Error(err) - } else if !in.isStringVector() { - t.Error("Expected string vector type") - } else { - v, _ := in.getStringVector() - if v[0] != "1a" || v[1] != "2b" || v[2] != "3c" || v[3] != "4d" { - t.Error("Expected [1a, 2b, 3c, 4d] got", v) - } } // reset in @@ -668,72 +405,36 @@ func Test_Operator_Add(t *testing.T) { // FLOAT { // FLOAT + BOOL - fl.appendOperand(typesys.OP_BINARY_ADD, b1) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_ADD, b1) + if err := checkExpression(be, fl, []float64{6.0, 6.0, 8.0, 8.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 6.0 || v[1] != 6.0 || v[2] != 8.0 || v[3] != 8.0 { - t.Error("Expected [6.0, 6.0, 8.0, 8.0] got", v) - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT + INTEGER - fl.appendOperand(typesys.OP_BINARY_ADD, in) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_ADD, in) + if err := checkExpression(be, fl, []float64{6.0, 8.0, 10.0, 12.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 6.0 || v[1] != 8.0 || v[2] != 10.0 || v[3] != 12.0 { - t.Error("Expected [6.0, 8.0, 10.0, 12.0] got", v) - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT + FLOAT - fl.appendOperand(typesys.OP_BINARY_ADD, fl) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_ADD, fl) + if err := checkExpression(be, fl, []float64{10.0, 12.0, 14.0, 16.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 10.0 || v[1] != 12.0 || v[2] != 14.0 || v[3] != 16.0 { - t.Error("Expected [10.0, 12.0, 14.0, 16.0] got", v) - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT + STRING - fl.appendOperand(typesys.OP_BINARY_ADD, st) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_ADD, st) + if err := checkExpression(be, fl, []string{"5a", "6b", "7c", "8d"}); err != nil { t.Error(err) - } else if !fl.isStringVector() { - t.Error("Expected string vector type") - } else { - v, _ := fl.getStringVector() - if v[0] != "5a" || v[1] != "6b" || v[2] != "7c" || v[3] != "8d" { - t.Error("Expected [5a, 6b, 7c, 8d] got", v) - } } // reset fl @@ -743,80 +444,42 @@ func Test_Operator_Add(t *testing.T) { // STRING { // STRING + BOOL - st.appendOperand(typesys.OP_BINARY_ADD, b1) - err = solveExpr(be, st) - - if err != nil { + st.appendBinaryOperation(typesys.OP_BINARY_ADD, b1) + if err := checkExpression(be, st, []string{"atrue", "bfalse", "ctrue", "dfalse"}); err != nil { t.Error(err) - } else if !st.isStringVector() { - t.Error("Expected string vector type") - } else { - v, _ := st.getStringVector() - if v[0] != "atrue" || v[1] != "bfalse" || v[2] != "ctrue" || v[3] != "dfalse" { - t.Error("Expected [atrue, bfalse, ctrue, dfalse] got", v) - } } // reset st st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING + INTEGER - st.appendOperand(typesys.OP_BINARY_ADD, in) - err = solveExpr(be, st) - - if err != nil { + st.appendBinaryOperation(typesys.OP_BINARY_ADD, in) + if err := checkExpression(be, st, []string{"a1", "b2", "c3", "d4"}); err != nil { t.Error(err) - } else if !st.isStringVector() { - t.Error("Expected string vector type") - } else { - v, _ := st.getStringVector() - if v[0] != "a1" || v[1] != "b2" || v[2] != "c3" || v[3] != "d4" { - t.Error("Expected [a1, b2, c3, d4] got", v) - } } // reset st st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING + FLOAT - st.appendOperand(typesys.OP_BINARY_ADD, fl) - err = solveExpr(be, st) - - if err != nil { + st.appendBinaryOperation(typesys.OP_BINARY_ADD, fl) + if err := checkExpression(be, st, []string{"a5", "b6", "c7", "d8"}); err != nil { t.Error(err) - } else if !st.isStringVector() { - t.Error("Expected string vector type") - } else { - v, _ := st.getStringVector() - if v[0] != "a5" || v[1] != "b6" || v[2] != "c7" || v[3] != "d8" { - t.Error("Expected [a5, b6, c7, d8] got", v) - } } // reset st st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING + STRING - st.appendOperand(typesys.OP_BINARY_ADD, st) - err = solveExpr(be, st) - - if err != nil { + st.appendBinaryOperation(typesys.OP_BINARY_ADD, st) + if err := checkExpression(be, st, []string{"aa", "bb", "cc", "dd"}); err != nil { t.Error(err) - } else if !st.isStringVector() { - t.Error("Expected string vector type") - } else { - v, _ := st.getStringVector() - if v[0] != "aa" || v[1] != "bb" || v[2] != "cc" || v[3] != "dd" { - t.Error("Expected [aa, bb, cc, dd] got", v) - } } } } func Test_Operator_Sub(t *testing.T) { - var err error - b1 := be.newPInternTerm([]bool{true, false, true, false}) b2 := be.newPInternTerm([]bool{false, false, true, true}) in := be.newPInternTerm([]int64{1, 2, 3, 4}) @@ -826,65 +489,36 @@ func Test_Operator_Sub(t *testing.T) { // BOOL { // BOOL - BOOL - b1.appendOperand(typesys.OP_BINARY_SUB, b2) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_SUB, b2) + if err := checkExpression(be, b1, []int64{1, 0, 0, -1}); err != nil { t.Error(err) - } else if !b1.isInt64Vector() { - t.Error("Expected integer vector type") - } else { - v, _ := b1.getInt64Vector() - if v[0] != 1 || v[1] != 0 || v[2] != 0 || v[3] != -1 { - t.Error("Expected [1, 0, 0, -1] got", v) - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL - INTEGER - b1.appendOperand(typesys.OP_BINARY_SUB, in) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_SUB, in) + if err := checkExpression(be, b1, []int64{0, -2, -2, -4}); err != nil { t.Error(err) - } else if !b1.isInt64Vector() { - t.Error("Expected integer vector type") - } else { - v, _ := b1.getInt64Vector() - if v[0] != 0 || v[1] != -2 || v[2] != -2 || v[3] != -4 { - t.Error("Expected [0, -2, -2, -4] got", v) - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL - FLOAT - b1.appendOperand(typesys.OP_BINARY_SUB, fl) - err = solveExpr(be, b1) - - if err != nil { + b1.appendBinaryOperation(typesys.OP_BINARY_SUB, fl) + if err := checkExpression(be, b1, []float64{-4.0, -6.0, -6.0, -8.0}); err != nil { t.Error(err) - } else if !b1.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := b1.getFloat64Vector() - if v[0] != -4.0 || v[1] != -6.0 || v[2] != -6.0 || v[3] != -8.0 { - t.Error("Expected [-4.0, -6.0, -6.0, -8.0] got", v) - } } // reset b1 b1 = be.newPInternTerm([]bool{true, false, true, false}) // BOOL - STRING - b1.appendOperand(typesys.OP_BINARY_SUB, st) - err = solveExpr(be, b1) - - if err == nil { - t.Error("Expected error") + b1.appendBinaryOperation(typesys.OP_BINARY_SUB, st) + if err := checkExpression(be, b1, fmt.Errorf("binary operator - not supported between Bool[4] and String[4]")); err != nil { + t.Error(err) } // reset b1 @@ -894,65 +528,36 @@ func Test_Operator_Sub(t *testing.T) { // INTEGER { // INTEGER - BOOL - in.appendOperand(typesys.OP_BINARY_SUB, b1) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_SUB, b1) + if err := checkExpression(be, in, []int64{0, 2, 2, 4}); err != nil { t.Error(err) - } else if !in.isInt64Vector() { - t.Error("Expected integer vector type") - } else { - v, _ := in.getInt64Vector() - if v[0] != 0 || v[1] != 2 || v[2] != 2 || v[3] != 4 { - t.Error("Expected [0, 2, 2, 4] got", v) - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER - INTEGER - in.appendOperand(typesys.OP_BINARY_SUB, in) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_SUB, in) + if err := checkExpression(be, in, []int64{0, 0, 0, 0}); err != nil { t.Error(err) - } else if !in.isInt64Vector() { - t.Error("Expected integer vector type") - } else { - v, _ := in.getInt64Vector() - if v[0] != 0 || v[1] != 0 || v[2] != 0 || v[3] != 0 { - t.Error("Expected [0, 0, 0, 0] got", v) - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER - FLOAT - in.appendOperand(typesys.OP_BINARY_SUB, fl) - err = solveExpr(be, in) - - if err != nil { + in.appendBinaryOperation(typesys.OP_BINARY_SUB, fl) + if err := checkExpression(be, in, []float64{-4.0, -4.0, -4.0, -4.0}); err != nil { t.Error(err) - } else if !in.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := in.getFloat64Vector() - if v[0] != -4.0 || v[1] != -4.0 || v[2] != -4.0 || v[3] != -4.0 { - t.Error("Expected [-4.0, -4.0, -4.0, -4.0] got", v) - } } // reset in in = be.newPInternTerm([]int64{1, 2, 3, 4}) // INTEGER - STRING - in.appendOperand(typesys.OP_BINARY_SUB, st) - err = solveExpr(be, in) - - if err == nil { - t.Error("Expected error") + in.appendBinaryOperation(typesys.OP_BINARY_SUB, st) + if err := checkExpression(be, in, fmt.Errorf("binary operator - not supported between Int64[4] and String[4]")); err != nil { + t.Error(err) } // reset in @@ -962,65 +567,36 @@ func Test_Operator_Sub(t *testing.T) { // FLOAT { // FLOAT - BOOL - fl.appendOperand(typesys.OP_BINARY_SUB, b1) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_SUB, b1) + if err := checkExpression(be, fl, []float64{4.0, 6.0, 6.0, 8.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 4.0 || v[1] != 6.0 || v[2] != 6.0 || v[3] != 8.0 { - t.Error("Expected [4.0, 6.0, 6.0, 8.0] got", v) - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT - INTEGER - fl.appendOperand(typesys.OP_BINARY_SUB, in) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_SUB, in) + if err := checkExpression(be, fl, []float64{4.0, 4.0, 4.0, 4.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 4.0 || v[1] != 4.0 || v[2] != 4.0 || v[3] != 4.0 { - t.Error("Expected [4.0, 4.0, 4.0, 4.0] got", v) - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT - FLOAT - fl.appendOperand(typesys.OP_BINARY_SUB, fl) - err = solveExpr(be, fl) - - if err != nil { + fl.appendBinaryOperation(typesys.OP_BINARY_SUB, fl) + if err := checkExpression(be, fl, []float64{0.0, 0.0, 0.0, 0.0}); err != nil { t.Error(err) - } else if !fl.isFloat64Vector() { - t.Error("Expected float vector type") - } else { - v, _ := fl.getFloat64Vector() - if v[0] != 0.0 || v[1] != 0.0 || v[2] != 0.0 || v[3] != 0.0 { - t.Error("Expected [0.0, 0.0, 0.0, 0.0] got", v) - } } // reset fl fl = be.newPInternTerm([]float64{5.0, 6.0, 7.0, 8.0}) // FLOAT - STRING - fl.appendOperand(typesys.OP_BINARY_SUB, st) - err = solveExpr(be, fl) - - if err == nil { - t.Error("Expected error") + fl.appendBinaryOperation(typesys.OP_BINARY_SUB, st) + if err := checkExpression(be, fl, fmt.Errorf("binary operator - not supported between Float64[4] and String[4]")); err != nil { + t.Error(err) } // reset fl @@ -1030,44 +606,36 @@ func Test_Operator_Sub(t *testing.T) { // STRING { // STRING - BOOL - st.appendOperand(typesys.OP_BINARY_SUB, b1) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator - not supported between String[4] and Bool[4]" { - t.Error("Expected error") + st.appendBinaryOperation(typesys.OP_BINARY_SUB, b1) + if err := checkExpression(be, st, fmt.Errorf("binary operator - not supported between String[4] and Bool[4]")); err != nil { + t.Error(err) } // reset st st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING - INTEGER - st.appendOperand(typesys.OP_BINARY_SUB, in) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator - not supported between String[4] and Int64[4]" { - t.Error("Expected error") + st.appendBinaryOperation(typesys.OP_BINARY_SUB, in) + if err := checkExpression(be, st, fmt.Errorf("binary operator - not supported between String[4] and Int64[4]")); err != nil { + t.Error(err) } // reset st st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING - FLOAT - st.appendOperand(typesys.OP_BINARY_SUB, fl) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator - not supported between String[4] and Float64[4]" { - t.Error("Expected error") + st.appendBinaryOperation(typesys.OP_BINARY_SUB, fl) + if err := checkExpression(be, st, fmt.Errorf("binary operator - not supported between String[4] and Float64[4]")); err != nil { + t.Error(err) } // reset st st = be.newPInternTerm([]string{"a", "b", "c", "d"}) // STRING - STRING - st.appendOperand(typesys.OP_BINARY_SUB, st) - err = solveExpr(be, st) - - if err == nil || err.Error() != "operator - not supported between String[4] and String[4]" { - t.Error("Expected error") + st.appendBinaryOperation(typesys.OP_BINARY_SUB, st) + if err := checkExpression(be, st, fmt.Errorf("binary operator - not supported between String[4] and String[4]")); err != nil { + t.Error(err) } } } diff --git a/core/std.go b/core/std.go index 07d9963..59a4a3c 100644 --- a/core/std.go +++ b/core/std.go @@ -21,8 +21,7 @@ func PreludioFunc_Derive(funcName string, vm *ByteEater) { return } - df, err = positional[0].getDataframe() - if err != nil { + if df, err = positional[0].getDataframe(); err != nil { vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) return } @@ -145,8 +144,7 @@ func PreludioFunc_WriteCSV(funcName string, vm *ByteEater) { var df gandalff.DataFrame var outputFile *os.File - df, err = positional[0].getDataframe() - if err != nil { + if df, err = positional[0].getDataframe(); err != nil { vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) return } @@ -218,8 +216,7 @@ func PreludioFunc_Filter(funcName string, vm *ByteEater) { return } - df, err = positional[0].getDataframe() - if err != nil { + if df, err = positional[0].getDataframe(); err != nil { vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) return } @@ -249,8 +246,7 @@ func PreludioFunc_From(funcName string, vm *ByteEater) { return } - df, err = positional[0].getDataframe() - if err != nil { + if df, err = positional[0].getDataframe(); err != nil { vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) return } @@ -342,8 +338,7 @@ func PreludioFunc_Names(funcName string, vm *ByteEater) { return } - df, err = positional[0].getDataframe() - if err != nil { + if df, err = positional[0].getDataframe(); err != nil { vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) return } @@ -420,8 +415,7 @@ func PreludioFunc_Select(funcName string, vm *ByteEater) { return } - df, err = positional[0].getDataframe() - if err != nil { + if df, err = positional[0].getDataframe(); err != nil { vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) return } @@ -447,9 +441,120 @@ func PreludioFunc_Select(funcName string, vm *ByteEater) { } } +// Group a Dataframe +func PreludioFunc_GroupBy(funcName string, vm *ByteEater) { + vm.printDebug(5, "STARTING", funcName, "") + + var err error + var df gandalff.DataFrame + positional, _, err := vm.GetFunctionParams(funcName, nil, false, false) + if err != nil { + vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) + return + } + + if df, err = positional[0].getDataframe(); err != nil { + vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) + return + } + + // The first value can be both a symbol or a list of symbols + switch v := positional[1].getValue().(type) { + case __p_symbol__: + vm.stackPush(vm.newPInternTerm(df.GroupBy(string(v)))) + vm.setCurrentDataFrame() + + case __p_list__: + list, err := positional[1].listToStringSlice() + if err != nil { + vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) + return + } + + vm.stackPush(vm.newPInternTerm(df.GroupBy(list...))) + vm.setCurrentDataFrame() + + default: + vm.setPanicMode(fmt.Sprintf("%s: expecting symbol or list of symbols, got %T", funcName, v)) + return + } +} + +// Ungroup a Dataframe +func PreludioFunc_Ungroup(funcName string, vm *ByteEater) { + vm.printDebug(5, "STARTING", funcName, "") + + var err error + var df gandalff.DataFrame + positional, _, err := vm.GetFunctionParams(funcName, nil, false, false) + if err != nil { + vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) + return + } + + if df, err = positional[0].getDataframe(); err != nil { + vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) + return + } + + vm.stackPush(vm.newPInternTerm(df.Ungroup())) + vm.setCurrentDataFrame() +} + // Sort all the values in the Dataframe -func PreludioFunc_Sort(funcName string, vm *ByteEater) { +func PreludioFunc_OrderBy(funcName string, vm *ByteEater) { vm.printDebug(5, "STARTING", funcName, "") + + var err error + var df gandalff.DataFrame + positional, _, err := vm.GetFunctionParams(funcName, nil, false, false) + if err != nil { + vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) + return + } + + if df, err = positional[0].getDataframe(); err != nil { + vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) + return + } + + if len(positional) != 2 { + vm.setPanicMode(fmt.Sprintf("%s: expecting 2 parameters, got %d", funcName, len(positional))) + return + } + + // The first value can be both a symbol or a list of symbols + sortParams := make([]gandalff.SortParam, 0) + switch v := positional[1].getValue().(type) { + case __p_symbol__: + if positional[1].isNeg() { + sortParams = append(sortParams, gandalff.Desc(string(v))) + } else { + sortParams = append(sortParams, gandalff.Asc(string(v))) + } + + case __p_list__: + for _, v1 := range positional[1].getValue().(__p_list__) { + switch v2 := v1.expr[0].(type) { + case __p_symbol__: + if v1.isNeg() { + sortParams = append(sortParams, gandalff.Desc(string(v2))) + } else { + sortParams = append(sortParams, gandalff.Asc(string(v2))) + } + default: + vm.setPanicMode(fmt.Sprintf("%s: expecting symbol, got %T", funcName, v)) + return + } + } + + default: + vm.setPanicMode(fmt.Sprintf("%s: expecting symbol or list of symbols, got %T", funcName, v)) + return + } + + vm.stackPush(vm.newPInternTerm(df.OrderBy(sortParams...))) + vm.setCurrentDataFrame() } // Take a subset of the Dataframe's rows @@ -464,8 +569,7 @@ func PreludioFunc_Take(funcName string, vm *ByteEater) { return } - df, err = positional[0].getDataframe() - if err != nil { + if df, err = positional[0].getDataframe(); err != nil { vm.setPanicMode(fmt.Sprintf("%s: %s", funcName, err)) return } diff --git a/core/utils.go b/core/utils.go index 64e28d3..280b93f 100644 --- a/core/utils.go +++ b/core/utils.go @@ -2,6 +2,7 @@ package preludiocore import ( "fmt" + "math" "typesys" ) @@ -14,6 +15,7 @@ func truncate(s string, n int) string { func operatorToString(op typesys.OPCODE) string { switch op { + case typesys.OP_BINARY_MUL: return "*" case typesys.OP_BINARY_DIV: @@ -42,8 +44,60 @@ func operatorToString(op typesys.OPCODE) string { return ">" case typesys.OP_BINARY_GE: return ">=" + + case typesys.OP_UNARY_ADD: + return "+" + case typesys.OP_UNARY_SUB: + return "-" + case typesys.OP_UNARY_NOT: + return "not" + + default: + return "UNKNOWN OPERATOR" + } +} + +func operatorToCode(op typesys.OPCODE) string { + switch op { + + case typesys.OP_BINARY_MUL: + return " * " + case typesys.OP_BINARY_DIV: + return " / " + case typesys.OP_BINARY_MOD: + return " % " + case typesys.OP_BINARY_POW: + return " ** " + case typesys.OP_BINARY_ADD: + return " + " + case typesys.OP_BINARY_SUB: + return " - " + case typesys.OP_BINARY_AND: + return "and " + case typesys.OP_BINARY_OR: + return " or " + case typesys.OP_BINARY_EQ: + return " == " + case typesys.OP_BINARY_NE: + return " != " + case typesys.OP_BINARY_LT: + return " < " + case typesys.OP_BINARY_LE: + return " <= " + case typesys.OP_BINARY_GT: + return " > " + case typesys.OP_BINARY_GE: + return " >= " + + case typesys.OP_UNARY_ADD: + return " + " + case typesys.OP_UNARY_SUB: + return " - " + case typesys.OP_UNARY_NOT: + return "not " + default: - return "" + return "UNKNOWN OPERATOR" } } @@ -76,6 +130,13 @@ func float64SliceEqual(a, b []float64) bool { return false } for i, v := range a { + if math.IsNaN(b[i]) { + if !math.IsNaN(v) { + return false + } else { + continue + } + } if b[i] != v { return false } @@ -95,75 +156,63 @@ func stringSliceEqual(a, b []string) bool { return true } -func currentResultChecker(be *ByteEater, expected interface{}) error { +func checkCurrentResult(be *ByteEater, expected interface{}) error { + if be.__currentResult == nil { + return fmt.Errorf("expected result, got nil") + } + switch v := expected.(type) { case bool: - if be.__currentResult == nil { - return fmt.Errorf("expected result, got nil") - } else if !be.__currentResult.isBoolScalar() { + if !be.__currentResult.isBoolScalar() { return fmt.Errorf("expected bool scalar, got %T", be.__currentResult) } else if b, err := be.__currentResult.getBoolScalar(); err != nil || b != v { return fmt.Errorf("expected %t, got %t: %T", v, b, err) } case []bool: - if be.__currentResult == nil { - return fmt.Errorf("expected result, got nil") - } else if !be.__currentResult.isBoolVector() { + if !be.__currentResult.isBoolVector() { return fmt.Errorf("expected bool vector, got %T", be.__currentResult) } else if b, err := be.__currentResult.getBoolVector(); err != nil || !boolSliceEqual(b, v) { return fmt.Errorf("expected %v, got %v: %T", v, b, err) } case int64: - if be.__currentResult == nil { - return fmt.Errorf("expected result, got nil") - } else if !be.__currentResult.isInt64Scalar() { + if !be.__currentResult.isInt64Scalar() { return fmt.Errorf("expected int64 scalar, got %T", be.__currentResult) } else if i, err := be.__currentResult.getInt64Scalar(); err != nil || i != v { return fmt.Errorf("expected %d, got %d: %T", v, i, err) } case []int64: - if be.__currentResult == nil { - return fmt.Errorf("expected result, got nil") - } else if !be.__currentResult.isInt64Vector() { + if !be.__currentResult.isInt64Vector() { return fmt.Errorf("expected int64 vector, got %T", be.__currentResult) } else if i, err := be.__currentResult.getInt64Vector(); err != nil || !int64SliceEqual(i, v) { return fmt.Errorf("expected %v, got %v: %T", v, i, err) } case float64: - if be.__currentResult == nil { - return fmt.Errorf("expected result, got nil") - } else if !be.__currentResult.isFloat64Scalar() { + if !be.__currentResult.isFloat64Scalar() { return fmt.Errorf("expected float64 scalar, got %T", be.__currentResult) - } else if f, err := be.__currentResult.getFloat64Scalar(); err != nil || f != v { + } else if f, err := be.__currentResult.getFloat64Scalar(); err != nil || !float64SliceEqual([]float64{f}, []float64{v}) { return fmt.Errorf("expected %f, got %f: %T", v, f, err) } case []float64: - if be.__currentResult == nil { - return fmt.Errorf("expected result, got nil") - } else if !be.__currentResult.isFloat64Vector() { + if !be.__currentResult.isFloat64Vector() { return fmt.Errorf("expected float64 vector, got %T", be.__currentResult) } else if f, err := be.__currentResult.getFloat64Vector(); err != nil || !float64SliceEqual(f, v) { return fmt.Errorf("expected %v, got %v: %T", v, f, err) } case string: - if be.__currentResult == nil { - return fmt.Errorf("expected result, got nil") - } else if !be.__currentResult.isStringScalar() { + if !be.__currentResult.isStringScalar() { return fmt.Errorf("expected string scalar, got %T", be.__currentResult) } else if s, err := be.__currentResult.getStringScalar(); err != nil || s != v { return fmt.Errorf("expected %s, got %s: %T", v, s, err) } case []string: - if be.__currentResult == nil { - return fmt.Errorf("expected result, got nil") - } else if !be.__currentResult.isStringVector() { + if !be.__currentResult.isStringVector() { return fmt.Errorf("expected string vector, got %T", be.__currentResult) } else if s, err := be.__currentResult.getStringVector(); err != nil || !stringSliceEqual(s, v) { return fmt.Errorf("expected %v, got %v: %T", v, s, err) @@ -175,3 +224,109 @@ func currentResultChecker(be *ByteEater, expected interface{}) error { return nil } + +func checkExpression(be *ByteEater, operand *__p_intern__, expected interface{}) error { + err := be.solveExpr(operand) + + switch expectedTyped := expected.(type) { + case bool: + if err != nil { + return err + } else if v, err := operand.getBoolScalar(); err == nil { + if v != expectedTyped { + return fmt.Errorf("expected %t, got %t", expected, v) + } + } else { + return err + } + + case []bool: + if err != nil { + return err + } else if v, err := operand.getBoolVector(); err == nil { + if !boolSliceEqual(v, expectedTyped) { + return fmt.Errorf("expected %v, got %v", expected, v) + } + } else { + return err + } + + case int64: + if err != nil { + return err + } else if v, err := operand.getInt64Scalar(); err == nil { + if v != expectedTyped { + return fmt.Errorf("expected %d, got %d", expected, v) + } + } else { + return err + } + + case []int64: + if err != nil { + return err + } else if v, err := operand.getInt64Vector(); err == nil { + if !int64SliceEqual(v, expectedTyped) { + return fmt.Errorf("expected %v, got %v", expected, v) + } + } else { + return err + } + + case float64: + if err != nil { + return err + } else if v, err := operand.getFloat64Scalar(); err == nil { + if v != expectedTyped { + return fmt.Errorf("expected %f, got %f", expected, v) + } + } else { + return err + } + + case []float64: + if err != nil { + return err + } else if v, err := operand.getFloat64Vector(); err == nil { + if !float64SliceEqual(v, expectedTyped) { + return fmt.Errorf("expected %v, got %v", expected, v) + } + } else { + return err + } + + case string: + if err != nil { + return err + } else if v, err := operand.getStringScalar(); err == nil { + if v != expectedTyped { + return fmt.Errorf("expected %s, got %s", expected, v) + } + } else { + return err + } + + case []string: + if err != nil { + return err + } else if v, err := operand.getStringVector(); err == nil { + if !stringSliceEqual(v, expectedTyped) { + return fmt.Errorf("expected %v, got %v", expected, v) + } + } else { + return err + } + + case error: + if err == nil { + return fmt.Errorf("expected error, got %v", operand) + } else if err.Error() != expectedTyped.Error() { + return fmt.Errorf("expected error string \"%v\", got \"%v\"", expected, err) + } + + default: + return fmt.Errorf("unknown type %T", expectedTyped) + } + + return nil +} diff --git a/core/vm.go b/core/vm.go index 7f35898..b9ed99c 100644 --- a/core/vm.go +++ b/core/vm.go @@ -281,7 +281,7 @@ func (vm *ByteEater) loadResults() { for !vm.stackIsEmpty() && vm.stackLast().tag != PRELUDIO_INTERNAL_TAG_BEGIN_FRAME { result := vm.stackPop() - if err := solveExpr(vm, result); err != nil { + if err := vm.solveExpr(result); err != nil { vm.setPanicMode(err.Error()) break } @@ -434,8 +434,12 @@ MAIN_LOOP: PreludioFunc_New("new", vm) case "select": PreludioFunc_Select("select", vm) - case "sort": - PreludioFunc_Sort("sort", vm) + case "groupBy": + PreludioFunc_GroupBy("groupBy", vm) + case "ungroup": + PreludioFunc_Ungroup("ungroup", vm) + case "orderBy": + PreludioFunc_OrderBy("orderBy", vm) case "take": PreludioFunc_Take("take", vm) @@ -583,37 +587,37 @@ MAIN_LOOP: vm.printDebug(10, "OP_BINARY_MUL", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_MUL, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_MUL, op2) case typesys.OP_BINARY_DIV: vm.printDebug(10, "OP_BINARY_DIV", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_DIV, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_DIV, op2) case typesys.OP_BINARY_MOD: vm.printDebug(10, "OP_BINARY_MOD", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_MOD, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_MOD, op2) case typesys.OP_BINARY_ADD: vm.printDebug(10, "OP_BINARY_ADD", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_ADD, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_ADD, op2) case typesys.OP_BINARY_SUB: vm.printDebug(10, "OP_BINARY_SUB", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_SUB, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_SUB, op2) case typesys.OP_BINARY_POW: vm.printDebug(10, "OP_BINARY_POW", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_POW, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_POW, op2) /////////////////////////////////////////////////////////////////////// /////////// LOGICAL OPERATIONS @@ -622,49 +626,49 @@ MAIN_LOOP: vm.printDebug(10, "OP_BINARY_EQ", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_EQ, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_EQ, op2) case typesys.OP_BINARY_NE: vm.printDebug(10, "OP_BINARY_NE", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_NE, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_NE, op2) case typesys.OP_BINARY_GE: vm.printDebug(10, "OP_BINARY_GE", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_GE, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_GE, op2) case typesys.OP_BINARY_LE: vm.printDebug(10, "OP_BINARY_LE", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_LE, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_LE, op2) case typesys.OP_BINARY_GT: vm.printDebug(10, "OP_BINARY_GT", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_GT, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_GT, op2) case typesys.OP_BINARY_LT: vm.printDebug(10, "OP_BINARY_LT", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_LT, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_LT, op2) case typesys.OP_BINARY_AND: vm.printDebug(10, "OP_BINARY_AND", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_AND, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_AND, op2) case typesys.OP_BINARY_OR: vm.printDebug(10, "OP_BINARY_OR", "", "") op2 := vm.stackPop() - vm.stackLast().appendOperand(typesys.OP_BINARY_OR, op2) + vm.stackLast().appendBinaryOperation(typesys.OP_BINARY_OR, op2) /////////////////////////////////////////////////////////////////////// /////////// OTHER OPERATIONS @@ -681,12 +685,18 @@ MAIN_LOOP: case typesys.OP_UNARY_SUB: vm.printDebug(10, "OP_UNARY_SUB", "", "") + vm.stackLast().appendUnaryOperation(typesys.OP_UNARY_SUB) + case typesys.OP_UNARY_ADD: vm.printDebug(10, "OP_UNARY_ADD", "", "") + vm.stackLast().appendUnaryOperation(typesys.OP_UNARY_ADD) + case typesys.OP_UNARY_NOT: vm.printDebug(10, "OP_UNARY_NOT", "", "") + vm.stackLast().appendUnaryOperation(typesys.OP_UNARY_NOT) + /////////////////////////////////////////////////////////////////////// /////////// NO OPERATION @@ -754,14 +764,14 @@ LOOP1: if solve { for _, p := range positionalParams { - if err := solveExpr(vm, p); err != nil { + if err := vm.solveExpr(p); err != nil { return positionalParams, assignments, err } } if namedParams != nil { for _, p := range *namedParams { - if err := solveExpr(vm, p); err != nil { + if err := vm.solveExpr(p); err != nil { return positionalParams, assignments, err } } @@ -769,7 +779,7 @@ LOOP1: if acceptingAssignments { for _, p := range assignments { - if err := solveExpr(vm, p); err != nil { + if err := vm.solveExpr(p); err != nil { return positionalParams, assignments, err } } diff --git a/main.go b/main.go index bf32da0..1e9e66c 100644 --- a/main.go +++ b/main.go @@ -15,7 +15,7 @@ import ( tea "github.com/charmbracelet/bubbletea" ) -const VERSION = "0.1.0-alpha" +const VERSION = "0.2.0" type CliArgs struct { SourceCode string `arg:"-s, --source" help:"source code to execute" default:""`