diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 92c248354..a085445c0 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -27,7 +27,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: bufbuild/buf-setup-action@v1.34.0 + - uses: bufbuild/buf-setup-action@v1.35.1 with: github_token: ${{ github.token }} - run: buf format --diff --exit-code @@ -36,7 +36,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: bufbuild/buf-setup-action@v1.34.0 + - uses: bufbuild/buf-setup-action@v1.35.1 with: github_token: ${{ github.token }} - uses: bufbuild/buf-lint-action@v1 @@ -72,7 +72,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: bufbuild/buf-setup-action@v1.34.0 + - uses: bufbuild/buf-setup-action@v1.35.1 - uses: actions/setup-node@v4 with: node-version: "20" @@ -93,7 +93,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: bufbuild/buf-setup-action@v1.34.0 + - uses: bufbuild/buf-setup-action@v1.35.1 - name: Run proto-prefix.py run: tools/proto_prefix.py output test proto go_package=github.com/test/proto - name: Modify buf config to build rewritten proto files diff --git a/.github/workflows/pr_breaking.yml b/.github/workflows/pr_breaking.yml index f39d3b21e..feb43cce2 100644 --- a/.github/workflows/pr_breaking.yml +++ b/.github/workflows/pr_breaking.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: bufbuild/buf-setup-action@v1.34.0 + - uses: bufbuild/buf-setup-action@v1.35.1 with: github_token: ${{ github.token }} - name: check for breaking changes diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 443d34eff..7cd5c05a6 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,7 +33,7 @@ jobs: with: node-version: "20" - - uses: bufbuild/buf-setup-action@v1.34.0 + - uses: bufbuild/buf-setup-action@v1.35.1 with: github_token: ${{ github.token }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 02235beab..e70c9c095 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,26 @@ Release Notes --- +## [0.53.0](https://github.com/substrait-io/substrait/compare/v0.52.0...v0.53.0) (2024-08-04) + +### ⚠ BREAKING CHANGES + +* PrecisionTimestamp(Tz) literal's value is now int64 +instead of uint64 + +### Features + +* add aggregate count functions with decimal return type ([#670](https://github.com/substrait-io/substrait/issues/670)) ([2aa516b](https://github.com/substrait-io/substrait/commit/2aa516bff3b2cc3e5ad262152c98f1d9b15c6765)) +* add arithmetic function "sqrt" and "factorial" with decimal type ([#674](https://github.com/substrait-io/substrait/issues/674)) ([e4f5b68](https://github.com/substrait-io/substrait/commit/e4f5b68981953d3546835572ce566e9586d497be)) +* add arithmetic function for bitwise(AND/OR/XOR) operation with decimal arguments ([#675](https://github.com/substrait-io/substrait/issues/675)) ([a70cf72](https://github.com/substrait-io/substrait/commit/a70cf72425c3a0eed432238c2a8afedab1cc025b)) +* add logarithmic functions with decimal type args ([#669](https://github.com/substrait-io/substrait/issues/669)) ([d9fb1e3](https://github.com/substrait-io/substrait/commit/d9fb1e355e0b378e1b6460f256d724a3aae931d3)) +* add precision timestamp datetime fn variants ([#666](https://github.com/substrait-io/substrait/issues/666)) ([60c93d2](https://github.com/substrait-io/substrait/commit/60c93d28c8e4df3174ba6b3f687a30d256acdcae)) +* clarify the meaning of plans ([#616](https://github.com/substrait-io/substrait/issues/616)) ([c1553df](https://github.com/substrait-io/substrait/commit/c1553dfafa09de1b2441cdb1d22a251a675419a7)), closes [#612](https://github.com/substrait-io/substrait/issues/612) [#613](https://github.com/substrait-io/substrait/issues/613) + +### Bug Fixes + +* use int64 instead of uint64 for PrecisionTimestamp(Tz) literal value ([#668](https://github.com/substrait-io/substrait/issues/668)) ([da3c74e](https://github.com/substrait-io/substrait/commit/da3c74eccc4978bdaeca4760e98a77aff560e19b)) + ## [0.52.0](https://github.com/substrait-io/substrait/compare/v0.51.0...v0.52.0) (2024-07-14) ### ⚠ BREAKING CHANGES diff --git a/extensions/functions_aggregate_decimal_output.yaml b/extensions/functions_aggregate_decimal_output.yaml new file mode 100644 index 000000000..13a3b2e23 --- /dev/null +++ b/extensions/functions_aggregate_decimal_output.yaml @@ -0,0 +1,41 @@ +%YAML 1.2 +--- +aggregate_functions: + - name: "count" + description: Count a set of values. Result is returned as a decimal instead of i64. + impls: + - args: + - name: x + value: any + options: + overflow: + values: [SILENT, SATURATE, ERROR] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: decimal<38,0> + return: decimal<38,0> + - name: "count" + description: "Count a set of records (not field referenced). Result is returned as a decimal instead of i64." + impls: + - options: + overflow: + values: [SILENT, SATURATE, ERROR] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: decimal<38,0> + return: decimal<38,0> + - name: "approx_count_distinct" + description: >- + Calculates the approximate number of rows that contain distinct values of the expression argument using + HyperLogLog. This function provides an alternative to the COUNT (DISTINCT expression) function, which + returns the exact number of rows that contain distinct values of an expression. APPROX_COUNT_DISTINCT + processes large amounts of data significantly faster than COUNT, with negligible deviation from the exact + result. Result is returned as a decimal instead of i64. + impls: + - args: + - name: x + value: any + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: binary + return: decimal<38,0> diff --git a/extensions/functions_arithmetic_decimal.yaml b/extensions/functions_arithmetic_decimal.yaml index b62c63484..57cdbe396 100644 --- a/extensions/functions_arithmetic_decimal.yaml +++ b/extensions/functions_arithmetic_decimal.yaml @@ -102,7 +102,7 @@ scalar_functions: scale_after_borrow = max(init_scale - delta, min_scale) scale = init_prec > 38 ? scale_after_borrow : init_scale DECIMAL - - + - name: "abs" description: Calculate the absolute value of the argument. impls: @@ -110,6 +110,78 @@ scalar_functions: - name: x value: decimal return: decimal + - name: "bitwise_and" + description: > + Return the bitwise AND result for two decimal inputs. + In inputs scale must be 0 (i.e. only integer types are allowed) + impls: + - args: + - name: x + value: "DECIMAL" + - name: y + value: "DECIMAL" + return: |- + max_precision = max(P1, P2) + DECIMAL + - name: "bitwise_or" + description: > + Return the bitwise OR result for two given decimal inputs. + In inputs scale must be 0 (i.e. only integer types are allowed) + impls: + - args: + - name: x + value: "DECIMAL" + - name: y + value: "DECIMAL" + return: |- + max_precision = max(P1, P2) + DECIMAL + - name: "bitwise_xor" + description: > + Return the bitwise XOR result for two given decimal inputs. + In inputs scale must be 0 (i.e. only integer types are allowed) + impls: + - args: + - name: x + value: "DECIMAL" + - name: y + value: "DECIMAL" + return: |- + max_precision = max(P1, P2) + DECIMAL + - name: "sqrt" + description: Square root of the value. Sqrt of 0 is 0 and sqrt of negative values will raise an error. + impls: + - args: + - name: x + value: "DECIMAL" + return: fp64 + - name: "factorial" + description: > + Return the factorial of a given decimal input. Scale should be 0 for factorial decimal input. + The factorial of 0! is 1 by convention. Negative inputs will raise an error. + Input which cause overflow of result will raise an error. + impls: + - args: + - name: "n" + value: "DECIMAL" + return: "DECIMAL<38,0>" + - + name: "power" + description: "Take the power with x as the base and y as exponent. + Behavior for complex number result is indicated by option complex_number_result" + impls: + - args: + - name: x + value: "DECIMAL" + - name: y + value: "DECIMAL" + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + complex_number_result: + values: [ NAN, ERROR ] + return: fp64 aggregate_functions: - name: "sum" diff --git a/extensions/functions_datetime.yaml b/extensions/functions_datetime.yaml index 0d575b5dd..cde3beb68 100644 --- a/extensions/functions_datetime.yaml +++ b/extensions/functions_datetime.yaml @@ -83,7 +83,7 @@ scalar_functions: MILLISECOND, MICROSECOND, NANOSECOND, SUBSECOND, UNIX_TIME, TIMEZONE_OFFSET ] description: The part of the value to extract. - name: x - value: precision_timestamp_tz + value: precision_timestamp_tz

- name: timezone description: Timezone string from IANA tzdb. value: string @@ -102,7 +102,7 @@ scalar_functions: MILLISECOND, MICROSECOND, NANOSECOND, SUBSECOND, UNIX_TIME ] description: The part of the value to extract. - name: x - value: precision_timestamp + value: precision_timestamp

return: i64 - args: - name: component @@ -141,7 +141,7 @@ scalar_functions: options: [ ONE, ZERO ] description: Start counting from 1 or 0. - name: x - value: precision_timestamp_tz + value: precision_timestamp_tz

- name: timezone description: Timezone string from IANA tzdb. value: string @@ -166,7 +166,7 @@ scalar_functions: options: [ ONE, ZERO ] description: Start counting from 1 or 0. - name: x - value: precision_timestamp + value: precision_timestamp

return: i64 - args: - name: component @@ -198,6 +198,13 @@ scalar_functions: - name: x value: timestamp return: boolean + - args: + - name: component + options: [ IS_LEAP_YEAR ] + description: The part of the value to extract. + - name: x + value: precision_timestamp

+ return: boolean - args: - name: component options: [ IS_LEAP_YEAR, IS_DST ] @@ -208,6 +215,16 @@ scalar_functions: description: Timezone string from IANA tzdb. value: string return: boolean + - args: + - name: component + options: [ IS_LEAP_YEAR, IS_DST ] + description: The part of the value to extract. + - name: x + value: precision_timestamp_tz

+ - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: boolean - args: - name: component options: [ IS_LEAP_YEAR ] @@ -230,6 +247,12 @@ scalar_functions: - name: y value: interval_year return: timestamp + - args: + - name: x + value: precision_timestamp

+ - name: y + value: interval_year + return: precision_timestamp

- args: - name: x value: timestamp_tz @@ -239,6 +262,15 @@ scalar_functions: description: Timezone string from IANA tzdb. value: string return: timestamp_tz + - args: + - name: x + value: precision_timestamp_tz

+ - name: y + value: interval_year + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: precision_timestamp_tz

- args: - name: x value: date @@ -251,12 +283,24 @@ scalar_functions: - name: y value: interval_day return: timestamp + - args: + - name: x + value: precision_timestamp

+ - name: y + value: interval_day + return: precision_timestamp

- args: - name: x value: timestamp_tz - name: y value: interval_day return: timestamp_tz + - args: + - name: x + value: precision_timestamp_tz

+ - name: y + value: interval_day + return: precision_timestamp_tz

- args: - name: x value: date @@ -346,12 +390,24 @@ scalar_functions: - name: y value: interval_year return: timestamp + - args: + - name: x + value: precision_timestamp

+ - name: y + value: interval_year + return: precision_timestamp

- args: - name: x value: timestamp_tz - name: y value: interval_year return: timestamp_tz + - args: + - name: x + value: precision_timestamp_tz

+ - name: y + value: interval_year + return: precision_timestamp_tz

- args: - name: x value: timestamp_tz @@ -361,6 +417,15 @@ scalar_functions: description: Timezone string from IANA tzdb. value: string return: timestamp_tz + - args: + - name: x + value: precision_timestamp_tz

+ - name: y + value: interval_year + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: precision_timestamp_tz

- args: - name: x value: date @@ -373,12 +438,24 @@ scalar_functions: - name: y value: interval_day return: timestamp + - args: + - name: x + value: precision_timestamp

+ - name: y + value: interval_day + return: precision_timestamp

- args: - name: x value: timestamp_tz - name: y value: interval_day return: timestamp_tz + - args: + - name: x + value: precision_timestamp_tz

+ - name: y + value: interval_day + return: precision_timestamp_tz

- args: - name: x value: date @@ -395,12 +472,24 @@ scalar_functions: - name: y value: timestamp return: boolean + - args: + - name: x + value: precision_timestamp

+ - name: y + value: precision_timestamp

+ return: boolean - args: - name: x value: timestamp_tz - name: y value: timestamp_tz return: boolean + - args: + - name: x + value: precision_timestamp_tz

+ - name: y + value: precision_timestamp_tz

+ return: boolean - args: - name: x value: date @@ -429,12 +518,24 @@ scalar_functions: - name: y value: timestamp return: boolean + - args: + - name: x + value: precision_timestamp

+ - name: y + value: precision_timestamp

+ return: boolean - args: - name: x value: timestamp_tz - name: y value: timestamp_tz return: boolean + - args: + - name: x + value: precision_timestamp_tz

+ - name: y + value: precision_timestamp_tz

+ return: boolean - args: - name: x value: date @@ -463,12 +564,24 @@ scalar_functions: - name: y value: timestamp return: boolean + - args: + - name: x + value: precision_timestamp

+ - name: y + value: precision_timestamp

+ return: boolean - args: - name: x value: timestamp_tz - name: y value: timestamp_tz return: boolean + - args: + - name: x + value: precision_timestamp_tz

+ - name: y + value: precision_timestamp_tz

+ return: boolean - args: - name: x value: date @@ -497,12 +610,24 @@ scalar_functions: - name: y value: timestamp return: boolean + - args: + - name: x + value: precision_timestamp

+ - name: y + value: precision_timestamp

+ return: boolean - args: - name: x value: timestamp_tz - name: y value: timestamp_tz return: boolean + - args: + - name: x + value: precision_timestamp_tz

+ - name: y + value: precision_timestamp_tz

+ return: boolean - args: - name: x value: date @@ -537,6 +662,13 @@ scalar_functions: description: Timezone string from IANA tzdb. value: string return: timestamp_tz + - args: + - name: x + value: precision_timestamp

+ - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: precision_timestamp_tz

- args: - name: x value: date @@ -560,6 +692,13 @@ scalar_functions: description: Timezone string from IANA tzdb. value: string return: timestamp + - args: + - name: x + value: precision_timestamp_tz

+ - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: precision_timestamp

- name: "strptime_time" description: >- @@ -627,6 +766,12 @@ scalar_functions: - name: format value: string return: string + - args: + - name: x + value: precision_timestamp

+ - name: format + value: string + return: string - args: - name: x value: timestamp_tz @@ -636,6 +781,15 @@ scalar_functions: description: Timezone string from IANA tzdb. value: string return: string + - args: + - name: x + value: precision_timestamp_tz

+ - name: format + value: string + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: string - args: - name: x value: date @@ -674,6 +828,18 @@ scalar_functions: - name: origin value: timestamp return: timestamp + - args: + - name: x + value: precision_timestamp

+ - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: multiple + value: i64 + - name: origin + value: precision_timestamp

+ return: precision_timestamp

- args: - name: x value: timestamp_tz @@ -689,6 +855,21 @@ scalar_functions: - name: origin value: timestamp_tz return: timestamp_tz + - args: + - name: x + value: precision_timestamp_tz

+ - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: multiple + value: i64 + - name: timezone + description: Timezone string from IANA tzdb. + value: string + - name: origin + value: precision_timestamp_tz

+ return: precision_timestamp_tz

- args: - name: x value: date @@ -741,6 +922,19 @@ scalar_functions: - name: multiple value: i64 return: timestamp + - args: + - name: x + value: precision_timestamp

+ - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: origin + options: [ YEAR, MONTH, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, + US_WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND ] + - name: multiple + value: i64 + return: precision_timestamp

- args: - name: x value: timestamp_tz @@ -757,6 +951,22 @@ scalar_functions: description: Timezone string from IANA tzdb. value: string return: timestamp_tz + - args: + - name: x + value: precision_timestamp_tz

+ - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: origin + options: [ YEAR, MONTH, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, + US_WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND ] + - name: multiple + value: i64 + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: precision_timestamp_tz

- args: - name: x value: date @@ -811,6 +1021,13 @@ aggregate_functions: decomposable: MANY intermediate: timestamp? return: timestamp? + - args: + - name: x + value: precision_timestamp

+ nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: precision_timestamp?

+ return: precision_timestamp?

- args: - name: x value: timestamp_tz @@ -818,6 +1035,13 @@ aggregate_functions: decomposable: MANY intermediate: timestamp_tz? return: timestamp_tz? + - args: + - name: x + value: precision_timestamp_tz

+ nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: precision_timestamp_tz?

+ return: precision_timestamp_tz?

- args: - name: x value: interval_day @@ -863,6 +1087,13 @@ aggregate_functions: decomposable: MANY intermediate: timestamp_tz? return: timestamp_tz? + - args: + - name: x + value: precision_timestamp_tz

+ nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: precision_timestamp_tz?

+ return: precision_timestamp_tz?

- args: - name: x value: interval_day diff --git a/extensions/functions_logarithmic.yaml b/extensions/functions_logarithmic.yaml index 5925e6cb4..b46f3d308 100644 --- a/extensions/functions_logarithmic.yaml +++ b/extensions/functions_logarithmic.yaml @@ -38,6 +38,17 @@ scalar_functions: on_log_zero: values: [NAN, ERROR, MINUS_INFINITY] return: fp64 + - args: + - name: x + value: decimal + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, "NULL", ERROR ] + on_log_zero: + values: [ NAN, ERROR, MINUS_INFINITY ] + return: fp64 - name: "log10" description: "Logarithm to base 10 of the value" @@ -75,6 +86,17 @@ scalar_functions: on_log_zero: values: [NAN, ERROR, MINUS_INFINITY] return: fp64 + - args: + - name: x + value: decimal + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, "NULL", ERROR ] + on_log_zero: + values: [ NAN, ERROR, MINUS_INFINITY ] + return: fp64 - name: "log2" description: "Logarithm to base 2 of the value" @@ -112,6 +134,17 @@ scalar_functions: on_log_zero: values: [NAN, ERROR, MINUS_INFINITY] return: fp64 + - args: + - name: x + value: decimal + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, "NULL", ERROR ] + on_log_zero: + values: [ NAN, ERROR, MINUS_INFINITY ] + return: fp64 - name: "logb" description: > @@ -164,6 +197,21 @@ scalar_functions: on_log_zero: values: [NAN, ERROR, MINUS_INFINITY] return: fp64 + - args: + - value: decimal + name: "x" + description: "The number `x` to compute the logarithm of" + - value: decimal + name: "base" + description: "The logarithm base `b` to use" + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, "NULL", ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp64 - name: "log1p" description: > @@ -193,3 +241,14 @@ scalar_functions: on_log_zero: values: [NAN, ERROR, MINUS_INFINITY] return: fp64 + - args: + - name: x + value: decimal + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, "NULL", ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp64 diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index 75042ac58..f36e71bd0 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -868,7 +868,7 @@ message Expression { // Sub-second precision, 0 means the value given is in seconds, 3 is milliseconds, 6 microseconds, 9 is nanoseconds int32 precision = 1; // Time passed since 1970-01-01 00:00:00.000000 in UTC for PrecisionTimestampTZ and unspecified timezone for PrecisionTimestamp - uint64 value = 2; + int64 value = 2; } message Map { diff --git a/site/docs/relations/basics.md b/site/docs/relations/basics.md index 41e86f425..e940741a8 100644 --- a/site/docs/relations/basics.md +++ b/site/docs/relations/basics.md @@ -1,6 +1,14 @@ # Basics -Substrait is designed to allow a user to construct an arbitrarily complex data transformation plan. The plan is composed of one or more relational operations. Relational operations are well-defined transformation operations that work by taking zero or more input datasets and transforming them into zero or more output transformations. Substrait defines a core set of transformations, but users are also able to extend the operations with their own specialized operations. +Substrait is designed to allow a user to describe arbitrarily complex data transformations. These transformations are composed of one or more relational operations. Relational operations are well-defined transformation operations that work by taking zero or more input datasets and transforming them into zero or more output transformations. Substrait defines a core set of transformations, but users are also able to extend the operations with their own specialized operations. + +## Plans + +A plan is a tree of relations. The root of the tree is the final output of the plan. Each node in the tree is a relational operation. The children of a node are the inputs to the operation. The leaves of the tree are the input datasets to the plan. + +Plans can be composed together using reference relations. This allows for the construction of common plans that can be reused in multiple places. If a plan has no cycles (there is only one plan or each reference relation only references later plans) then the plan will form a DAG (Directed Acyclic Graph). + +## Relational Operators Each relational operation is composed of several properties. Common properties for relational operations include the following: @@ -10,8 +18,6 @@ Each relational operation is composed of several properties. Common properties f | Hints | A set of optionally provided, optionally consumed information about an operation that better informs execution. These might include estimated number of input and output records, estimated record size, likely filter reduction, estimated dictionary size, etc. These can also include implementation specific pieces of execution information. | Physical | | Constraint | A set of runtime constraints around the operation, limiting its consumption based on real-world resources (CPU, memory) as well as virtual resources like number of records produced, the largest record size, etc. | Physical | - - ## Relational Signatures In functions, function signatures are declared externally to the use of those signatures (function bindings). In the case of relational operations, signatures are declared directly in the specification. This is due to the speed of change and number of total operations. Relational operations in the specification are expected to be <100 for several years with additions being infrequent. On the other hand, there is an expectation of both a much larger number of functions (1,000s) and a much higher velocity of additions. diff --git a/site/docs/serialization/_config b/site/docs/serialization/_config index 30dd0ee96..8642e6b77 100644 --- a/site/docs/serialization/_config +++ b/site/docs/serialization/_config @@ -1,3 +1,5 @@ arrange: - - binary_serialization.md - - text_serialization.md + +- basics.md +- binary_serialization.md +- text_serialization.md diff --git a/site/docs/serialization/basics.md b/site/docs/serialization/basics.md new file mode 100644 index 000000000..093719df4 --- /dev/null +++ b/site/docs/serialization/basics.md @@ -0,0 +1,25 @@ +# Basics + +Substrait is designed to be serialized into various different formats. Currently we support a binary serialization for +transmission of plans between programs (e.g. IPC or network communication) and a text serialization for debugging and human readability. Other formats may be added in the future. + +These formats serialize a collection of plans. Substrait does not define how a collection of plans is to be interpreted. +For example, the following scenarios are all valid uses of a collection of plans: + +- A query engine receives a plan and executes it. It receives a collection of plans with a single root plan. The + top-level node of the root plan defines the output of the query. Non-root plans may be included as common subplans + which are referenced from the root plan. +- A transpiler may convert plans from one dialect to another. It could take, as input, a single root plan. Then + it could output a serialized binary containing multiple root plans. Each root plan is a representation of the + input plan in a different dialect. +- A distributed scheduler might expect 1+ root plans. Each root plan describes a different stage of computation. + +Libraries should make sure to thoroughly describe the way plan collections will be produced or consumed. + +## Root plans + +We often refer to query plans as a graph of nodes (typically a DAG unless the query is recursive). However, we +encode this graph as a collection of trees with a single root tree that references other trees (which may also +transitively reference other trees). Plan serializations all have some way to indicate which plan(s) are "root" +plans. Any plan that is not a root plan and is not referenced (directly or transitively) by some root plan +can safely be ignored. diff --git a/site/docs/types/type_classes.md b/site/docs/types/type_classes.md index fda1cfade..26233493a 100644 --- a/site/docs/types/type_classes.md +++ b/site/docs/types/type_classes.md @@ -41,8 +41,8 @@ Compound type classes are type classes that need to be configured by means of a | NSTRUCT<N:T1,...,N:Tn> | **Pseudo-type**: A struct that maps unique names to value types. Each name is a UTF-8-encoded string. Each value can have a distinct type. Note that NSTRUCT is actually a pseudo-type, because Substrait's core type system is based entirely on ordinal positions, not named fields. Nonetheless, when working with systems outside Substrait, names are important. | n/a | LIST<T> | A list of values of type T. The list can be between [0..2,147,483,647] values in length. | `repeated Literal`, all types matching T | MAP<K, V> | An unordered list of type K keys with type V values. Keys may be repeated. While the key type could be nullable, keys may not be null. | `repeated KeyValue` (in turn two `Literal`s), all key types matching K and all value types matching V -| PRECISIONTIMESTAMP<P> | A timestamp with fractional second precision (P, number of digits) 0 <= P <= 9. Does not include timezone information and can thus not be unambiguously mapped to a moment on the timeline without context. Similar to naive datetime in Python. | `uint64` microseconds or nanoseconds since 1970-01-01 00:00:00.000000000 (in an unspecified timezone) -| PRECISIONTIMESTAMPTZ<P> | A timezone-aware timestamp, with fractional second precision (P, number of digits) 0 <= P <= 9. Similar to aware datetime in Python. | `uint64` microseconds or nanoseconds since 1970-01-01 00:00:00.000000000 UTC +| PRECISIONTIMESTAMP<P> | A timestamp with fractional second precision (P, number of digits) 0 <= P <= 9. Does not include timezone information and can thus not be unambiguously mapped to a moment on the timeline without context. Similar to naive datetime in Python. | `int64` seconds, milliseconds, microseconds or nanoseconds since 1970-01-01 00:00:00.000000000 (in an unspecified timezone) +| PRECISIONTIMESTAMPTZ<P> | A timezone-aware timestamp, with fractional second precision (P, number of digits) 0 <= P <= 9. Similar to aware datetime in Python. | `int64` seconds, milliseconds, microseconds or nanoseconds since 1970-01-01 00:00:00.000000000 UTC ## User-Defined Types