diff --git a/.github/deploy_manylinux.sh b/.github/deploy_manylinux.sh index 941543d9e7542..993f4b39f2f5c 100644 --- a/.github/deploy_manylinux.sh +++ b/.github/deploy_manylinux.sh @@ -8,7 +8,7 @@ ls -la rm py-polars/README.md cp README.md py-polars/README.md cd py-polars -rustup override set nightly-2023-06-23 +rustup override set nightly-2023-07-27 export RUSTFLAGS='-C target-feature=+fxsr,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+avx,+fma' # first the default release diff --git a/.github/release-drafter-python.yml b/.github/release-drafter-python.yml index 17dbcd1cc46db..d2e17c11a9059 100644 --- a/.github/release-drafter-python.yml +++ b/.github/release-drafter-python.yml @@ -6,3 +6,10 @@ tag-prefix: py- include-labels: - python + +version-resolver: + minor: + labels: + - breaking + - breaking python + default: patch diff --git a/.github/release-drafter-rust.yml b/.github/release-drafter-rust.yml index 1ca73d2a3490b..10c3b7ddf759f 100644 --- a/.github/release-drafter-rust.yml +++ b/.github/release-drafter-rust.yml @@ -6,3 +6,10 @@ tag-prefix: rs- include-labels: - rust + +version-resolver: + minor: + labels: + - breaking + - breaking rust + default: patch diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index d7fb09be71da1..3b0d9a9eece8c 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -18,40 +18,47 @@ categories: - internal exclude-labels: - - skip-changelog + - skip changelog - release change-template: '- $TITLE (#$NUMBER)' change-title-escapes: '\<*_&' replacers: # Remove conventional commits from titles - - search: '/- (build|chore|depr|docs|feat|fix|perf|release)(\(.*\))?(\!)?\: /g' + - search: '/- (build|chore|ci|depr|docs|feat|fix|perf|refactor|release|test)(\(.*\))?(\!)?\: /g' replace: '- ' -version-resolver: - minor: - labels: breaking - default: patch - autolabeler: - label: rust title: - - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*rust.*\))?\!?\: /' + # Example: feat(rust): ... + - '/^(build|chore|ci|depr|docs|feat|fix|perf|refactor|release|test)(\(.*rust.*\))?\!?\: /' - label: python title: - - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*python.*\))?\!?\: /' + # Example: feat(python): ... + - '/^(build|chore|ci|depr|docs|feat|fix|perf|refactor|release|test)(\(.*python.*\))?\!?\: /' - label: cli title: - - '/^(build|chore|depr|docs|feat|fix|perf|release)\(.*cli.*\)\!?\: /' # CLI tag not in global scope + # Example: feat(cli): ... + - '/^(build|chore|ci|depr|docs|feat|fix|perf|refactor|release|test)\(.*cli.*\)\!?\: /' # CLI tag not in global scope - label: breaking title: - - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*\))?\!\: /' + # Example: feat!: ... + - '/^(build|chore|ci|depr|docs|feat|fix|perf|refactor|release|test)(\(.*\))?\!\: /' + - label: breaking rust + title: + # Example: feat(rust!, python): ... + - '/^(build|chore|ci|depr|docs|feat|fix|perf|refactor|release|test)\(.*rust\!.*\)\: /' + - label: breaking python + title: + # Example: feat(python!): ... + - '/^(build|chore|ci|depr|docs|feat|fix|perf|refactor|release|test)\(.*python\!.*\)\: /' - label: build title: - '/^build/' - label: internal title: - - '/^chore/' + - '/^(chore|ci|refactor|test)/' - label: deprecation title: - '/^depr/' diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0f509f41f99e0..b467a0ef05a91 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -3,7 +3,7 @@ name: Benchmark on: pull_request: paths: - - polars/** + - crates/** - Cargo.toml - py-polars/tests/benchmark/** - .github/workflows/benchmark.yml @@ -11,7 +11,7 @@ on: branches: - main paths: - - polars/** + - crates/** - Cargo.toml - py-polars/tests/benchmark/** - .github/workflows/benchmark.yml diff --git a/.github/workflows/docs-rust.yml b/.github/workflows/docs-rust.yml index 0439b307b1e03..7af4196d7a7aa 100644 --- a/.github/workflows/docs-rust.yml +++ b/.github/workflows/docs-rust.yml @@ -3,13 +3,13 @@ name: Build Rust documentation on: pull_request: paths: - - polars/** + - crates/** - .github/workflows/docs-rust.yml push: branches: - main paths: - - polars/** + - crates/** - .github/workflows/docs-rust.yml concurrency: @@ -32,7 +32,7 @@ jobs: - name: Build Rust documentation env: RUSTDOCFLAGS: --cfg docsrs -D warnings - working-directory: polars + working-directory: crates run: make doctest - name: Create redirect to Polars crate and set no-jekyll diff --git a/.github/workflows/lint-global.yml b/.github/workflows/lint-global.yml index 4177255016960..c4b29df37954e 100644 --- a/.github/workflows/lint-global.yml +++ b/.github/workflows/lint-global.yml @@ -15,4 +15,4 @@ jobs: - name: Lint Markdown and TOML uses: dprint/check@v2.2 - name: Spell Check with Typos - uses: crate-ci/typos@v1.15.9 + uses: crate-ci/typos@v1.16.1 diff --git a/.github/workflows/lint-py-polars.yml b/.github/workflows/lint-py-polars.yml index 771136b868991..5265f3181b3b8 100644 --- a/.github/workflows/lint-py-polars.yml +++ b/.github/workflows/lint-py-polars.yml @@ -3,7 +3,7 @@ name: Lint py-polars crate on: pull_request: paths: - - polars/** + - crates/** - py-polars/src/** - py-polars/Cargo.toml - .github/workflows/lint-py-polars.yml @@ -11,7 +11,7 @@ on: branches: - main paths: - - polars/** + - crates/** - py-polars/src/** - py-polars/Cargo.toml - .github/workflows/lint-py-polars.yml diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml index 0f3c7e3852207..325cb1e833a51 100644 --- a/.github/workflows/lint-python.yml +++ b/.github/workflows/lint-python.yml @@ -30,9 +30,9 @@ jobs: - name: Lint Python run: | + ruff --exit-non-zero-on-fix . black --check . blackdoc --check . - ruff . --exit-non-zero-on-fix mypy: runs-on: ubuntu-latest diff --git a/.github/workflows/lint-rust.yml b/.github/workflows/lint-rust.yml index bc7c18399145c..933d3a46ea366 100644 --- a/.github/workflows/lint-rust.yml +++ b/.github/workflows/lint-rust.yml @@ -3,7 +3,7 @@ name: Lint Rust on: pull_request: paths: - - polars/** + - crates/** - polars-cli/** - examples/** - Cargo.toml @@ -12,7 +12,7 @@ on: branches: - main paths: - - polars/** + - crates/** - polars-cli/** - examples/** - Cargo.toml diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml index df75c7aa41ea1..9eea7d2f36102 100644 --- a/.github/workflows/release-python.yml +++ b/.github/workflows/release-python.yml @@ -6,7 +6,7 @@ on: - py-* env: - RUST_TOOLCHAIN: nightly-2023-06-23 + RUST_TOOLCHAIN: nightly-2023-07-27 PYTHON_VERSION: '3.8' MATURIN_VERSION: '1.1.0' MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/test-bytecode-parser.yml b/.github/workflows/test-bytecode-parser.yml new file mode 100644 index 0000000000000..23e0a8e8ff25b --- /dev/null +++ b/.github/workflows/test-bytecode-parser.yml @@ -0,0 +1,49 @@ +name: Test Bytecode Parser + +on: + pull_request: + paths: + - py-polars/** + - .github/workflows/test-bytecode-parser.yml + push: + branches: + - main + paths: + - py-polars/** + - .github/workflows/test-bytecode-parser.yml + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +defaults: + run: + working-directory: py-polars + +jobs: + ubuntu: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11'] + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Create virtual environment + run: | + python -m venv .venv + echo "$GITHUB_WORKSPACE/py-polars/.venv/bin" >> $GITHUB_PATH + + - name: Install dependencies + run: pip install numpy pytest + + - name: Run tests + if: github.ref_name != 'main' + run: PYTHONPATH=polars/utils pytest tests/test_udfs.py diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml index a64414598fc0e..e6de23dca87a6 100644 --- a/.github/workflows/test-python.yml +++ b/.github/workflows/test-python.yml @@ -4,13 +4,13 @@ on: pull_request: paths: - py-polars/** - - polars/** + - crates/** - .github/workflows/test-python.yml push: branches: - main paths: - - polars/** + - crates/** - py-polars/** - .github/workflows/test-python.yml diff --git a/.github/workflows/test-rust.yml b/.github/workflows/test-rust.yml index 4bbf604355e6a..016a22e15aadc 100644 --- a/.github/workflows/test-rust.yml +++ b/.github/workflows/test-rust.yml @@ -3,7 +3,7 @@ name: Test Rust on: pull_request: paths: - - polars/** + - crates/** - examples/** - Cargo.toml - .github/workflows/test-rust.yml @@ -11,7 +11,7 @@ on: branches: - main paths: - - polars/** + - crates/** - examples/** - Cargo.toml - .github/workflows/test-rust.yml @@ -27,7 +27,7 @@ jobs: test: runs-on: ${{ matrix.os }} strategy: - fail-fast: true + fail-fast: false matrix: os: [ubuntu-latest, windows-latest] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 35621b486710b..f08dea64ec58a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -36,7 +36,7 @@ Please describe the behavior you want and why, and provide examples of how Polar ### Picking an issue Pick an issue by going through the [issue tracker](https://github.com/pola-rs/polars/issues) and finding an issue you would like to work on. -Feel free to pick any issue that is not already assigned. +Feel free to pick any issue with an [accepted](https://github.com/pola-rs/polars/issues?q=is%3Aopen+is%3Aissue+label%3Aaccepted) label that is not already assigned. We use the [help wanted](https://github.com/pola-rs/polars/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) label to indicate issues that are high on our wishlist. If you are a first time contributor, you might want to look for issues labeled [good first issue](https://github.com/pola-rs/polars/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22). @@ -114,7 +114,7 @@ If this all runs correctly, you're ready to start contributing to the Polars cod Create a new git branch from the `main` branch in your local repository, and start coding! -The Rust codebase is located in the `polars` directory, while the Python codebase is located in the `py-polars` directory. +The Rust code is located in the `crates` directory, while the Python codebase is located in the `py-polars` directory. Both directories contain a `Makefile` with helpful commands. Most notably: - `make test` to run the test suite (see the [test suite docs](/py-polars/tests/README.md) for more info) diff --git a/Cargo.toml b/Cargo.toml index be81549ee883b..b120f74a44f12 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,9 @@ [workspace] resolver = "2" members = [ - "polars", + "crates/*", "polars-cli", - "polars/polars-*", - "polars/polars-lazy/polars-*", + "contribution/*", "examples/*", ] exclude = [ @@ -32,11 +31,11 @@ strum_macros = "0.25" [workspace.dependencies.arrow] package = "arrow2" -# git = "https://github.com/jorgecarleitao/arrow2" +git = "https://github.com/jorgecarleitao/arrow2" # git = "https://github.com/ritchie46/arrow2" -# rev = "2d2e7053f9a50810bfe9cecff25ab39089aef98e" +rev = "d5c78e7ba45fcebfbafd55a82ba2601ee3ea9617" # path = "../arrow2" -# branch = "polars_2023-06-26" +# branch = "duration_json" version = "0.17.2" default-features = false features = [ diff --git a/_typos.toml b/_typos.toml index 5ecc7341c68e3..12406b2f4ea84 100644 --- a/_typos.toml +++ b/_typos.toml @@ -15,6 +15,7 @@ width_strat = "width_strat" [default.extend-words] iif = "iif" '"r0ot"' = "r0ot" +wee = "wee" [type.csv] extend-glob = ["*.csv"] diff --git a/contribution/README.md b/contribution/README.md new file mode 100644 index 0000000000000..9800513f47fb0 --- /dev/null +++ b/contribution/README.md @@ -0,0 +1,14 @@ +# Contribution guideline examples + +This subdirectory is intended to provide examples that guide contributors. + +Naming conventions for variables: + +```rust +let s: Series = ... +let ca: ChunkedArray = ... +let arr: ArrayRef = ... +let arr: PrimitiveArray = ... +let dtype: DataType = ... +let data_type: ArrowDataType = ... +``` diff --git a/contribution/polars_ops_multiple_arguments/Cargo.toml b/contribution/polars_ops_multiple_arguments/Cargo.toml new file mode 100644 index 0000000000000..0e9229e736242 --- /dev/null +++ b/contribution/polars_ops_multiple_arguments/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "polars_ops_multiple_arguments" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +polars = { path = "../../crates/polars" } +polars-core = { path = "../../crates/polars-core" } diff --git a/contribution/polars_ops_multiple_arguments/src/lib.rs b/contribution/polars_ops_multiple_arguments/src/lib.rs new file mode 100644 index 0000000000000..cd4334665655e --- /dev/null +++ b/contribution/polars_ops_multiple_arguments/src/lib.rs @@ -0,0 +1,72 @@ +use std::ops::Add; + +use polars::export::arrow::array::*; +use polars::export::arrow::compute::arity::binary; +use polars::export::arrow::types::NativeType; +use polars::prelude::*; +use polars_core::utils::{align_chunks_binary, combine_validities_or}; +use polars_core::with_match_physical_numeric_polars_type; + +// Prefer to do the compute closest to the arrow arrays. +// this will tend to be faster as iterators can work directly on slices and don't have +// to go through boxed traits +fn compute_kernel(arr_1: &PrimitiveArray, arr_2: &PrimitiveArray) -> PrimitiveArray +where + T: Add + NativeType, +{ + // process the null data separately + // this saves an expensive branch and bitoperation when iterating + let validity_1 = arr_1.validity(); + let validity_2 = arr_2.validity(); + + let validity = combine_validities_or(validity_1, validity_2); + + // process the numerical data as if there were no validities + let values_1: &[T] = arr_1.values().as_slice(); + let values_2: &[T] = arr_2.values().as_slice(); + + let values = values_1 + .iter() + .zip(values_2) + .map(|(a, b)| *a + *b) + .collect::>(); + + PrimitiveArray::from_data_default(values.into(), validity) +} + +// Same kernel as above, but uses the `binary` abstraction. Prefer this, +#[allow(dead_code)] +fn compute_kernel2(arr_1: &PrimitiveArray, arr_2: &PrimitiveArray) -> PrimitiveArray +where + T: Add + NativeType, +{ + binary(arr_1, arr_2, arr_1.data_type().clone(), |a, b| a + b) +} + +fn compute_chunked_array_2_args( + ca_1: &ChunkedArray, + ca_2: &ChunkedArray, +) -> ChunkedArray { + // this ensures both ChunkedArrays have the same number of chunks with the same offset + // and the same length. + let (ca_1, ca_2) = align_chunks_binary(ca_1, ca_2); + + let chunks = ca_1 + .downcast_iter() + .zip(ca_2.downcast_iter()) + .map(|(arr_1, arr_2)| compute_kernel(arr_1, arr_2).boxed()) + .collect::>(); + + // Safety: we are sure the `ArrayRef` holds type `T` + unsafe { ChunkedArray::from_chunks(ca_1.name(), chunks) } +} + +pub fn compute_expr_2_args(arg_1: &Series, arg_2: &Series) -> Series { + // dispatch the numerical series to `compute_chunked_array_2_args` + with_match_physical_numeric_polars_type!(arg_1.dtype(), |$T| { + let ca_1: &ChunkedArray<$T> = arg_1.as_ref().as_ref().as_ref(); + let ca_2: &ChunkedArray<$T> = arg_2.as_ref().as_ref().as_ref(); + + compute_chunked_array_2_args(ca_1, ca_2).into_series() + }) +} diff --git a/polars/Makefile b/crates/Makefile similarity index 99% rename from polars/Makefile rename to crates/Makefile index 8cdfaff4d96c9..5ee65b15d80e1 100644 --- a/polars/Makefile +++ b/crates/Makefile @@ -51,6 +51,7 @@ test: ## Run tests -p polars-utils \ -p polars-row \ -p polars-sql \ + -p polars-plan \ -- \ --test-threads=2 diff --git a/polars/clippy.toml b/crates/clippy.toml similarity index 100% rename from polars/clippy.toml rename to crates/clippy.toml diff --git a/polars/polars-algo/Cargo.toml b/crates/polars-algo/Cargo.toml similarity index 100% rename from polars/polars-algo/Cargo.toml rename to crates/polars-algo/Cargo.toml diff --git a/polars/polars-algo/LICENSE b/crates/polars-algo/LICENSE similarity index 100% rename from polars/polars-algo/LICENSE rename to crates/polars-algo/LICENSE diff --git a/polars/polars-algo/README.md b/crates/polars-algo/README.md similarity index 100% rename from polars/polars-algo/README.md rename to crates/polars-algo/README.md diff --git a/polars/polars-algo/src/algo.rs b/crates/polars-algo/src/algo.rs similarity index 100% rename from polars/polars-algo/src/algo.rs rename to crates/polars-algo/src/algo.rs diff --git a/polars/polars-algo/src/lib.rs b/crates/polars-algo/src/lib.rs similarity index 100% rename from polars/polars-algo/src/lib.rs rename to crates/polars-algo/src/lib.rs diff --git a/polars/polars-algo/src/prelude.rs b/crates/polars-algo/src/prelude.rs similarity index 100% rename from polars/polars-algo/src/prelude.rs rename to crates/polars-algo/src/prelude.rs diff --git a/polars/polars-arrow/Cargo.toml b/crates/polars-arrow/Cargo.toml similarity index 100% rename from polars/polars-arrow/Cargo.toml rename to crates/polars-arrow/Cargo.toml diff --git a/polars/polars-arrow/LICENSE b/crates/polars-arrow/LICENSE similarity index 100% rename from polars/polars-arrow/LICENSE rename to crates/polars-arrow/LICENSE diff --git a/polars/polars-arrow/README.md b/crates/polars-arrow/README.md similarity index 100% rename from polars/polars-arrow/README.md rename to crates/polars-arrow/README.md diff --git a/polars/polars-arrow/src/array/default_arrays.rs b/crates/polars-arrow/src/array/default_arrays.rs similarity index 100% rename from polars/polars-arrow/src/array/default_arrays.rs rename to crates/polars-arrow/src/array/default_arrays.rs diff --git a/polars/polars-arrow/src/array/fixed_size_list.rs b/crates/polars-arrow/src/array/fixed_size_list.rs similarity index 100% rename from polars/polars-arrow/src/array/fixed_size_list.rs rename to crates/polars-arrow/src/array/fixed_size_list.rs diff --git a/polars/polars-arrow/src/array/get.rs b/crates/polars-arrow/src/array/get.rs similarity index 100% rename from polars/polars-arrow/src/array/get.rs rename to crates/polars-arrow/src/array/get.rs diff --git a/polars/polars-arrow/src/array/list.rs b/crates/polars-arrow/src/array/list.rs similarity index 100% rename from polars/polars-arrow/src/array/list.rs rename to crates/polars-arrow/src/array/list.rs diff --git a/polars/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs similarity index 100% rename from polars/polars-arrow/src/array/mod.rs rename to crates/polars-arrow/src/array/mod.rs diff --git a/polars/polars-arrow/src/array/null.rs b/crates/polars-arrow/src/array/null.rs similarity index 100% rename from polars/polars-arrow/src/array/null.rs rename to crates/polars-arrow/src/array/null.rs diff --git a/polars/polars-arrow/src/array/slice.rs b/crates/polars-arrow/src/array/slice.rs similarity index 100% rename from polars/polars-arrow/src/array/slice.rs rename to crates/polars-arrow/src/array/slice.rs diff --git a/polars/polars-arrow/src/array/utf8.rs b/crates/polars-arrow/src/array/utf8.rs similarity index 100% rename from polars/polars-arrow/src/array/utf8.rs rename to crates/polars-arrow/src/array/utf8.rs diff --git a/polars/polars-arrow/src/bit_util.rs b/crates/polars-arrow/src/bit_util.rs similarity index 100% rename from polars/polars-arrow/src/bit_util.rs rename to crates/polars-arrow/src/bit_util.rs diff --git a/crates/polars-arrow/src/bitmap/arity.rs b/crates/polars-arrow/src/bitmap/arity.rs new file mode 100644 index 0000000000000..a6440e0ccbfcc --- /dev/null +++ b/crates/polars-arrow/src/bitmap/arity.rs @@ -0,0 +1,32 @@ +use arrow::bitmap::utils::{BitChunkIterExact, BitChunksExact}; +use arrow::bitmap::{chunk_iter_to_vec, Bitmap}; + +/// Apply a bitwise operation `op` to one input and return the result as a [`Bitmap`]. +pub fn unary_mut(lhs: &Bitmap, op: F) -> Bitmap +where + F: FnMut(u64) -> u64, +{ + let (slice, offset, length) = lhs.as_slice(); + if offset == 0 { + let iter = BitChunksExact::::new(slice, length); + unary_impl(iter, op, lhs.len()) + } else { + let iter = lhs.chunks::(); + unary_impl(iter, op, lhs.len()) + } +} + +fn unary_impl(iter: I, mut op: F, length: usize) -> Bitmap +where + I: BitChunkIterExact, + F: FnMut(u64) -> u64, +{ + let rem = op(iter.remainder()); + + // TODO! this can be done without chaining + let iterator = iter.map(op).chain(std::iter::once(rem)); + + let buffer = chunk_iter_to_vec(iterator); + + Bitmap::from_u8_vec(buffer, length) +} diff --git a/crates/polars-arrow/src/bitmap/mod.rs b/crates/polars-arrow/src/bitmap/mod.rs new file mode 100644 index 0000000000000..8226af0c0f17a --- /dev/null +++ b/crates/polars-arrow/src/bitmap/mod.rs @@ -0,0 +1,4 @@ +mod arity; +pub mod mutable; + +pub use arity::*; diff --git a/polars/polars-arrow/src/bitmap/mutable.rs b/crates/polars-arrow/src/bitmap/mutable.rs similarity index 100% rename from polars/polars-arrow/src/bitmap/mutable.rs rename to crates/polars-arrow/src/bitmap/mutable.rs diff --git a/polars/polars-arrow/src/compute/arithmetics/decimal/add.rs b/crates/polars-arrow/src/compute/arithmetics/decimal/add.rs similarity index 100% rename from polars/polars-arrow/src/compute/arithmetics/decimal/add.rs rename to crates/polars-arrow/src/compute/arithmetics/decimal/add.rs diff --git a/polars/polars-arrow/src/compute/arithmetics/decimal/commutative.rs b/crates/polars-arrow/src/compute/arithmetics/decimal/commutative.rs similarity index 100% rename from polars/polars-arrow/src/compute/arithmetics/decimal/commutative.rs rename to crates/polars-arrow/src/compute/arithmetics/decimal/commutative.rs diff --git a/polars/polars-arrow/src/compute/arithmetics/decimal/div.rs b/crates/polars-arrow/src/compute/arithmetics/decimal/div.rs similarity index 100% rename from polars/polars-arrow/src/compute/arithmetics/decimal/div.rs rename to crates/polars-arrow/src/compute/arithmetics/decimal/div.rs diff --git a/polars/polars-arrow/src/compute/arithmetics/decimal/mod.rs b/crates/polars-arrow/src/compute/arithmetics/decimal/mod.rs similarity index 100% rename from polars/polars-arrow/src/compute/arithmetics/decimal/mod.rs rename to crates/polars-arrow/src/compute/arithmetics/decimal/mod.rs diff --git a/polars/polars-arrow/src/compute/arithmetics/decimal/mul.rs b/crates/polars-arrow/src/compute/arithmetics/decimal/mul.rs similarity index 100% rename from polars/polars-arrow/src/compute/arithmetics/decimal/mul.rs rename to crates/polars-arrow/src/compute/arithmetics/decimal/mul.rs diff --git a/polars/polars-arrow/src/compute/arithmetics/decimal/sub.rs b/crates/polars-arrow/src/compute/arithmetics/decimal/sub.rs similarity index 100% rename from polars/polars-arrow/src/compute/arithmetics/decimal/sub.rs rename to crates/polars-arrow/src/compute/arithmetics/decimal/sub.rs diff --git a/polars/polars-arrow/src/compute/arithmetics/mod.rs b/crates/polars-arrow/src/compute/arithmetics/mod.rs similarity index 100% rename from polars/polars-arrow/src/compute/arithmetics/mod.rs rename to crates/polars-arrow/src/compute/arithmetics/mod.rs diff --git a/polars/polars-arrow/src/compute/arity.rs b/crates/polars-arrow/src/compute/arity.rs similarity index 100% rename from polars/polars-arrow/src/compute/arity.rs rename to crates/polars-arrow/src/compute/arity.rs diff --git a/polars/polars-arrow/src/compute/bitwise.rs b/crates/polars-arrow/src/compute/bitwise.rs similarity index 100% rename from polars/polars-arrow/src/compute/bitwise.rs rename to crates/polars-arrow/src/compute/bitwise.rs diff --git a/polars/polars-arrow/src/compute/cast.rs b/crates/polars-arrow/src/compute/cast.rs similarity index 100% rename from polars/polars-arrow/src/compute/cast.rs rename to crates/polars-arrow/src/compute/cast.rs diff --git a/polars/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs similarity index 100% rename from polars/polars-arrow/src/compute/decimal.rs rename to crates/polars-arrow/src/compute/decimal.rs diff --git a/polars/polars-arrow/src/compute/mod.rs b/crates/polars-arrow/src/compute/mod.rs similarity index 100% rename from polars/polars-arrow/src/compute/mod.rs rename to crates/polars-arrow/src/compute/mod.rs diff --git a/polars/polars-arrow/src/compute/take/bitmap.rs b/crates/polars-arrow/src/compute/take/bitmap.rs similarity index 100% rename from polars/polars-arrow/src/compute/take/bitmap.rs rename to crates/polars-arrow/src/compute/take/bitmap.rs diff --git a/polars/polars-arrow/src/compute/take/boolean.rs b/crates/polars-arrow/src/compute/take/boolean.rs similarity index 100% rename from polars/polars-arrow/src/compute/take/boolean.rs rename to crates/polars-arrow/src/compute/take/boolean.rs diff --git a/polars/polars-arrow/src/compute/take/fixed_size_list.rs b/crates/polars-arrow/src/compute/take/fixed_size_list.rs similarity index 98% rename from polars/polars-arrow/src/compute/take/fixed_size_list.rs rename to crates/polars-arrow/src/compute/take/fixed_size_list.rs index 87d86345f48ee..a29bb17fe976d 100644 --- a/polars/polars-arrow/src/compute/take/fixed_size_list.rs +++ b/crates/polars-arrow/src/compute/take/fixed_size_list.rs @@ -15,7 +15,9 @@ pub unsafe fn take_unchecked(values: &FixedSizeListArray, indices: &IdxArr) -> F ) { let idx = indices.values().as_slice(); let child_values = values.values(); - let DataType::FixedSizeList(_, width) = values.data_type() else {unreachable!()}; + let DataType::FixedSizeList(_, width) = values.data_type() else { + unreachable!() + }; with_match_primitive_type!(primitive, |$T| { let arr: &PrimitiveArray<$T> = child_values.as_any().downcast_ref().unwrap(); diff --git a/polars/polars-arrow/src/compute/take/mod.rs b/crates/polars-arrow/src/compute/take/mod.rs similarity index 100% rename from polars/polars-arrow/src/compute/take/mod.rs rename to crates/polars-arrow/src/compute/take/mod.rs diff --git a/polars/polars-arrow/src/compute/tile.rs b/crates/polars-arrow/src/compute/tile.rs similarity index 100% rename from polars/polars-arrow/src/compute/tile.rs rename to crates/polars-arrow/src/compute/tile.rs diff --git a/polars/polars-arrow/src/conversion.rs b/crates/polars-arrow/src/conversion.rs similarity index 100% rename from polars/polars-arrow/src/conversion.rs rename to crates/polars-arrow/src/conversion.rs diff --git a/polars/polars-arrow/src/data_types.rs b/crates/polars-arrow/src/data_types.rs similarity index 100% rename from polars/polars-arrow/src/data_types.rs rename to crates/polars-arrow/src/data_types.rs diff --git a/polars/polars-arrow/src/error.rs b/crates/polars-arrow/src/error.rs similarity index 100% rename from polars/polars-arrow/src/error.rs rename to crates/polars-arrow/src/error.rs diff --git a/polars/polars-arrow/src/export.rs b/crates/polars-arrow/src/export.rs similarity index 100% rename from polars/polars-arrow/src/export.rs rename to crates/polars-arrow/src/export.rs diff --git a/polars/polars-arrow/src/floats/mod.rs b/crates/polars-arrow/src/floats/mod.rs similarity index 100% rename from polars/polars-arrow/src/floats/mod.rs rename to crates/polars-arrow/src/floats/mod.rs diff --git a/polars/polars-arrow/src/floats/ord.rs b/crates/polars-arrow/src/floats/ord.rs similarity index 100% rename from polars/polars-arrow/src/floats/ord.rs rename to crates/polars-arrow/src/floats/ord.rs diff --git a/polars/polars-arrow/src/index.rs b/crates/polars-arrow/src/index.rs similarity index 100% rename from polars/polars-arrow/src/index.rs rename to crates/polars-arrow/src/index.rs diff --git a/polars/polars-arrow/src/is_valid.rs b/crates/polars-arrow/src/is_valid.rs similarity index 100% rename from polars/polars-arrow/src/is_valid.rs rename to crates/polars-arrow/src/is_valid.rs diff --git a/polars/polars-arrow/src/kernels/agg_mean.rs b/crates/polars-arrow/src/kernels/agg_mean.rs similarity index 80% rename from polars/polars-arrow/src/kernels/agg_mean.rs rename to crates/polars-arrow/src/kernels/agg_mean.rs index 26b04e5576673..8b451774a9436 100644 --- a/polars/polars-arrow/src/kernels/agg_mean.rs +++ b/crates/polars-arrow/src/kernels/agg_mean.rs @@ -1,4 +1,7 @@ -use std::simd::{Mask, Simd, SimdCast, SimdElement, SimdFloat, StdFloat, ToBitMask}; +use std::simd::{ + LaneCount, Mask, Simd, SimdCast, SimdElement, SimdFloat, SimdInt, SimdUint, StdFloat, + SupportedLaneCount, ToBitMask, +}; use arrow::array::{Array, PrimitiveArray}; use arrow::bitmap::utils::{BitChunkIterExact, BitChunksExact}; @@ -11,10 +14,43 @@ use num_traits::ToPrimitive; use crate::data_types::IsFloat; use crate::utils::with_match_primitive_type; +// TODO! try to remove this if we can cast again directly +pub trait SimdCastPl +where + LaneCount: SupportedLaneCount, +{ + fn cast_custom(self) -> Simd; +} + +macro_rules! impl_cast_custom { + ($_type:ty) => { + impl SimdCastPl for Simd<$_type, N> + where + LaneCount: SupportedLaneCount, + { + fn cast_custom(self) -> Simd { + self.cast::() + } + } + }; +} + +impl_cast_custom!(u8); +impl_cast_custom!(u16); +impl_cast_custom!(u32); +impl_cast_custom!(u64); +impl_cast_custom!(i8); +impl_cast_custom!(i16); +impl_cast_custom!(i32); +impl_cast_custom!(i64); +impl_cast_custom!(f32); +impl_cast_custom!(f64); + #[multiversion(targets = "simd")] fn nonnull_sum_as_f64(values: &[T]) -> f64 where T: NativeType + SimdElement + ToPrimitive + SimdCast, + Simd: SimdCastPl<8>, { // we choose 8 as that the maximum size of f64x8 -> 512bit wide const LANES: usize = 8; @@ -22,7 +58,7 @@ where let mut reduced: Simd = Simd::splat(0.0); for chunk in simd_vals { - reduced += chunk.cast::(); + reduced += chunk.cast_custom::(); } unsafe { @@ -43,6 +79,7 @@ fn null_sum_as_f64_impl(values: &[T], mut validity_masks: I) -> f64 where T: NativeType + SimdElement + ToPrimitive + IsFloat + SimdCast, I: BitChunkIterExact, + Simd: SimdCastPl<8>, { const LANES: usize = 8; let mut chunks = values.chunks_exact(LANES); @@ -54,7 +91,7 @@ where |acc, (chunk, validity_chunk)| { // safety: exact size chunks let chunk: [T; LANES] = unsafe { chunk.try_into().unwrap_unchecked() }; - let chunk = Simd::from(chunk).cast::(); + let chunk = Simd::from(chunk).cast_custom::(); // construct [bools] let mask = Mask::::from_bitmask(validity_chunk); @@ -107,6 +144,7 @@ where fn null_sum_as_f64(values: &[T], bitmap: &Bitmap) -> f64 where T: NativeType + SimdElement + ToPrimitive + IsFloat + SimdCast, + Simd: SimdCastPl<8>, { let (slice, offset, length) = bitmap.as_slice(); if offset == 0 { diff --git a/polars/polars-arrow/src/kernels/comparison.rs b/crates/polars-arrow/src/kernels/comparison.rs similarity index 100% rename from polars/polars-arrow/src/kernels/comparison.rs rename to crates/polars-arrow/src/kernels/comparison.rs diff --git a/polars/polars-arrow/src/kernels/concatenate.rs b/crates/polars-arrow/src/kernels/concatenate.rs similarity index 100% rename from polars/polars-arrow/src/kernels/concatenate.rs rename to crates/polars-arrow/src/kernels/concatenate.rs diff --git a/polars/polars-arrow/src/kernels/ewm/average.rs b/crates/polars-arrow/src/kernels/ewm/average.rs similarity index 100% rename from polars/polars-arrow/src/kernels/ewm/average.rs rename to crates/polars-arrow/src/kernels/ewm/average.rs diff --git a/polars/polars-arrow/src/kernels/ewm/mod.rs b/crates/polars-arrow/src/kernels/ewm/mod.rs similarity index 100% rename from polars/polars-arrow/src/kernels/ewm/mod.rs rename to crates/polars-arrow/src/kernels/ewm/mod.rs diff --git a/polars/polars-arrow/src/kernels/ewm/variance.rs b/crates/polars-arrow/src/kernels/ewm/variance.rs similarity index 99% rename from polars/polars-arrow/src/kernels/ewm/variance.rs rename to crates/polars-arrow/src/kernels/ewm/variance.rs index 64d04094e3262..fb7cae40c24b1 100644 --- a/polars/polars-arrow/src/kernels/ewm/variance.rs +++ b/crates/polars-arrow/src/kernels/ewm/variance.rs @@ -37,7 +37,7 @@ where let res = xs .into_iter() - .zip(ys.into_iter()) + .zip(ys) .enumerate() .map(|(i, (opt_x, opt_y))| { let is_observation = opt_x.is_some() && opt_y.is_some(); diff --git a/polars/polars-arrow/src/kernels/float.rs b/crates/polars-arrow/src/kernels/float.rs similarity index 100% rename from polars/polars-arrow/src/kernels/float.rs rename to crates/polars-arrow/src/kernels/float.rs diff --git a/polars/polars-arrow/src/kernels/list.rs b/crates/polars-arrow/src/kernels/list.rs similarity index 100% rename from polars/polars-arrow/src/kernels/list.rs rename to crates/polars-arrow/src/kernels/list.rs diff --git a/polars/polars-arrow/src/kernels/list_bytes_iter.rs b/crates/polars-arrow/src/kernels/list_bytes_iter.rs similarity index 100% rename from polars/polars-arrow/src/kernels/list_bytes_iter.rs rename to crates/polars-arrow/src/kernels/list_bytes_iter.rs diff --git a/polars/polars-arrow/src/kernels/mod.rs b/crates/polars-arrow/src/kernels/mod.rs similarity index 99% rename from polars/polars-arrow/src/kernels/mod.rs rename to crates/polars-arrow/src/kernels/mod.rs index 02ba262ec635f..7af6d99b6fcfe 100644 --- a/polars/polars-arrow/src/kernels/mod.rs +++ b/crates/polars-arrow/src/kernels/mod.rs @@ -23,7 +23,7 @@ pub mod take_agg; mod time; #[cfg(feature = "timezones")] -pub use time::replace_timezone; +pub use time::replace_time_zone; /// Internal state of [SlicesIterator] #[derive(Debug, PartialEq)] diff --git a/polars/polars-arrow/src/kernels/rolling/mod.rs b/crates/polars-arrow/src/kernels/rolling/mod.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/mod.rs rename to crates/polars-arrow/src/kernels/rolling/mod.rs diff --git a/polars/polars-arrow/src/kernels/rolling/no_nulls/mean.rs b/crates/polars-arrow/src/kernels/rolling/no_nulls/mean.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/no_nulls/mean.rs rename to crates/polars-arrow/src/kernels/rolling/no_nulls/mean.rs diff --git a/polars/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs b/crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs rename to crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs diff --git a/polars/polars-arrow/src/kernels/rolling/no_nulls/mod.rs b/crates/polars-arrow/src/kernels/rolling/no_nulls/mod.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/no_nulls/mod.rs rename to crates/polars-arrow/src/kernels/rolling/no_nulls/mod.rs diff --git a/polars/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs b/crates/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs similarity index 94% rename from polars/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs rename to crates/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs index c790e2a1706b0..b9ea8427bc59f 100644 --- a/polars/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs +++ b/crates/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs @@ -169,7 +169,7 @@ where // Once the cumulative weight crosses h, we've found our ind{ex/ices}. The definition may look // odd but it's the equivalent of taking h = p * (n - 1) + 1 if your data is indexed from 1. let h: f64 = p * (wsum - buf[0].1) + buf[0].1; - for &(v, w) in buf.iter().filter(|(_, w)| *w != 0.0) { + for &(v, w) in buf.iter() { if s > h { break; } @@ -219,17 +219,21 @@ where + PartialOrd, { assert_eq!(weights.len(), window_size); - let mut buf = vec![(T::zero(), 0.0); window_size]; + // Keep nonzero weights and their indices to know which values we need each iteration. + let nz_idx_wts: Vec<_> = weights.iter().enumerate().filter(|x| x.1 != &0.0).collect(); + let mut buf = vec![(T::zero(), 0.0); nz_idx_wts.len()]; let len = values.len(); let out = (0..len) .map(|idx| { - let (start, end) = det_offsets_fn(idx, window_size, len); - let vals = unsafe { values.get_unchecked(start..end) }; + // Don't need end. Window size is constant and we computed offsets from start above. + let (start, _) = det_offsets_fn(idx, window_size, len); // Sorting is not ideal, see https://github.com/tobiasschoch/wquantile for something faster - buf.iter_mut() - .zip(vals.iter().zip(weights)) - .for_each(|(b, (v, w))| *b = (*v, *w)); + unsafe { + buf.iter_mut() + .zip(nz_idx_wts.iter()) + .for_each(|(b, (i, w))| *b = (*values.get_unchecked(i + start), **w)); + } buf.sort_unstable_by(|&a, &b| compare_fn_nan_max(&a.0, &b.0)); compute_wq(&buf, p, wsum, interpolation) }) diff --git a/polars/polars-arrow/src/kernels/rolling/no_nulls/sum.rs b/crates/polars-arrow/src/kernels/rolling/no_nulls/sum.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/no_nulls/sum.rs rename to crates/polars-arrow/src/kernels/rolling/no_nulls/sum.rs diff --git a/polars/polars-arrow/src/kernels/rolling/no_nulls/variance.rs b/crates/polars-arrow/src/kernels/rolling/no_nulls/variance.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/no_nulls/variance.rs rename to crates/polars-arrow/src/kernels/rolling/no_nulls/variance.rs diff --git a/polars/polars-arrow/src/kernels/rolling/nulls/mean.rs b/crates/polars-arrow/src/kernels/rolling/nulls/mean.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/nulls/mean.rs rename to crates/polars-arrow/src/kernels/rolling/nulls/mean.rs diff --git a/polars/polars-arrow/src/kernels/rolling/nulls/min_max.rs b/crates/polars-arrow/src/kernels/rolling/nulls/min_max.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/nulls/min_max.rs rename to crates/polars-arrow/src/kernels/rolling/nulls/min_max.rs diff --git a/polars/polars-arrow/src/kernels/rolling/nulls/mod.rs b/crates/polars-arrow/src/kernels/rolling/nulls/mod.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/nulls/mod.rs rename to crates/polars-arrow/src/kernels/rolling/nulls/mod.rs diff --git a/polars/polars-arrow/src/kernels/rolling/nulls/quantile.rs b/crates/polars-arrow/src/kernels/rolling/nulls/quantile.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/nulls/quantile.rs rename to crates/polars-arrow/src/kernels/rolling/nulls/quantile.rs diff --git a/polars/polars-arrow/src/kernels/rolling/nulls/sum.rs b/crates/polars-arrow/src/kernels/rolling/nulls/sum.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/nulls/sum.rs rename to crates/polars-arrow/src/kernels/rolling/nulls/sum.rs diff --git a/polars/polars-arrow/src/kernels/rolling/nulls/variance.rs b/crates/polars-arrow/src/kernels/rolling/nulls/variance.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/nulls/variance.rs rename to crates/polars-arrow/src/kernels/rolling/nulls/variance.rs diff --git a/polars/polars-arrow/src/kernels/rolling/window.rs b/crates/polars-arrow/src/kernels/rolling/window.rs similarity index 100% rename from polars/polars-arrow/src/kernels/rolling/window.rs rename to crates/polars-arrow/src/kernels/rolling/window.rs diff --git a/polars/polars-arrow/src/kernels/set.rs b/crates/polars-arrow/src/kernels/set.rs similarity index 100% rename from polars/polars-arrow/src/kernels/set.rs rename to crates/polars-arrow/src/kernels/set.rs diff --git a/polars/polars-arrow/src/kernels/sort_partition.rs b/crates/polars-arrow/src/kernels/sort_partition.rs similarity index 100% rename from polars/polars-arrow/src/kernels/sort_partition.rs rename to crates/polars-arrow/src/kernels/sort_partition.rs diff --git a/polars/polars-arrow/src/kernels/sorted_join/inner.rs b/crates/polars-arrow/src/kernels/sorted_join/inner.rs similarity index 100% rename from polars/polars-arrow/src/kernels/sorted_join/inner.rs rename to crates/polars-arrow/src/kernels/sorted_join/inner.rs diff --git a/polars/polars-arrow/src/kernels/sorted_join/left.rs b/crates/polars-arrow/src/kernels/sorted_join/left.rs similarity index 100% rename from polars/polars-arrow/src/kernels/sorted_join/left.rs rename to crates/polars-arrow/src/kernels/sorted_join/left.rs diff --git a/polars/polars-arrow/src/kernels/sorted_join/mod.rs b/crates/polars-arrow/src/kernels/sorted_join/mod.rs similarity index 100% rename from polars/polars-arrow/src/kernels/sorted_join/mod.rs rename to crates/polars-arrow/src/kernels/sorted_join/mod.rs diff --git a/polars/polars-arrow/src/kernels/string.rs b/crates/polars-arrow/src/kernels/string.rs similarity index 100% rename from polars/polars-arrow/src/kernels/string.rs rename to crates/polars-arrow/src/kernels/string.rs diff --git a/polars/polars-arrow/src/kernels/take_agg/boolean.rs b/crates/polars-arrow/src/kernels/take_agg/boolean.rs similarity index 100% rename from polars/polars-arrow/src/kernels/take_agg/boolean.rs rename to crates/polars-arrow/src/kernels/take_agg/boolean.rs diff --git a/polars/polars-arrow/src/kernels/take_agg/mod.rs b/crates/polars-arrow/src/kernels/take_agg/mod.rs similarity index 100% rename from polars/polars-arrow/src/kernels/take_agg/mod.rs rename to crates/polars-arrow/src/kernels/take_agg/mod.rs diff --git a/polars/polars-arrow/src/kernels/take_agg/var.rs b/crates/polars-arrow/src/kernels/take_agg/var.rs similarity index 100% rename from polars/polars-arrow/src/kernels/take_agg/var.rs rename to crates/polars-arrow/src/kernels/take_agg/var.rs diff --git a/polars/polars-arrow/src/kernels/time.rs b/crates/polars-arrow/src/kernels/time.rs similarity index 99% rename from polars/polars-arrow/src/kernels/time.rs rename to crates/polars-arrow/src/kernels/time.rs index b6ecaec4eeb84..8277b7fdcebd6 100644 --- a/polars/polars-arrow/src/kernels/time.rs +++ b/crates/polars-arrow/src/kernels/time.rs @@ -91,7 +91,7 @@ fn convert_to_timestamp( } #[cfg(feature = "timezones")] -pub fn replace_timezone( +pub fn replace_time_zone( arr: &PrimitiveArray, tu: TimeUnit, from: &str, diff --git a/polars/polars-arrow/src/lib.rs b/crates/polars-arrow/src/lib.rs similarity index 70% rename from polars/polars-arrow/src/lib.rs rename to crates/polars-arrow/src/lib.rs index fa943136bdb8e..184674368fccb 100644 --- a/polars/polars-arrow/src/lib.rs +++ b/crates/polars-arrow/src/lib.rs @@ -1,7 +1,11 @@ #![cfg_attr(feature = "simd", feature(portable_simd))] +#![cfg_attr( + feature = "nightly", + allow(clippy::incorrect_partial_ord_impl_on_ord_type) +)] // remove once stable pub mod array; pub mod bit_util; -mod bitmap; +pub mod bitmap; pub mod compute; pub mod conversion; pub mod data_types; diff --git a/polars/polars-arrow/src/prelude.rs b/crates/polars-arrow/src/prelude.rs similarity index 100% rename from polars/polars-arrow/src/prelude.rs rename to crates/polars-arrow/src/prelude.rs diff --git a/polars/polars-arrow/src/slice.rs b/crates/polars-arrow/src/slice.rs similarity index 100% rename from polars/polars-arrow/src/slice.rs rename to crates/polars-arrow/src/slice.rs diff --git a/polars/polars-arrow/src/time_zone.rs b/crates/polars-arrow/src/time_zone.rs similarity index 100% rename from polars/polars-arrow/src/time_zone.rs rename to crates/polars-arrow/src/time_zone.rs diff --git a/polars/polars-arrow/src/trusted_len/boolean.rs b/crates/polars-arrow/src/trusted_len/boolean.rs similarity index 100% rename from polars/polars-arrow/src/trusted_len/boolean.rs rename to crates/polars-arrow/src/trusted_len/boolean.rs diff --git a/polars/polars-arrow/src/trusted_len/mod.rs b/crates/polars-arrow/src/trusted_len/mod.rs similarity index 100% rename from polars/polars-arrow/src/trusted_len/mod.rs rename to crates/polars-arrow/src/trusted_len/mod.rs diff --git a/polars/polars-arrow/src/trusted_len/push_unchecked.rs b/crates/polars-arrow/src/trusted_len/push_unchecked.rs similarity index 100% rename from polars/polars-arrow/src/trusted_len/push_unchecked.rs rename to crates/polars-arrow/src/trusted_len/push_unchecked.rs diff --git a/polars/polars-arrow/src/trusted_len/rev.rs b/crates/polars-arrow/src/trusted_len/rev.rs similarity index 100% rename from polars/polars-arrow/src/trusted_len/rev.rs rename to crates/polars-arrow/src/trusted_len/rev.rs diff --git a/polars/polars-arrow/src/utils.rs b/crates/polars-arrow/src/utils.rs similarity index 100% rename from polars/polars-arrow/src/utils.rs rename to crates/polars-arrow/src/utils.rs diff --git a/polars/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml similarity index 100% rename from polars/polars-core/Cargo.toml rename to crates/polars-core/Cargo.toml diff --git a/polars/polars-core/LICENSE b/crates/polars-core/LICENSE similarity index 100% rename from polars/polars-core/LICENSE rename to crates/polars-core/LICENSE diff --git a/polars/polars-core/README.md b/crates/polars-core/README.md similarity index 100% rename from polars/polars-core/README.md rename to crates/polars-core/README.md diff --git a/polars/polars-core/src/chunked_array/arithmetic/decimal.rs b/crates/polars-core/src/chunked_array/arithmetic/decimal.rs similarity index 100% rename from polars/polars-core/src/chunked_array/arithmetic/decimal.rs rename to crates/polars-core/src/chunked_array/arithmetic/decimal.rs diff --git a/polars/polars-core/src/chunked_array/arithmetic/mod.rs b/crates/polars-core/src/chunked_array/arithmetic/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/arithmetic/mod.rs rename to crates/polars-core/src/chunked_array/arithmetic/mod.rs diff --git a/polars/polars-core/src/chunked_array/arithmetic/numeric.rs b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs similarity index 100% rename from polars/polars-core/src/chunked_array/arithmetic/numeric.rs rename to crates/polars-core/src/chunked_array/arithmetic/numeric.rs diff --git a/polars/polars-core/src/chunked_array/array/iterator.rs b/crates/polars-core/src/chunked_array/array/iterator.rs similarity index 100% rename from polars/polars-core/src/chunked_array/array/iterator.rs rename to crates/polars-core/src/chunked_array/array/iterator.rs diff --git a/polars/polars-core/src/chunked_array/array/mod.rs b/crates/polars-core/src/chunked_array/array/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/array/mod.rs rename to crates/polars-core/src/chunked_array/array/mod.rs diff --git a/polars/polars-core/src/chunked_array/bitwise.rs b/crates/polars-core/src/chunked_array/bitwise.rs similarity index 100% rename from polars/polars-core/src/chunked_array/bitwise.rs rename to crates/polars-core/src/chunked_array/bitwise.rs diff --git a/polars/polars-core/src/chunked_array/builder/binary.rs b/crates/polars-core/src/chunked_array/builder/binary.rs similarity index 100% rename from polars/polars-core/src/chunked_array/builder/binary.rs rename to crates/polars-core/src/chunked_array/builder/binary.rs diff --git a/polars/polars-core/src/chunked_array/builder/boolean.rs b/crates/polars-core/src/chunked_array/builder/boolean.rs similarity index 93% rename from polars/polars-core/src/chunked_array/builder/boolean.rs rename to crates/polars-core/src/chunked_array/builder/boolean.rs index f4404dd2d9b64..655d94ff1a7d9 100644 --- a/polars/polars-core/src/chunked_array/builder/boolean.rs +++ b/crates/polars-core/src/chunked_array/builder/boolean.rs @@ -23,13 +23,15 @@ impl ChunkedBuilder for BooleanChunkedBuilder { let arr = self.array_builder.as_box(); let length = arr.len() as IdxSize; - ChunkedArray { + let mut ca = ChunkedArray { field: Arc::new(self.field), chunks: vec![arr], phantom: PhantomData, bit_settings: Default::default(), length, - } + }; + ca.compute_len(); + ca } fn shrink_to_fit(&mut self) { diff --git a/polars/polars-core/src/chunked_array/builder/fixed_size_list.rs b/crates/polars-core/src/chunked_array/builder/fixed_size_list.rs similarity index 100% rename from polars/polars-core/src/chunked_array/builder/fixed_size_list.rs rename to crates/polars-core/src/chunked_array/builder/fixed_size_list.rs diff --git a/polars/polars-core/src/chunked_array/builder/from.rs b/crates/polars-core/src/chunked_array/builder/from.rs similarity index 100% rename from polars/polars-core/src/chunked_array/builder/from.rs rename to crates/polars-core/src/chunked_array/builder/from.rs diff --git a/polars/polars-core/src/chunked_array/builder/list/anonymous.rs b/crates/polars-core/src/chunked_array/builder/list/anonymous.rs similarity index 100% rename from polars/polars-core/src/chunked_array/builder/list/anonymous.rs rename to crates/polars-core/src/chunked_array/builder/list/anonymous.rs diff --git a/polars/polars-core/src/chunked_array/builder/list/binary.rs b/crates/polars-core/src/chunked_array/builder/list/binary.rs similarity index 100% rename from polars/polars-core/src/chunked_array/builder/list/binary.rs rename to crates/polars-core/src/chunked_array/builder/list/binary.rs diff --git a/polars/polars-core/src/chunked_array/builder/list/boolean.rs b/crates/polars-core/src/chunked_array/builder/list/boolean.rs similarity index 100% rename from polars/polars-core/src/chunked_array/builder/list/boolean.rs rename to crates/polars-core/src/chunked_array/builder/list/boolean.rs diff --git a/polars/polars-core/src/chunked_array/builder/list/categorical.rs b/crates/polars-core/src/chunked_array/builder/list/categorical.rs similarity index 89% rename from polars/polars-core/src/chunked_array/builder/list/categorical.rs rename to crates/polars-core/src/chunked_array/builder/list/categorical.rs index 7a4043a0bae92..fb46f7064ed5f 100644 --- a/polars/polars-core/src/chunked_array/builder/list/categorical.rs +++ b/crates/polars-core/src/chunked_array/builder/list/categorical.rs @@ -14,7 +14,9 @@ impl ListCategoricalChunkedBuilder { ) -> Self { let inner = ListPrimitiveChunkedBuilder::new(name, capacity, values_capacity, logical_type.clone()); - let DataType::Categorical(Some(rev_map)) = logical_type else { panic!("expected categorical type") }; + let DataType::Categorical(Some(rev_map)) = logical_type else { + panic!("expected categorical type") + }; Self { inner, inner_dtype: RevMapMerger::new(rev_map), @@ -24,7 +26,9 @@ impl ListCategoricalChunkedBuilder { impl ListBuilderTrait for ListCategoricalChunkedBuilder { fn append_series(&mut self, s: &Series) -> PolarsResult<()> { - let DataType::Categorical(Some(rev_map)) = s.dtype() else { polars_bail!(ComputeError: "expected categorical type")}; + let DataType::Categorical(Some(rev_map)) = s.dtype() else { + polars_bail!(ComputeError: "expected categorical type") + }; self.inner_dtype.merge_map(rev_map)?; self.inner.append_series(s) } diff --git a/polars/polars-core/src/chunked_array/builder/list/dtypes.rs b/crates/polars-core/src/chunked_array/builder/list/dtypes.rs similarity index 93% rename from polars/polars-core/src/chunked_array/builder/list/dtypes.rs rename to crates/polars-core/src/chunked_array/builder/list/dtypes.rs index 5ad2aec7cfb7c..50b26d68a297a 100644 --- a/polars/polars-core/src/chunked_array/builder/list/dtypes.rs +++ b/crates/polars-core/src/chunked_array/builder/list/dtypes.rs @@ -28,7 +28,9 @@ impl DtypeMerger { match self { #[cfg(feature = "dtype-categorical")] DtypeMerger::Categorical(merger) => { - let DataType::Categorical(Some(rev_map)) = dtype else { polars_bail!(ComputeError: "expected categorical rev-map") }; + let DataType::Categorical(Some(rev_map)) = dtype else { + polars_bail!(ComputeError: "expected categorical rev-map") + }; return merger.merge_map(rev_map); } DtypeMerger::Other(Some(set_dtype)) => { diff --git a/polars/polars-core/src/chunked_array/builder/list/mod.rs b/crates/polars-core/src/chunked_array/builder/list/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/builder/list/mod.rs rename to crates/polars-core/src/chunked_array/builder/list/mod.rs diff --git a/polars/polars-core/src/chunked_array/builder/list/primitive.rs b/crates/polars-core/src/chunked_array/builder/list/primitive.rs similarity index 100% rename from polars/polars-core/src/chunked_array/builder/list/primitive.rs rename to crates/polars-core/src/chunked_array/builder/list/primitive.rs diff --git a/polars/polars-core/src/chunked_array/builder/mod.rs b/crates/polars-core/src/chunked_array/builder/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/builder/mod.rs rename to crates/polars-core/src/chunked_array/builder/mod.rs diff --git a/polars/polars-core/src/chunked_array/builder/primitive.rs b/crates/polars-core/src/chunked_array/builder/primitive.rs similarity index 94% rename from polars/polars-core/src/chunked_array/builder/primitive.rs rename to crates/polars-core/src/chunked_array/builder/primitive.rs index a09ca25e310ee..f5314a5fb62ab 100644 --- a/polars/polars-core/src/chunked_array/builder/primitive.rs +++ b/crates/polars-core/src/chunked_array/builder/primitive.rs @@ -28,13 +28,15 @@ where fn finish(mut self) -> ChunkedArray { let arr = self.array_builder.as_box(); let length = arr.len() as IdxSize; - ChunkedArray { + let mut ca = ChunkedArray { field: Arc::new(self.field), chunks: vec![arr], phantom: PhantomData, bit_settings: Default::default(), length, - } + }; + ca.compute_len(); + ca } fn shrink_to_fit(&mut self) { diff --git a/polars/polars-core/src/chunked_array/builder/utf8.rs b/crates/polars-core/src/chunked_array/builder/utf8.rs similarity index 96% rename from polars/polars-core/src/chunked_array/builder/utf8.rs rename to crates/polars-core/src/chunked_array/builder/utf8.rs index b67a4cee34588..49f933c790edd 100644 --- a/polars/polars-core/src/chunked_array/builder/utf8.rs +++ b/crates/polars-core/src/chunked_array/builder/utf8.rs @@ -43,13 +43,15 @@ impl Utf8ChunkedBuilder { let arr = self.builder.as_box(); let length = arr.len() as IdxSize; - ChunkedArray { + let mut ca = ChunkedArray { field: Arc::new(self.field), chunks: vec![arr], phantom: PhantomData, bit_settings: Default::default(), length, - } + }; + ca.compute_len(); + ca } fn shrink_to_fit(&mut self) { diff --git a/polars/polars-core/src/chunked_array/cast.rs b/crates/polars-core/src/chunked_array/cast.rs similarity index 100% rename from polars/polars-core/src/chunked_array/cast.rs rename to crates/polars-core/src/chunked_array/cast.rs diff --git a/polars/polars-core/src/chunked_array/comparison/mod.rs b/crates/polars-core/src/chunked_array/comparison/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/comparison/mod.rs rename to crates/polars-core/src/chunked_array/comparison/mod.rs diff --git a/polars/polars-core/src/chunked_array/comparison/scalar.rs b/crates/polars-core/src/chunked_array/comparison/scalar.rs similarity index 98% rename from polars/polars-core/src/chunked_array/comparison/scalar.rs rename to crates/polars-core/src/chunked_array/comparison/scalar.rs index 6c121a242f47f..5c8e42715212b 100644 --- a/polars/polars-core/src/chunked_array/comparison/scalar.rs +++ b/crates/polars-core/src/chunked_array/comparison/scalar.rs @@ -93,7 +93,7 @@ where fn gt(&self, rhs: Rhs) -> BooleanChunked { match (self.is_sorted_flag(), self.null_count()) { - (IsSorted::Ascending, 0) => { + (IsSorted::Ascending, 0) if self.len() > 1 => { let rhs: T::Native = NumCast::from(rhs).unwrap(); let cmp_fn = |a: &T::Native| match compare_fn_nan_max(a, &rhs) { @@ -110,7 +110,7 @@ where fn gt_eq(&self, rhs: Rhs) -> BooleanChunked { match (self.is_sorted_flag(), self.null_count()) { - (IsSorted::Ascending, 0) => { + (IsSorted::Ascending, 0) if self.len() > 1 => { let rhs: T::Native = NumCast::from(rhs).unwrap(); let cmp_fn = |a: &T::Native| match compare_fn_nan_max(a, &rhs) { diff --git a/polars/polars-core/src/chunked_array/drop.rs b/crates/polars-core/src/chunked_array/drop.rs similarity index 100% rename from polars/polars-core/src/chunked_array/drop.rs rename to crates/polars-core/src/chunked_array/drop.rs diff --git a/polars/polars-core/src/chunked_array/float.rs b/crates/polars-core/src/chunked_array/float.rs similarity index 100% rename from polars/polars-core/src/chunked_array/float.rs rename to crates/polars-core/src/chunked_array/float.rs diff --git a/polars/polars-core/src/chunked_array/from.rs b/crates/polars-core/src/chunked_array/from.rs similarity index 100% rename from polars/polars-core/src/chunked_array/from.rs rename to crates/polars-core/src/chunked_array/from.rs diff --git a/polars/polars-core/src/chunked_array/iterator/mod.rs b/crates/polars-core/src/chunked_array/iterator/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/iterator/mod.rs rename to crates/polars-core/src/chunked_array/iterator/mod.rs diff --git a/polars/polars-core/src/chunked_array/iterator/par/list.rs b/crates/polars-core/src/chunked_array/iterator/par/list.rs similarity index 100% rename from polars/polars-core/src/chunked_array/iterator/par/list.rs rename to crates/polars-core/src/chunked_array/iterator/par/list.rs diff --git a/polars/polars-core/src/chunked_array/iterator/par/mod.rs b/crates/polars-core/src/chunked_array/iterator/par/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/iterator/par/mod.rs rename to crates/polars-core/src/chunked_array/iterator/par/mod.rs diff --git a/polars/polars-core/src/chunked_array/iterator/par/utf8.rs b/crates/polars-core/src/chunked_array/iterator/par/utf8.rs similarity index 100% rename from polars/polars-core/src/chunked_array/iterator/par/utf8.rs rename to crates/polars-core/src/chunked_array/iterator/par/utf8.rs diff --git a/polars/polars-core/src/chunked_array/kernels/mod.rs b/crates/polars-core/src/chunked_array/kernels/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/kernels/mod.rs rename to crates/polars-core/src/chunked_array/kernels/mod.rs diff --git a/polars/polars-core/src/chunked_array/kernels/take.rs b/crates/polars-core/src/chunked_array/kernels/take.rs similarity index 100% rename from polars/polars-core/src/chunked_array/kernels/take.rs rename to crates/polars-core/src/chunked_array/kernels/take.rs diff --git a/polars/polars-core/src/chunked_array/list/iterator.rs b/crates/polars-core/src/chunked_array/list/iterator.rs similarity index 94% rename from polars/polars-core/src/chunked_array/list/iterator.rs rename to crates/polars-core/src/chunked_array/list/iterator.rs index 81d0970e8e3e8..2537275312692 100644 --- a/polars/polars-core/src/chunked_array/list/iterator.rs +++ b/crates/polars-core/src/chunked_array/list/iterator.rs @@ -68,6 +68,8 @@ impl<'a, I: Iterator>> Iterator for AmortizedListIter<'a // update the inner state unsafe { *self.inner.as_mut() = array_ref }; + // last iteration could have set the sorted flag (e.g. in compute_len) + self.series_container.clear_settings(); // make sure that the length is correct self.series_container._get_inner_mut().compute_len(); @@ -135,11 +137,13 @@ impl ListChunked { // Safety: // inner type passed as physical type let series_container = unsafe { - Box::new(Series::from_chunks_and_dtype_unchecked( + let mut s = Series::from_chunks_and_dtype_unchecked( name, vec![inner_values.clone()], &iter_dtype, - )) + ); + s.clear_settings(); + Box::new(s) }; let ptr = series_container.array_ref(0) as *const ArrayRef as *mut ArrayRef; @@ -228,10 +232,8 @@ mod test { builder.append_series(&Series::new("", &[1, 1])).unwrap(); let ca = builder.finish(); - ca.amortized_iter() - .zip(ca.into_iter()) - .for_each(|(s1, s2)| { - assert!(s1.unwrap().as_ref().series_equal(&s2.unwrap())); - }); + ca.amortized_iter().zip(&ca).for_each(|(s1, s2)| { + assert!(s1.unwrap().as_ref().series_equal(&s2.unwrap())); + }); } } diff --git a/polars/polars-core/src/chunked_array/list/mod.rs b/crates/polars-core/src/chunked_array/list/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/list/mod.rs rename to crates/polars-core/src/chunked_array/list/mod.rs diff --git a/polars/polars-core/src/chunked_array/logical/categorical/builder.rs b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs similarity index 99% rename from polars/polars-core/src/chunked_array/logical/categorical/builder.rs rename to crates/polars-core/src/chunked_array/logical/categorical/builder.rs index 93b46dd9475d6..4656dd245d038 100644 --- a/polars/polars-core/src/chunked_array/logical/categorical/builder.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs @@ -380,7 +380,7 @@ impl<'a> CategoricalChunkedBuilder<'a> { let cache = &mut crate::STRING_CACHE.lock_map(); id = cache.uuid; - for (s, h) in values.values_iter().zip(hashes.into_iter()) { + for (s, h) in values.values_iter().zip(hashes) { let global_idx = cache.insert_from_hash(h, s); // safety: // we allocated enough @@ -463,7 +463,7 @@ impl CategoricalChunked { polars_ensure!( !oob, ComputeError: - "cannot construct Categorical from these categories, at least on of them is out of bounds" + "cannot construct Categorical from these categories; at least one of them is out of bounds" ); Ok(unsafe { Self::from_global_indices_unchecked(cats) }) } @@ -558,7 +558,7 @@ mod test { let mut builder1 = CategoricalChunkedBuilder::new("foo", 10); let mut builder2 = CategoricalChunkedBuilder::new("foo", 10); builder1.drain_iter(vec![None, Some("hello"), Some("vietnam")]); - builder2.drain_iter(vec![Some("hello"), None, Some("world")].into_iter()); + builder2.drain_iter(vec![Some("hello"), None, Some("world")]); let s = builder1.finish().into_series(); assert_eq!(s.str_value(0).unwrap(), "null"); diff --git a/polars/polars-core/src/chunked_array/logical/categorical/from.rs b/crates/polars-core/src/chunked_array/logical/categorical/from.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/categorical/from.rs rename to crates/polars-core/src/chunked_array/logical/categorical/from.rs diff --git a/polars/polars-core/src/chunked_array/logical/categorical/merge.rs b/crates/polars-core/src/chunked_array/logical/categorical/merge.rs similarity index 96% rename from polars/polars-core/src/chunked_array/logical/categorical/merge.rs rename to crates/polars-core/src/chunked_array/logical/categorical/merge.rs index 6612d0979de9d..7e526168f0519 100644 --- a/polars/polars-core/src/chunked_array/logical/categorical/merge.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/merge.rs @@ -48,7 +48,9 @@ pub(crate) struct RevMapMerger { impl RevMapMerger { pub(crate) fn new(rev_map: Arc) -> Self { - let RevMapping::Global(_, _, id) = rev_map.as_ref() else { panic!("impl error") }; + let RevMapping::Global(_, _, id) = rev_map.as_ref() else { + panic!("impl error") + }; RevMapMerger { state: None, id: *id, @@ -57,7 +59,9 @@ impl RevMapMerger { } fn init_state(&mut self) { - let RevMapping::Global(map, slots, _) = self.original.as_ref() else { unreachable!() }; + let RevMapping::Global(map, slots, _) = self.original.as_ref() else { + unreachable!() + }; self.state = Some(State { map: (*map).clone(), slots: slots_to_mut(slots), @@ -70,7 +74,9 @@ impl RevMapMerger { if Arc::ptr_eq(&self.original, rev_map) { return Ok(()); } - let RevMapping::Global(map, slots, id) = rev_map.as_ref() else { polars_bail!(ComputeError: "expected global rev-map") }; + let RevMapping::Global(map, slots, id) = rev_map.as_ref() else { + polars_bail!(ComputeError: "expected global rev-map") + }; polars_ensure!(*id == self.id, ComputeError: "categoricals don't originate from the same string cache\n\ try setting a global string cache or increase the scope of the local string cache"); diff --git a/polars/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/categorical/mod.rs rename to crates/polars-core/src/chunked_array/logical/categorical/mod.rs diff --git a/polars/polars-core/src/chunked_array/logical/categorical/ops/append.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/append.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/categorical/ops/append.rs rename to crates/polars-core/src/chunked_array/logical/categorical/ops/append.rs diff --git a/polars/polars-core/src/chunked_array/logical/categorical/ops/full.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/full.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/categorical/ops/full.rs rename to crates/polars-core/src/chunked_array/logical/categorical/ops/full.rs diff --git a/polars/polars-core/src/chunked_array/logical/categorical/ops/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/categorical/ops/mod.rs rename to crates/polars-core/src/chunked_array/logical/categorical/ops/mod.rs diff --git a/polars/polars-core/src/chunked_array/logical/categorical/ops/take_random.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/take_random.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/categorical/ops/take_random.rs rename to crates/polars-core/src/chunked_array/logical/categorical/ops/take_random.rs diff --git a/polars/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/categorical/ops/unique.rs rename to crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs diff --git a/polars/polars-core/src/chunked_array/logical/categorical/ops/zip.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/zip.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/categorical/ops/zip.rs rename to crates/polars-core/src/chunked_array/logical/categorical/ops/zip.rs diff --git a/polars/polars-core/src/chunked_array/logical/categorical/stringcache.rs b/crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/categorical/stringcache.rs rename to crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs diff --git a/polars/polars-core/src/chunked_array/logical/date.rs b/crates/polars-core/src/chunked_array/logical/date.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/date.rs rename to crates/polars-core/src/chunked_array/logical/date.rs diff --git a/polars/polars-core/src/chunked_array/logical/datetime.rs b/crates/polars-core/src/chunked_array/logical/datetime.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/datetime.rs rename to crates/polars-core/src/chunked_array/logical/datetime.rs diff --git a/polars/polars-core/src/chunked_array/logical/decimal.rs b/crates/polars-core/src/chunked_array/logical/decimal.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/decimal.rs rename to crates/polars-core/src/chunked_array/logical/decimal.rs diff --git a/polars/polars-core/src/chunked_array/logical/duration.rs b/crates/polars-core/src/chunked_array/logical/duration.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/duration.rs rename to crates/polars-core/src/chunked_array/logical/duration.rs diff --git a/polars/polars-core/src/chunked_array/logical/mod.rs b/crates/polars-core/src/chunked_array/logical/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/mod.rs rename to crates/polars-core/src/chunked_array/logical/mod.rs diff --git a/polars/polars-core/src/chunked_array/logical/struct_/from.rs b/crates/polars-core/src/chunked_array/logical/struct_/from.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/struct_/from.rs rename to crates/polars-core/src/chunked_array/logical/struct_/from.rs diff --git a/polars/polars-core/src/chunked_array/logical/struct_/mod.rs b/crates/polars-core/src/chunked_array/logical/struct_/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/struct_/mod.rs rename to crates/polars-core/src/chunked_array/logical/struct_/mod.rs diff --git a/polars/polars-core/src/chunked_array/logical/time.rs b/crates/polars-core/src/chunked_array/logical/time.rs similarity index 100% rename from polars/polars-core/src/chunked_array/logical/time.rs rename to crates/polars-core/src/chunked_array/logical/time.rs diff --git a/polars/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs similarity index 99% rename from polars/polars-core/src/chunked_array/mod.rs rename to crates/polars-core/src/chunked_array/mod.rs index 94fb09ba1c03e..5d884e5f3ad7a 100644 --- a/polars/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -174,6 +174,10 @@ impl ChunkedArray { self.bit_settings.remove(Settings::FAST_EXPLODE_LIST) } + pub(crate) fn clear_settings(&mut self) { + self.bit_settings.bits = 0; + } + pub fn is_sorted_flag(&self) -> IsSorted { if self.is_sorted_ascending_flag() { IsSorted::Ascending diff --git a/polars/polars-core/src/chunked_array/ndarray.rs b/crates/polars-core/src/chunked_array/ndarray.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ndarray.rs rename to crates/polars-core/src/chunked_array/ndarray.rs diff --git a/polars/polars-core/src/chunked_array/object/builder.rs b/crates/polars-core/src/chunked_array/object/builder.rs similarity index 100% rename from polars/polars-core/src/chunked_array/object/builder.rs rename to crates/polars-core/src/chunked_array/object/builder.rs diff --git a/polars/polars-core/src/chunked_array/object/extension/drop.rs b/crates/polars-core/src/chunked_array/object/extension/drop.rs similarity index 96% rename from polars/polars-core/src/chunked_array/object/extension/drop.rs rename to crates/polars-core/src/chunked_array/object/extension/drop.rs index 9d904462f96de..1f678f0a946fc 100644 --- a/polars/polars-core/src/chunked_array/object/extension/drop.rs +++ b/crates/polars-core/src/chunked_array/object/extension/drop.rs @@ -2,7 +2,7 @@ use crate::chunked_array::object::extension::PolarsExtension; use crate::prelude::*; /// This will dereference a raw ptr when dropping the PolarsExtension, make sure that it's valid. -pub(crate) unsafe fn drop_list(ca: &mut ListChunked) { +pub(crate) unsafe fn drop_list(ca: &ListChunked) { let mut inner = ca.inner_dtype(); let mut nested_count = 0; diff --git a/polars/polars-core/src/chunked_array/object/extension/list.rs b/crates/polars-core/src/chunked_array/object/extension/list.rs similarity index 100% rename from polars/polars-core/src/chunked_array/object/extension/list.rs rename to crates/polars-core/src/chunked_array/object/extension/list.rs diff --git a/polars/polars-core/src/chunked_array/object/extension/mod.rs b/crates/polars-core/src/chunked_array/object/extension/mod.rs similarity index 98% rename from polars/polars-core/src/chunked_array/object/extension/mod.rs rename to crates/polars-core/src/chunked_array/object/extension/mod.rs index 41a6b74840e6b..592d0e65d0a80 100644 --- a/polars/polars-core/src/chunked_array/object/extension/mod.rs +++ b/crates/polars-core/src/chunked_array/object/extension/mod.rs @@ -50,10 +50,7 @@ unsafe fn any_as_u8_slice(p: &T) -> &[u8] { /// Create an extension Array that can be sent to arrow and (once wrapped in `[PolarsExtension]` will /// also call drop on `T`, when the array is dropped. -pub(crate) fn create_extension< - I: IntoIterator> + TrustedLen, - T: Sized + Default, ->( +pub(crate) fn create_extension> + TrustedLen, T: Sized + Default>( iter: I, ) -> PolarsExtension { let env = "POLARS_ALLOW_EXTENSION"; diff --git a/polars/polars-core/src/chunked_array/object/extension/polars_extension.rs b/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs similarity index 100% rename from polars/polars-core/src/chunked_array/object/extension/polars_extension.rs rename to crates/polars-core/src/chunked_array/object/extension/polars_extension.rs diff --git a/polars/polars-core/src/chunked_array/object/is_valid.rs b/crates/polars-core/src/chunked_array/object/is_valid.rs similarity index 100% rename from polars/polars-core/src/chunked_array/object/is_valid.rs rename to crates/polars-core/src/chunked_array/object/is_valid.rs diff --git a/polars/polars-core/src/chunked_array/object/iterator.rs b/crates/polars-core/src/chunked_array/object/iterator.rs similarity index 100% rename from polars/polars-core/src/chunked_array/object/iterator.rs rename to crates/polars-core/src/chunked_array/object/iterator.rs diff --git a/polars/polars-core/src/chunked_array/object/mod.rs b/crates/polars-core/src/chunked_array/object/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/object/mod.rs rename to crates/polars-core/src/chunked_array/object/mod.rs diff --git a/polars/polars-core/src/chunked_array/object/registry.rs b/crates/polars-core/src/chunked_array/object/registry.rs similarity index 100% rename from polars/polars-core/src/chunked_array/object/registry.rs rename to crates/polars-core/src/chunked_array/object/registry.rs diff --git a/polars/polars-core/src/chunked_array/ops/abs.rs b/crates/polars-core/src/chunked_array/ops/abs.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/abs.rs rename to crates/polars-core/src/chunked_array/ops/abs.rs diff --git a/polars/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs similarity index 99% rename from polars/polars-core/src/chunked_array/ops/aggregate/mod.rs rename to crates/polars-core/src/chunked_array/ops/aggregate/mod.rs index ce948d1d79daa..3db7a22fee53b 100644 --- a/polars/polars-core/src/chunked_array/ops/aggregate/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs @@ -99,6 +99,9 @@ where } fn min(&self) -> Option { + if self.is_empty() { + return None; + } match self.is_sorted_flag() { IsSorted::Ascending => { self.first_non_null().and_then(|idx| { @@ -128,6 +131,9 @@ where } fn max(&self) -> Option { + if self.is_empty() { + return None; + } match self.is_sorted_flag() { IsSorted::Ascending => { self.last_non_null().and_then(|idx| { @@ -217,7 +223,6 @@ where /// Booleans are casted to 1 or 0. impl BooleanChunked { - /// Returns `None` if the array is empty or only contains null values. pub fn sum(&self) -> Option { Some(if self.is_empty() { 0 @@ -441,6 +446,9 @@ impl ChunkAggSeries for BooleanChunked { impl Utf8Chunked { pub(crate) fn max_str(&self) -> Option<&str> { + if self.is_empty() { + return None; + } match self.is_sorted_flag() { IsSorted::Ascending => self.get(self.len() - 1), IsSorted::Descending => self.get(0), @@ -451,6 +459,9 @@ impl Utf8Chunked { } } pub(crate) fn min_str(&self) -> Option<&str> { + if self.is_empty() { + return None; + } match self.is_sorted_flag() { IsSorted::Ascending => self.get(0), IsSorted::Descending => self.get(self.len() - 1), diff --git a/polars/polars-core/src/chunked_array/ops/aggregate/quantile.rs b/crates/polars-core/src/chunked_array/ops/aggregate/quantile.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/aggregate/quantile.rs rename to crates/polars-core/src/chunked_array/ops/aggregate/quantile.rs diff --git a/polars/polars-core/src/chunked_array/ops/aggregate/var.rs b/crates/polars-core/src/chunked_array/ops/aggregate/var.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/aggregate/var.rs rename to crates/polars-core/src/chunked_array/ops/aggregate/var.rs diff --git a/polars/polars-core/src/chunked_array/ops/any_value.rs b/crates/polars-core/src/chunked_array/ops/any_value.rs similarity index 98% rename from polars/polars-core/src/chunked_array/ops/any_value.rs rename to crates/polars-core/src/chunked_array/ops/any_value.rs index 5519abe04a164..ae5740f33ae44 100644 --- a/polars/polars-core/src/chunked_array/ops/any_value.rs +++ b/crates/polars-core/src/chunked_array/ops/any_value.rs @@ -134,7 +134,7 @@ impl<'a> AnyValue<'a> { // so we set the array pointer with values of the dictionary array. #[cfg(feature = "dtype-categorical")] { - use polars_arrow::is_valid::{IsValid as _}; + use polars_arrow::is_valid::IsValid as _; if let Some(arr) = arr.as_any().downcast_ref::>() { let keys = arr.keys(); let values = arr.values(); @@ -144,14 +144,14 @@ impl<'a> AnyValue<'a> { if arr.is_valid_unchecked(idx) { let v = arr.value_unchecked(idx); - let DataType::Categorical(Some(rev_map)) = fld.data_type() else { + let DataType::Categorical(Some(rev_map)) = fld.data_type() + else { unimplemented!() }; AnyValue::Categorical(v, rev_map, SyncPtr::from_const(values)) } else { AnyValue::Null } - } else { arr_to_any_value(&**arr, idx, fld.data_type()) } diff --git a/polars/polars-core/src/chunked_array/ops/append.rs b/crates/polars-core/src/chunked_array/ops/append.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/append.rs rename to crates/polars-core/src/chunked_array/ops/append.rs diff --git a/polars/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs similarity index 92% rename from polars/polars-core/src/chunked_array/ops/apply.rs rename to crates/polars-core/src/chunked_array/ops/apply.rs index a43cfa5c94f36..ac166a76799df 100644 --- a/polars/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -3,7 +3,9 @@ use std::borrow::Cow; use std::convert::TryFrom; use arrow::array::{BooleanArray, PrimitiveArray}; +use arrow::bitmap::utils::{get_bit_unchecked, set_bit_unchecked}; use polars_arrow::array::PolarsArray; +use polars_arrow::bitmap::unary_mut; use polars_arrow::trusted_len::TrustedLenPush; use crate::prelude::*; @@ -319,14 +321,64 @@ impl<'a> ChunkApply<'a, bool, bool> for BooleanChunked { where F: Fn(bool) -> bool + Copy, { - apply!(self, f) + self.apply_kernel(&|arr| { + let values = arrow::bitmap::unary(arr.values(), |chunk| { + let bytes = chunk.to_ne_bytes(); + + // different output as that might lead + // to better internal parallelism + let mut out = 0u64.to_ne_bytes(); + for i in 0..64 { + unsafe { + let val = get_bit_unchecked(&bytes, i); + let res = f(val); + set_bit_unchecked(&mut out, i, res) + }; + } + u64::from_ne_bytes(out) + }); + BooleanArray::from_data_default(values, arr.validity().cloned()).boxed() + }) } fn try_apply(&self, f: F) -> PolarsResult where F: Fn(bool) -> PolarsResult + Copy, { - try_apply!(self, f) + let mut failed: Option = None; + let chunks = self + .downcast_iter() + .map(|arr| { + let values = unary_mut(arr.values(), |chunk| { + let bytes = chunk.to_ne_bytes(); + + // different output as that might lead + // to better internal parallelism + let mut out = 0u64.to_ne_bytes(); + for i in 0..64 { + unsafe { + let val = get_bit_unchecked(&bytes, i); + match f(val) { + Ok(res) => set_bit_unchecked(&mut out, i, res), + Err(e) => { + if failed.is_none() { + failed = Some(e) + } + } + } + }; + } + u64::from_ne_bytes(out) + }); + Ok(BooleanArray::from_data_default(values, arr.validity().cloned()).boxed()) + }) + .collect::>>()?; + + if let Some(e) = failed { + return Err(e); + } + + Ok(unsafe { BooleanChunked::from_chunks(self.name(), chunks) }) } fn apply_on_opt(&'a self, f: F) -> Self diff --git a/polars/polars-core/src/chunked_array/ops/bit_repr.rs b/crates/polars-core/src/chunked_array/ops/bit_repr.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/bit_repr.rs rename to crates/polars-core/src/chunked_array/ops/bit_repr.rs diff --git a/polars/polars-core/src/chunked_array/ops/chunkops.rs b/crates/polars-core/src/chunked_array/ops/chunkops.rs similarity index 98% rename from polars/polars-core/src/chunked_array/ops/chunkops.rs rename to crates/polars-core/src/chunked_array/ops/chunkops.rs index 69e21a974ca27..468e9a2d5c804 100644 --- a/polars/polars-core/src/chunked_array/ops/chunkops.rs +++ b/crates/polars-core/src/chunked_array/ops/chunkops.rs @@ -73,6 +73,11 @@ impl ChunkedArray { } } self.length = inner(&self.chunks) as IdxSize; + + if self.length <= 1 { + self.set_sorted_flag(IsSorted::Ascending) + } + #[cfg(feature = "python")] assert!( self.length < IdxSize::MAX, diff --git a/polars/polars-core/src/chunked_array/ops/compare_inner.rs b/crates/polars-core/src/chunked_array/ops/compare_inner.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/compare_inner.rs rename to crates/polars-core/src/chunked_array/ops/compare_inner.rs diff --git a/polars/polars-core/src/chunked_array/ops/concat_str.rs b/crates/polars-core/src/chunked_array/ops/concat_str.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/concat_str.rs rename to crates/polars-core/src/chunked_array/ops/concat_str.rs diff --git a/polars/polars-core/src/chunked_array/ops/cum_agg.rs b/crates/polars-core/src/chunked_array/ops/cum_agg.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/cum_agg.rs rename to crates/polars-core/src/chunked_array/ops/cum_agg.rs diff --git a/polars/polars-core/src/chunked_array/ops/decimal.rs b/crates/polars-core/src/chunked_array/ops/decimal.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/decimal.rs rename to crates/polars-core/src/chunked_array/ops/decimal.rs diff --git a/polars/polars-core/src/chunked_array/ops/downcast.rs b/crates/polars-core/src/chunked_array/ops/downcast.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/downcast.rs rename to crates/polars-core/src/chunked_array/ops/downcast.rs diff --git a/polars/polars-core/src/chunked_array/ops/explode.rs b/crates/polars-core/src/chunked_array/ops/explode.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/explode.rs rename to crates/polars-core/src/chunked_array/ops/explode.rs diff --git a/polars/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/explode_and_offsets.rs rename to crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs diff --git a/polars/polars-core/src/chunked_array/ops/extend.rs b/crates/polars-core/src/chunked_array/ops/extend.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/extend.rs rename to crates/polars-core/src/chunked_array/ops/extend.rs diff --git a/polars/polars-core/src/chunked_array/ops/fill_null.rs b/crates/polars-core/src/chunked_array/ops/fill_null.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/fill_null.rs rename to crates/polars-core/src/chunked_array/ops/fill_null.rs diff --git a/polars/polars-core/src/chunked_array/ops/filter.rs b/crates/polars-core/src/chunked_array/ops/filter.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/filter.rs rename to crates/polars-core/src/chunked_array/ops/filter.rs diff --git a/polars/polars-core/src/chunked_array/ops/full.rs b/crates/polars-core/src/chunked_array/ops/full.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/full.rs rename to crates/polars-core/src/chunked_array/ops/full.rs diff --git a/polars/polars-core/src/chunked_array/ops/interpolate.rs b/crates/polars-core/src/chunked_array/ops/interpolate.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/interpolate.rs rename to crates/polars-core/src/chunked_array/ops/interpolate.rs diff --git a/polars/polars-core/src/chunked_array/ops/is_in.rs b/crates/polars-core/src/chunked_array/ops/is_in.rs similarity index 98% rename from polars/polars-core/src/chunked_array/ops/is_in.rs rename to crates/polars-core/src/chunked_array/ops/is_in.rs index 7c408f977ebf1..d6979b3476db6 100644 --- a/polars/polars-core/src/chunked_array/ops/is_in.rs +++ b/crates/polars-core/src/chunked_array/ops/is_in.rs @@ -269,7 +269,13 @@ impl IsIn for BooleanChunked { DataType::Boolean => { let other = other.bool().unwrap(); let has_true = other.any(); - let has_false = !other.all(); + let nc = other.null_count(); + + let has_false = if nc == 0 { + !other.all() + } else { + !(other.sum().unwrap() as usize + nc) == other.len() + }; Ok(self.apply(|v| if v { has_true } else { has_false })) } _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()), diff --git a/polars/polars-core/src/chunked_array/ops/len.rs b/crates/polars-core/src/chunked_array/ops/len.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/len.rs rename to crates/polars-core/src/chunked_array/ops/len.rs diff --git a/polars/polars-core/src/chunked_array/ops/min_max_binary.rs b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/min_max_binary.rs rename to crates/polars-core/src/chunked_array/ops/min_max_binary.rs diff --git a/polars/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/mod.rs rename to crates/polars-core/src/chunked_array/ops/mod.rs diff --git a/polars/polars-core/src/chunked_array/ops/nulls.rs b/crates/polars-core/src/chunked_array/ops/nulls.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/nulls.rs rename to crates/polars-core/src/chunked_array/ops/nulls.rs diff --git a/polars/polars-core/src/chunked_array/ops/peaks.rs b/crates/polars-core/src/chunked_array/ops/peaks.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/peaks.rs rename to crates/polars-core/src/chunked_array/ops/peaks.rs diff --git a/polars/polars-core/src/chunked_array/ops/repeat_by.rs b/crates/polars-core/src/chunked_array/ops/repeat_by.rs similarity index 96% rename from polars/polars-core/src/chunked_array/ops/repeat_by.rs rename to crates/polars-core/src/chunked_array/ops/repeat_by.rs index 261bf473a00bb..8c58e264cc893 100644 --- a/polars/polars-core/src/chunked_array/ops/repeat_by.rs +++ b/crates/polars-core/src/chunked_array/ops/repeat_by.rs @@ -32,7 +32,7 @@ where } let iter = self .into_iter() - .zip(by.into_iter()) + .zip(by) .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); // Safety: @@ -64,7 +64,7 @@ impl RepeatBy for BooleanChunked { let iter = self .into_iter() - .zip(by.into_iter()) + .zip(by) .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); // Safety: @@ -93,7 +93,7 @@ impl RepeatBy for Utf8Chunked { let iter = self .into_iter() - .zip(by.into_iter()) + .zip(by) .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); // Safety: @@ -124,7 +124,7 @@ impl RepeatBy for BinaryChunked { } let iter = self .into_iter() - .zip(by.into_iter()) + .zip(by) .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); // Safety: diff --git a/polars/polars-core/src/chunked_array/ops/reverse.rs b/crates/polars-core/src/chunked_array/ops/reverse.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/reverse.rs rename to crates/polars-core/src/chunked_array/ops/reverse.rs diff --git a/polars/polars-core/src/chunked_array/ops/rolling_window.rs b/crates/polars-core/src/chunked_array/ops/rolling_window.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/rolling_window.rs rename to crates/polars-core/src/chunked_array/ops/rolling_window.rs diff --git a/polars/polars-core/src/chunked_array/ops/set.rs b/crates/polars-core/src/chunked_array/ops/set.rs similarity index 98% rename from polars/polars-core/src/chunked_array/ops/set.rs rename to crates/polars-core/src/chunked_array/ops/set.rs index 9bbb92f47bcb4..869604beaca44 100644 --- a/polars/polars-core/src/chunked_array/ops/set.rs +++ b/crates/polars-core/src/chunked_array/ops/set.rs @@ -55,7 +55,7 @@ where if self.chunks.len() == 1 { let arr = set_at_idx_no_null( self.downcast_iter().next().unwrap(), - idx.into_iter(), + idx, value, T::get_dtype().to_arrow(), )?; @@ -113,7 +113,7 @@ where // slow path, could be optimized. let ca = mask .into_iter() - .zip(self.into_iter()) + .zip(self) .map(|(mask_val, opt_val)| match mask_val { Some(true) => value, _ => opt_val, @@ -166,7 +166,7 @@ impl<'a> ChunkSet<'a, bool, bool> for BooleanChunked { check_bounds!(self, mask); let ca = mask .into_iter() - .zip(self.into_iter()) + .zip(self) .map(|(mask_val, opt_val)| match mask_val { Some(true) => value, _ => opt_val, @@ -229,7 +229,7 @@ impl<'a> ChunkSet<'a, &'a str, String> for Utf8Chunked { check_bounds!(self, mask); let ca = mask .into_iter() - .zip(self.into_iter()) + .zip(self) .map(|(mask_val, opt_val)| match mask_val { Some(true) => value, _ => opt_val, @@ -293,7 +293,7 @@ impl<'a> ChunkSet<'a, &'a [u8], Vec> for BinaryChunked { check_bounds!(self, mask); let ca = mask .into_iter() - .zip(self.into_iter()) + .zip(self) .map(|(mask_val, opt_val)| match mask_val { Some(true) => value, _ => opt_val, diff --git a/polars/polars-core/src/chunked_array/ops/shift.rs b/crates/polars-core/src/chunked_array/ops/shift.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/shift.rs rename to crates/polars-core/src/chunked_array/ops/shift.rs diff --git a/polars/polars-core/src/chunked_array/ops/sort/arg_sort.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/sort/arg_sort.rs rename to crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs diff --git a/polars/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs rename to crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs diff --git a/polars/polars-core/src/chunked_array/ops/sort/categorical.rs b/crates/polars-core/src/chunked_array/ops/sort/categorical.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/sort/categorical.rs rename to crates/polars-core/src/chunked_array/ops/sort/categorical.rs diff --git a/polars/polars-core/src/chunked_array/ops/sort/mod.rs b/crates/polars-core/src/chunked_array/ops/sort/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/sort/mod.rs rename to crates/polars-core/src/chunked_array/ops/sort/mod.rs diff --git a/polars/polars-core/src/chunked_array/ops/sort/slice.rs b/crates/polars-core/src/chunked_array/ops/sort/slice.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/sort/slice.rs rename to crates/polars-core/src/chunked_array/ops/sort/slice.rs diff --git a/polars/polars-core/src/chunked_array/ops/take/mod.rs b/crates/polars-core/src/chunked_array/ops/take/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/take/mod.rs rename to crates/polars-core/src/chunked_array/ops/take/mod.rs diff --git a/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs b/crates/polars-core/src/chunked_array/ops/take/take_chunked.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/take/take_chunked.rs rename to crates/polars-core/src/chunked_array/ops/take/take_chunked.rs diff --git a/polars/polars-core/src/chunked_array/ops/take/take_every.rs b/crates/polars-core/src/chunked_array/ops/take/take_every.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/take/take_every.rs rename to crates/polars-core/src/chunked_array/ops/take/take_every.rs diff --git a/polars/polars-core/src/chunked_array/ops/take/take_random.rs b/crates/polars-core/src/chunked_array/ops/take/take_random.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/take/take_random.rs rename to crates/polars-core/src/chunked_array/ops/take/take_random.rs diff --git a/polars/polars-core/src/chunked_array/ops/take/take_single.rs b/crates/polars-core/src/chunked_array/ops/take/take_single.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/take/take_single.rs rename to crates/polars-core/src/chunked_array/ops/take/take_single.rs diff --git a/polars/polars-core/src/chunked_array/ops/take/traits.rs b/crates/polars-core/src/chunked_array/ops/take/traits.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/take/traits.rs rename to crates/polars-core/src/chunked_array/ops/take/traits.rs diff --git a/polars/polars-core/src/chunked_array/ops/tile.rs b/crates/polars-core/src/chunked_array/ops/tile.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/tile.rs rename to crates/polars-core/src/chunked_array/ops/tile.rs diff --git a/polars/polars-core/src/chunked_array/ops/unique/mod.rs b/crates/polars-core/src/chunked_array/ops/unique/mod.rs similarity index 91% rename from polars/polars-core/src/chunked_array/ops/unique/mod.rs rename to crates/polars-core/src/chunked_array/ops/unique/mod.rs index 8f34e149a04ad..b4c3224e687e3 100644 --- a/polars/polars-core/src/chunked_array/ops/unique/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/unique/mod.rs @@ -78,7 +78,38 @@ where } #[cfg(feature = "mode")] -#[allow(clippy::needless_collect)] +fn mode_indices(groups: GroupsProxy) -> Vec { + match groups { + GroupsProxy::Idx(groups) => { + let mut groups = groups.into_iter().collect_trusted::>(); + groups.sort_unstable_by_key(|k| k.1.len()); + let last = &groups.last().unwrap(); + let max_occur = last.1.len(); + groups + .iter() + .rev() + .take_while(|v| v.1.len() == max_occur) + .map(|v| v.0) + .collect() + } + GroupsProxy::Slice { groups, .. } => { + let last = groups.last().unwrap(); + let max_occur = last[1]; + + groups + .iter() + .rev() + .take_while(|v| { + let len = v[1]; + len == max_occur + }) + .map(|v| v[0]) + .collect() + } + } +} + +#[cfg(feature = "mode")] fn mode(ca: &ChunkedArray) -> ChunkedArray where ChunkedArray: IntoGroupsProxy + ChunkTake, @@ -86,28 +117,12 @@ where if ca.is_empty() { return ca.clone(); } - let mut groups = ca - .group_tuples(true, false) - .unwrap() - .into_idx() - .into_iter() - .collect_trusted::>(); - groups.sort_unstable_by_key(|k| k.1.len()); - let last = &groups.last().unwrap(); - - let max_occur = last.1.len(); - - // collect until we don't take with trusted len anymore - // TODO! take directly from iter, but first remove standard trusted-length collect. - let idx = groups - .iter() - .rev() - .take_while(|v| v.1.len() == max_occur) - .map(|v| v.0) - .collect::>(); + let groups = ca.group_tuples(true, false).unwrap(); + let idx = mode_indices(groups); + // Safety: // group indices are in bounds - unsafe { ca.take_unchecked(idx.into_iter().map(|i| i as usize).into()) } + unsafe { ca.take_unchecked(idx.as_slice().into()) } } macro_rules! arg_unique_ca { diff --git a/polars/polars-core/src/chunked_array/ops/unique/rank.rs b/crates/polars-core/src/chunked_array/ops/unique/rank.rs similarity index 100% rename from polars/polars-core/src/chunked_array/ops/unique/rank.rs rename to crates/polars-core/src/chunked_array/ops/unique/rank.rs diff --git a/polars/polars-core/src/chunked_array/ops/zip.rs b/crates/polars-core/src/chunked_array/ops/zip.rs similarity index 98% rename from polars/polars-core/src/chunked_array/ops/zip.rs rename to crates/polars-core/src/chunked_array/ops/zip.rs index 19cb41ada6925..ae91346f1beb3 100644 --- a/polars/polars-core/src/chunked_array/ops/zip.rs +++ b/crates/polars-core/src/chunked_array/ops/zip.rs @@ -167,8 +167,8 @@ impl ChunkZip> for ObjectChunked { let mut ca: Self = left .as_ref() .into_iter() - .zip(right.into_iter()) - .zip(mask.into_iter()) + .zip(right.as_ref()) + .zip(mask.as_ref()) .map(|((left_c, right_c), mask_c)| match mask_c { Some(true) => left_c.cloned(), Some(false) => right_c.cloned(), diff --git a/polars/polars-core/src/chunked_array/random.rs b/crates/polars-core/src/chunked_array/random.rs similarity index 100% rename from polars/polars-core/src/chunked_array/random.rs rename to crates/polars-core/src/chunked_array/random.rs diff --git a/polars/polars-core/src/chunked_array/temporal/conversion.rs b/crates/polars-core/src/chunked_array/temporal/conversion.rs similarity index 100% rename from polars/polars-core/src/chunked_array/temporal/conversion.rs rename to crates/polars-core/src/chunked_array/temporal/conversion.rs diff --git a/polars/polars-core/src/chunked_array/temporal/date.rs b/crates/polars-core/src/chunked_array/temporal/date.rs similarity index 100% rename from polars/polars-core/src/chunked_array/temporal/date.rs rename to crates/polars-core/src/chunked_array/temporal/date.rs diff --git a/polars/polars-core/src/chunked_array/temporal/datetime.rs b/crates/polars-core/src/chunked_array/temporal/datetime.rs similarity index 90% rename from polars/polars-core/src/chunked_array/temporal/datetime.rs rename to crates/polars-core/src/chunked_array/temporal/datetime.rs index b118e2b3773fb..fc987ceb95207 100644 --- a/polars/polars-core/src/chunked_array/temporal/datetime.rs +++ b/crates/polars-core/src/chunked_array/temporal/datetime.rs @@ -8,8 +8,6 @@ use chrono::format::{DelayedFormat, StrftimeItems}; use chrono::TimeZone as TimeZoneTrait; #[cfg(feature = "timezones")] use chrono_tz::Tz; -#[cfg(feature = "timezones")] -use polars_arrow::kernels::replace_timezone; use super::conversion::{datetime_to_timestamp_ms, datetime_to_timestamp_ns}; use super::*; @@ -94,29 +92,6 @@ impl DatetimeChunked { } } - #[cfg(feature = "timezones")] - pub fn replace_time_zone( - &self, - time_zone: Option<&str>, - use_earliest: Option, - ) -> PolarsResult { - let out: PolarsResult<_> = { - let from = self.time_zone().as_deref().unwrap_or("UTC"); - let to = time_zone.unwrap_or("UTC"); - let chunks = self - .downcast_iter() - .map(|arr| { - replace_timezone(arr, self.time_unit().to_arrow(), from, to, use_earliest) - }) - .collect::>()?; - let out = unsafe { ChunkedArray::from_chunks(self.name(), chunks) }; - Ok(out.into_datetime(self.time_unit(), time_zone.map(|x| x.to_string()))) - }; - let mut out = out?; - out.set_sorted_flag(self.is_sorted_flag()); - Ok(out) - } - /// Convert from Datetime into Utf8 with the given format. /// See [chrono strftime/strptime](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html). pub fn to_string(&self, format: &str) -> PolarsResult { diff --git a/polars/polars-core/src/chunked_array/temporal/duration.rs b/crates/polars-core/src/chunked_array/temporal/duration.rs similarity index 100% rename from polars/polars-core/src/chunked_array/temporal/duration.rs rename to crates/polars-core/src/chunked_array/temporal/duration.rs diff --git a/polars/polars-core/src/chunked_array/temporal/mod.rs b/crates/polars-core/src/chunked_array/temporal/mod.rs similarity index 100% rename from polars/polars-core/src/chunked_array/temporal/mod.rs rename to crates/polars-core/src/chunked_array/temporal/mod.rs diff --git a/polars/polars-core/src/chunked_array/temporal/time.rs b/crates/polars-core/src/chunked_array/temporal/time.rs similarity index 100% rename from polars/polars-core/src/chunked_array/temporal/time.rs rename to crates/polars-core/src/chunked_array/temporal/time.rs diff --git a/polars/polars-core/src/chunked_array/to_vec.rs b/crates/polars-core/src/chunked_array/to_vec.rs similarity index 100% rename from polars/polars-core/src/chunked_array/to_vec.rs rename to crates/polars-core/src/chunked_array/to_vec.rs diff --git a/polars/polars-core/src/chunked_array/trusted_len.rs b/crates/polars-core/src/chunked_array/trusted_len.rs similarity index 100% rename from polars/polars-core/src/chunked_array/trusted_len.rs rename to crates/polars-core/src/chunked_array/trusted_len.rs diff --git a/polars/polars-core/src/chunked_array/upstream_traits.rs b/crates/polars-core/src/chunked_array/upstream_traits.rs similarity index 100% rename from polars/polars-core/src/chunked_array/upstream_traits.rs rename to crates/polars-core/src/chunked_array/upstream_traits.rs diff --git a/polars/polars-core/src/cloud.rs b/crates/polars-core/src/cloud.rs similarity index 100% rename from polars/polars-core/src/cloud.rs rename to crates/polars-core/src/cloud.rs diff --git a/polars/polars-core/src/config.rs b/crates/polars-core/src/config.rs similarity index 100% rename from polars/polars-core/src/config.rs rename to crates/polars-core/src/config.rs diff --git a/polars/polars-core/src/datatypes/_serde.rs b/crates/polars-core/src/datatypes/_serde.rs similarity index 100% rename from polars/polars-core/src/datatypes/_serde.rs rename to crates/polars-core/src/datatypes/_serde.rs diff --git a/polars/polars-core/src/datatypes/aliases.rs b/crates/polars-core/src/datatypes/aliases.rs similarity index 100% rename from polars/polars-core/src/datatypes/aliases.rs rename to crates/polars-core/src/datatypes/aliases.rs diff --git a/polars/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs similarity index 99% rename from polars/polars-core/src/datatypes/any_value.rs rename to crates/polars-core/src/datatypes/any_value.rs index 42cbf25753b05..16a4db50c9a2a 100644 --- a/polars/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -4,6 +4,7 @@ use arrow::temporal_conversions::{ use arrow::types::PrimitiveType; #[cfg(feature = "dtype-struct")] use polars_arrow::trusted_len::TrustedLenPush; +use polars_utils::format_smartstring; #[cfg(feature = "dtype-struct")] use polars_utils::slice::GetSaferUnchecked; #[cfg(feature = "dtype-categorical")] @@ -456,6 +457,7 @@ impl<'a> AnyValue<'a> { DataType::Duration(tu) => AnyValue::Duration($av as i64, *tu), #[cfg(feature="dtype-time")] DataType::Time => AnyValue::Time($av as i64), + DataType::Utf8 => AnyValue::Utf8Owned(format_smartstring!("{}", $av)), _ => polars_bail!( ComputeError: "cannot cast any-value {:?} to dtype '{}'", self, dtype, ), diff --git a/polars/polars-core/src/datatypes/dtype.rs b/crates/polars-core/src/datatypes/dtype.rs similarity index 100% rename from polars/polars-core/src/datatypes/dtype.rs rename to crates/polars-core/src/datatypes/dtype.rs diff --git a/polars/polars-core/src/datatypes/field.rs b/crates/polars-core/src/datatypes/field.rs similarity index 100% rename from polars/polars-core/src/datatypes/field.rs rename to crates/polars-core/src/datatypes/field.rs diff --git a/polars/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs similarity index 100% rename from polars/polars-core/src/datatypes/mod.rs rename to crates/polars-core/src/datatypes/mod.rs diff --git a/polars/polars-core/src/datatypes/time_unit.rs b/crates/polars-core/src/datatypes/time_unit.rs similarity index 100% rename from polars/polars-core/src/datatypes/time_unit.rs rename to crates/polars-core/src/datatypes/time_unit.rs diff --git a/polars/polars-core/src/doc/changelog/mod.rs b/crates/polars-core/src/doc/changelog/mod.rs similarity index 100% rename from polars/polars-core/src/doc/changelog/mod.rs rename to crates/polars-core/src/doc/changelog/mod.rs diff --git a/polars/polars-core/src/doc/changelog/v0_10_0_11.rs b/crates/polars-core/src/doc/changelog/v0_10_0_11.rs similarity index 100% rename from polars/polars-core/src/doc/changelog/v0_10_0_11.rs rename to crates/polars-core/src/doc/changelog/v0_10_0_11.rs diff --git a/polars/polars-core/src/doc/changelog/v0_3.rs b/crates/polars-core/src/doc/changelog/v0_3.rs similarity index 100% rename from polars/polars-core/src/doc/changelog/v0_3.rs rename to crates/polars-core/src/doc/changelog/v0_3.rs diff --git a/polars/polars-core/src/doc/changelog/v0_4.rs b/crates/polars-core/src/doc/changelog/v0_4.rs similarity index 100% rename from polars/polars-core/src/doc/changelog/v0_4.rs rename to crates/polars-core/src/doc/changelog/v0_4.rs diff --git a/polars/polars-core/src/doc/changelog/v0_5.rs b/crates/polars-core/src/doc/changelog/v0_5.rs similarity index 100% rename from polars/polars-core/src/doc/changelog/v0_5.rs rename to crates/polars-core/src/doc/changelog/v0_5.rs diff --git a/polars/polars-core/src/doc/changelog/v0_6.rs b/crates/polars-core/src/doc/changelog/v0_6.rs similarity index 100% rename from polars/polars-core/src/doc/changelog/v0_6.rs rename to crates/polars-core/src/doc/changelog/v0_6.rs diff --git a/polars/polars-core/src/doc/changelog/v0_7.rs b/crates/polars-core/src/doc/changelog/v0_7.rs similarity index 100% rename from polars/polars-core/src/doc/changelog/v0_7.rs rename to crates/polars-core/src/doc/changelog/v0_7.rs diff --git a/polars/polars-core/src/doc/changelog/v0_8.rs b/crates/polars-core/src/doc/changelog/v0_8.rs similarity index 100% rename from polars/polars-core/src/doc/changelog/v0_8.rs rename to crates/polars-core/src/doc/changelog/v0_8.rs diff --git a/polars/polars-core/src/doc/changelog/v0_9.rs b/crates/polars-core/src/doc/changelog/v0_9.rs similarity index 100% rename from polars/polars-core/src/doc/changelog/v0_9.rs rename to crates/polars-core/src/doc/changelog/v0_9.rs diff --git a/polars/polars-core/src/doc/mod.rs b/crates/polars-core/src/doc/mod.rs similarity index 100% rename from polars/polars-core/src/doc/mod.rs rename to crates/polars-core/src/doc/mod.rs diff --git a/polars/polars-core/src/error.rs b/crates/polars-core/src/error.rs similarity index 100% rename from polars/polars-core/src/error.rs rename to crates/polars-core/src/error.rs diff --git a/polars/polars-core/src/export.rs b/crates/polars-core/src/export.rs similarity index 100% rename from polars/polars-core/src/export.rs rename to crates/polars-core/src/export.rs diff --git a/polars/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs similarity index 100% rename from polars/polars-core/src/fmt.rs rename to crates/polars-core/src/fmt.rs diff --git a/polars/polars-core/src/frame/arithmetic.rs b/crates/polars-core/src/frame/arithmetic.rs similarity index 96% rename from polars/polars-core/src/frame/arithmetic.rs rename to crates/polars-core/src/frame/arithmetic.rs index c0c2150e694ab..4488640a0b456 100644 --- a/polars/polars-core/src/frame/arithmetic.rs +++ b/crates/polars-core/src/frame/arithmetic.rs @@ -8,12 +8,9 @@ use crate::utils::try_get_supertype; /// Get the supertype that is valid for all columns in the DataFrame. /// This reduces casting of the rhs in arithmetic. fn get_supertype_all(df: &DataFrame, rhs: &Series) -> PolarsResult { - df.columns - .iter() - .fold(Ok(rhs.dtype().clone()), |dt, s| match dt { - Ok(dt) => try_get_supertype(s.dtype(), &dt), - e => e, - }) + df.columns.iter().try_fold(rhs.dtype().clone(), |dt, s| { + try_get_supertype(s.dtype(), &dt) + }) } macro_rules! impl_arithmetic { diff --git a/polars/polars-core/src/frame/asof_join/asof.rs b/crates/polars-core/src/frame/asof_join/asof.rs similarity index 100% rename from polars/polars-core/src/frame/asof_join/asof.rs rename to crates/polars-core/src/frame/asof_join/asof.rs diff --git a/polars/polars-core/src/frame/asof_join/groups.rs b/crates/polars-core/src/frame/asof_join/groups.rs similarity index 98% rename from polars/polars-core/src/frame/asof_join/groups.rs rename to crates/polars-core/src/frame/asof_join/groups.rs index 1869513d8a716..2ab7490505b00 100644 --- a/polars/polars-core/src/frame/asof_join/groups.rs +++ b/crates/polars-core/src/frame/asof_join/groups.rs @@ -152,26 +152,26 @@ pub(super) unsafe fn join_asof_nearest_with_indirection< if offsets.is_empty() { return (None, 0); } - let max_value = ::max_value(); - let mut dist: T = max_value; + let max_possible_dist = ::max_value(); + let mut dist: T = max_possible_dist; + let mut prev_offset: IdxSize = 0; for (idx, &offset) in offsets.iter().enumerate() { let val_r = *right.get_unchecked(offset as usize); - if val_r >= val_l { - // This is (val_r - val_l).abs(), but works on strings/dates - let dist_curr = if val_r > val_l { - val_r - val_l - } else { - val_l - val_r - }; - if dist_curr <= dist { - // candidate for match - dist = dist_curr; - } else { - // note for a nearest-match, we can re-match on the same val_r next time, - // so we need to rewind the idx by 1 - return (Some(offset - 1), idx - 1); - } + // This is (val_r - val_l).abs(), but works on strings/dates + let dist_curr = if val_r > val_l { + val_r - val_l + } else { + val_l - val_r + }; + if dist_curr <= dist { + // candidate for match + dist = dist_curr; + } else { + // note for a nearest-match, we can re-match on the same val_r next time, + // so we need to rewind the idx by 1 + return (Some(prev_offset), idx - 1); } + prev_offset = offset; } // if we've reached the end with nearest and haven't returned, it means that the last item was the closest diff --git a/polars/polars-core/src/frame/asof_join/mod.rs b/crates/polars-core/src/frame/asof_join/mod.rs similarity index 100% rename from polars/polars-core/src/frame/asof_join/mod.rs rename to crates/polars-core/src/frame/asof_join/mod.rs diff --git a/polars/polars-core/src/frame/chunks.rs b/crates/polars-core/src/frame/chunks.rs similarity index 100% rename from polars/polars-core/src/frame/chunks.rs rename to crates/polars-core/src/frame/chunks.rs diff --git a/polars/polars-core/src/frame/cross_join.rs b/crates/polars-core/src/frame/cross_join.rs similarity index 100% rename from polars/polars-core/src/frame/cross_join.rs rename to crates/polars-core/src/frame/cross_join.rs diff --git a/polars/polars-core/src/frame/explode.rs b/crates/polars-core/src/frame/explode.rs similarity index 99% rename from polars/polars-core/src/frame/explode.rs rename to crates/polars-core/src/frame/explode.rs index 0d8a46fd2d77e..29e940021c68a 100644 --- a/polars/polars-core/src/frame/explode.rs +++ b/crates/polars-core/src/frame/explode.rs @@ -37,6 +37,7 @@ pub struct MeltArgs { impl DataFrame { pub fn explode_impl(&self, mut columns: Vec) -> PolarsResult { + polars_ensure!(!columns.is_empty(), InvalidOperation: "no columns provided in explode"); let mut df = self.clone(); if self.height() == 0 { for s in &columns { diff --git a/polars/polars-core/src/frame/from.rs b/crates/polars-core/src/frame/from.rs similarity index 100% rename from polars/polars-core/src/frame/from.rs rename to crates/polars-core/src/frame/from.rs diff --git a/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs b/crates/polars-core/src/frame/groupby/aggregations/agg_list.rs similarity index 100% rename from polars/polars-core/src/frame/groupby/aggregations/agg_list.rs rename to crates/polars-core/src/frame/groupby/aggregations/agg_list.rs diff --git a/polars/polars-core/src/frame/groupby/aggregations/boolean.rs b/crates/polars-core/src/frame/groupby/aggregations/boolean.rs similarity index 100% rename from polars/polars-core/src/frame/groupby/aggregations/boolean.rs rename to crates/polars-core/src/frame/groupby/aggregations/boolean.rs diff --git a/polars/polars-core/src/frame/groupby/aggregations/dispatch.rs b/crates/polars-core/src/frame/groupby/aggregations/dispatch.rs similarity index 100% rename from polars/polars-core/src/frame/groupby/aggregations/dispatch.rs rename to crates/polars-core/src/frame/groupby/aggregations/dispatch.rs diff --git a/polars/polars-core/src/frame/groupby/aggregations/mod.rs b/crates/polars-core/src/frame/groupby/aggregations/mod.rs similarity index 100% rename from polars/polars-core/src/frame/groupby/aggregations/mod.rs rename to crates/polars-core/src/frame/groupby/aggregations/mod.rs diff --git a/polars/polars-core/src/frame/groupby/aggregations/utf8.rs b/crates/polars-core/src/frame/groupby/aggregations/utf8.rs similarity index 100% rename from polars/polars-core/src/frame/groupby/aggregations/utf8.rs rename to crates/polars-core/src/frame/groupby/aggregations/utf8.rs diff --git a/polars/polars-core/src/frame/groupby/expr.rs b/crates/polars-core/src/frame/groupby/expr.rs similarity index 100% rename from polars/polars-core/src/frame/groupby/expr.rs rename to crates/polars-core/src/frame/groupby/expr.rs diff --git a/polars/polars-core/src/frame/groupby/hashing.rs b/crates/polars-core/src/frame/groupby/hashing.rs similarity index 99% rename from polars/polars-core/src/frame/groupby/hashing.rs rename to crates/polars-core/src/frame/groupby/hashing.rs index f73bf22ba99c3..d145f285d1295 100644 --- a/polars/polars-core/src/frame/groupby/hashing.rs +++ b/crates/polars-core/src/frame/groupby/hashing.rs @@ -68,7 +68,7 @@ fn finish_group_order(mut out: Vec>, sorted: bool) -> GroupsProxy { items }; out.sort_unstable_by_key(|g| g.0); - let mut idx = GroupsIdx::from_iter(out.into_iter()); + let mut idx = GroupsIdx::from_iter(out); idx.sorted = true; GroupsProxy::Idx(idx) } else { @@ -123,8 +123,7 @@ fn finish_group_order_vecs( // give the compiler some info // maybe it may elide some loop counters assert_eq!(first.len(), all.len()); - for (i, (first, all)) in first.into_iter().zip(all.into_iter()).enumerate() - { + for (i, (first, all)) in first.into_iter().zip(all).enumerate() { std::ptr::write(items_ptr.add(i), (first, all)) } } @@ -136,7 +135,7 @@ fn finish_group_order_vecs( // sort again items.sort_unstable_by_key(|g| g.0); - let mut idx = GroupsIdx::from_iter(items.into_iter()); + let mut idx = GroupsIdx::from_iter(items); idx.sorted = true; GroupsProxy::Idx(idx) } else { diff --git a/polars/polars-core/src/frame/groupby/into_groups.rs b/crates/polars-core/src/frame/groupby/into_groups.rs similarity index 100% rename from polars/polars-core/src/frame/groupby/into_groups.rs rename to crates/polars-core/src/frame/groupby/into_groups.rs diff --git a/polars/polars-core/src/frame/groupby/mod.rs b/crates/polars-core/src/frame/groupby/mod.rs similarity index 99% rename from polars/polars-core/src/frame/groupby/mod.rs rename to crates/polars-core/src/frame/groupby/mod.rs index 47a71cc7b99ad..1617608cfc3a7 100644 --- a/polars/polars-core/src/frame/groupby/mod.rs +++ b/crates/polars-core/src/frame/groupby/mod.rs @@ -782,7 +782,7 @@ impl<'df> GroupBy<'df> { let mut new_cols = Vec::with_capacity(self.selected_keys.len() + agg.len()); new_cols.extend_from_slice(&self.selected_keys); let cols = self.df.select_series(agg)?; - new_cols.extend(cols.into_iter()); + new_cols.extend(cols); Ok(DataFrame::new_no_checks(new_cols)) } } else { diff --git a/polars/polars-core/src/frame/groupby/perfect.rs b/crates/polars-core/src/frame/groupby/perfect.rs similarity index 99% rename from polars/polars-core/src/frame/groupby/perfect.rs rename to crates/polars-core/src/frame/groupby/perfect.rs index abfdd8a765ac8..31165cd887a3d 100644 --- a/polars/polars-core/src/frame/groupby/perfect.rs +++ b/crates/polars-core/src/frame/groupby/perfect.rs @@ -190,7 +190,9 @@ where impl CategoricalChunked { // Use the indexes as perfect groups pub fn group_tuples_perfect(&self, multithreaded: bool, sorted: bool) -> GroupsProxy { - let DataType::Categorical(Some(rev_map)) = self.dtype() else { unreachable!()}; + let DataType::Categorical(Some(rev_map)) = self.dtype() else { + unreachable!() + }; if self.is_empty() { return GroupsProxy::Idx(GroupsIdx::new(vec![], vec![], true)); } diff --git a/polars/polars-core/src/frame/groupby/proxy.rs b/crates/polars-core/src/frame/groupby/proxy.rs similarity index 99% rename from polars/polars-core/src/frame/groupby/proxy.rs rename to crates/polars-core/src/frame/groupby/proxy.rs index 3b30c47327c4e..3f1a6eeced65a 100644 --- a/polars/polars-core/src/frame/groupby/proxy.rs +++ b/crates/polars-core/src/frame/groupby/proxy.rs @@ -240,7 +240,7 @@ impl IntoIterator for GroupsIdx { fn into_iter(mut self) -> Self::IntoIter { let first = std::mem::take(&mut self.first); let all = std::mem::take(&mut self.all); - first.into_iter().zip(all.into_iter()) + first.into_iter().zip(all) } } @@ -312,7 +312,7 @@ impl GroupsProxy { match self { GroupsProxy::Idx(groups) => groups, GroupsProxy::Slice { groups, .. } => { - eprintln!("Had to reallocate groups, missed an optimization opportunity. Please open an issue."); + polars_warn!("Had to reallocate groups, missed an optimization opportunity. Please open an issue."); groups .iter() .map(|&[first, len]| (first, (first..first + len).collect_trusted::>())) diff --git a/polars/polars-core/src/frame/hash_join/args.rs b/crates/polars-core/src/frame/hash_join/args.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/args.rs rename to crates/polars-core/src/frame/hash_join/args.rs diff --git a/polars/polars-core/src/frame/hash_join/mod.rs b/crates/polars-core/src/frame/hash_join/mod.rs similarity index 98% rename from polars/polars-core/src/frame/hash_join/mod.rs rename to crates/polars-core/src/frame/hash_join/mod.rs index ec31166685c42..680cceb9f9e43 100644 --- a/polars/polars-core/src/frame/hash_join/mod.rs +++ b/crates/polars-core/src/frame/hash_join/mod.rs @@ -176,19 +176,19 @@ impl DataFrame { &self, join_tuples: &[IdxSize], left_join: bool, - sorted: bool, + sorted_tuple_idx: bool, ) -> DataFrame { if left_join && join_tuples.len() == self.height() { self.clone() } else { // left join tuples are always in ascending order - let sorted = if left_join || sorted { + let sorted = if left_join || sorted_tuple_idx { IsSorted::Ascending } else { IsSorted::Not }; - self._take_unchecked_slice2(join_tuples, true, sorted) + self._take_unchecked_slice_sorted(join_tuples, true, sorted) } } @@ -316,7 +316,7 @@ impl DataFrame { idx = slice_slice(idx, offset, len); } // idx from anti-semi join should always be sorted - self._take_unchecked_slice2(idx, true, IsSorted::Ascending) + self._take_unchecked_slice_sorted(idx, true, IsSorted::Ascending) } #[cfg(feature = "semi_anti_join")] diff --git a/polars/polars-core/src/frame/hash_join/multiple_keys.rs b/crates/polars-core/src/frame/hash_join/multiple_keys.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/multiple_keys.rs rename to crates/polars-core/src/frame/hash_join/multiple_keys.rs diff --git a/polars/polars-core/src/frame/hash_join/single_keys.rs b/crates/polars-core/src/frame/hash_join/single_keys.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/single_keys.rs rename to crates/polars-core/src/frame/hash_join/single_keys.rs diff --git a/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs b/crates/polars-core/src/frame/hash_join/single_keys_dispatch.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs rename to crates/polars-core/src/frame/hash_join/single_keys_dispatch.rs diff --git a/polars/polars-core/src/frame/hash_join/single_keys_inner.rs b/crates/polars-core/src/frame/hash_join/single_keys_inner.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/single_keys_inner.rs rename to crates/polars-core/src/frame/hash_join/single_keys_inner.rs diff --git a/polars/polars-core/src/frame/hash_join/single_keys_left.rs b/crates/polars-core/src/frame/hash_join/single_keys_left.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/single_keys_left.rs rename to crates/polars-core/src/frame/hash_join/single_keys_left.rs diff --git a/polars/polars-core/src/frame/hash_join/single_keys_outer.rs b/crates/polars-core/src/frame/hash_join/single_keys_outer.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/single_keys_outer.rs rename to crates/polars-core/src/frame/hash_join/single_keys_outer.rs diff --git a/polars/polars-core/src/frame/hash_join/single_keys_semi_anti.rs b/crates/polars-core/src/frame/hash_join/single_keys_semi_anti.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/single_keys_semi_anti.rs rename to crates/polars-core/src/frame/hash_join/single_keys_semi_anti.rs diff --git a/polars/polars-core/src/frame/hash_join/sort_merge.rs b/crates/polars-core/src/frame/hash_join/sort_merge.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/sort_merge.rs rename to crates/polars-core/src/frame/hash_join/sort_merge.rs diff --git a/polars/polars-core/src/frame/hash_join/zip_outer.rs b/crates/polars-core/src/frame/hash_join/zip_outer.rs similarity index 100% rename from polars/polars-core/src/frame/hash_join/zip_outer.rs rename to crates/polars-core/src/frame/hash_join/zip_outer.rs diff --git a/polars/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs similarity index 99% rename from polars/polars-core/src/frame/mod.rs rename to crates/polars-core/src/frame/mod.rs index a2f24136df32d..626e44496ae92 100644 --- a/polars/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -3282,15 +3282,23 @@ impl DataFrame { DataFrame::new_no_checks(cols) } + /// Take by index values given by the slice `idx`. + /// # Warning /// Be careful with allowing threads when calling this in a large hot loop /// every thread split may be on rayon stack and lead to SO #[doc(hidden)] pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self { - self._take_unchecked_slice2(idx, allow_threads, IsSorted::Not) + self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not) } + /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice` + /// if the index value in `idx` are sorted. This will maintain sorted flags. + /// + /// # Warning + /// Be careful with allowing threads when calling this in a large hot loop + /// every thread split may be on rayon stack and lead to SO #[doc(hidden)] - pub unsafe fn _take_unchecked_slice2( + pub unsafe fn _take_unchecked_slice_sorted( &self, idx: &[IdxSize], allow_threads: bool, @@ -3310,26 +3318,9 @@ impl DataFrame { } } } - let ptr = idx.as_ptr() as *mut IdxSize; - let len = idx.len(); - - // create a temporary vec. we will not drop it. - let mut ca = IdxCa::from_vec("", Vec::from_raw_parts(ptr, len, len)); + let mut ca = IdxCa::mmap_slice("", idx); ca.set_sorted_flag(sorted); - let out = self.take_unchecked_impl(&ca, allow_threads); - - // ref count of buffers should be one because we dropped all allocations - let arr = { - let arr_ref = std::mem::take(&mut ca.chunks).pop().unwrap(); - arr_ref - .as_any() - .downcast_ref::>() - .unwrap() - .clone() - }; - // the only owned heap allocation is the `Vec` we created and must not be dropped - let _ = std::mem::ManuallyDrop::new(arr.into_mut().right().unwrap()); - out + self.take_unchecked_impl(&ca, allow_threads) } #[cfg(feature = "partition_by")] @@ -3362,7 +3353,9 @@ impl DataFrame { .into_par_iter() .map(|(_, group)| { // groups are in bounds - unsafe { df._take_unchecked_slice(&group, false) } + unsafe { + df._take_unchecked_slice_sorted(&group, false, IsSorted::Ascending) + } }) .collect()) } diff --git a/polars/polars-core/src/frame/row/av_buffer.rs b/crates/polars-core/src/frame/row/av_buffer.rs similarity index 93% rename from polars/polars-core/src/frame/row/av_buffer.rs rename to crates/polars-core/src/frame/row/av_buffer.rs index ffe67cd05511f..84da014f4b239 100644 --- a/polars/polars-core/src/frame/row/av_buffer.rs +++ b/crates/polars-core/src/frame/row/av_buffer.rs @@ -358,51 +358,73 @@ impl<'a> AnyValueBufferTrusted<'a> { use AnyValueBufferTrusted::*; match self { Boolean(builder) => { - let AnyValue::Boolean(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::Boolean(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } #[cfg(feature = "dtype-i8")] Int8(builder) => { - let AnyValue::Int8(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::Int8(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } #[cfg(feature = "dtype-i16")] Int16(builder) => { - let AnyValue::Int16(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::Int16(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } Int32(builder) => { - let AnyValue::Int32(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::Int32(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } Int64(builder) => { - let AnyValue::Int64(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::Int64(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } #[cfg(feature = "dtype-u8")] UInt8(builder) => { - let AnyValue::UInt8(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::UInt8(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } #[cfg(feature = "dtype-u16")] UInt16(builder) => { - let AnyValue::UInt16(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::UInt16(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } UInt32(builder) => { - let AnyValue::UInt32(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::UInt32(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } UInt64(builder) => { - let AnyValue::UInt64(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::UInt64(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } Float32(builder) => { - let AnyValue::Float32(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::Float32(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } Float64(builder) => { - let AnyValue::Float64(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::Float64(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(*v) } _ => { @@ -426,12 +448,16 @@ impl<'a> AnyValueBufferTrusted<'a> { _ => { match self { Utf8(builder) => { - let AnyValue::Utf8Owned(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::Utf8Owned(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(v) } #[cfg(feature = "dtype-struct")] Struct(builders) => { - let AnyValue::StructOwned(payload) = val else { unreachable_unchecked_release!() }; + let AnyValue::StructOwned(payload) = val else { + unreachable_unchecked_release!() + }; let avs = &*payload.0; // amortize loop counter for i in 0..avs.len() { @@ -461,12 +487,16 @@ impl<'a> AnyValueBufferTrusted<'a> { _ => { match self { Utf8(builder) => { - let AnyValue::Utf8(v) = val else { unreachable_unchecked_release!() }; + let AnyValue::Utf8(v) = val else { + unreachable_unchecked_release!() + }; builder.append_value(v) } #[cfg(feature = "dtype-struct")] Struct(builders) => { - let AnyValue::Struct(idx, arr, fields) = val else { unreachable_unchecked_release!() }; + let AnyValue::Struct(idx, arr, fields) = val else { + unreachable_unchecked_release!() + }; let arrays = arr.values(); // amortize loop counter for i in 0..fields.len() { diff --git a/polars/polars-core/src/frame/row/dataframe.rs b/crates/polars-core/src/frame/row/dataframe.rs similarity index 100% rename from polars/polars-core/src/frame/row/dataframe.rs rename to crates/polars-core/src/frame/row/dataframe.rs diff --git a/polars/polars-core/src/frame/row/mod.rs b/crates/polars-core/src/frame/row/mod.rs similarity index 100% rename from polars/polars-core/src/frame/row/mod.rs rename to crates/polars-core/src/frame/row/mod.rs diff --git a/polars/polars-core/src/frame/row/transpose.rs b/crates/polars-core/src/frame/row/transpose.rs similarity index 66% rename from polars/polars-core/src/frame/row/transpose.rs rename to crates/polars-core/src/frame/row/transpose.rs index 07c79e1cb7c37..79d7a146c8fd2 100644 --- a/polars/polars-core/src/frame/row/transpose.rs +++ b/crates/polars-core/src/frame/row/transpose.rs @@ -1,25 +1,49 @@ +use std::borrow::Cow; + +use either::Either; + use super::*; impl DataFrame { - pub(crate) fn transpose_from_dtype(&self, dtype: &DataType) -> PolarsResult { + pub(crate) fn transpose_from_dtype( + &self, + dtype: &DataType, + keep_names_as: Option<&str>, + names_out: &[String], + ) -> PolarsResult { let new_width = self.height(); let new_height = self.width(); + // Allocate space for the transposed columns, putting the "row names" first if needed + let mut cols_t = match keep_names_as { + None => Vec::::with_capacity(new_width), + Some(name) => { + let mut tmp = Vec::::with_capacity(new_width + 1); + tmp.push(Utf8Chunked::new(name, self.get_column_names()).into()); + tmp + } + }; + let cols = &self.columns; match dtype { #[cfg(feature = "dtype-i8")] - DataType::Int8 => numeric_transpose::(&self.columns), + DataType::Int8 => numeric_transpose::(cols, names_out, &mut cols_t), #[cfg(feature = "dtype-i16")] - DataType::Int16 => numeric_transpose::(&self.columns), - DataType::Int32 => numeric_transpose::(&self.columns), - DataType::Int64 => numeric_transpose::(&self.columns), + DataType::Int16 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::Int32 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::Int64 => numeric_transpose::(cols, names_out, &mut cols_t), #[cfg(feature = "dtype-u8")] - DataType::UInt8 => numeric_transpose::(&self.columns), + DataType::UInt8 => numeric_transpose::(cols, names_out, &mut cols_t), #[cfg(feature = "dtype-u16")] - DataType::UInt16 => numeric_transpose::(&self.columns), - DataType::UInt32 => numeric_transpose::(&self.columns), - DataType::UInt64 => numeric_transpose::(&self.columns), - DataType::Float32 => numeric_transpose::(&self.columns), - DataType::Float64 => numeric_transpose::(&self.columns), + DataType::UInt16 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::UInt32 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::UInt64 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::Float32 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::Float64 => numeric_transpose::(cols, names_out, &mut cols_t), + #[cfg(feature = "object")] + DataType::Object(_) => { + // this requires to support `Object` in Series::iter which we don't yet + polars_bail!(InvalidOperation: "Object dtype not supported in 'transpose'") + } _ => { let phys_dtype = dtype.to_physical(); let mut buffers = (0..new_width) @@ -47,27 +71,51 @@ impl DataFrame { } }); } - let cols = buffers - .into_iter() - .enumerate() - .map(|(i, buf)| { - let mut s = buf.into_series().cast(dtype).unwrap(); - s.rename(&format!("column_{i}")); - s - }) - .collect::>(); - Ok(DataFrame::new_no_checks(cols)) + cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| { + let mut s = buf.into_series().cast(dtype).unwrap(); + s.rename(name); + s + })); } - } + }; + Ok(DataFrame::new_no_checks(cols_t)) } /// Transpose a DataFrame. This is a very expensive operation. - pub fn transpose(&self) -> PolarsResult { + pub fn transpose( + &self, + keep_names_as: Option<&str>, + new_col_names: Option>>, + ) -> PolarsResult { + let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column + let names_out = match new_col_names { + None => (0..self.height()).map(|i| format!("column_{i}")).collect(), + Some(cn) => match cn { + Either::Left(name) => { + let new_names = self.column(&name).and_then(|x| x.utf8())?; + polars_ensure!(!new_names.has_validity(), ComputeError: "Column with new names can't have null values"); + df = Cow::Owned(self.drop(&name)?); + new_names + .into_no_null_iter() + .map(|s| s.to_owned()) + .collect() + } + Either::Right(names) => { + polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count"); + names + } + }, + }; + if let Some(cn) = keep_names_as { + // Check that the column name we're using for the original column names is unique before + // wasting time transposing + polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn) + } polars_ensure!( - self.height() != 0 && self.width() != 0, + df.height() != 0 && df.width() != 0, NoData: "unable to transpose an empty dataframe" ); - let dtype = self.get_supertype().unwrap()?; + let dtype = df.get_supertype().unwrap()?; match dtype { #[cfg(feature = "dtype-categorical")] DataType::Categorical(_) => { @@ -92,7 +140,7 @@ impl DataFrame { } _ => {} } - self.transpose_from_dtype(&dtype) + df.transpose_from_dtype(&dtype, keep_names_as, &names_out) } } @@ -108,9 +156,12 @@ unsafe fn add_value( *el_ptr.add(row_idx) = value; } -pub(super) fn numeric_transpose(cols: &[Series]) -> PolarsResult +// This just fills a pre-allocated mutable series vector, which may have a name column. +// Nothing is returned and the actual DataFrame is constructed above. +pub(super) fn numeric_transpose(cols: &[Series], names_out: &[String], cols_t: &mut Vec) where T: PolarsNumericType, + //S: AsRef, ChunkedArray: IntoSeries, { let new_width = cols[0].len(); @@ -172,12 +223,12 @@ where }) }); - let series = POOL.install(|| { + cols_t.par_extend(POOL.install(|| { values_buf .into_par_iter() .zip(validity_buf) - .enumerate() - .map(|(i, (mut values, validity))| { + .zip(names_out) + .map(|((mut values, validity), name)| { // Safety: // all values are written we can now set len unsafe { @@ -200,16 +251,12 @@ where values.into(), validity, ); - let name = format!("column_{i}"); unsafe { - ChunkedArray::::from_chunks(&name, vec![Box::new(arr) as ArrayRef]) + ChunkedArray::::from_chunks(name, vec![Box::new(arr) as ArrayRef]) .into_series() } }) - .collect() - }); - - Ok(DataFrame::new_no_checks(series)) + })); } #[cfg(test)] @@ -223,7 +270,7 @@ mod test { "b" => [10, 20, 30], ]?; - let out = df.transpose()?; + let out = df.transpose(None, None)?; let expected = df![ "column_0" => [1, 10], "column_1" => [2, 20], @@ -236,7 +283,7 @@ mod test { "a" => [Some(1), None, Some(3)], "b" => [Some(10), Some(20), None], ]?; - let out = df.transpose()?; + let out = df.transpose(None, None)?; let expected = df![ "column_0" => [1, 10], "column_1" => [None, Some(20)], @@ -249,7 +296,7 @@ mod test { "a" => ["a", "b", "c"], "b" => [Some(10), Some(20), None], ]?; - let out = df.transpose()?; + let out = df.transpose(None, None)?; let expected = df![ "column_0" => ["a", "10"], "column_1" => ["b", "20"], diff --git a/polars/polars-core/src/frame/top_k.rs b/crates/polars-core/src/frame/top_k.rs similarity index 98% rename from polars/polars-core/src/frame/top_k.rs rename to crates/polars-core/src/frame/top_k.rs index fbcb8941f918d..b72116821dc9e 100644 --- a/polars/polars-core/src/frame/top_k.rs +++ b/crates/polars-core/src/frame/top_k.rs @@ -33,7 +33,7 @@ impl Ord for CompareRow<'_> { impl PartialOrd for CompareRow<'_> { fn partial_cmp(&self, other: &Self) -> Option { - self.bytes.partial_cmp(other.bytes) + Some(self.cmp(other)) } } diff --git a/polars/polars-core/src/frame/upstream_traits.rs b/crates/polars-core/src/frame/upstream_traits.rs similarity index 100% rename from polars/polars-core/src/frame/upstream_traits.rs rename to crates/polars-core/src/frame/upstream_traits.rs diff --git a/polars/polars-core/src/functions.rs b/crates/polars-core/src/functions.rs similarity index 100% rename from polars/polars-core/src/functions.rs rename to crates/polars-core/src/functions.rs diff --git a/polars/polars-core/src/hashing/fx.rs b/crates/polars-core/src/hashing/fx.rs similarity index 100% rename from polars/polars-core/src/hashing/fx.rs rename to crates/polars-core/src/hashing/fx.rs diff --git a/polars/polars-core/src/hashing/identity.rs b/crates/polars-core/src/hashing/identity.rs similarity index 100% rename from polars/polars-core/src/hashing/identity.rs rename to crates/polars-core/src/hashing/identity.rs diff --git a/polars/polars-core/src/hashing/mod.rs b/crates/polars-core/src/hashing/mod.rs similarity index 100% rename from polars/polars-core/src/hashing/mod.rs rename to crates/polars-core/src/hashing/mod.rs diff --git a/polars/polars-core/src/hashing/partition.rs b/crates/polars-core/src/hashing/partition.rs similarity index 100% rename from polars/polars-core/src/hashing/partition.rs rename to crates/polars-core/src/hashing/partition.rs diff --git a/polars/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs similarity index 100% rename from polars/polars-core/src/hashing/vector_hasher.rs rename to crates/polars-core/src/hashing/vector_hasher.rs diff --git a/polars/polars-core/src/lib.rs b/crates/polars-core/src/lib.rs similarity index 94% rename from polars/polars-core/src/lib.rs rename to crates/polars-core/src/lib.rs index c3dfaf9b705d7..42aeb726750f2 100644 --- a/polars/polars-core/src/lib.rs +++ b/crates/polars-core/src/lib.rs @@ -1,6 +1,10 @@ #![cfg_attr(docsrs, feature(doc_auto_cfg))] #![cfg_attr(feature = "simd", feature(portable_simd))] #![allow(ambiguous_glob_reexports)] +#![cfg_attr( + feature = "nightly", + allow(clippy::incorrect_partial_ord_impl_on_ord_type) +)] // remove once stable extern crate core; #[macro_use] diff --git a/polars/polars-core/src/named_from.rs b/crates/polars-core/src/named_from.rs similarity index 100% rename from polars/polars-core/src/named_from.rs rename to crates/polars-core/src/named_from.rs diff --git a/polars/polars-core/src/prelude.rs b/crates/polars-core/src/prelude.rs similarity index 100% rename from polars/polars-core/src/prelude.rs rename to crates/polars-core/src/prelude.rs diff --git a/polars/polars-core/src/schema.rs b/crates/polars-core/src/schema.rs similarity index 100% rename from polars/polars-core/src/schema.rs rename to crates/polars-core/src/schema.rs diff --git a/polars/polars-core/src/serde/chunked_array.rs b/crates/polars-core/src/serde/chunked_array.rs similarity index 98% rename from polars/polars-core/src/serde/chunked_array.rs rename to crates/polars-core/src/serde/chunked_array.rs index bdd6634826f13..24cd6064a0a84 100644 --- a/polars/polars-core/src/serde/chunked_array.rs +++ b/crates/polars-core/src/serde/chunked_array.rs @@ -38,7 +38,7 @@ where S: Serializer, { let iter: I = self.iter.borrow_mut().take().unwrap(); - serializer.collect_seq(iter.into_iter()) + serializer.collect_seq(iter) } } diff --git a/polars/polars-core/src/serde/df.rs b/crates/polars-core/src/serde/df.rs similarity index 100% rename from polars/polars-core/src/serde/df.rs rename to crates/polars-core/src/serde/df.rs diff --git a/polars/polars-core/src/serde/mod.rs b/crates/polars-core/src/serde/mod.rs similarity index 100% rename from polars/polars-core/src/serde/mod.rs rename to crates/polars-core/src/serde/mod.rs diff --git a/polars/polars-core/src/serde/series.rs b/crates/polars-core/src/serde/series.rs similarity index 100% rename from polars/polars-core/src/serde/series.rs rename to crates/polars-core/src/serde/series.rs diff --git a/polars/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs similarity index 99% rename from polars/polars-core/src/series/any_value.rs rename to crates/polars-core/src/series/any_value.rs index d0d494650a4be..c9cc0bed24517 100644 --- a/polars/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -65,10 +65,8 @@ fn any_values_to_decimal( } let Some((s_min, s_max)) = scale_range else { // empty array or all nulls, return a decimal array with given scale (or 0 if inferring) - return Ok( - Int128Chunked::full_null("", avs.len()) - .into_decimal_unchecked(precision, scale.unwrap_or(0)) - ); + return Ok(Int128Chunked::full_null("", avs.len()) + .into_decimal_unchecked(precision, scale.unwrap_or(0))); }; let scale = scale.unwrap_or(s_max); if s_max > scale { diff --git a/polars/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs similarity index 100% rename from polars/polars-core/src/series/arithmetic/borrowed.rs rename to crates/polars-core/src/series/arithmetic/borrowed.rs diff --git a/polars/polars-core/src/series/arithmetic/mod.rs b/crates/polars-core/src/series/arithmetic/mod.rs similarity index 100% rename from polars/polars-core/src/series/arithmetic/mod.rs rename to crates/polars-core/src/series/arithmetic/mod.rs diff --git a/polars/polars-core/src/series/arithmetic/owned.rs b/crates/polars-core/src/series/arithmetic/owned.rs similarity index 100% rename from polars/polars-core/src/series/arithmetic/owned.rs rename to crates/polars-core/src/series/arithmetic/owned.rs diff --git a/polars/polars-core/src/series/comparison.rs b/crates/polars-core/src/series/comparison.rs similarity index 98% rename from polars/polars-core/src/series/comparison.rs rename to crates/polars-core/src/series/comparison.rs index 7c4a6a6706612..00dfcb9bea768 100644 --- a/polars/polars-core/src/series/comparison.rs +++ b/crates/polars-core/src/series/comparison.rs @@ -1,12 +1,6 @@ //! Comparison operations on Series. -#[cfg(any( - feature = "dtype-duration", - feature = "dtype-datetime", - feature = "dtype-date", - feature = "dtype-time", - feature = "dtype-struct" -))] +#[cfg(feature = "dtype-struct")] use std::ops::Deref; use super::Series; @@ -33,11 +27,6 @@ macro_rules! impl_compare { DataType::Int64 => lhs.i64().unwrap().$method(rhs.i64().unwrap()), DataType::Float32 => lhs.f32().unwrap().$method(rhs.f32().unwrap()), DataType::Float64 => lhs.f64().unwrap().$method(rhs.f64().unwrap()), - #[cfg(feature = "dtype-duration")] - DataType::Duration(_) => lhs - .duration() - .unwrap() - .$method(rhs.duration().unwrap().deref()), DataType::List(_) => lhs.list().unwrap().$method(rhs.list().unwrap()), #[cfg(feature = "dtype-struct")] DataType::Struct(_) => lhs diff --git a/polars/polars-core/src/series/from.rs b/crates/polars-core/src/series/from.rs similarity index 99% rename from polars/polars-core/src/series/from.rs rename to crates/polars-core/src/series/from.rs index 6816a2c79bba9..48cf0b5eaf021 100644 --- a/polars/polars-core/src/series/from.rs +++ b/crates/polars-core/src/series/from.rs @@ -603,7 +603,7 @@ fn to_physical_and_dtype(arrays: Vec) -> (Vec, DataType) { )) as ArrayRef; let polars_fields = _fields .iter() - .zip(dtypes.into_iter()) + .zip(dtypes) .map(|(field, dtype)| Field::new(&field.name, dtype)) .collect(); (vec![arrow_array], DataType::Struct(polars_fields)) diff --git a/polars/polars-core/src/series/implementations/array.rs b/crates/polars-core/src/series/implementations/array.rs similarity index 98% rename from polars/polars-core/src/series/implementations/array.rs rename to crates/polars-core/src/series/implementations/array.rs index 1b45dec74e39a..5d0d5c4976709 100644 --- a/polars/polars-core/src/series/implementations/array.rs +++ b/crates/polars-core/src/series/implementations/array.rs @@ -20,6 +20,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().data_type() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0.explode_by_offsets(offsets) } diff --git a/polars/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs similarity index 99% rename from polars/polars-core/src/series/implementations/binary.rs rename to crates/polars-core/src/series/implementations/binary.rs index 18a5f230c7f8f..deaceeec4c260 100644 --- a/polars/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -24,6 +24,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().data_type() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0.explode_by_offsets(offsets) } diff --git a/polars/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs similarity index 99% rename from polars/polars-core/src/series/implementations/boolean.rs rename to crates/polars-core/src/series/implementations/boolean.rs index 979a7cf6b81c7..20c8911189b44 100644 --- a/polars/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -25,6 +25,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().data_type() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0.explode_by_offsets(offsets) diff --git a/polars/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs similarity index 99% rename from polars/polars-core/src/series/implementations/categorical.rs rename to crates/polars-core/src/series/implementations/categorical.rs index 939331d075a52..3ca6044e63e4c 100644 --- a/polars/polars-core/src/series/implementations/categorical.rs +++ b/crates/polars-core/src/series/implementations/categorical.rs @@ -64,6 +64,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.dtype() } + fn _clear_settings(&mut self) { + self.0.logical_mut().clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { // TODO! explode by offset should return concrete type diff --git a/polars/polars-core/src/series/implementations/dates_time.rs b/crates/polars-core/src/series/implementations/dates_time.rs similarity index 99% rename from polars/polars-core/src/series/implementations/dates_time.rs rename to crates/polars-core/src/series/implementations/dates_time.rs index ebc4bed623fdc..d46e2eafbccfa 100644 --- a/polars/polars-core/src/series/implementations/dates_time.rs +++ b/crates/polars-core/src/series/implementations/dates_time.rs @@ -39,6 +39,9 @@ macro_rules! impl_dyn_series { fn _dtype(&self) -> &DataType { self.0.dtype() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0 diff --git a/polars/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs similarity index 99% rename from polars/polars-core/src/series/implementations/datetime.rs rename to crates/polars-core/src/series/implementations/datetime.rs index 82bdfac5fdbfe..ec27373898838 100644 --- a/polars/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -35,6 +35,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.dtype() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0 diff --git a/polars/polars-core/src/series/implementations/decimal.rs b/crates/polars-core/src/series/implementations/decimal.rs similarity index 99% rename from polars/polars-core/src/series/implementations/decimal.rs rename to crates/polars-core/src/series/implementations/decimal.rs index 2f054673d8646..bb85cb60a241d 100644 --- a/polars/polars-core/src/series/implementations/decimal.rs +++ b/crates/polars-core/src/series/implementations/decimal.rs @@ -51,6 +51,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.dtype() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } #[cfg(feature = "zip_with")] fn zip_with_same_type(&self, mask: &BooleanChunked, other: &Series) -> PolarsResult { diff --git a/polars/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs similarity index 99% rename from polars/polars-core/src/series/implementations/duration.rs rename to crates/polars-core/src/series/implementations/duration.rs index 0e0ccfc542ca1..c827286531323 100644 --- a/polars/polars-core/src/series/implementations/duration.rs +++ b/crates/polars-core/src/series/implementations/duration.rs @@ -36,6 +36,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.dtype() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0 diff --git a/polars/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs similarity index 99% rename from polars/polars-core/src/series/implementations/floats.rs rename to crates/polars-core/src/series/implementations/floats.rs index 1b3d454907696..b3d231abeb333 100644 --- a/polars/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -29,6 +29,9 @@ macro_rules! impl_dyn_series { fn _dtype(&self) -> &DataType { self.0.ref_field().data_type() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0.explode_by_offsets(offsets) diff --git a/polars/polars-core/src/series/implementations/list.rs b/crates/polars-core/src/series/implementations/list.rs similarity index 98% rename from polars/polars-core/src/series/implementations/list.rs rename to crates/polars-core/src/series/implementations/list.rs index 17391d6e70504..8e104d7e17e0d 100644 --- a/polars/polars-core/src/series/implementations/list.rs +++ b/crates/polars-core/src/series/implementations/list.rs @@ -20,6 +20,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().data_type() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0.explode_by_offsets(offsets) } diff --git a/polars/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs similarity index 99% rename from polars/polars-core/src/series/implementations/mod.rs rename to crates/polars-core/src/series/implementations/mod.rs index 0c6a105dfbabf..bf45273a0e963 100644 --- a/polars/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -91,6 +91,10 @@ macro_rules! impl_dyn_series { self.0.ref_field().data_type() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } + fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0.explode_by_offsets(offsets) } diff --git a/polars/polars-core/src/series/implementations/null.rs b/crates/polars-core/src/series/implementations/null.rs similarity index 98% rename from polars/polars-core/src/series/implementations/null.rs rename to crates/polars-core/src/series/implementations/null.rs index 41d115663ca73..bb70f7187a240 100644 --- a/polars/polars-core/src/series/implementations/null.rs +++ b/crates/polars-core/src/series/implementations/null.rs @@ -48,6 +48,9 @@ impl PrivateSeries for NullChunked { fn _field(&self) -> Cow { Cow::Owned(Field::new(self.name(), DataType::Null)) } + fn _clear_settings(&mut self) { + // no-op + } fn _dtype(&self) -> &DataType { &DataType::Null diff --git a/polars/polars-core/src/series/implementations/object.rs b/crates/polars-core/src/series/implementations/object.rs similarity index 99% rename from polars/polars-core/src/series/implementations/object.rs rename to crates/polars-core/src/series/implementations/object.rs index 7b6c069ab5ed9..227c92db17c0d 100644 --- a/polars/polars-core/src/series/implementations/object.rs +++ b/crates/polars-core/src/series/implementations/object.rs @@ -38,6 +38,9 @@ where fn _dtype(&self) -> &DataType { self.0.dtype() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { self.0.agg_list(groups) diff --git a/polars/polars-core/src/series/implementations/struct_.rs b/crates/polars-core/src/series/implementations/struct_.rs similarity index 99% rename from polars/polars-core/src/series/implementations/struct_.rs rename to crates/polars-core/src/series/implementations/struct_.rs index 87cedae6d3ec4..a26ba0e23db7f 100644 --- a/polars/polars-core/src/series/implementations/struct_.rs +++ b/crates/polars-core/src/series/implementations/struct_.rs @@ -25,6 +25,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().data_type() } + fn _clear_settings(&mut self) { + // no-op + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0 .apply_fields(|s| s.explode_by_offsets(offsets)) diff --git a/polars/polars-core/src/series/implementations/utf8.rs b/crates/polars-core/src/series/implementations/utf8.rs similarity index 99% rename from polars/polars-core/src/series/implementations/utf8.rs rename to crates/polars-core/src/series/implementations/utf8.rs index a0fa2aac4cf1b..11e341b8d4566 100644 --- a/polars/polars-core/src/series/implementations/utf8.rs +++ b/crates/polars-core/src/series/implementations/utf8.rs @@ -24,6 +24,9 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().data_type() } + fn _clear_settings(&mut self) { + self.0.clear_settings() + } fn explode_by_offsets(&self, offsets: &[i64]) -> Series { self.0.explode_by_offsets(offsets) } diff --git a/polars/polars-core/src/series/into.rs b/crates/polars-core/src/series/into.rs similarity index 100% rename from polars/polars-core/src/series/into.rs rename to crates/polars-core/src/series/into.rs diff --git a/polars/polars-core/src/series/iterator.rs b/crates/polars-core/src/series/iterator.rs similarity index 94% rename from polars/polars-core/src/series/iterator.rs rename to crates/polars-core/src/series/iterator.rs index c774479654351..b11bc45bf8d50 100644 --- a/polars/polars-core/src/series/iterator.rs +++ b/crates/polars-core/src/series/iterator.rs @@ -66,6 +66,11 @@ impl Series { /// This will panic if the array is not rechunked first. pub fn iter(&self) -> SeriesIter<'_> { let dtype = self.dtype(); + #[cfg(feature = "object")] + assert!( + !matches!(dtype, DataType::Object(_)), + "object dtype not supported in Series.iter" + ); assert_eq!(self.chunks().len(), 1, "impl error"); let arr = &*self.chunks()[0]; let len = arr.len(); @@ -83,6 +88,11 @@ impl Series { assert_eq!(dtype, &phys_dtype, "impl error"); assert_eq!(self.chunks().len(), 1, "impl error"); + #[cfg(feature = "object")] + assert!( + !matches!(dtype, DataType::Object(_)), + "object dtype not supported in Series.iter" + ); let arr = &*self.chunks()[0]; if phys_dtype.is_numeric() { diff --git a/polars/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs similarity index 99% rename from polars/polars-core/src/series/mod.rs rename to crates/polars-core/src/series/mod.rs index 3638fcc1d3433..5ecf831844e59 100644 --- a/polars/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -197,6 +197,11 @@ impl Series { inner._set_sorted_flag(sorted) } + pub(crate) fn clear_settings(&mut self) { + let inner = self._get_inner_mut(); + inner._clear_settings() + } + pub fn into_frame(self) -> DataFrame { DataFrame::new_no_checks(vec![self]) } diff --git a/polars/polars-core/src/series/ops/diff.rs b/crates/polars-core/src/series/ops/diff.rs similarity index 100% rename from polars/polars-core/src/series/ops/diff.rs rename to crates/polars-core/src/series/ops/diff.rs diff --git a/polars/polars-core/src/series/ops/downcast.rs b/crates/polars-core/src/series/ops/downcast.rs similarity index 100% rename from polars/polars-core/src/series/ops/downcast.rs rename to crates/polars-core/src/series/ops/downcast.rs diff --git a/polars/polars-core/src/series/ops/ewm.rs b/crates/polars-core/src/series/ops/ewm.rs similarity index 100% rename from polars/polars-core/src/series/ops/ewm.rs rename to crates/polars-core/src/series/ops/ewm.rs diff --git a/polars/polars-core/src/series/ops/extend.rs b/crates/polars-core/src/series/ops/extend.rs similarity index 100% rename from polars/polars-core/src/series/ops/extend.rs rename to crates/polars-core/src/series/ops/extend.rs diff --git a/polars/polars-core/src/series/ops/mod.rs b/crates/polars-core/src/series/ops/mod.rs similarity index 100% rename from polars/polars-core/src/series/ops/mod.rs rename to crates/polars-core/src/series/ops/mod.rs diff --git a/polars/polars-core/src/series/ops/moment.rs b/crates/polars-core/src/series/ops/moment.rs similarity index 100% rename from polars/polars-core/src/series/ops/moment.rs rename to crates/polars-core/src/series/ops/moment.rs diff --git a/polars/polars-core/src/series/ops/null.rs b/crates/polars-core/src/series/ops/null.rs similarity index 100% rename from polars/polars-core/src/series/ops/null.rs rename to crates/polars-core/src/series/ops/null.rs diff --git a/polars/polars-core/src/series/ops/pct_change.rs b/crates/polars-core/src/series/ops/pct_change.rs similarity index 100% rename from polars/polars-core/src/series/ops/pct_change.rs rename to crates/polars-core/src/series/ops/pct_change.rs diff --git a/polars/polars-core/src/series/ops/round.rs b/crates/polars-core/src/series/ops/round.rs similarity index 100% rename from polars/polars-core/src/series/ops/round.rs rename to crates/polars-core/src/series/ops/round.rs diff --git a/polars/polars-core/src/series/ops/to_list.rs b/crates/polars-core/src/series/ops/to_list.rs similarity index 100% rename from polars/polars-core/src/series/ops/to_list.rs rename to crates/polars-core/src/series/ops/to_list.rs diff --git a/polars/polars-core/src/series/ops/unique.rs b/crates/polars-core/src/series/ops/unique.rs similarity index 100% rename from polars/polars-core/src/series/ops/unique.rs rename to crates/polars-core/src/series/ops/unique.rs diff --git a/polars/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs similarity index 99% rename from polars/polars-core/src/series/series_trait.rs rename to crates/polars-core/src/series/series_trait.rs index 5a5871720861a..d53c0fabd6ade 100644 --- a/polars/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -77,6 +77,8 @@ pub(crate) mod private { fn _dtype(&self) -> &DataType; + fn _clear_settings(&mut self); + fn compute_len(&mut self); fn explode_by_offsets(&self, _offsets: &[i64]) -> Series { diff --git a/polars/polars-core/src/series/unstable.rs b/crates/polars-core/src/series/unstable.rs similarity index 100% rename from polars/polars-core/src/series/unstable.rs rename to crates/polars-core/src/series/unstable.rs diff --git a/polars/polars-core/src/testing.rs b/crates/polars-core/src/testing.rs similarity index 100% rename from polars/polars-core/src/testing.rs rename to crates/polars-core/src/testing.rs diff --git a/polars/polars-core/src/tests.rs b/crates/polars-core/src/tests.rs similarity index 100% rename from polars/polars-core/src/tests.rs rename to crates/polars-core/src/tests.rs diff --git a/polars/polars-core/src/utils/flatten.rs b/crates/polars-core/src/utils/flatten.rs similarity index 100% rename from polars/polars-core/src/utils/flatten.rs rename to crates/polars-core/src/utils/flatten.rs diff --git a/polars/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs similarity index 100% rename from polars/polars-core/src/utils/mod.rs rename to crates/polars-core/src/utils/mod.rs diff --git a/polars/polars-core/src/utils/series.rs b/crates/polars-core/src/utils/series.rs similarity index 100% rename from polars/polars-core/src/utils/series.rs rename to crates/polars-core/src/utils/series.rs diff --git a/polars/polars-core/src/utils/supertype.rs b/crates/polars-core/src/utils/supertype.rs similarity index 100% rename from polars/polars-core/src/utils/supertype.rs rename to crates/polars-core/src/utils/supertype.rs diff --git a/polars/polars-error/Cargo.toml b/crates/polars-error/Cargo.toml similarity index 100% rename from polars/polars-error/Cargo.toml rename to crates/polars-error/Cargo.toml diff --git a/polars/polars-error/LICENSE b/crates/polars-error/LICENSE similarity index 100% rename from polars/polars-error/LICENSE rename to crates/polars-error/LICENSE diff --git a/polars/polars-error/README.md b/crates/polars-error/README.md similarity index 100% rename from polars/polars-error/README.md rename to crates/polars-error/README.md diff --git a/polars/polars-error/src/lib.rs b/crates/polars-error/src/lib.rs similarity index 100% rename from polars/polars-error/src/lib.rs rename to crates/polars-error/src/lib.rs diff --git a/polars/polars-error/src/warning.rs b/crates/polars-error/src/warning.rs similarity index 100% rename from polars/polars-error/src/warning.rs rename to crates/polars-error/src/warning.rs diff --git a/polars/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml similarity index 100% rename from polars/polars-io/Cargo.toml rename to crates/polars-io/Cargo.toml diff --git a/polars/polars-io/LICENSE b/crates/polars-io/LICENSE similarity index 100% rename from polars/polars-io/LICENSE rename to crates/polars-io/LICENSE diff --git a/polars/polars-io/README.md b/crates/polars-io/README.md similarity index 100% rename from polars/polars-io/README.md rename to crates/polars-io/README.md diff --git a/polars/polars-io/src/avro/mod.rs b/crates/polars-io/src/avro/mod.rs similarity index 100% rename from polars/polars-io/src/avro/mod.rs rename to crates/polars-io/src/avro/mod.rs diff --git a/polars/polars-io/src/avro/read.rs b/crates/polars-io/src/avro/read.rs similarity index 100% rename from polars/polars-io/src/avro/read.rs rename to crates/polars-io/src/avro/read.rs diff --git a/polars/polars-io/src/avro/write.rs b/crates/polars-io/src/avro/write.rs similarity index 100% rename from polars/polars-io/src/avro/write.rs rename to crates/polars-io/src/avro/write.rs diff --git a/polars/polars-io/src/cloud/adaptors.rs b/crates/polars-io/src/cloud/adaptors.rs similarity index 100% rename from polars/polars-io/src/cloud/adaptors.rs rename to crates/polars-io/src/cloud/adaptors.rs diff --git a/polars/polars-io/src/cloud/glob.rs b/crates/polars-io/src/cloud/glob.rs similarity index 100% rename from polars/polars-io/src/cloud/glob.rs rename to crates/polars-io/src/cloud/glob.rs diff --git a/polars/polars-io/src/cloud/mod.rs b/crates/polars-io/src/cloud/mod.rs similarity index 100% rename from polars/polars-io/src/cloud/mod.rs rename to crates/polars-io/src/cloud/mod.rs diff --git a/polars/polars-io/src/csv/buffer.rs b/crates/polars-io/src/csv/buffer.rs similarity index 100% rename from polars/polars-io/src/csv/buffer.rs rename to crates/polars-io/src/csv/buffer.rs diff --git a/polars/polars-io/src/csv/mod.rs b/crates/polars-io/src/csv/mod.rs similarity index 100% rename from polars/polars-io/src/csv/mod.rs rename to crates/polars-io/src/csv/mod.rs diff --git a/polars/polars-io/src/csv/parser.rs b/crates/polars-io/src/csv/parser.rs similarity index 100% rename from polars/polars-io/src/csv/parser.rs rename to crates/polars-io/src/csv/parser.rs diff --git a/polars/polars-io/src/csv/read.rs b/crates/polars-io/src/csv/read.rs similarity index 99% rename from polars/polars-io/src/csv/read.rs rename to crates/polars-io/src/csv/read.rs index dfbb43d4ec9ff..2c3fd81128b1a 100644 --- a/polars/polars-io/src/csv/read.rs +++ b/crates/polars-io/src/csv/read.rs @@ -323,7 +323,7 @@ impl<'a> CsvReader<'a, File> { /// This is the recommended way to create a csv reader as this allows for fastest parsing. pub fn from_path>(path: P) -> PolarsResult { let path = resolve_homedir(&path.into()); - let f = std::fs::File::open(&path)?; + let f = polars_utils::open_file(&path)?; Ok(Self::new(f).with_path(Some(path))) } } diff --git a/polars/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs similarity index 100% rename from polars/polars-io/src/csv/read_impl/batched_mmap.rs rename to crates/polars-io/src/csv/read_impl/batched_mmap.rs diff --git a/polars/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs similarity index 99% rename from polars/polars-io/src/csv/read_impl/batched_read.rs rename to crates/polars-io/src/csv/read_impl/batched_read.rs index f919bafd191a8..47bef7240ec74 100644 --- a/polars/polars-io/src/csv/read_impl/batched_read.rs +++ b/crates/polars-io/src/csv/read_impl/batched_read.rs @@ -192,7 +192,9 @@ impl<'a> CoreReader<'a> { pub fn batched_read(mut self, _has_cat: bool) -> PolarsResult> { let reader_bytes = self.reader_bytes.take().unwrap(); - let ReaderBytes::Mapped(bytes, mut file) = &reader_bytes else { unreachable!() }; + let ReaderBytes::Mapped(bytes, mut file) = &reader_bytes else { + unreachable!() + }; let (_, starting_point_offset) = self.find_starting_point(bytes, self.quote_char, self.eol_char)?; if let Some(starting_point_offset) = starting_point_offset { diff --git a/polars/polars-io/src/csv/read_impl/mod.rs b/crates/polars-io/src/csv/read_impl/mod.rs similarity index 100% rename from polars/polars-io/src/csv/read_impl/mod.rs rename to crates/polars-io/src/csv/read_impl/mod.rs diff --git a/polars/polars-io/src/csv/splitfields.rs b/crates/polars-io/src/csv/splitfields.rs similarity index 100% rename from polars/polars-io/src/csv/splitfields.rs rename to crates/polars-io/src/csv/splitfields.rs diff --git a/polars/polars-io/src/csv/utils.rs b/crates/polars-io/src/csv/utils.rs similarity index 95% rename from polars/polars-io/src/csv/utils.rs rename to crates/polars-io/src/csv/utils.rs index 84e8a920e5854..ba5c3259a91ee 100644 --- a/polars/polars-io/src/csv/utils.rs +++ b/crates/polars-io/src/csv/utils.rs @@ -175,17 +175,8 @@ pub(crate) fn parse_bytes_with_encoding( }) } -/// Infer the schema of a CSV file by reading through the first n rows of the file, -/// with `max_read_rows` controlling the maximum number of rows to read. -/// -/// If `max_read_rows` is not set, the whole file is read to infer its schema. -/// -/// Returns -/// - inferred schema -/// - number of rows used for inference. -/// - bytes read #[allow(clippy::too_many_arguments)] -pub fn infer_file_schema( +pub fn infer_file_schema_inner( reader_bytes: &ReaderBytes, delimiter: u8, max_read_rows: Option, @@ -200,6 +191,7 @@ pub fn infer_file_schema( eol_char: u8, null_values: Option<&NullValues>, try_parse_dates: bool, + recursion_count: u8, ) -> PolarsResult<(Schema, usize, usize)> { // keep track so that we can determine the amount of bytes read let start_ptr = reader_bytes.as_ptr() as usize; @@ -288,14 +280,14 @@ pub fn infer_file_schema( } column_names } - } else if has_header && !bytes.is_empty() { + } else if has_header && !bytes.is_empty() && recursion_count == 0 { // there was no new line char. So we copy the whole buf and add one // this is likely to be cheap as there are no rows. let mut buf = Vec::with_capacity(bytes.len() + 2); buf.extend_from_slice(bytes); buf.push(eol_char); - return infer_file_schema( + return infer_file_schema_inner( &ReaderBytes::Owned(buf), delimiter, max_read_rows, @@ -308,6 +300,7 @@ pub fn infer_file_schema( eol_char, null_values, try_parse_dates, + recursion_count + 1, ); } else { polars_bail!(NoData: "empty CSV"); @@ -473,11 +466,11 @@ pub fn infer_file_schema( // if there is a single line after the header without an eol // we copy the bytes add an eol and rerun this function // so that the inference is consistent with and without eol char - if rows_count == 0 && reader_bytes[reader_bytes.len() - 1] != eol_char { + if rows_count == 0 && reader_bytes[reader_bytes.len() - 1] != eol_char && recursion_count == 0 { let mut rb = Vec::with_capacity(reader_bytes.len() + 1); rb.extend_from_slice(reader_bytes); rb.push(eol_char); - return infer_file_schema( + return infer_file_schema_inner( &ReaderBytes::Owned(rb), delimiter, max_read_rows, @@ -490,12 +483,56 @@ pub fn infer_file_schema( eol_char, null_values, try_parse_dates, + recursion_count + 1, ); } Ok((Schema::from_iter(fields), rows_count, end_ptr - start_ptr)) } +/// Infer the schema of a CSV file by reading through the first n rows of the file, +/// with `max_read_rows` controlling the maximum number of rows to read. +/// +/// If `max_read_rows` is not set, the whole file is read to infer its schema. +/// +/// Returns +/// - inferred schema +/// - number of rows used for inference. +/// - bytes read +#[allow(clippy::too_many_arguments)] +pub fn infer_file_schema( + reader_bytes: &ReaderBytes, + delimiter: u8, + max_read_rows: Option, + has_header: bool, + schema_overwrite: Option<&Schema>, + // we take &mut because we maybe need to skip more rows dependent + // on the schema inference + skip_rows: &mut usize, + skip_rows_after_header: usize, + comment_char: Option, + quote_char: Option, + eol_char: u8, + null_values: Option<&NullValues>, + try_parse_dates: bool, +) -> PolarsResult<(Schema, usize, usize)> { + infer_file_schema_inner( + reader_bytes, + delimiter, + max_read_rows, + has_header, + schema_overwrite, + skip_rows, + skip_rows_after_header, + comment_char, + quote_char, + eol_char, + null_values, + try_parse_dates, + 0, + ) +} + // magic numbers const GZIP: [u8; 2] = [31, 139]; const ZLIB0: [u8; 2] = [0x78, 0x01]; diff --git a/polars/polars-io/src/csv/write.rs b/crates/polars-io/src/csv/write.rs similarity index 100% rename from polars/polars-io/src/csv/write.rs rename to crates/polars-io/src/csv/write.rs diff --git a/polars/polars-io/src/csv/write_impl.rs b/crates/polars-io/src/csv/write_impl.rs similarity index 100% rename from polars/polars-io/src/csv/write_impl.rs rename to crates/polars-io/src/csv/write_impl.rs diff --git a/polars/polars-io/src/export.rs b/crates/polars-io/src/export.rs similarity index 100% rename from polars/polars-io/src/export.rs rename to crates/polars-io/src/export.rs diff --git a/polars/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs similarity index 100% rename from polars/polars-io/src/ipc/ipc_file.rs rename to crates/polars-io/src/ipc/ipc_file.rs diff --git a/polars/polars-io/src/ipc/ipc_stream.rs b/crates/polars-io/src/ipc/ipc_stream.rs similarity index 100% rename from polars/polars-io/src/ipc/ipc_stream.rs rename to crates/polars-io/src/ipc/ipc_stream.rs diff --git a/polars/polars-io/src/ipc/mmap.rs b/crates/polars-io/src/ipc/mmap.rs similarity index 100% rename from polars/polars-io/src/ipc/mmap.rs rename to crates/polars-io/src/ipc/mmap.rs diff --git a/polars/polars-io/src/ipc/mod.rs b/crates/polars-io/src/ipc/mod.rs similarity index 100% rename from polars/polars-io/src/ipc/mod.rs rename to crates/polars-io/src/ipc/mod.rs diff --git a/polars/polars-io/src/ipc/write.rs b/crates/polars-io/src/ipc/write.rs similarity index 100% rename from polars/polars-io/src/ipc/write.rs rename to crates/polars-io/src/ipc/write.rs diff --git a/polars/polars-io/src/ipc/write_async.rs b/crates/polars-io/src/ipc/write_async.rs similarity index 100% rename from polars/polars-io/src/ipc/write_async.rs rename to crates/polars-io/src/ipc/write_async.rs diff --git a/polars/polars-io/src/json/mod.rs b/crates/polars-io/src/json/mod.rs similarity index 98% rename from polars/polars-io/src/json/mod.rs rename to crates/polars-io/src/json/mod.rs index 79a246d5637be..37055dedd2be8 100644 --- a/polars/polars-io/src/json/mod.rs +++ b/crates/polars-io/src/json/mod.rs @@ -243,8 +243,8 @@ where let dtype = infer(&json_value)?; if let Some(overwrite) = self.schema_overwrite { let ArrowDataType::Struct(fields) = dtype else { - polars_bail!(ComputeError: "can only deserialize json objects") - }; + polars_bail!(ComputeError: "can only deserialize json objects") + }; let mut schema = Schema::from_iter(fields.iter()); overwrite_schema(&mut schema, overwrite)?; diff --git a/polars/polars-io/src/lib.rs b/crates/polars-io/src/lib.rs similarity index 100% rename from polars/polars-io/src/lib.rs rename to crates/polars-io/src/lib.rs diff --git a/polars/polars-io/src/mmap.rs b/crates/polars-io/src/mmap.rs similarity index 100% rename from polars/polars-io/src/mmap.rs rename to crates/polars-io/src/mmap.rs diff --git a/polars/polars-io/src/ndjson/buffer.rs b/crates/polars-io/src/ndjson/buffer.rs similarity index 99% rename from polars/polars-io/src/ndjson/buffer.rs rename to crates/polars-io/src/ndjson/buffer.rs index 654190777ea5c..c4bf434ce1d39 100644 --- a/polars/polars-io/src/ndjson/buffer.rs +++ b/crates/polars-io/src/ndjson/buffer.rs @@ -190,7 +190,7 @@ fn deserialize_all<'a>( Value::Array(arr) => { let Some(inner_dtype) = dtype.inner_dtype() else { if ignore_errors { - return Ok(AnyValue::Null) + return Ok(AnyValue::Null); } polars_bail!(ComputeError: "expected list/array in json value, got {}", dtype); }; diff --git a/polars/polars-io/src/ndjson/core.rs b/crates/polars-io/src/ndjson/core.rs similarity index 96% rename from polars/polars-io/src/ndjson/core.rs rename to crates/polars-io/src/ndjson/core.rs index 9417967e1e0ed..e575264d901a0 100644 --- a/polars/polars-io/src/ndjson/core.rs +++ b/crates/polars-io/src/ndjson/core.rs @@ -92,7 +92,7 @@ impl<'a> JsonLineReader<'a, File> { /// This is the recommended way to create a json reader as this allows for fastest parsing. pub fn from_path>(path: P) -> PolarsResult { let path = resolve_homedir(&path.into()); - let f = std::fs::File::open(&path)?; + let f = polars_utils::open_file(&path)?; Ok(Self::new(f).with_path(Some(path))) } } @@ -305,11 +305,18 @@ fn parse_lines(bytes: &[u8], buffers: &mut PlIndexMap) -> Pol // The `RawValue` is a pointer to the original JSON string and does not perform any deserialization. // It is used to properly iterate over the lines without re-implementing the splitlines logic when this does the same thing. - let mut iter = + let iter = serde_json::Deserializer::from_slice(bytes).into_iter::>(); - while let Some(Ok(value)) = iter.next() { - let bytes = value.get().as_bytes(); - parse_impl(bytes, buffers, &mut buf)?; + for value_result in iter { + match value_result { + Ok(value) => { + let bytes = value.get().as_bytes(); + parse_impl(bytes, buffers, &mut buf)?; + } + Err(e) => { + polars_bail!(ComputeError: "error parsing ndjson {}", e) + } + } } Ok(()) } diff --git a/polars/polars-io/src/ndjson/mod.rs b/crates/polars-io/src/ndjson/mod.rs similarity index 100% rename from polars/polars-io/src/ndjson/mod.rs rename to crates/polars-io/src/ndjson/mod.rs diff --git a/polars/polars-io/src/options.rs b/crates/polars-io/src/options.rs similarity index 100% rename from polars/polars-io/src/options.rs rename to crates/polars-io/src/options.rs diff --git a/polars/polars-io/src/parquet/async_impl.rs b/crates/polars-io/src/parquet/async_impl.rs similarity index 100% rename from polars/polars-io/src/parquet/async_impl.rs rename to crates/polars-io/src/parquet/async_impl.rs diff --git a/polars/polars-io/src/parquet/mmap.rs b/crates/polars-io/src/parquet/mmap.rs similarity index 100% rename from polars/polars-io/src/parquet/mmap.rs rename to crates/polars-io/src/parquet/mmap.rs diff --git a/polars/polars-io/src/parquet/mod.rs b/crates/polars-io/src/parquet/mod.rs similarity index 97% rename from polars/polars-io/src/parquet/mod.rs rename to crates/polars-io/src/parquet/mod.rs index bb96c55da8bcf..78ac26ea78bab 100644 --- a/polars/polars-io/src/parquet/mod.rs +++ b/crates/polars-io/src/parquet/mod.rs @@ -29,7 +29,6 @@ use super::*; #[cfg(test)] mod test { - use std::fs::File; use std::io::Cursor; use polars_core::df; @@ -40,7 +39,7 @@ mod test { #[test] fn test_parquet() { // In CI: This test will be skipped because the file does not exist. - if let Ok(r) = File::open("data/simple.parquet") { + if let Ok(r) = polars_utils::open_file("data/simple.parquet") { let reader = ParquetReader::new(r); let df = reader.finish().unwrap(); assert_eq!(df.get_column_names(), ["a", "b"]); diff --git a/polars/polars-io/src/parquet/predicates.rs b/crates/polars-io/src/parquet/predicates.rs similarity index 100% rename from polars/polars-io/src/parquet/predicates.rs rename to crates/polars-io/src/parquet/predicates.rs diff --git a/polars/polars-io/src/parquet/read.rs b/crates/polars-io/src/parquet/read.rs similarity index 100% rename from polars/polars-io/src/parquet/read.rs rename to crates/polars-io/src/parquet/read.rs diff --git a/polars/polars-io/src/parquet/read_impl.rs b/crates/polars-io/src/parquet/read_impl.rs similarity index 99% rename from polars/polars-io/src/parquet/read_impl.rs rename to crates/polars-io/src/parquet/read_impl.rs index 683e48fe7b63a..cac8d13a5805c 100644 --- a/polars/polars-io/src/parquet/read_impl.rs +++ b/crates/polars-io/src/parquet/read_impl.rs @@ -324,7 +324,7 @@ pub fn read_parquet( }; Ok(arrow_schema_to_empty_df(&schema)) } else { - accumulate_dataframes_vertical(dfs.into_iter()) + accumulate_dataframes_vertical(dfs) } } diff --git a/polars/polars-io/src/parquet/write.rs b/crates/polars-io/src/parquet/write.rs similarity index 100% rename from polars/polars-io/src/parquet/write.rs rename to crates/polars-io/src/parquet/write.rs diff --git a/polars/polars-io/src/partition.rs b/crates/polars-io/src/partition.rs similarity index 93% rename from polars/polars-io/src/partition.rs rename to crates/polars-io/src/partition.rs index 7270f4fcdcc54..1af01cedf466e 100644 --- a/polars/polars-io/src/partition.rs +++ b/crates/polars-io/src/partition.rs @@ -3,6 +3,7 @@ use std::io::BufWriter; use std::path::{Path, PathBuf}; use polars_core::prelude::*; +use polars_core::series::IsSorted; use polars_core::POOL; use rayon::prelude::*; @@ -102,7 +103,10 @@ where .enumerate() .map(|(i, (_, group))| { // groups are in bounds - let mut part_df = unsafe { df._take_unchecked_slice(group, false) }; + // and sorted + let mut part_df = unsafe { + df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending) + }; self.write_partition_df(&mut part_df, i) }) .collect::>>() @@ -156,7 +160,7 @@ mod test { let expected: Vec<(PathBuf, DataFrame)> = ["a=1/b=2", "a=2/b=3", "a=3/b=4"] .into_iter() - .zip(expected_dfs.into_iter()) + .zip(expected_dfs) .map(|(p, df)| (rootdir.join(p), df)) .collect(); @@ -171,7 +175,7 @@ mod test { .collect::>>()?; assert_eq!(ipc_paths.len(), 1); - let reader = BufReader::new(std::fs::File::open(&ipc_paths[0])?); + let reader = BufReader::new(polars_utils::open_file(&ipc_paths[0])?); let df = IpcReader::new(reader).finish()?; assert!(expected_df.frame_equal(&df)); } diff --git a/polars/polars-io/src/predicates.rs b/crates/polars-io/src/predicates.rs similarity index 100% rename from polars/polars-io/src/predicates.rs rename to crates/polars-io/src/predicates.rs diff --git a/polars/polars-io/src/prelude.rs b/crates/polars-io/src/prelude.rs similarity index 100% rename from polars/polars-io/src/prelude.rs rename to crates/polars-io/src/prelude.rs diff --git a/polars/polars-io/src/tests.rs b/crates/polars-io/src/tests.rs similarity index 100% rename from polars/polars-io/src/tests.rs rename to crates/polars-io/src/tests.rs diff --git a/polars/polars-io/src/utils.rs b/crates/polars-io/src/utils.rs similarity index 100% rename from polars/polars-io/src/utils.rs rename to crates/polars-io/src/utils.rs diff --git a/polars/polars-json/Cargo.toml b/crates/polars-json/Cargo.toml similarity index 100% rename from polars/polars-json/Cargo.toml rename to crates/polars-json/Cargo.toml diff --git a/polars/polars-json/LICENSE b/crates/polars-json/LICENSE similarity index 100% rename from polars/polars-json/LICENSE rename to crates/polars-json/LICENSE diff --git a/polars/polars-json/src/json/deserialize.rs b/crates/polars-json/src/json/deserialize.rs similarity index 100% rename from polars/polars-json/src/json/deserialize.rs rename to crates/polars-json/src/json/deserialize.rs diff --git a/polars/polars-json/src/json/infer_schema.rs b/crates/polars-json/src/json/infer_schema.rs similarity index 100% rename from polars/polars-json/src/json/infer_schema.rs rename to crates/polars-json/src/json/infer_schema.rs diff --git a/polars/polars-json/src/json/mod.rs b/crates/polars-json/src/json/mod.rs similarity index 100% rename from polars/polars-json/src/json/mod.rs rename to crates/polars-json/src/json/mod.rs diff --git a/polars/polars-json/src/lib.rs b/crates/polars-json/src/lib.rs similarity index 100% rename from polars/polars-json/src/lib.rs rename to crates/polars-json/src/lib.rs diff --git a/polars/polars-json/src/ndjson/deserialize.rs b/crates/polars-json/src/ndjson/deserialize.rs similarity index 100% rename from polars/polars-json/src/ndjson/deserialize.rs rename to crates/polars-json/src/ndjson/deserialize.rs diff --git a/polars/polars-json/src/ndjson/file.rs b/crates/polars-json/src/ndjson/file.rs similarity index 100% rename from polars/polars-json/src/ndjson/file.rs rename to crates/polars-json/src/ndjson/file.rs diff --git a/polars/polars-json/src/ndjson/mod.rs b/crates/polars-json/src/ndjson/mod.rs similarity index 100% rename from polars/polars-json/src/ndjson/mod.rs rename to crates/polars-json/src/ndjson/mod.rs diff --git a/polars/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml similarity index 97% rename from polars/polars-lazy/Cargo.toml rename to crates/polars-lazy/Cargo.toml index e43dba048ce87..fe0eb9e029c7c 100644 --- a/polars/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -22,8 +22,8 @@ polars-core = { version = "0.31.1", path = "../polars-core", features = ["lazy", polars-io = { version = "0.31.1", path = "../polars-io", features = ["lazy", "csv"], default-features = false } polars-json = { version = "0.31.1", path = "../polars-json", optional = true } polars-ops = { version = "0.31.1", path = "../polars-ops", default-features = false } -polars-pipe = { version = "0.31.1", path = "./polars-pipe", optional = true } -polars-plan = { version = "0.31.1", path = "./polars-plan" } +polars-pipe = { version = "0.31.1", path = "../polars-pipe", optional = true } +polars-plan = { version = "0.31.1", path = "../polars-plan" } polars-time = { version = "0.31.1", path = "../polars-time", optional = true } polars-utils = { version = "0.31.1", path = "../polars-utils" } pyo3 = { version = "0.19", optional = true } diff --git a/polars/polars-lazy/LICENSE b/crates/polars-lazy/LICENSE similarity index 100% rename from polars/polars-lazy/LICENSE rename to crates/polars-lazy/LICENSE diff --git a/polars/polars-lazy/README.md b/crates/polars-lazy/README.md similarity index 100% rename from polars/polars-lazy/README.md rename to crates/polars-lazy/README.md diff --git a/polars/polars-lazy/src/dot.rs b/crates/polars-lazy/src/dot.rs similarity index 100% rename from polars/polars-lazy/src/dot.rs rename to crates/polars-lazy/src/dot.rs diff --git a/polars/polars-lazy/src/dsl/eval.rs b/crates/polars-lazy/src/dsl/eval.rs similarity index 100% rename from polars/polars-lazy/src/dsl/eval.rs rename to crates/polars-lazy/src/dsl/eval.rs diff --git a/polars/polars-lazy/src/dsl/functions.rs b/crates/polars-lazy/src/dsl/functions.rs similarity index 97% rename from polars/polars-lazy/src/dsl/functions.rs rename to crates/polars-lazy/src/dsl/functions.rs index 0e8a778db0c61..44710a2bb4643 100644 --- a/polars/polars-lazy/src/dsl/functions.rs +++ b/crates/polars-lazy/src/dsl/functions.rs @@ -68,7 +68,13 @@ pub(crate) fn concat_impl>( }; if convert_supertypes { - let LogicalPlan::Union {mut inputs, options} = lf.logical_plan else { unreachable!()} ; + let LogicalPlan::Union { + mut inputs, + options, + } = lf.logical_plan + else { + unreachable!() + }; let mut schema = inputs[0].schema()?.as_ref().as_ref().clone(); let mut changed = false; @@ -141,7 +147,7 @@ pub fn diag_concat_lf>( let lfs_with_all_columns = lfs .into_iter() // Zip Frames with their Schemas - .zip(schemas.into_iter()) + .zip(schemas) .map(|(mut lf, lf_schema)| { for (name, dtype) in total_schema.iter() { // If a name from Total Schema is not present - append diff --git a/polars/polars-lazy/src/dsl/into.rs b/crates/polars-lazy/src/dsl/into.rs similarity index 100% rename from polars/polars-lazy/src/dsl/into.rs rename to crates/polars-lazy/src/dsl/into.rs diff --git a/polars/polars-lazy/src/dsl/list.rs b/crates/polars-lazy/src/dsl/list.rs similarity index 100% rename from polars/polars-lazy/src/dsl/list.rs rename to crates/polars-lazy/src/dsl/list.rs diff --git a/polars/polars-lazy/src/dsl/mod.rs b/crates/polars-lazy/src/dsl/mod.rs similarity index 100% rename from polars/polars-lazy/src/dsl/mod.rs rename to crates/polars-lazy/src/dsl/mod.rs diff --git a/polars/polars-lazy/src/frame/anonymous_scan.rs b/crates/polars-lazy/src/frame/anonymous_scan.rs similarity index 100% rename from polars/polars-lazy/src/frame/anonymous_scan.rs rename to crates/polars-lazy/src/frame/anonymous_scan.rs diff --git a/polars/polars-lazy/src/frame/csv.rs b/crates/polars-lazy/src/frame/csv.rs similarity index 99% rename from polars/polars-lazy/src/frame/csv.rs rename to crates/polars-lazy/src/frame/csv.rs index ae93643e01a80..788798f2156ba 100644 --- a/polars/polars-lazy/src/frame/csv.rs +++ b/crates/polars-lazy/src/frame/csv.rs @@ -210,9 +210,9 @@ impl<'a> LazyCsvReader<'a> { Some(globresult) => globresult?, None => polars_bail!(ComputeError: "globbing pattern did not match any files"), }; - std::fs::File::open(&path) + polars_utils::open_file(&path) } else { - std::fs::File::open(&self.path) + polars_utils::open_file(&self.path) }?; let reader_bytes = get_reader_bytes(&mut file).expect("could not mmap file"); let mut skip_rows = self.skip_rows; diff --git a/polars/polars-lazy/src/frame/err.rs b/crates/polars-lazy/src/frame/err.rs similarity index 100% rename from polars/polars-lazy/src/frame/err.rs rename to crates/polars-lazy/src/frame/err.rs diff --git a/polars/polars-lazy/src/frame/file_list_reader.rs b/crates/polars-lazy/src/frame/file_list_reader.rs similarity index 100% rename from polars/polars-lazy/src/frame/file_list_reader.rs rename to crates/polars-lazy/src/frame/file_list_reader.rs diff --git a/polars/polars-lazy/src/frame/ipc.rs b/crates/polars-lazy/src/frame/ipc.rs similarity index 100% rename from polars/polars-lazy/src/frame/ipc.rs rename to crates/polars-lazy/src/frame/ipc.rs diff --git a/polars/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs similarity index 97% rename from polars/polars-lazy/src/frame/mod.rs rename to crates/polars-lazy/src/frame/mod.rs index ed3641d0b0017..ba3328df6d8a8 100644 --- a/polars/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -132,7 +132,9 @@ impl LazyFrame { // will be toggled by a scan operation such as csv scan or parquet scan file_caching: false, #[cfg(feature = "cse")] - common_subplan_elimination: false, + comm_subplan_elim: false, + #[cfg(feature = "cse")] + comm_subexpr_elim: false, streaming: false, }) } @@ -163,8 +165,15 @@ impl LazyFrame { /// Toggle common subplan elimination optimization on or off #[cfg(feature = "cse")] - pub fn with_common_subplan_elimination(mut self, toggle: bool) -> Self { - self.opt_state.common_subplan_elimination = toggle; + pub fn with_comm_subplan_elim(mut self, toggle: bool) -> Self { + self.opt_state.comm_subplan_elim = toggle; + self + } + + /// Toggle common subexpression elimination optimization on or off + #[cfg(feature = "cse")] + pub fn with_comm_subexpr_elim(mut self, toggle: bool) -> Self { + self.opt_state.comm_subexpr_elim = toggle; self } @@ -365,7 +374,7 @@ impl LazyFrame { let mut existing_vec: Vec = Vec::with_capacity(cap); let mut new_vec: Vec = Vec::with_capacity(cap); - for (existing, new) in iter.zip(new.into_iter()) { + for (existing, new) in iter.zip(new) { let existing = existing.as_ref(); let new = new.as_ref(); @@ -475,6 +484,18 @@ impl LazyFrame { self.optimize_with_scratch(lp_arena, expr_arena, &mut vec![], false) } + pub fn to_alp_optimized(self) -> PolarsResult<(Node, Arena, Arena)> { + let mut lp_arena = Arena::with_capacity(16); + let mut expr_arena = Arena::with_capacity(16); + let node = + self.optimize_with_scratch(&mut lp_arena, &mut expr_arena, &mut vec![], false)?; + Ok((node, lp_arena, expr_arena)) + } + + pub fn to_alp(self) -> PolarsResult<(Node, Arena, Arena)> { + self.logical_plan.to_alp() + } + pub(crate) fn optimize_with_scratch( self, lp_arena: &mut Arena, @@ -486,9 +507,11 @@ impl LazyFrame { let mut opt_state = self.opt_state; let streaming = self.opt_state.streaming; #[cfg(feature = "cse")] - if streaming && self.opt_state.common_subplan_elimination { - polars_warn!("Cannot combine 'streaming' with 'common_subplan_elimination'. CSE will be turned off."); - opt_state.common_subplan_elimination = false; + if streaming && self.opt_state.comm_subplan_elim { + polars_warn!( + "Cannot combine 'streaming' with 'comm_subplan_elim'. CSE will be turned off." + ); + opt_state.comm_subplan_elim = false; } let lp_top = optimize(self.logical_plan, opt_state, lp_arena, expr_arena, scratch)?; @@ -1385,7 +1408,7 @@ impl LazyGroupBy { schema, apply: Some(Arc::new(f)), maintain_order: self.maintain_order, - options, + options: Arc::new(options), }; LazyFrame::from_logical_plan(lp, self.opt_state) } @@ -1500,7 +1523,8 @@ impl JoinBuilder { force_parallel: self.force_parallel, args, ..Default::default() - }, + } + .into(), ) .build(); LazyFrame::from_logical_plan(lp, opt_state) diff --git a/polars/polars-lazy/src/frame/ndjson.rs b/crates/polars-lazy/src/frame/ndjson.rs similarity index 100% rename from polars/polars-lazy/src/frame/ndjson.rs rename to crates/polars-lazy/src/frame/ndjson.rs diff --git a/polars/polars-lazy/src/frame/parquet.rs b/crates/polars-lazy/src/frame/parquet.rs similarity index 100% rename from polars/polars-lazy/src/frame/parquet.rs rename to crates/polars-lazy/src/frame/parquet.rs diff --git a/polars/polars-lazy/src/frame/pivot.rs b/crates/polars-lazy/src/frame/pivot.rs similarity index 100% rename from polars/polars-lazy/src/frame/pivot.rs rename to crates/polars-lazy/src/frame/pivot.rs diff --git a/polars/polars-lazy/src/frame/python.rs b/crates/polars-lazy/src/frame/python.rs similarity index 100% rename from polars/polars-lazy/src/frame/python.rs rename to crates/polars-lazy/src/frame/python.rs diff --git a/polars/polars-lazy/src/lib.rs b/crates/polars-lazy/src/lib.rs similarity index 98% rename from polars/polars-lazy/src/lib.rs rename to crates/polars-lazy/src/lib.rs index 38e9a3bc14fd0..ce5f2d7209efe 100644 --- a/polars/polars-lazy/src/lib.rs +++ b/crates/polars-lazy/src/lib.rs @@ -101,8 +101,8 @@ //! df.lazy() //! .groupby([col("date")]) //! .agg([ -//! col("rain").min(), -//! col("rain").sum(), +//! col("rain").min().alias("min_rain"), +//! col("rain").sum().alias("sum_rain"), //! col("rain").quantile(lit(0.5), QuantileInterpolOptions::Nearest).alias("median_rain"), //! ]) //! .sort("date", Default::default()) diff --git a/polars/polars-lazy/src/physical_plan/executors/cache.rs b/crates/polars-lazy/src/physical_plan/executors/cache.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/cache.rs rename to crates/polars-lazy/src/physical_plan/executors/cache.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/executor.rs b/crates/polars-lazy/src/physical_plan/executors/executor.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/executor.rs rename to crates/polars-lazy/src/physical_plan/executors/executor.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/ext_context.rs b/crates/polars-lazy/src/physical_plan/executors/ext_context.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/ext_context.rs rename to crates/polars-lazy/src/physical_plan/executors/ext_context.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/filter.rs b/crates/polars-lazy/src/physical_plan/executors/filter.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/filter.rs rename to crates/polars-lazy/src/physical_plan/executors/filter.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/groupby.rs b/crates/polars-lazy/src/physical_plan/executors/groupby.rs similarity index 79% rename from polars/polars-lazy/src/physical_plan/executors/groupby.rs rename to crates/polars-lazy/src/physical_plan/executors/groupby.rs index 38872dc886bb7..b2cb72d32e36d 100644 --- a/polars/polars-lazy/src/physical_plan/executors/groupby.rs +++ b/crates/polars-lazy/src/physical_plan/executors/groupby.rs @@ -2,6 +2,25 @@ use rayon::prelude::*; use super::*; +pub(super) fn evaluate_aggs( + df: &DataFrame, + aggs: &[Arc], + groups: &GroupsProxy, + state: &ExecutionState, +) -> PolarsResult> { + POOL.install(|| { + aggs.par_iter() + .map(|expr| { + let mut agg = expr.evaluate_on_groups(df, groups, state)?.finalize(); + polars_ensure!(agg.len() == groups.len(), agg_len = agg.len(), groups.len()); + + rename_cse_tmp_series(&mut agg); + Ok(agg) + }) + .collect::>>() + }) +} + /// Take an input Executor and a multiple expressions pub struct GroupByExec { input: Box, @@ -14,6 +33,7 @@ pub struct GroupByExec { } impl GroupByExec { + #[allow(clippy::too_many_arguments)] pub(crate) fn new( input: Box, keys: Vec>, @@ -35,12 +55,13 @@ impl GroupByExec { } } +#[allow(clippy::too_many_arguments)] pub(super) fn groupby_helper( mut df: DataFrame, keys: Vec, aggs: &[Arc], apply: Option>, - state: &mut ExecutionState, + state: &ExecutionState, maintain_order: bool, slice: Option<(i64, usize)>, ) -> PolarsResult { @@ -62,35 +83,21 @@ pub(super) fn groupby_helper( groups = sliced_groups.as_deref().unwrap(); } - state.expr_cache = Some(Default::default()); let (mut columns, agg_columns) = POOL.install(|| { let get_columns = || gb.keys_sliced(slice); - let get_agg = || { - aggs.par_iter() - .map(|expr| { - let agg = expr.evaluate_on_groups(&df, groups, state)?.finalize(); - polars_ensure!(agg.len() == groups.len(), agg_len = agg.len(), groups.len()); - Ok(agg) - }) - .collect::>>() - }; + let get_agg = || evaluate_aggs(&df, aggs, groups, state); rayon::join(get_columns, get_agg) }); let agg_columns = agg_columns?; - state.expr_cache = None; columns.extend_from_slice(&agg_columns); DataFrame::new(columns) } impl GroupByExec { - fn execute_impl( - &mut self, - state: &mut ExecutionState, - df: DataFrame, - ) -> PolarsResult { + fn execute_impl(&mut self, state: &ExecutionState, df: DataFrame) -> PolarsResult { let keys = self .keys .iter() diff --git a/polars/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs b/crates/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs similarity index 86% rename from polars/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs rename to crates/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs index 7d80af14beca1..651ee63716e16 100644 --- a/polars/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs +++ b/crates/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs @@ -23,11 +23,10 @@ impl GroupByDynamicExec { #[cfg(feature = "dynamic_groupby")] fn execute_impl( &mut self, - state: &mut ExecutionState, + state: &ExecutionState, mut df: DataFrame, ) -> PolarsResult { df.as_single_chunk_par(); - let keys = self .keys .iter() @@ -64,18 +63,7 @@ impl GroupByDynamicExec { } } - state.expr_cache = Some(Default::default()); - let agg_columns = POOL.install(|| { - self.aggs - .par_iter() - .map(|expr| { - let agg = expr.evaluate_on_groups(&df, groups, state)?.finalize(); - polars_ensure!(agg.len() == groups.len(), agg_len = agg.len(), groups.len()); - Ok(agg) - }) - .collect::>>() - })?; - state.expr_cache = None; + let agg_columns = evaluate_aggs(&df, &self.aggs, groups, state)?; let mut columns = Vec::with_capacity(agg_columns.len() + 1 + keys.len()); columns.extend_from_slice(&keys); diff --git a/polars/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs b/crates/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs similarity index 97% rename from polars/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs rename to crates/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs index eb6d39ab9d26b..9184294c5f28d 100644 --- a/polars/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs +++ b/crates/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs @@ -233,7 +233,8 @@ impl PartitionGroupByExec { let groupby_options = GroupbyOptions { slice: self.slice, ..Default::default() - }; + } + .into(); let lp = LogicalPlan::Aggregate { input: Box::new(original_df.lazy().logical_plan), keys: Arc::new(std::mem::take(&mut self.keys)), @@ -345,7 +346,12 @@ impl PartitionGroupByExec { .zip(&df.get_columns()[self.phys_keys.len()..]) .map(|(expr, partitioned_s)| { let agg_expr = expr.as_partitioned_aggregator().unwrap(); - agg_expr.finalize(partitioned_s.clone(), groups, state) + agg_expr + .finalize(partitioned_s.clone(), groups, state) + .map(|mut s| { + rename_cse_tmp_series(&mut s); + s + }) }) .collect(); diff --git a/polars/polars-lazy/src/physical_plan/executors/groupby_rolling.rs b/crates/polars-lazy/src/physical_plan/executors/groupby_rolling.rs similarity index 88% rename from polars/polars-lazy/src/physical_plan/executors/groupby_rolling.rs rename to crates/polars-lazy/src/physical_plan/executors/groupby_rolling.rs index d3f80601cafaf..88f2f1dc39f03 100644 --- a/polars/polars-lazy/src/physical_plan/executors/groupby_rolling.rs +++ b/crates/polars-lazy/src/physical_plan/executors/groupby_rolling.rs @@ -21,7 +21,7 @@ impl GroupByRollingExec { #[cfg(feature = "dynamic_groupby")] fn execute_impl( &mut self, - state: &mut ExecutionState, + state: &ExecutionState, mut df: DataFrame, ) -> PolarsResult { df.as_single_chunk_par(); @@ -80,18 +80,7 @@ impl GroupByRollingExec { } }; - state.expr_cache = Some(Default::default()); - let agg_columns = POOL.install(|| { - self.aggs - .par_iter() - .map(|expr| { - let agg = expr.evaluate_on_groups(&df, groups, state)?.aggregated(); - polars_ensure!(agg.len() == groups.len(), agg_len = agg.len(), groups.len()); - Ok(agg) - }) - .collect::>>() - })?; - state.expr_cache = None; + let agg_columns = evaluate_aggs(&df, &self.aggs, groups, state)?; let mut columns = Vec::with_capacity(agg_columns.len() + 1 + keys.len()); columns.extend_from_slice(&keys); diff --git a/polars/polars-lazy/src/physical_plan/executors/join.rs b/crates/polars-lazy/src/physical_plan/executors/join.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/join.rs rename to crates/polars-lazy/src/physical_plan/executors/join.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/mod.rs b/crates/polars-lazy/src/physical_plan/executors/mod.rs similarity index 83% rename from polars/polars-lazy/src/physical_plan/executors/mod.rs rename to crates/polars-lazy/src/physical_plan/executors/mod.rs index b3be159ecaac9..a9813ffc6ab97 100644 --- a/polars/polars-lazy/src/physical_plan/executors/mod.rs +++ b/crates/polars-lazy/src/physical_plan/executors/mod.rs @@ -137,31 +137,59 @@ fn execute_projection_cached_window_fns( Ok(selected_columns) } -pub(crate) fn evaluate_physical_expressions( +fn run_exprs_par( df: &DataFrame, exprs: &[Arc], - state: &mut ExecutionState, + state: &ExecutionState, +) -> PolarsResult> { + POOL.install(|| { + exprs + .par_iter() + .map(|expr| expr.evaluate(df, state)) + .collect() + }) +} + +pub(super) fn evaluate_physical_expressions( + df: &mut DataFrame, + cse_exprs: &[Arc], + exprs: &[Arc], + state: &ExecutionState, has_windows: bool, -) -> PolarsResult { - state.expr_cache = Some(Default::default()); - let zero_length = df.height() == 0; - let selected_columns = if has_windows { +) -> PolarsResult> { + let selected_columns = if !cse_exprs.is_empty() { + let tmp_cols = run_exprs_par(df, cse_exprs, state)?; + let width = df.width(); + + // put the cse expressions at the end + unsafe { + df.hstack_mut_unchecked(&tmp_cols); + } + let mut result = run_exprs_par(df, exprs, state)?; + // restore original df + unsafe { + df.get_columns_mut().truncate(width); + } + + // the replace CSE has a temporary name + // we don't want this name in the result + for s in result.iter_mut() { + rename_cse_tmp_series(s); + } + + result + } else if has_windows { execute_projection_cached_window_fns(df, exprs, state)? } else { - POOL.install(|| { - exprs - .par_iter() - .map(|expr| expr.evaluate(df, state)) - .collect::>() - })? + run_exprs_par(df, exprs, state)? }; + state.clear_window_expr_cache(); - state.expr_cache = None; - check_expand_literals(selected_columns, zero_length) + Ok(selected_columns) } -fn check_expand_literals( +pub(super) fn check_expand_literals( mut selected_columns: Vec, zero_length: bool, ) -> PolarsResult { diff --git a/polars/polars-lazy/src/physical_plan/executors/projection.rs b/crates/polars-lazy/src/physical_plan/executors/projection.rs similarity index 68% rename from polars/polars-lazy/src/physical_plan/executors/projection.rs rename to crates/polars-lazy/src/physical_plan/executors/projection.rs index eded6267aafc6..b4e5a11ded9cf 100644 --- a/polars/polars-lazy/src/physical_plan/executors/projection.rs +++ b/crates/polars-lazy/src/physical_plan/executors/projection.rs @@ -4,6 +4,7 @@ use super::*; /// and a multiple PhysicalExpressions (create the output Series) pub struct ProjectionExec { pub(crate) input: Box, + pub(crate) cse_expr: Vec>, pub(crate) expr: Vec>, pub(crate) has_windows: bool, pub(crate) input_schema: SchemaRef, @@ -14,25 +15,31 @@ pub struct ProjectionExec { impl ProjectionExec { fn execute_impl( &mut self, - state: &mut ExecutionState, - df: DataFrame, + state: &ExecutionState, + mut df: DataFrame, ) -> PolarsResult { #[allow(clippy::let_and_return)] - let df = evaluate_physical_expressions(&df, &self.expr, state, self.has_windows); + let selected_cols = evaluate_physical_expressions( + &mut df, + &self.cse_expr, + &self.expr, + state, + self.has_windows, + )?; + #[allow(unused_mut)] + let mut df = check_expand_literals(selected_cols, df.height() == 0)?; // this only runs during testing and check if the runtime type matches the predicted schema #[cfg(test)] #[allow(unused_must_use)] { // TODO: also check the types. - if let Ok(df) = df.as_ref() { - for (l, r) in df.iter().zip(self.schema.iter_names()) { - assert_eq!(l.name(), r); - } + for (l, r) in df.iter().zip(self.schema.iter_names()) { + assert_eq!(l.name(), r); } } - df + Ok(df) } } @@ -41,7 +48,11 @@ impl Executor for ProjectionExec { #[cfg(debug_assertions)] { if state.verbose() { - println!("run ProjectionExec") + if self.cse_expr.is_empty() { + println!("run ProjectionExec"); + } else { + println!("run ProjectionExec with {} CSE", self.cse_expr.len()) + }; } } let df = self.input.execute(state)?; diff --git a/polars/polars-lazy/src/physical_plan/executors/python_scan.rs b/crates/polars-lazy/src/physical_plan/executors/python_scan.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/python_scan.rs rename to crates/polars-lazy/src/physical_plan/executors/python_scan.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/scan/csv.rs b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/scan/csv.rs rename to crates/polars-lazy/src/physical_plan/executors/scan/csv.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/scan/ipc.rs b/crates/polars-lazy/src/physical_plan/executors/scan/ipc.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/scan/ipc.rs rename to crates/polars-lazy/src/physical_plan/executors/scan/ipc.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/scan/mod.rs b/crates/polars-lazy/src/physical_plan/executors/scan/mod.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/scan/mod.rs rename to crates/polars-lazy/src/physical_plan/executors/scan/mod.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/scan/ndjson.rs b/crates/polars-lazy/src/physical_plan/executors/scan/ndjson.rs similarity index 95% rename from polars/polars-lazy/src/physical_plan/executors/scan/ndjson.rs rename to crates/polars-lazy/src/physical_plan/executors/scan/ndjson.rs index d9e2cb70d63d2..a6f6cedda2f78 100644 --- a/polars/polars-lazy/src/physical_plan/executors/scan/ndjson.rs +++ b/crates/polars-lazy/src/physical_plan/executors/scan/ndjson.rs @@ -20,7 +20,7 @@ impl AnonymousScan for LazyJsonLineReader { } fn schema(&self, infer_schema_length: Option) -> PolarsResult { - let f = std::fs::File::open(&self.path)?; + let f = polars_utils::open_file(&self.path)?; let mut reader = std::io::BufReader::new(f); let data_type = diff --git a/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs b/crates/polars-lazy/src/physical_plan/executors/scan/parquet.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs rename to crates/polars-lazy/src/physical_plan/executors/scan/parquet.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/slice.rs b/crates/polars-lazy/src/physical_plan/executors/slice.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/slice.rs rename to crates/polars-lazy/src/physical_plan/executors/slice.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/sort.rs b/crates/polars-lazy/src/physical_plan/executors/sort.rs similarity index 98% rename from polars/polars-lazy/src/physical_plan/executors/sort.rs rename to crates/polars-lazy/src/physical_plan/executors/sort.rs index 2d34175d72121..b3c64d0ab61b3 100644 --- a/polars/polars-lazy/src/physical_plan/executors/sort.rs +++ b/crates/polars-lazy/src/physical_plan/executors/sort.rs @@ -9,7 +9,7 @@ pub(crate) struct SortExec { impl SortExec { fn execute_impl( &mut self, - state: &mut ExecutionState, + state: &ExecutionState, mut df: DataFrame, ) -> PolarsResult { df.as_single_chunk_par(); diff --git a/polars/polars-lazy/src/physical_plan/executors/stack.rs b/crates/polars-lazy/src/physical_plan/executors/stack.rs similarity index 64% rename from polars/polars-lazy/src/physical_plan/executors/stack.rs rename to crates/polars-lazy/src/physical_plan/executors/stack.rs index 0871fa9067503..18f4572cc526f 100644 --- a/polars/polars-lazy/src/physical_plan/executors/stack.rs +++ b/crates/polars-lazy/src/physical_plan/executors/stack.rs @@ -3,31 +3,25 @@ use super::*; pub struct StackExec { pub(crate) input: Box, pub(crate) has_windows: bool, - pub(crate) expr: Vec>, + pub(crate) cse_exprs: Vec>, + pub(crate) exprs: Vec>, pub(crate) input_schema: SchemaRef, } impl StackExec { fn execute_impl( &mut self, - state: &mut ExecutionState, + state: &ExecutionState, mut df: DataFrame, ) -> PolarsResult { - state.expr_cache = Some(Default::default()); - let res = if self.has_windows { - // we have a different run here - // to ensure the window functions run sequential and share caches - execute_projection_cached_window_fns(&df, &self.expr, state)? - } else { - POOL.install(|| { - self.expr - .par_iter() - .map(|expr| expr.evaluate(&df, state)) - .collect::>>() - })? - }; + let res = evaluate_physical_expressions( + &mut df, + &self.cse_exprs, + &self.exprs, + state, + self.has_windows, + )?; state.clear_window_expr_cache(); - state.expr_cache = None; let schema = &*self.input_schema; df._add_columns(res, schema)?; @@ -41,14 +35,18 @@ impl Executor for StackExec { #[cfg(debug_assertions)] { if state.verbose() { - println!("run StackExec") + if self.cse_exprs.is_empty() { + println!("run StackExec"); + } else { + println!("run StackExec with {} CSE", self.cse_exprs.len()); + }; } } let df = self.input.execute(state)?; let profile_name = if state.has_node_timer() { let by = self - .expr + .exprs .iter() .map(|s| Ok(s.to_field(&self.input_schema)?.name)) .collect::>>()?; diff --git a/polars/polars-lazy/src/physical_plan/executors/udf.rs b/crates/polars-lazy/src/physical_plan/executors/udf.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/udf.rs rename to crates/polars-lazy/src/physical_plan/executors/udf.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/union.rs b/crates/polars-lazy/src/physical_plan/executors/union.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/union.rs rename to crates/polars-lazy/src/physical_plan/executors/union.rs diff --git a/polars/polars-lazy/src/physical_plan/executors/unique.rs b/crates/polars-lazy/src/physical_plan/executors/unique.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/executors/unique.rs rename to crates/polars-lazy/src/physical_plan/executors/unique.rs diff --git a/polars/polars-lazy/src/physical_plan/exotic.rs b/crates/polars-lazy/src/physical_plan/exotic.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/exotic.rs rename to crates/polars-lazy/src/physical_plan/exotic.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/aggregation.rs b/crates/polars-lazy/src/physical_plan/expressions/aggregation.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/aggregation.rs rename to crates/polars-lazy/src/physical_plan/expressions/aggregation.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/alias.rs b/crates/polars-lazy/src/physical_plan/expressions/alias.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/alias.rs rename to crates/polars-lazy/src/physical_plan/expressions/alias.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/apply.rs b/crates/polars-lazy/src/physical_plan/expressions/apply.rs similarity index 98% rename from polars/polars-lazy/src/physical_plan/expressions/apply.rs rename to crates/polars-lazy/src/physical_plan/expressions/apply.rs index cae2bcbed458a..a5a8fd79d00c8 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/apply.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/apply.rs @@ -443,6 +443,11 @@ impl ApplyExpr { } => (function, input), _ => return Ok(true), }; + // ensure the input of the function is only a `col(..)` + // if it does any arithmetic the code below is flawed + if !matches!(input[0], Expr::Column(_)) { + return Ok(true); + } match function { FunctionExpr::Boolean(BooleanFunction::IsNull) => { diff --git a/polars/polars-lazy/src/physical_plan/expressions/binary.rs b/crates/polars-lazy/src/physical_plan/expressions/binary.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/binary.rs rename to crates/polars-lazy/src/physical_plan/expressions/binary.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/cast.rs b/crates/polars-lazy/src/physical_plan/expressions/cast.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/cast.rs rename to crates/polars-lazy/src/physical_plan/expressions/cast.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/column.rs b/crates/polars-lazy/src/physical_plan/expressions/column.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/column.rs rename to crates/polars-lazy/src/physical_plan/expressions/column.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/count.rs b/crates/polars-lazy/src/physical_plan/expressions/count.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/count.rs rename to crates/polars-lazy/src/physical_plan/expressions/count.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/filter.rs b/crates/polars-lazy/src/physical_plan/expressions/filter.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/filter.rs rename to crates/polars-lazy/src/physical_plan/expressions/filter.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/group_iter.rs b/crates/polars-lazy/src/physical_plan/expressions/group_iter.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/group_iter.rs rename to crates/polars-lazy/src/physical_plan/expressions/group_iter.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/literal.rs b/crates/polars-lazy/src/physical_plan/expressions/literal.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/literal.rs rename to crates/polars-lazy/src/physical_plan/expressions/literal.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/mod.rs b/crates/polars-lazy/src/physical_plan/expressions/mod.rs similarity index 99% rename from polars/polars-lazy/src/physical_plan/expressions/mod.rs rename to crates/polars-lazy/src/physical_plan/expressions/mod.rs index befd3655be993..569158496f8ea 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/mod.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/mod.rs @@ -2,7 +2,6 @@ mod aggregation; mod alias; mod apply; mod binary; -mod cache; mod cast; mod column; mod count; @@ -23,7 +22,6 @@ pub(crate) use aggregation::*; pub(crate) use alias::*; pub(crate) use apply::*; pub(crate) use binary::*; -pub(crate) use cache::*; pub(crate) use cast::*; pub(crate) use column::*; pub(crate) use count::*; diff --git a/polars/polars-lazy/src/physical_plan/expressions/slice.rs b/crates/polars-lazy/src/physical_plan/expressions/slice.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/slice.rs rename to crates/polars-lazy/src/physical_plan/expressions/slice.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/sort.rs b/crates/polars-lazy/src/physical_plan/expressions/sort.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/sort.rs rename to crates/polars-lazy/src/physical_plan/expressions/sort.rs diff --git a/polars/polars-lazy/src/physical_plan/expressions/sortby.rs b/crates/polars-lazy/src/physical_plan/expressions/sortby.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/expressions/sortby.rs rename to crates/polars-lazy/src/physical_plan/expressions/sortby.rs diff --git a/crates/polars-lazy/src/physical_plan/expressions/take.rs b/crates/polars-lazy/src/physical_plan/expressions/take.rs new file mode 100644 index 0000000000000..f354965f9ee5a --- /dev/null +++ b/crates/polars-lazy/src/physical_plan/expressions/take.rs @@ -0,0 +1,203 @@ +use std::sync::Arc; + +use polars_arrow::utils::CustomIterTools; +use polars_core::frame::groupby::GroupsProxy; +use polars_core::prelude::*; +use polars_core::utils::NoNull; + +use crate::physical_plan::state::ExecutionState; +use crate::prelude::*; + +pub struct TakeExpr { + pub(crate) phys_expr: Arc, + pub(crate) idx: Arc, + pub(crate) expr: Expr, +} + +impl TakeExpr { + fn finish( + &self, + df: &DataFrame, + state: &ExecutionState, + series: Series, + ) -> PolarsResult { + let idx = self.idx.evaluate(df, state)?; + + let nulls_before_cast = idx.null_count(); + + let idx = idx.cast(&IDX_DTYPE)?; + if idx.null_count() != nulls_before_cast { + self.oob_err()?; + } + let idx_ca = idx.idx()?; + + series.take(idx_ca) + } + + fn oob_err(&self) -> PolarsResult<()> { + polars_bail!(expr = self.expr, ComputeError: "index out of bounds"); + } +} + +impl PhysicalExpr for TakeExpr { + fn as_expression(&self) -> Option<&Expr> { + Some(&self.expr) + } + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + let series = self.phys_expr.evaluate(df, state)?; + self.finish(df, state, series) + } + + #[allow(clippy::ptr_arg)] + fn evaluate_on_groups<'a>( + &self, + df: &DataFrame, + groups: &'a GroupsProxy, + state: &ExecutionState, + ) -> PolarsResult> { + let mut ac = self.phys_expr.evaluate_on_groups(df, groups, state)?; + let mut idx = self.idx.evaluate_on_groups(df, groups, state)?; + + let idx = match idx.state { + AggState::AggregatedFlat(s) => { + let idx = s.cast(&IDX_DTYPE)?; + let idx = idx.idx().unwrap(); + + // The indexes are AggregatedFlat, meaning they are a single values pointing into + // a group. + // If we zip this with the first of each group -> `idx + firs` then we can + // simply use a take operation on the whole array instead of per group. + + // The groups maybe scattered all over the place, so we sort by group + ac.sort_by_groups(); + + // A previous aggregation may have updated the groups + let groups = ac.groups(); + + // Determine the take indices + let idx: IdxCa = match groups.as_ref() { + GroupsProxy::Idx(groups) => { + if groups.all().iter().zip(idx).any(|(g, idx)| match idx { + None => true, + Some(idx) => idx >= g.len() as IdxSize, + }) { + self.oob_err()?; + } + + idx.into_iter() + .zip(groups.first().iter()) + .map(|(idx, first)| idx.map(|idx| idx + first)) + .collect_trusted() + } + GroupsProxy::Slice { groups, .. } => { + if groups.iter().zip(idx).any(|(g, idx)| match idx { + None => true, + Some(idx) => idx >= g[1], + }) { + self.oob_err()?; + } + + idx.into_iter() + .zip(groups.iter()) + .map(|(idx, g)| idx.map(|idx| idx + g[0])) + .collect_trusted() + } + }; + let taken = ac.flat_naive().take(&idx)?; + ac.with_series(taken, true, Some(&self.expr))?; + return Ok(ac); + } + AggState::AggregatedList(s) => s.list().unwrap().clone(), + // Maybe a literal as well, this needs a different path + AggState::NotAggregated(_) => { + let s = idx.aggregated(); + s.list().unwrap().clone() + } + AggState::Literal(s) => { + let idx = s.cast(&IDX_DTYPE)?; + let idx = idx.idx().unwrap(); + + return if idx.len() == 1 { + match idx.get(0) { + None => polars_bail!(ComputeError: "cannot take by a null"), + Some(idx) => { + if idx != 0 { + // We must make sure that the column we take from is sorted by + // groups otherwise we might point into the wrong group + ac.sort_by_groups() + } + // Make sure that we look at the updated groups. + let groups = ac.groups(); + + // we offset the groups first by idx; + let idx: NoNull = match groups.as_ref() { + GroupsProxy::Idx(groups) => { + if groups.all().iter().any(|g| idx >= g.len() as IdxSize) { + self.oob_err()?; + } + + groups.first().iter().map(|f| *f + idx).collect_trusted() + } + GroupsProxy::Slice { groups, .. } => { + if groups.iter().any(|g| idx >= g[1]) { + self.oob_err()?; + } + + groups.iter().map(|g| g[0] + idx).collect_trusted() + } + }; + let taken = ac.flat_naive().take(&idx.into_inner())?; + ac.with_series(taken, true, Some(&self.expr))?; + ac.with_update_groups(UpdateGroups::WithGroupsLen); + Ok(ac) + } + } + } else { + let out = ac + .aggregated() + .list() + .unwrap() + .try_apply_amortized(|s| s.as_ref().take(idx))?; + + ac.with_series(out.into_series(), true, Some(&self.expr))?; + ac.with_update_groups(UpdateGroups::WithGroupsLen); + Ok(ac) + }; + } + }; + + let s = idx.cast(&DataType::List(Box::new(IDX_DTYPE)))?; + let idx = s.list().unwrap(); + + let mut taken = ac + .aggregated() + .list() + .unwrap() + .amortized_iter() + .zip(idx.amortized_iter()) + .map(|(s, idx)| { + s.and_then(|s| { + idx.map(|idx| { + let idx = idx.as_ref().idx().unwrap(); + s.as_ref().take(idx) + }) + }) + .transpose() + }) + .collect::>()?; + + taken.rename(ac.series().name()); + + ac.with_series(taken.into_series(), true, Some(&self.expr))?; + ac.with_update_groups(UpdateGroups::WithGroupsLen); + Ok(ac) + } + + fn to_field(&self, input_schema: &Schema) -> PolarsResult { + self.phys_expr.to_field(input_schema) + } + + fn is_valid_aggregation(&self) -> bool { + true + } +} diff --git a/polars/polars-lazy/src/physical_plan/expressions/ternary.rs b/crates/polars-lazy/src/physical_plan/expressions/ternary.rs similarity index 98% rename from polars/polars-lazy/src/physical_plan/expressions/ternary.rs rename to crates/polars-lazy/src/physical_plan/expressions/ternary.rs index 1281c0e9a4272..541b0deccb910 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/ternary.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/ternary.rs @@ -199,7 +199,7 @@ impl PhysicalExpr for TernaryExpr { check_length(ca, mask)?; let mut out: ListChunked = ca .into_iter() - .zip(mask.into_iter()) + .zip(mask) .map(|(truthy, take)| match (truthy, take) { (Some(v), Some(true)) => Some(v), (Some(_), Some(false)) => None, @@ -217,7 +217,7 @@ impl PhysicalExpr for TernaryExpr { check_length(ca, mask)?; let mut out: ListChunked = ca .into_iter() - .zip(mask.into_iter()) + .zip(mask) .map(|(falsy, take)| match (falsy, take) { (Some(_), Some(true)) => None, (Some(v), Some(false)) => Some(v), @@ -239,7 +239,7 @@ impl PhysicalExpr for TernaryExpr { check_length(ca, mask)?; let mut out: ListChunked = ca .into_iter() - .zip(mask.into_iter()) + .zip(mask) .map(|(falsy, take)| match (falsy, take) { (Some(_), Some(true)) => Some(literal.clone()), (Some(v), Some(false)) => Some(v), @@ -256,7 +256,7 @@ impl PhysicalExpr for TernaryExpr { check_length(ca, mask)?; let mut out: ListChunked = ca .into_iter() - .zip(mask.into_iter()) + .zip(mask) .map(|(truthy, take)| match (truthy, take) { (Some(v), Some(true)) => Some(v), (Some(_), Some(false)) => Some(literal.clone()), diff --git a/polars/polars-lazy/src/physical_plan/expressions/window.rs b/crates/polars-lazy/src/physical_plan/expressions/window.rs similarity index 99% rename from polars/polars-lazy/src/physical_plan/expressions/window.rs rename to crates/polars-lazy/src/physical_plan/expressions/window.rs index 6e6f345db2d6f..547fe27dffbeb 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/window.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/window.rs @@ -704,7 +704,7 @@ where } } let mut values = Vec::with_capacity(len); - let ptr = values.as_mut_ptr() as *mut T::Native; + let ptr: *mut T::Native = values.as_mut_ptr(); // safety: // we will write from different threads but we will never alias. let sync_ptr_values = unsafe { SyncPtr::new(ptr) }; diff --git a/polars/polars-lazy/src/physical_plan/file_cache.rs b/crates/polars-lazy/src/physical_plan/file_cache.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/file_cache.rs rename to crates/polars-lazy/src/physical_plan/file_cache.rs diff --git a/polars/polars-lazy/src/physical_plan/mod.rs b/crates/polars-lazy/src/physical_plan/mod.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/mod.rs rename to crates/polars-lazy/src/physical_plan/mod.rs diff --git a/polars/polars-lazy/src/physical_plan/node_timer.rs b/crates/polars-lazy/src/physical_plan/node_timer.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/node_timer.rs rename to crates/polars-lazy/src/physical_plan/node_timer.rs diff --git a/polars/polars-lazy/src/physical_plan/planner/expr.rs b/crates/polars-lazy/src/physical_plan/planner/expr.rs similarity index 98% rename from polars/polars-lazy/src/physical_plan/planner/expr.rs rename to crates/polars-lazy/src/physical_plan/planner/expr.rs index 6e6431622ec77..96607d317a7d5 100644 --- a/polars/polars-lazy/src/physical_plan/planner/expr.rs +++ b/crates/polars-lazy/src/physical_plan/planner/expr.rs @@ -531,15 +531,6 @@ pub(crate) fn create_physical_expr( ApplyOptions::ApplyGroups, ))) } - Cache { input, id } => { - state.has_cache = true; - let input = create_physical_expr(input, ctxt, expr_arena, schema, state)?; - Ok(Arc::new(CacheExpr::new( - input, - node_to_expr(expression, expr_arena), - id, - ))) - } Wildcard => panic!("should be no wildcard at this point"), Nth(_) => panic!("should be no nth at this point"), } diff --git a/polars/polars-lazy/src/physical_plan/planner/lp.rs b/crates/polars-lazy/src/physical_plan/planner/lp.rs similarity index 94% rename from polars/polars-lazy/src/physical_plan/planner/lp.rs rename to crates/polars-lazy/src/physical_plan/planner/lp.rs index 37b3869191eca..d79a6a8c156f0 100644 --- a/polars/polars-lazy/src/physical_plan/planner/lp.rs +++ b/crates/polars-lazy/src/physical_plan/planner/lp.rs @@ -238,7 +238,14 @@ pub fn create_physical_plan( let input = create_physical_plan(input, lp_arena, expr_arena)?; let mut state = ExpressionConversionState::new(POOL.current_num_threads() > expr.len()); let phys_expr = create_physical_expressions( - &expr, + expr.default_exprs(), + Context::Default, + expr_arena, + Some(&input_schema), + &mut state, + )?; + let cse_expr = create_physical_expressions( + expr.cse_exprs(), Context::Default, expr_arena, Some(&input_schema), @@ -246,6 +253,7 @@ pub fn create_physical_plan( )?; Ok(Box::new(executors::ProjectionExec { input, + cse_expr, expr: phys_expr, has_windows: state.has_windows, input_schema, @@ -272,6 +280,7 @@ pub fn create_physical_plan( )?; Ok(Box::new(executors::ProjectionExec { input, + cse_expr: vec![], expr: phys_expr, has_windows: state.has_windows, input_schema, @@ -312,6 +321,7 @@ pub fn create_physical_plan( output_schema, .. } => { + let options = Arc::try_unwrap(options).unwrap_or_else(|options| (*options).clone()); let predicate = predicate .map(|pred| { create_physical_expr( @@ -367,6 +377,7 @@ pub fn create_physical_plan( options, } => { let input_schema = lp_arena.get(input).schema(lp_arena).into_owned(); + let options = Arc::try_unwrap(options).unwrap_or_else(|options| (*options).clone()); let phys_keys = create_physical_expressions( &keys, Context::Default, @@ -494,6 +505,7 @@ pub fn create_physical_plan( None, &mut Default::default(), )?; + let options = Arc::try_unwrap(options).unwrap_or_else(|options| (*options).clone()); Ok(Box::new(executors::JoinExec::new( input_left, input_right, @@ -503,14 +515,27 @@ pub fn create_physical_plan( options.args, ))) } - HStack { input, exprs, .. } => { + HStack { + input, + exprs, + schema: _schema, + } => { let input_schema = lp_arena.get(input).schema(lp_arena).into_owned(); let input = create_physical_plan(input, lp_arena, expr_arena)?; let mut state = ExpressionConversionState::new(POOL.current_num_threads() > exprs.len()); - let phys_expr = create_physical_expressions( - &exprs, + + let cse_exprs = create_physical_expressions( + exprs.cse_exprs(), + Context::Default, + expr_arena, + Some(&input_schema), + &mut state, + )?; + + let phys_exprs = create_physical_expressions( + exprs.default_exprs(), Context::Default, expr_arena, Some(&input_schema), @@ -519,7 +544,8 @@ pub fn create_physical_plan( Ok(Box::new(executors::StackExec { input, has_windows: state.has_windows, - expr: phys_expr, + cse_exprs, + exprs: phys_exprs, input_schema, })) } diff --git a/polars/polars-lazy/src/physical_plan/planner/mod.rs b/crates/polars-lazy/src/physical_plan/planner/mod.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/planner/mod.rs rename to crates/polars-lazy/src/physical_plan/planner/mod.rs diff --git a/polars/polars-lazy/src/physical_plan/state.rs b/crates/polars-lazy/src/physical_plan/state.rs similarity index 93% rename from polars/polars-lazy/src/physical_plan/state.rs rename to crates/polars-lazy/src/physical_plan/state.rs index be2635a61920a..fc5c00a5e4673 100644 --- a/polars/polars-lazy/src/physical_plan/state.rs +++ b/crates/polars-lazy/src/physical_plan/state.rs @@ -63,8 +63,6 @@ impl From for StateFlags { pub struct ExecutionState { // cached by a `.cache` call and kept in memory for the duration of the plan. df_cache: Arc>>>>, - #[allow(clippy::type_complexity)] - pub(crate) expr_cache: Option>>>>>, // cache file reads until all branches got there file, then we delete it #[cfg(any(feature = "ipc", feature = "parquet", feature = "csv"))] pub(crate) file_cache: FileCache, @@ -111,7 +109,6 @@ impl ExecutionState { pub(super) fn split(&self) -> Self { Self { df_cache: self.df_cache.clone(), - expr_cache: self.expr_cache.clone(), #[cfg(any(feature = "ipc", feature = "parquet", feature = "csv"))] file_cache: self.file_cache.clone(), schema_cache: Default::default(), @@ -128,7 +125,6 @@ impl ExecutionState { pub(super) fn clone(&self) -> Self { Self { df_cache: self.df_cache.clone(), - expr_cache: self.expr_cache.clone(), #[cfg(any(feature = "ipc", feature = "parquet", feature = "csv"))] file_cache: self.file_cache.clone(), schema_cache: self.schema_cache.read().unwrap().clone().into(), @@ -149,7 +145,6 @@ impl ExecutionState { pub(crate) fn with_finger_prints(finger_prints: Option>) -> Self { Self { df_cache: Arc::new(Mutex::new(PlHashMap::default())), - expr_cache: None, schema_cache: Default::default(), #[cfg(any(feature = "ipc", feature = "parquet", feature = "csv"))] file_cache: FileCache::new(finger_prints), @@ -169,7 +164,6 @@ impl ExecutionState { } Self { df_cache: Default::default(), - expr_cache: None, schema_cache: Default::default(), #[cfg(any(feature = "ipc", feature = "parquet", feature = "csv"))] file_cache: FileCache::new(None), @@ -206,16 +200,6 @@ impl ExecutionState { .clone() } - pub(crate) fn get_expr_cache(&self, key: usize) -> Option>> { - self.expr_cache.as_ref().map(|cache| { - let mut guard = cache.lock().unwrap(); - guard - .entry(key) - .or_insert_with(|| Arc::new(OnceCell::new())) - .clone() - }) - } - /// Clear the cache used by the Window expressions pub(crate) fn clear_window_expr_cache(&self) { { diff --git a/polars/polars-lazy/src/physical_plan/streaming/checks.rs b/crates/polars-lazy/src/physical_plan/streaming/checks.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/streaming/checks.rs rename to crates/polars-lazy/src/physical_plan/streaming/checks.rs diff --git a/polars/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs b/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs similarity index 91% rename from polars/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs rename to crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs index 219b67a854fad..1c3f8d1d0af9e 100644 --- a/polars/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs +++ b/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs @@ -59,36 +59,30 @@ fn jit_insert_slice( // slice AFTER the join has happened and the join will be an // operator use ALogicalPlan::*; - match lp_arena.get(node) { - Join { - options: - JoinOptions { - args: - JoinArgs { - slice: Some((offset, len)), - .. - }, - .. - }, - .. + let (offset, len) = match lp_arena.get(node) { + Join { options, .. } if options.args.slice.is_some() => { + let Some((offset, len)) = options.args.slice else { + unreachable!() + }; + (offset, len) } - | Union { + Union { options: UnionOptions { slice: Some((offset, len)), .. }, .. - } => { - let slice_node = lp_arena.add(Slice { - input: Node::default(), - offset: *offset, - len: *len as IdxSize, - }); - sink_nodes.push((operator_offset + 1, slice_node, Rc::new(RefCell::new(1)))); - } - _ => {} - } + } => (*offset, *len), + _ => return, + }; + + let slice_node = lp_arena.add(Slice { + input: Node::default(), + offset, + len: len as IdxSize, + }); + sink_nodes.push((operator_offset + 1, slice_node, Rc::new(RefCell::new(1)))); } pub(super) fn construct( @@ -193,7 +187,9 @@ pub(super) fn construct( // also pipelines are not ready to receive inputs otherwise pipelines.sort_by(|a, b| a.0.cmp(&b.0)); - let Some(final_sink) = final_sink else { return Ok(None) }; + let Some(final_sink) = final_sink else { + return Ok(None); + }; let insertion_location = match lp_arena.get(final_sink) { FileSink { input, @@ -219,7 +215,9 @@ pub(super) fn construct( None }; - let Some((_, mut most_left)) = pipelines.pop() else {unreachable!()}; + let Some((_, mut most_left)) = pipelines.pop() else { + unreachable!() + }; while let Some((_, rhs)) = pipelines.pop() { most_left = most_left.with_other_branch(rhs) } diff --git a/polars/polars-lazy/src/physical_plan/streaming/convert_alp.rs b/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs similarity index 99% rename from polars/polars-lazy/src/physical_plan/streaming/convert_alp.rs rename to crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs index b8fd4bc14446a..8e6ce83f1b69e 100644 --- a/polars/polars-lazy/src/physical_plan/streaming/convert_alp.rs +++ b/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs @@ -287,7 +287,9 @@ pub(crate) fn insert_streaming_nodes( } if *offset >= 0 => { insert_slice(root, *offset, *len as IdxSize, lp_arena, &mut state); state.streamable = true; - let Union {inputs, ..} = lp_arena.get(root) else {unreachable!()}; + let Union { inputs, .. } = lp_arena.get(root) else { + unreachable!() + }; for (i, input) in inputs.iter().enumerate() { let mut state = if i == 0 { // note the clone! diff --git a/polars/polars-lazy/src/physical_plan/streaming/mod.rs b/crates/polars-lazy/src/physical_plan/streaming/mod.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/streaming/mod.rs rename to crates/polars-lazy/src/physical_plan/streaming/mod.rs diff --git a/polars/polars-lazy/src/physical_plan/streaming/tree.rs b/crates/polars-lazy/src/physical_plan/streaming/tree.rs similarity index 100% rename from polars/polars-lazy/src/physical_plan/streaming/tree.rs rename to crates/polars-lazy/src/physical_plan/streaming/tree.rs diff --git a/polars/polars-lazy/src/prelude.rs b/crates/polars-lazy/src/prelude.rs similarity index 100% rename from polars/polars-lazy/src/prelude.rs rename to crates/polars-lazy/src/prelude.rs diff --git a/polars/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs similarity index 100% rename from polars/polars-lazy/src/tests/aggregations.rs rename to crates/polars-lazy/src/tests/aggregations.rs diff --git a/polars/polars-lazy/src/tests/arity.rs b/crates/polars-lazy/src/tests/arity.rs similarity index 100% rename from polars/polars-lazy/src/tests/arity.rs rename to crates/polars-lazy/src/tests/arity.rs diff --git a/polars/polars-lazy/src/tests/cse.rs b/crates/polars-lazy/src/tests/cse.rs similarity index 97% rename from polars/polars-lazy/src/tests/cse.rs rename to crates/polars-lazy/src/tests/cse.rs index 4fef63584456c..28f1e20b07414 100644 --- a/polars/polars-lazy/src/tests/cse.rs +++ b/crates/polars-lazy/src/tests/cse.rs @@ -19,7 +19,7 @@ fn test_cse_self_joins() -> PolarsResult<()> { let lf = lf .clone() .left_join(lf, col("fats_g"), col("fats_g")) - .with_common_subplan_elimination(true); + .with_comm_subplan_elim(true); cached_before_root(lf); @@ -41,7 +41,7 @@ fn test_cse_unions() -> PolarsResult<()> { }, )? .select([col("category"), col("fats_g")]) - .with_common_subplan_elimination(true); + .with_comm_subplan_elim(true); let (mut expr_arena, mut lp_arena) = get_arenas(); let lp = lf.clone().optimize(&mut lp_arena, &mut expr_arena).unwrap(); @@ -77,7 +77,7 @@ fn test_cse_cache_union_projection_pd() -> PolarsResult<()> { let q2 = q.filter(col("a").eq(lit(1))).select([col("a"), col("b")]); let q = q1 .left_join(q2, col("a"), col("a")) - .with_common_subplan_elimination(true); + .with_comm_subplan_elim(true); // check that the projection of a is not done before the cache let (mut expr_arena, mut lp_arena) = get_arenas(); @@ -242,7 +242,7 @@ fn test_cache_with_partial_projection() -> PolarsResult<()> { JoinType::Semi.into(), ); - let q = q.with_common_subplan_elimination(true); + let q = q.with_comm_subplan_elim(true); let (mut expr_arena, mut lp_arena) = get_arenas(); let lp = q.optimize(&mut lp_arena, &mut expr_arena).unwrap(); diff --git a/polars/polars-lazy/src/tests/io.rs b/crates/polars-lazy/src/tests/io.rs similarity index 100% rename from polars/polars-lazy/src/tests/io.rs rename to crates/polars-lazy/src/tests/io.rs diff --git a/polars/polars-lazy/src/tests/logical.rs b/crates/polars-lazy/src/tests/logical.rs similarity index 100% rename from polars/polars-lazy/src/tests/logical.rs rename to crates/polars-lazy/src/tests/logical.rs diff --git a/polars/polars-lazy/src/tests/mod.rs b/crates/polars-lazy/src/tests/mod.rs similarity index 100% rename from polars/polars-lazy/src/tests/mod.rs rename to crates/polars-lazy/src/tests/mod.rs diff --git a/polars/polars-lazy/src/tests/optimization_checks.rs b/crates/polars-lazy/src/tests/optimization_checks.rs similarity index 99% rename from polars/polars-lazy/src/tests/optimization_checks.rs rename to crates/polars-lazy/src/tests/optimization_checks.rs index 4f5ce98018bdf..602525288cae3 100644 --- a/polars/polars-lazy/src/tests/optimization_checks.rs +++ b/crates/polars-lazy/src/tests/optimization_checks.rs @@ -181,7 +181,7 @@ pub fn test_slice_pushdown_join() -> PolarsResult<()> { ) .slice(1, 3) // this inserts a cache and blocks slice pushdown - .with_common_subplan_elimination(false); + .with_comm_subplan_elim(false); // test if optimization continued beyond the join node assert!(slice_at_scan(q.clone())); diff --git a/polars/polars-lazy/src/tests/predicate_queries.rs b/crates/polars-lazy/src/tests/predicate_queries.rs similarity index 98% rename from polars/polars-lazy/src/tests/predicate_queries.rs rename to crates/polars-lazy/src/tests/predicate_queries.rs index d2259fef77df2..0a854bf420a6a 100644 --- a/polars/polars-lazy/src/tests/predicate_queries.rs +++ b/crates/polars-lazy/src/tests/predicate_queries.rs @@ -44,7 +44,7 @@ fn test_issue_2472() -> PolarsResult<()> { let extract = col("group") .cast(DataType::Utf8) .str() - .extract(r#"(\d+-){4}(\w+)-"#, 2) + .extract(r"(\d+-){4}(\w+)-", 2) .cast(DataType::Int32) .alias("age"); let predicate = col("age").is_in(lit(Series::new("", [2i32]))); @@ -254,7 +254,7 @@ fn test_predicate_on_join_suffix_4788() -> PolarsResult<()> { .suffix("_") .finish() .filter(col("x").eq(1)) - .with_common_subplan_elimination(false); + .with_comm_subplan_elim(false); // the left hand side should have a predicate assert!(predicate_at_scan(q.clone())); diff --git a/polars/polars-lazy/src/tests/projection_queries.rs b/crates/polars-lazy/src/tests/projection_queries.rs similarity index 98% rename from polars/polars-lazy/src/tests/projection_queries.rs rename to crates/polars-lazy/src/tests/projection_queries.rs index 0d3d5712aab9a..30cf420baf360 100644 --- a/polars/polars-lazy/src/tests/projection_queries.rs +++ b/crates/polars-lazy/src/tests/projection_queries.rs @@ -102,7 +102,7 @@ fn scan_join_same_file() -> PolarsResult<()> { [col("category")], JoinType::Inner.into(), ) - .with_common_subplan_elimination(cse); + .with_comm_subplan_elim(cse); let out = q.collect()?; assert_eq!( out.get_column_names(), diff --git a/polars/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs similarity index 100% rename from polars/polars-lazy/src/tests/queries.rs rename to crates/polars-lazy/src/tests/queries.rs diff --git a/polars/polars-lazy/src/tests/streaming.rs b/crates/polars-lazy/src/tests/streaming.rs similarity index 100% rename from polars/polars-lazy/src/tests/streaming.rs rename to crates/polars-lazy/src/tests/streaming.rs diff --git a/polars/polars-lazy/src/tests/tpch.rs b/crates/polars-lazy/src/tests/tpch.rs similarity index 98% rename from polars/polars-lazy/src/tests/tpch.rs rename to crates/polars-lazy/src/tests/tpch.rs index 5578681f264fc..c5f876477f5c6 100644 --- a/polars/polars-lazy/src/tests/tpch.rs +++ b/crates/polars-lazy/src/tests/tpch.rs @@ -83,7 +83,7 @@ fn test_q2() -> PolarsResult<()> { false, ) .limit(100) - .with_common_subplan_elimination(true); + .with_comm_subplan_elim(true); let out = q.collect()?; let schema = Schema::from_iter([ diff --git a/polars/polars-lazy/src/utils.rs b/crates/polars-lazy/src/utils.rs similarity index 100% rename from polars/polars-lazy/src/utils.rs rename to crates/polars-lazy/src/utils.rs diff --git a/polars/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml similarity index 94% rename from polars/polars-ops/Cargo.toml rename to crates/polars-ops/Cargo.toml index dae80e271e0cd..9b3129326a610 100644 --- a/polars/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -13,6 +13,8 @@ description = "More operations on polars data structures" argminmax = { version = "0.6.1", default-features = false, features = ["float"] } arrow.workspace = true base64 = { version = "0.21", optional = true } +chrono = { version = "0.4", default-features = false, features = ["std"], optional = true } +chrono-tz = { version = "0.8", optional = true } either.workspace = true hex = { version = "0.4", optional = true } indexmap.workspace = true @@ -52,6 +54,7 @@ approx_unique = [] fused = [] cutqcut = ["dtype-categorical", "dtype-struct"] rle = ["dtype-struct"] +timezones = ["chrono-tz", "chrono"] # extra utilities for BinaryChunked binary_encoding = ["base64", "hex"] diff --git a/polars/polars-ops/LICENSE b/crates/polars-ops/LICENSE similarity index 100% rename from polars/polars-ops/LICENSE rename to crates/polars-ops/LICENSE diff --git a/polars/polars-ops/README.md b/crates/polars-ops/README.md similarity index 100% rename from polars/polars-ops/README.md rename to crates/polars-ops/README.md diff --git a/polars/polars-ops/src/chunked_array/array/min_max.rs b/crates/polars-ops/src/chunked_array/array/min_max.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/array/min_max.rs rename to crates/polars-ops/src/chunked_array/array/min_max.rs diff --git a/polars/polars-ops/src/chunked_array/array/mod.rs b/crates/polars-ops/src/chunked_array/array/mod.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/array/mod.rs rename to crates/polars-ops/src/chunked_array/array/mod.rs diff --git a/polars/polars-ops/src/chunked_array/array/namespace.rs b/crates/polars-ops/src/chunked_array/array/namespace.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/array/namespace.rs rename to crates/polars-ops/src/chunked_array/array/namespace.rs diff --git a/polars/polars-ops/src/chunked_array/array/sum_mean.rs b/crates/polars-ops/src/chunked_array/array/sum_mean.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/array/sum_mean.rs rename to crates/polars-ops/src/chunked_array/array/sum_mean.rs diff --git a/polars/polars-ops/src/chunked_array/binary/mod.rs b/crates/polars-ops/src/chunked_array/binary/mod.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/binary/mod.rs rename to crates/polars-ops/src/chunked_array/binary/mod.rs diff --git a/polars/polars-ops/src/chunked_array/binary/namespace.rs b/crates/polars-ops/src/chunked_array/binary/namespace.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/binary/namespace.rs rename to crates/polars-ops/src/chunked_array/binary/namespace.rs diff --git a/crates/polars-ops/src/chunked_array/datetime/mod.rs b/crates/polars-ops/src/chunked_array/datetime/mod.rs new file mode 100644 index 0000000000000..a84031ff486d0 --- /dev/null +++ b/crates/polars-ops/src/chunked_array/datetime/mod.rs @@ -0,0 +1,4 @@ +#[cfg(feature = "timezones")] +mod replace_time_zone; +#[cfg(feature = "timezones")] +pub use replace_time_zone::*; diff --git a/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs b/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs new file mode 100644 index 0000000000000..cd313c16158fe --- /dev/null +++ b/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs @@ -0,0 +1,24 @@ +use polars_arrow::kernels::replace_time_zone as replace_time_zone_kernel; +use polars_core::prelude::*; + +pub fn replace_time_zone( + ca: &DatetimeChunked, + time_zone: Option<&str>, + use_earliest: Option, +) -> PolarsResult { + let out: PolarsResult<_> = { + let from = ca.time_zone().as_deref().unwrap_or("UTC"); + let to = time_zone.unwrap_or("UTC"); + let chunks = ca + .downcast_iter() + .map(|arr| { + replace_time_zone_kernel(arr, ca.time_unit().to_arrow(), from, to, use_earliest) + }) + .collect::>()?; + let out = unsafe { ChunkedArray::from_chunks(ca.name(), chunks) }; + Ok(out.into_datetime(ca.time_unit(), time_zone.map(|x| x.to_string()))) + }; + let mut out = out?; + out.set_sorted_flag(ca.is_sorted_flag()); + Ok(out) +} diff --git a/polars/polars-ops/src/chunked_array/interpolate.rs b/crates/polars-ops/src/chunked_array/interpolate.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/interpolate.rs rename to crates/polars-ops/src/chunked_array/interpolate.rs diff --git a/polars/polars-ops/src/chunked_array/list/any_all.rs b/crates/polars-ops/src/chunked_array/list/any_all.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/list/any_all.rs rename to crates/polars-ops/src/chunked_array/list/any_all.rs diff --git a/polars/polars-ops/src/chunked_array/list/count.rs b/crates/polars-ops/src/chunked_array/list/count.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/list/count.rs rename to crates/polars-ops/src/chunked_array/list/count.rs diff --git a/polars/polars-ops/src/chunked_array/list/hash.rs b/crates/polars-ops/src/chunked_array/list/hash.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/list/hash.rs rename to crates/polars-ops/src/chunked_array/list/hash.rs diff --git a/polars/polars-ops/src/chunked_array/list/min_max.rs b/crates/polars-ops/src/chunked_array/list/min_max.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/list/min_max.rs rename to crates/polars-ops/src/chunked_array/list/min_max.rs diff --git a/polars/polars-ops/src/chunked_array/list/mod.rs b/crates/polars-ops/src/chunked_array/list/mod.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/list/mod.rs rename to crates/polars-ops/src/chunked_array/list/mod.rs diff --git a/polars/polars-ops/src/chunked_array/list/namespace.rs b/crates/polars-ops/src/chunked_array/list/namespace.rs similarity index 99% rename from polars/polars-ops/src/chunked_array/list/namespace.rs rename to crates/polars-ops/src/chunked_array/list/namespace.rs index 9cbb733c4ca87..da61fa9ff5cb4 100644 --- a/polars/polars-ops/src/chunked_array/list/namespace.rs +++ b/crates/polars-ops/src/chunked_array/list/namespace.rs @@ -290,7 +290,7 @@ pub trait ListNameSpaceImpl: AsList { let idx_ca = idx.list().unwrap(); let mut out = list_ca .amortized_iter() - .zip(idx_ca.into_iter()) + .zip(idx_ca) .map(|(opt_s, opt_idx)| { { match (opt_s, opt_idx) { diff --git a/polars/polars-ops/src/chunked_array/list/sets.rs b/crates/polars-ops/src/chunked_array/list/sets.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/list/sets.rs rename to crates/polars-ops/src/chunked_array/list/sets.rs diff --git a/polars/polars-ops/src/chunked_array/list/sum_mean.rs b/crates/polars-ops/src/chunked_array/list/sum_mean.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/list/sum_mean.rs rename to crates/polars-ops/src/chunked_array/list/sum_mean.rs diff --git a/polars/polars-ops/src/chunked_array/list/to_struct.rs b/crates/polars-ops/src/chunked_array/list/to_struct.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/list/to_struct.rs rename to crates/polars-ops/src/chunked_array/list/to_struct.rs diff --git a/polars/polars-ops/src/chunked_array/mod.rs b/crates/polars-ops/src/chunked_array/mod.rs similarity index 84% rename from polars/polars-ops/src/chunked_array/mod.rs rename to crates/polars-ops/src/chunked_array/mod.rs index 5da1507191725..62052893ac9fd 100644 --- a/polars/polars-ops/src/chunked_array/mod.rs +++ b/crates/polars-ops/src/chunked_array/mod.rs @@ -1,6 +1,8 @@ #[cfg(feature = "dtype-array")] pub mod array; mod binary; +#[cfg(feature = "timezones")] +pub mod datetime; #[cfg(feature = "interpolate")] mod interpolate; pub mod list; @@ -13,6 +15,8 @@ mod sum; mod top_k; pub use binary::*; +#[cfg(feature = "timezones")] +pub use datetime::*; #[cfg(feature = "interpolate")] pub use interpolate::*; pub use list::*; diff --git a/polars/polars-ops/src/chunked_array/nan_propagating_aggregate.rs b/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/nan_propagating_aggregate.rs rename to crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs diff --git a/polars/polars-ops/src/chunked_array/set.rs b/crates/polars-ops/src/chunked_array/set.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/set.rs rename to crates/polars-ops/src/chunked_array/set.rs diff --git a/polars/polars-ops/src/chunked_array/strings/case.rs b/crates/polars-ops/src/chunked_array/strings/case.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/strings/case.rs rename to crates/polars-ops/src/chunked_array/strings/case.rs diff --git a/polars/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/strings/json_path.rs rename to crates/polars-ops/src/chunked_array/strings/json_path.rs diff --git a/polars/polars-ops/src/chunked_array/strings/justify.rs b/crates/polars-ops/src/chunked_array/strings/justify.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/strings/justify.rs rename to crates/polars-ops/src/chunked_array/strings/justify.rs diff --git a/polars/polars-ops/src/chunked_array/strings/mod.rs b/crates/polars-ops/src/chunked_array/strings/mod.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/strings/mod.rs rename to crates/polars-ops/src/chunked_array/strings/mod.rs diff --git a/polars/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs similarity index 99% rename from polars/polars-ops/src/chunked_array/strings/namespace.rs rename to crates/polars-ops/src/chunked_array/strings/namespace.rs index afe1e7134c053..1e056fbcc86b8 100644 --- a/polars/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -336,7 +336,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { let mut builder = ListUtf8ChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size()); - for (opt_s, opt_pat) in ca.into_iter().zip(pat.into_iter()) { + for (opt_s, opt_pat) in ca.into_iter().zip(pat) { match (opt_s, opt_pat) { (_, None) | (None, _) => builder.append_null(), (Some(s), Some(pat)) => { diff --git a/polars/polars-ops/src/chunked_array/strings/replace.rs b/crates/polars-ops/src/chunked_array/strings/replace.rs similarity index 93% rename from polars/polars-ops/src/chunked_array/strings/replace.rs rename to crates/polars-ops/src/chunked_array/strings/replace.rs index 5658e15e00ec2..72479ea81b292 100644 --- a/polars/polars-ops/src/chunked_array/strings/replace.rs +++ b/crates/polars-ops/src/chunked_array/strings/replace.rs @@ -61,8 +61,14 @@ pub(super) fn replace_lit_n_char( // set the end of this string region // safety: invariant of Utf8Array tells us that there is a next offset. - if let Some(next) = offsets_iter.next() { - end = *next as usize - 1; + + // must loop to skip null values, as they have the same offsets + for next in offsets_iter.by_ref() { + let new_end = *next as usize - 1; + if new_end != end { + end = new_end; + break; + } } } } diff --git a/polars/polars-ops/src/chunked_array/sum.rs b/crates/polars-ops/src/chunked_array/sum.rs similarity index 100% rename from polars/polars-ops/src/chunked_array/sum.rs rename to crates/polars-ops/src/chunked_array/sum.rs diff --git a/polars/polars-ops/src/chunked_array/top_k.rs b/crates/polars-ops/src/chunked_array/top_k.rs similarity index 93% rename from polars/polars-ops/src/chunked_array/top_k.rs rename to crates/polars-ops/src/chunked_array/top_k.rs index f50e8ad62dcff..c1b5ac9300320 100644 --- a/polars/polars-ops/src/chunked_array/top_k.rs +++ b/crates/polars-ops/src/chunked_array/top_k.rs @@ -17,7 +17,7 @@ impl PartialEq for Compare { impl PartialOrd for Compare { fn partial_cmp(&self, other: &Self) -> Option { - Some(compare_fn_nan_max(&self.0, &other.0)) + Some(self.cmp(other)) } } @@ -25,9 +25,7 @@ impl Eq for Compare {} impl Ord for Compare { fn cmp(&self, other: &Self) -> Ordering { - // Safety: - // we always return Some - unsafe { self.partial_cmp(other).unwrap_unchecked() } + compare_fn_nan_max(&self.0, &other.0) } } diff --git a/polars/polars-ops/src/frame/join/merge_sorted.rs b/crates/polars-ops/src/frame/join/merge_sorted.rs similarity index 100% rename from polars/polars-ops/src/frame/join/merge_sorted.rs rename to crates/polars-ops/src/frame/join/merge_sorted.rs diff --git a/polars/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs similarity index 100% rename from polars/polars-ops/src/frame/join/mod.rs rename to crates/polars-ops/src/frame/join/mod.rs diff --git a/polars/polars-ops/src/frame/mod.rs b/crates/polars-ops/src/frame/mod.rs similarity index 100% rename from polars/polars-ops/src/frame/mod.rs rename to crates/polars-ops/src/frame/mod.rs diff --git a/polars/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs similarity index 100% rename from polars/polars-ops/src/frame/pivot/mod.rs rename to crates/polars-ops/src/frame/pivot/mod.rs diff --git a/polars/polars-ops/src/frame/pivot/positioning.rs b/crates/polars-ops/src/frame/pivot/positioning.rs similarity index 99% rename from polars/polars-ops/src/frame/pivot/positioning.rs rename to crates/polars-ops/src/frame/pivot/positioning.rs index 65754c9ec038e..c64f861d6554a 100644 --- a/polars/polars-ops/src/frame/pivot/positioning.rs +++ b/crates/polars-ops/src/frame/pivot/positioning.rs @@ -131,6 +131,8 @@ where let col_locations = &col_locations[offset..offset + len]; let value_agg_phys = value_agg_phys.slice(offset as i64, len); + // todo! remove lint silencing + #[allow(clippy::useless_conversion)] for ((row_idx, col_idx), val) in row_locations .iter() .zip(col_locations) diff --git a/polars/polars-ops/src/lib.rs b/crates/polars-ops/src/lib.rs similarity index 100% rename from polars/polars-ops/src/lib.rs rename to crates/polars-ops/src/lib.rs diff --git a/polars/polars-ops/src/prelude.rs b/crates/polars-ops/src/prelude.rs similarity index 100% rename from polars/polars-ops/src/prelude.rs rename to crates/polars-ops/src/prelude.rs diff --git a/polars/polars-ops/src/series/mod.rs b/crates/polars-ops/src/series/mod.rs similarity index 100% rename from polars/polars-ops/src/series/mod.rs rename to crates/polars-ops/src/series/mod.rs diff --git a/polars/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs b/crates/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs similarity index 100% rename from polars/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs rename to crates/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs diff --git a/polars/polars-ops/src/series/ops/approx_algo/mod.rs b/crates/polars-ops/src/series/ops/approx_algo/mod.rs similarity index 100% rename from polars/polars-ops/src/series/ops/approx_algo/mod.rs rename to crates/polars-ops/src/series/ops/approx_algo/mod.rs diff --git a/polars/polars-ops/src/series/ops/approx_unique.rs b/crates/polars-ops/src/series/ops/approx_unique.rs similarity index 100% rename from polars/polars-ops/src/series/ops/approx_unique.rs rename to crates/polars-ops/src/series/ops/approx_unique.rs diff --git a/polars/polars-ops/src/series/ops/arg_min_max.rs b/crates/polars-ops/src/series/ops/arg_min_max.rs similarity index 100% rename from polars/polars-ops/src/series/ops/arg_min_max.rs rename to crates/polars-ops/src/series/ops/arg_min_max.rs diff --git a/polars/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs similarity index 94% rename from polars/polars-ops/src/series/ops/cut.rs rename to crates/polars-ops/src/series/ops/cut.rs index 493f884168cc5..40a51bc7f3539 100644 --- a/polars/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -64,16 +64,16 @@ pub fn cut( left_closed: bool, include_breaks: bool, ) -> PolarsResult { - polars_ensure!(!breaks.is_empty(), ShapeMismatch: "Breaks are empty"); polars_ensure!(!breaks.iter().any(|x| x.is_nan()), ComputeError: "Breaks cannot be NaN"); // Breaks must be sorted to cut inputs properly. let mut breaks = breaks; let sorted_breaks = breaks.as_mut_slice(); sorted_breaks.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); polars_ensure!(sorted_breaks.windows(2).all(|x| x[0] != x[1]), Duplicate: "Breaks are not unique"); - - polars_ensure!(sorted_breaks[0] > f64::NEG_INFINITY, ComputeError: "Don't include -inf in breaks"); - polars_ensure!(sorted_breaks[sorted_breaks.len() - 1] < f64::INFINITY, ComputeError: "Don't include inf in breaks"); + if !sorted_breaks.is_empty() { + polars_ensure!(sorted_breaks[0] > f64::NEG_INFINITY, ComputeError: "Don't include -inf in breaks"); + polars_ensure!(sorted_breaks[sorted_breaks.len() - 1] < f64::INFINITY, ComputeError: "Don't include inf in breaks"); + } let cutlabs = match labels { Some(ll) => { diff --git a/polars/polars-ops/src/series/ops/floor_divide.rs b/crates/polars-ops/src/series/ops/floor_divide.rs similarity index 99% rename from polars/polars-ops/src/series/ops/floor_divide.rs rename to crates/polars-ops/src/series/ops/floor_divide.rs index e673125295aa5..f536caeb2d493 100644 --- a/polars/polars-ops/src/series/ops/floor_divide.rs +++ b/crates/polars-ops/src/series/ops/floor_divide.rs @@ -45,7 +45,7 @@ fn floor_div_array( } else { let iter = a .into_iter() - .zip(b.into_iter()) + .zip(b) .map(|(opt_a, opt_b)| match (opt_a, opt_b) { (Some(&a), Some(&b)) => Some(floor_div_element(a, b)), _ => None, diff --git a/polars/polars-ops/src/series/ops/fused.rs b/crates/polars-ops/src/series/ops/fused.rs similarity index 100% rename from polars/polars-ops/src/series/ops/fused.rs rename to crates/polars-ops/src/series/ops/fused.rs diff --git a/polars/polars-ops/src/series/ops/is_first.rs b/crates/polars-ops/src/series/ops/is_first.rs similarity index 100% rename from polars/polars-ops/src/series/ops/is_first.rs rename to crates/polars-ops/src/series/ops/is_first.rs diff --git a/polars/polars-ops/src/series/ops/is_unique.rs b/crates/polars-ops/src/series/ops/is_unique.rs similarity index 100% rename from polars/polars-ops/src/series/ops/is_unique.rs rename to crates/polars-ops/src/series/ops/is_unique.rs diff --git a/polars/polars-ops/src/series/ops/log.rs b/crates/polars-ops/src/series/ops/log.rs similarity index 100% rename from polars/polars-ops/src/series/ops/log.rs rename to crates/polars-ops/src/series/ops/log.rs diff --git a/polars/polars-ops/src/series/ops/mod.rs b/crates/polars-ops/src/series/ops/mod.rs similarity index 100% rename from polars/polars-ops/src/series/ops/mod.rs rename to crates/polars-ops/src/series/ops/mod.rs diff --git a/polars/polars-ops/src/series/ops/rle.rs b/crates/polars-ops/src/series/ops/rle.rs similarity index 100% rename from polars/polars-ops/src/series/ops/rle.rs rename to crates/polars-ops/src/series/ops/rle.rs diff --git a/polars/polars-ops/src/series/ops/rolling.rs b/crates/polars-ops/src/series/ops/rolling.rs similarity index 100% rename from polars/polars-ops/src/series/ops/rolling.rs rename to crates/polars-ops/src/series/ops/rolling.rs diff --git a/polars/polars-ops/src/series/ops/search_sorted.rs b/crates/polars-ops/src/series/ops/search_sorted.rs similarity index 100% rename from polars/polars-ops/src/series/ops/search_sorted.rs rename to crates/polars-ops/src/series/ops/search_sorted.rs diff --git a/polars/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs similarity index 100% rename from polars/polars-ops/src/series/ops/to_dummies.rs rename to crates/polars-ops/src/series/ops/to_dummies.rs diff --git a/polars/polars-ops/src/series/ops/various.rs b/crates/polars-ops/src/series/ops/various.rs similarity index 82% rename from polars/polars-ops/src/series/ops/various.rs rename to crates/polars-ops/src/series/ops/various.rs index 5bf22ad0f0696..425807f30f438 100644 --- a/polars/polars-ops/src/series/ops/various.rs +++ b/crates/polars-ops/src/series/ops/various.rs @@ -1,5 +1,7 @@ #[cfg(feature = "hash")] use polars_core::export::ahash; +#[cfg(feature = "dtype-struct")] +use polars_core::prelude::sort::arg_sort_multiple::_get_rows_encoded_ca; use polars_core::prelude::*; use polars_core::series::IsSorted; @@ -42,6 +44,14 @@ pub trait SeriesMethods: SeriesSealed { fn is_sorted(&self, options: SortOptions) -> PolarsResult { let s = self.as_series(); + // for struct types we row-encode and recurse + #[cfg(feature = "dtype-struct")] + if matches!(s.dtype(), DataType::Struct(_)) { + let encoded = + _get_rows_encoded_ca("", &[s.clone()], &[options.descending], options.nulls_last)?; + return encoded.into_series().is_sorted(options); + } + // fast paths if (options.descending && options.nulls_last @@ -70,10 +80,12 @@ pub trait SeriesMethods: SeriesSealed { // Compare adjacent elements with no-copy slices that don't include any nulls let offset = !options.nulls_last as i64 * nc as i64; let (s1, s2) = (s.slice(offset, slen), s.slice(offset + 1, slen)); - match options.descending { - true => Ok(Series::gt_eq(&s1, &s2)?.all()), - false => Ok(Series::lt_eq(&s1, &s2)?.all()), - } + let cmp_op = if options.descending { + Series::gt_eq + } else { + Series::lt_eq + }; + Ok(cmp_op(&s1, &s2)?.all()) } } diff --git a/polars/polars-lazy/polars-pipe/Cargo.toml b/crates/polars-pipe/Cargo.toml similarity index 70% rename from polars/polars-lazy/polars-pipe/Cargo.toml rename to crates/polars-pipe/Cargo.toml index 5ee405d57da85..0770f40e5b8e6 100644 --- a/polars/polars-lazy/polars-pipe/Cargo.toml +++ b/crates/polars-pipe/Cargo.toml @@ -14,13 +14,13 @@ crossbeam-queue = { version = "0.3", optional = true } enum_dispatch = "0.3" hashbrown.workspace = true num-traits.workspace = true -polars-arrow = { version = "0.31.1", path = "../../polars-arrow", default-features = false } -polars-core = { version = "0.31.1", path = "../../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } -polars-io = { version = "0.31.1", path = "../../polars-io", default-features = false, features = ["ipc", "async"] } -polars-ops = { version = "0.31.1", path = "../../polars-ops", features = ["search_sorted"] } +polars-arrow = { version = "0.31.1", path = "../polars-arrow", default-features = false } +polars-core = { version = "0.31.1", path = "../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } +polars-io = { version = "0.31.1", path = "../polars-io", default-features = false, features = ["ipc", "async"] } +polars-ops = { version = "0.31.1", path = "../polars-ops", features = ["search_sorted"] } polars-plan = { version = "0.31.1", path = "../polars-plan", default-features = false, features = ["compile"] } -polars-row = { version = "0.31.1", path = "../../polars-row" } -polars-utils = { version = "0.31.1", path = "../../polars-utils", features = ["sysinfo"] } +polars-row = { version = "0.31.1", path = "../polars-row" } +polars-utils = { version = "0.31.1", path = "../polars-utils", features = ["sysinfo"] } rayon.workspace = true smartstring = { version = "1" } diff --git a/polars/polars-row/LICENSE b/crates/polars-pipe/LICENSE similarity index 100% rename from polars/polars-row/LICENSE rename to crates/polars-pipe/LICENSE diff --git a/crates/polars-pipe/README.md b/crates/polars-pipe/README.md new file mode 100644 index 0000000000000..9578d9703c103 --- /dev/null +++ b/crates/polars-pipe/README.md @@ -0,0 +1,5 @@ +# Polars Pipe + +`polars-pipe` is a sub-crate that provides OOC (out of core) algorithms to the polars physical plans. + +Not intended for external usage. diff --git a/polars/polars-lazy/polars-pipe/src/executors/mod.rs b/crates/polars-pipe/src/executors/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/mod.rs rename to crates/polars-pipe/src/executors/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/operators/filter.rs b/crates/polars-pipe/src/executors/operators/filter.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/operators/filter.rs rename to crates/polars-pipe/src/executors/operators/filter.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/operators/function.rs b/crates/polars-pipe/src/executors/operators/function.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/operators/function.rs rename to crates/polars-pipe/src/executors/operators/function.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/operators/mod.rs b/crates/polars-pipe/src/executors/operators/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/operators/mod.rs rename to crates/polars-pipe/src/executors/operators/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/operators/pass.rs b/crates/polars-pipe/src/executors/operators/pass.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/operators/pass.rs rename to crates/polars-pipe/src/executors/operators/pass.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/operators/placeholder.rs b/crates/polars-pipe/src/executors/operators/placeholder.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/operators/placeholder.rs rename to crates/polars-pipe/src/executors/operators/placeholder.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/operators/projection.rs b/crates/polars-pipe/src/executors/operators/projection.rs similarity index 57% rename from polars/polars-lazy/polars-pipe/src/executors/operators/projection.rs rename to crates/polars-pipe/src/executors/operators/projection.rs index cf407c6d13afe..efbd0d6b29533 100644 --- a/polars/polars-lazy/polars-pipe/src/executors/operators/projection.rs +++ b/crates/polars-pipe/src/executors/operators/projection.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use polars_core::error::PolarsResult; use polars_core::frame::DataFrame; use polars_core::schema::SchemaRef; +use polars_plan::utils::rename_cse_tmp_series; use crate::expressions::PhysicalPipedExpr; use crate::operators::{DataChunk, Operator, OperatorResult, PExecutionContext}; @@ -46,6 +47,7 @@ impl Operator for FastProjectionOperator { #[derive(Clone)] pub(crate) struct ProjectionOperator { pub(crate) exprs: Vec>, + pub(crate) cse_exprs: Option, } impl Operator for ProjectionOperator { @@ -54,19 +56,35 @@ impl Operator for ProjectionOperator { context: &PExecutionContext, chunk: &DataChunk, ) -> PolarsResult { + // add temporary cse column to the chunk + let cse_owned_chunk; + let chunk = if let Some(hstack) = &mut self.cse_exprs { + let OperatorResult::Finished(out) = hstack.execute(context, chunk)? else { + unreachable!() + }; + cse_owned_chunk = out; + &cse_owned_chunk + } else { + chunk + }; + let mut has_literals = false; let mut has_empty = false; let mut projected = self .exprs .iter() .map(|e| { - let s = e.evaluate(chunk, context.execution_state.as_any())?; - if s.len() == 1 { - has_literals = true; - } - if s.len() == 0 { - has_empty = true; + #[allow(unused_mut)] + let mut s = e.evaluate(chunk, context.execution_state.as_any())?; + + // correct the cse name + if self.cse_exprs.is_some() { + rename_cse_tmp_series(&mut s); } + + has_literals |= s.len() == 1; + has_empty |= s.len() == 0; + Ok(s) }) .collect::>>()?; @@ -92,7 +110,11 @@ impl Operator for ProjectionOperator { Box::new(self.clone()) } fn fmt(&self) -> &str { - "projection" + if self.cse_exprs.is_some() { + "projection[cse]" + } else { + "projection" + } } } @@ -100,6 +122,11 @@ impl Operator for ProjectionOperator { pub(crate) struct HstackOperator { pub(crate) exprs: Vec>, pub(crate) input_schema: SchemaRef, + pub(crate) cse_exprs: Option>, + // add columns without any checks + // this is needed for cse, as the temporary columns + // may have a different size + pub(crate) unchecked: bool, } impl Operator for HstackOperator { @@ -108,15 +135,44 @@ impl Operator for HstackOperator { context: &PExecutionContext, chunk: &DataChunk, ) -> PolarsResult { + // add temporary cse column to the chunk + let width = chunk.data.width(); + let cse_owned_chunk; + let chunk = if let Some(hstack) = &mut self.cse_exprs { + let OperatorResult::Finished(out) = hstack.execute(context, chunk)? else { + unreachable!() + }; + cse_owned_chunk = out; + &cse_owned_chunk + } else { + chunk + }; + let projected = self .exprs .iter() - .map(|e| e.evaluate(chunk, context.execution_state.as_any())) + .map(|e| { + #[allow(unused_mut)] + let mut res = e.evaluate(chunk, context.execution_state.as_any()); + + if self.cse_exprs.is_some() { + res = res.map(|mut s| { + rename_cse_tmp_series(&mut s); + s + }) + } + res + }) .collect::>>()?; - let mut df = chunk.data.clone(); + let mut df = DataFrame::new_no_checks(chunk.data.get_columns()[..width].to_vec()); + let schema = &*self.input_schema; - df._add_columns(projected, schema)?; + if self.unchecked { + unsafe { df.get_columns_mut().extend(projected) } + } else { + df._add_columns(projected, schema)?; + } let chunk = chunk.with_data(df); Ok(OperatorResult::Finished(chunk)) @@ -125,6 +181,10 @@ impl Operator for HstackOperator { Box::new(self.clone()) } fn fmt(&self) -> &str { - "hstack" + if self.cse_exprs.is_some() { + "hstack[cse]" + } else { + "hstack" + } } } diff --git a/polars/polars-lazy/polars-pipe/src/executors/operators/reproject.rs b/crates/polars-pipe/src/executors/operators/reproject.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/operators/reproject.rs rename to crates/polars-pipe/src/executors/operators/reproject.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/file_sink.rs b/crates/polars-pipe/src/executors/sinks/file_sink.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/file_sink.rs rename to crates/polars-pipe/src/executors/sinks/file_sink.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/count.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/count.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/count.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/count.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/mean.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/mean.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/mean.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/mean.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/min_max.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/min_max.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/min_max.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/min_max.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/mod.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/mod.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/sum.rs b/crates/polars-pipe/src/executors/sinks/groupby/aggregates/sum.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/aggregates/sum.rs rename to crates/polars-pipe/src/executors/sinks/groupby/aggregates/sum.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/eval.rs b/crates/polars-pipe/src/executors/sinks/groupby/generic/eval.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/eval.rs rename to crates/polars-pipe/src/executors/sinks/groupby/generic/eval.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/global.rs b/crates/polars-pipe/src/executors/sinks/groupby/generic/global.rs similarity index 98% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/global.rs rename to crates/polars-pipe/src/executors/sinks/groupby/generic/global.rs index f0d480b53c3d0..a65e31c8b30d4 100644 --- a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/global.rs +++ b/crates/polars-pipe/src/executors/sinks/groupby/generic/global.rs @@ -174,7 +174,7 @@ impl GlobalTable { } } - pub(super) fn merge_local_map(&self, finalized_local_map: &mut AggHashTable) { + pub(super) fn merge_local_map(&self, finalized_local_map: &AggHashTable) { // TODO! maybe parallelize? // needs unsafe, first benchmark. for (partition_i, pt_map) in self.inner_maps.iter().enumerate() { diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs similarity index 98% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs rename to crates/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs index 30088cfbd864a..7a0afc446c2df 100644 --- a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs +++ b/crates/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs @@ -152,14 +152,14 @@ impl AggHashTable { false } - pub(super) fn combine(&mut self, other: &mut Self) { + pub(super) fn combine(&mut self, other: &Self) { self.combine_impl(other, |_hash| true) } pub(super) fn combine_on_partition( &mut self, partition: usize, - other: &mut AggHashTable, + other: &AggHashTable, ) { let partition = partition as u64; self.combine_impl(other, |hash| { @@ -169,7 +169,7 @@ impl AggHashTable { pub(super) fn combine_impl( &mut self, - other: &mut AggHashTable, + other: &AggHashTable, on_condition: C, ) // takes a hash and if true, this keys will be combined diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/mod.rs b/crates/polars-pipe/src/executors/sinks/groupby/generic/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/mod.rs rename to crates/polars-pipe/src/executors/sinks/groupby/generic/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs rename to crates/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/sink.rs b/crates/polars-pipe/src/executors/sinks/groupby/generic/sink.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/sink.rs rename to crates/polars-pipe/src/executors/sinks/groupby/generic/sink.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/source.rs b/crates/polars-pipe/src/executors/sinks/groupby/generic/source.rs similarity index 98% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/source.rs rename to crates/polars-pipe/src/executors/sinks/groupby/generic/source.rs index 9486745295440..d174e70998d9c 100644 --- a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/source.rs +++ b/crates/polars-pipe/src/executors/sinks/groupby/generic/source.rs @@ -64,7 +64,7 @@ impl Source for GroupBySource { if partition_dir.exists() { for file in std::fs::read_dir(partition_dir).expect("should be there") { let spilled = file.unwrap().path(); - let file = std::fs::File::open(spilled)?; + let file = polars_utils::open_file(spilled)?; let reader = IpcReader::new(file); let spilled = reader.finish().unwrap(); if spilled.n_chunks() > 1 { diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs b/crates/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs similarity index 97% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs rename to crates/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs index 3abea87e49119..0a4f5c52b3c7b 100644 --- a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs +++ b/crates/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs @@ -171,10 +171,7 @@ impl SpillPartitions { other.finish(); let other_payloads = std::mem::take(&mut other.finished_payloads); - for (part_self, part_other) in self - .finished_payloads - .iter_mut() - .zip(other_payloads.into_iter()) + for (part_self, part_other) in self.finished_payloads.iter_mut().zip(other_payloads) { part_self.extend(part_other) } @@ -214,7 +211,7 @@ impl SpillPartitions { }, ) }) - .chain(flattened.into_iter()) + .chain(flattened) } } @@ -280,7 +277,7 @@ impl ThreadLocalTable { } pub(super) fn combine(&mut self, other: &mut Self) { - self.inner_map.combine(&mut other.inner_map); + self.inner_map.combine(&other.inner_map); self.spill_partitions.combine(&mut other.spill_partitions); } diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/mod.rs b/crates/polars-pipe/src/executors/sinks/groupby/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/mod.rs rename to crates/polars-pipe/src/executors/sinks/groupby/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/ooc.rs b/crates/polars-pipe/src/executors/sinks/groupby/ooc.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/ooc.rs rename to crates/polars-pipe/src/executors/sinks/groupby/ooc.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/groupby/ooc_state.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/ooc_state.rs rename to crates/polars-pipe/src/executors/sinks/groupby/ooc_state.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs b/crates/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs rename to crates/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/string.rs b/crates/polars-pipe/src/executors/sinks/groupby/string.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/string.rs rename to crates/polars-pipe/src/executors/sinks/groupby/string.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/utils.rs b/crates/polars-pipe/src/executors/sinks/groupby/utils.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/groupby/utils.rs rename to crates/polars-pipe/src/executors/sinks/groupby/utils.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/io.rs b/crates/polars-pipe/src/executors/sinks/io.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/io.rs rename to crates/polars-pipe/src/executors/sinks/io.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/joins/cross.rs b/crates/polars-pipe/src/executors/sinks/joins/cross.rs similarity index 99% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/joins/cross.rs rename to crates/polars-pipe/src/executors/sinks/joins/cross.rs index d3207379e8189..3782b25b9903c 100644 --- a/polars/polars-lazy/polars-pipe/src/executors/sinks/joins/cross.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/cross.rs @@ -37,7 +37,7 @@ impl Sink for CrossJoin { fn combine(&mut self, other: &mut dyn Sink) { let other = other.as_any().downcast_mut::().unwrap(); let other_chunks = std::mem::take(&mut other.chunks); - self.chunks.extend(other_chunks.into_iter()); + self.chunks.extend(other_chunks); } fn split(&self, _thread_no: usize) -> Box { diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/joins/generic_build.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/joins/generic_build.rs rename to crates/polars-pipe/src/executors/sinks/joins/generic_build.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/joins/inner_left.rs b/crates/polars-pipe/src/executors/sinks/joins/inner_left.rs similarity index 97% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/joins/inner_left.rs rename to crates/polars-pipe/src/executors/sinks/joins/inner_left.rs index 92d0f2366ea03..cfa093509084d 100644 --- a/polars/polars-lazy/polars-pipe/src/executors/sinks/joins/inner_left.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/inner_left.rs @@ -226,7 +226,13 @@ impl GenericJoinProbe { } let right_df = self.df_a.as_ref(); - let left_df = unsafe { chunk.data._take_unchecked_slice(&self.join_tuples_b, false) }; + // join tuples of left joins are always sorted + // this will ensure sorted flags maintain + let left_df = unsafe { + chunk + .data + ._take_unchecked_slice_sorted(&self.join_tuples_b, false, IsSorted::Ascending) + }; let right_df = unsafe { right_df._take_opt_chunked_unchecked_seq(&self.join_tuples_a_left_join) }; diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/joins/mod.rs b/crates/polars-pipe/src/executors/sinks/joins/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/joins/mod.rs rename to crates/polars-pipe/src/executors/sinks/joins/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/memory.rs b/crates/polars-pipe/src/executors/sinks/memory.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/memory.rs rename to crates/polars-pipe/src/executors/sinks/memory.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/mod.rs b/crates/polars-pipe/src/executors/sinks/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/mod.rs rename to crates/polars-pipe/src/executors/sinks/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/ordered.rs b/crates/polars-pipe/src/executors/sinks/ordered.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/ordered.rs rename to crates/polars-pipe/src/executors/sinks/ordered.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/reproject.rs b/crates/polars-pipe/src/executors/sinks/reproject.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/reproject.rs rename to crates/polars-pipe/src/executors/sinks/reproject.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/slice.rs b/crates/polars-pipe/src/executors/sinks/slice.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/slice.rs rename to crates/polars-pipe/src/executors/sinks/slice.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/sort/mod.rs b/crates/polars-pipe/src/executors/sinks/sort/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/sort/mod.rs rename to crates/polars-pipe/src/executors/sinks/sort/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/sort/ooc.rs b/crates/polars-pipe/src/executors/sinks/sort/ooc.rs similarity index 96% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/sort/ooc.rs rename to crates/polars-pipe/src/executors/sinks/sort/ooc.rs index d66ed9b73a492..21d469f59608b 100644 --- a/polars/polars-lazy/polars-pipe/src/executors/sinks/sort/ooc.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/ooc.rs @@ -3,6 +3,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use crossbeam_queue::SegQueue; use polars_core::prelude::*; +use polars_core::series::IsSorted; use polars_core::utils::accumulate_dataframes_vertical_unchecked; use polars_core::POOL; use polars_io::ipc::IpcReader; @@ -15,7 +16,7 @@ use crate::executors::sinks::sort::source::SortSource; use crate::operators::FinalizedSink; pub(super) fn read_df(path: &Path) -> PolarsResult { - let file = std::fs::File::open(path)?; + let file = polars_utils::open_file(path)?; IpcReader::new(file).set_rechunk(false).finish() } @@ -186,8 +187,8 @@ fn partition_df(df: DataFrame, partitions: &IdxCa) -> PolarsResult<(DfIter, IdxC let out = match groups { GroupsProxy::Idx(idx) => { let iter = idx.into_iter().map(move |(_, group)| { - // groups are in bounds - unsafe { df._take_unchecked_slice(&group, false) } + // groups are in bounds and sorted + unsafe { df._take_unchecked_slice_sorted(&group, false, IsSorted::Ascending) } }); Box::new(iter) as DfIter } diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/sort/sink.rs b/crates/polars-pipe/src/executors/sinks/sort/sink.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/sort/sink.rs rename to crates/polars-pipe/src/executors/sinks/sort/sink.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/sort/sink_multiple.rs b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/sort/sink_multiple.rs rename to crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/sort/source.rs b/crates/polars-pipe/src/executors/sinks/sort/source.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/sort/source.rs rename to crates/polars-pipe/src/executors/sinks/sort/source.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sinks/utils.rs b/crates/polars-pipe/src/executors/sinks/utils.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sinks/utils.rs rename to crates/polars-pipe/src/executors/sinks/utils.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sources/csv.rs rename to crates/polars-pipe/src/executors/sources/csv.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sources/frame.rs b/crates/polars-pipe/src/executors/sources/frame.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sources/frame.rs rename to crates/polars-pipe/src/executors/sources/frame.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sources/ipc_one_shot.rs b/crates/polars-pipe/src/executors/sources/ipc_one_shot.rs similarity index 94% rename from polars/polars-lazy/polars-pipe/src/executors/sources/ipc_one_shot.rs rename to crates/polars-pipe/src/executors/sources/ipc_one_shot.rs index 0343c5dfd7e85..6398956f79ce5 100644 --- a/polars/polars-lazy/polars-pipe/src/executors/sources/ipc_one_shot.rs +++ b/crates/polars-pipe/src/executors/sources/ipc_one_shot.rs @@ -15,7 +15,7 @@ pub struct IpcSourceOneShot { impl IpcSourceOneShot { #[allow(unused_variables)] pub(crate) fn new(path: &Path) -> PolarsResult { - let file = File::open(path)?; + let file = polars_utils::open_file(path)?; let reader = Some(IpcReader::new(file)); Ok(IpcSourceOneShot { reader }) diff --git a/polars/polars-lazy/polars-pipe/src/executors/sources/mod.rs b/crates/polars-pipe/src/executors/sources/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sources/mod.rs rename to crates/polars-pipe/src/executors/sources/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sources/parquet.rs rename to crates/polars-pipe/src/executors/sources/parquet.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sources/reproject.rs b/crates/polars-pipe/src/executors/sources/reproject.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sources/reproject.rs rename to crates/polars-pipe/src/executors/sources/reproject.rs diff --git a/polars/polars-lazy/polars-pipe/src/executors/sources/union.rs b/crates/polars-pipe/src/executors/sources/union.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/executors/sources/union.rs rename to crates/polars-pipe/src/executors/sources/union.rs diff --git a/polars/polars-lazy/polars-pipe/src/expressions.rs b/crates/polars-pipe/src/expressions.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/expressions.rs rename to crates/polars-pipe/src/expressions.rs diff --git a/polars/polars-lazy/polars-pipe/src/lib.rs b/crates/polars-pipe/src/lib.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/lib.rs rename to crates/polars-pipe/src/lib.rs diff --git a/polars/polars-lazy/polars-pipe/src/operators/chunks.rs b/crates/polars-pipe/src/operators/chunks.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/operators/chunks.rs rename to crates/polars-pipe/src/operators/chunks.rs diff --git a/polars/polars-lazy/polars-pipe/src/operators/context.rs b/crates/polars-pipe/src/operators/context.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/operators/context.rs rename to crates/polars-pipe/src/operators/context.rs diff --git a/polars/polars-lazy/polars-pipe/src/operators/mod.rs b/crates/polars-pipe/src/operators/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/operators/mod.rs rename to crates/polars-pipe/src/operators/mod.rs diff --git a/polars/polars-lazy/polars-pipe/src/operators/operator.rs b/crates/polars-pipe/src/operators/operator.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/operators/operator.rs rename to crates/polars-pipe/src/operators/operator.rs diff --git a/polars/polars-lazy/polars-pipe/src/operators/sink.rs b/crates/polars-pipe/src/operators/sink.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/operators/sink.rs rename to crates/polars-pipe/src/operators/sink.rs diff --git a/polars/polars-lazy/polars-pipe/src/operators/source.rs b/crates/polars-pipe/src/operators/source.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/operators/source.rs rename to crates/polars-pipe/src/operators/source.rs diff --git a/polars/polars-lazy/polars-pipe/src/pipeline/config.rs b/crates/polars-pipe/src/pipeline/config.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/pipeline/config.rs rename to crates/polars-pipe/src/pipeline/config.rs diff --git a/polars/polars-lazy/polars-pipe/src/pipeline/convert.rs b/crates/polars-pipe/src/pipeline/convert.rs similarity index 90% rename from polars/polars-lazy/polars-pipe/src/pipeline/convert.rs rename to crates/polars-pipe/src/pipeline/convert.rs index 358ef6df992e7..05c3e784fc53b 100644 --- a/polars/polars-lazy/polars-pipe/src/pipeline/convert.rs +++ b/crates/polars-pipe/src/pipeline/convert.rs @@ -7,6 +7,7 @@ use polars_core::prelude::*; use polars_core::with_match_physical_integer_polars_type; use polars_plan::prelude::*; +use crate::executors::operators::HstackOperator; use crate::executors::sinks::groupby::aggregates::convert_to_hash_agg; use crate::executors::sinks::groupby::GenericGroupby2; use crate::executors::sinks::*; @@ -17,7 +18,7 @@ use crate::pipeline::PipeLine; fn exprs_to_physical( exprs: &[Node], - expr_arena: &mut Arena, + expr_arena: &Arena, to_physical: &F, schema: Option<&SchemaRef>, ) -> PolarsResult>> @@ -118,7 +119,7 @@ where pub fn get_sink( node: Node, - lp_arena: &mut Arena, + lp_arena: &Arena, expr_arena: &mut Arena, to_physical: &F, ) -> PolarsResult> @@ -395,10 +396,29 @@ pub fn get_dummy_operator() -> Box { Box::new(operators::PlaceHolder {}) } +fn get_hstack( + exprs: &[Node], + expr_arena: &Arena, + to_physical: &F, + input_schema: SchemaRef, + cse_exprs: Option>, + unchecked: bool, +) -> PolarsResult +where + F: Fn(Node, &Arena, Option<&SchemaRef>) -> PolarsResult>, +{ + Ok(operators::HstackOperator { + exprs: exprs_to_physical(exprs, expr_arena, &to_physical, Some(&input_schema))?, + input_schema, + cse_exprs, + unchecked, + }) +} + pub fn get_operator( node: Node, - lp_arena: &mut Arena, - expr_arena: &mut Arena, + lp_arena: &Arena, + expr_arena: &Arena, to_physical: &F, ) -> PolarsResult> where @@ -408,17 +428,57 @@ where let op = match lp_arena.get(node) { Projection { expr, input, .. } => { let input_schema = lp_arena.get(*input).schema(lp_arena); + + let cse_exprs = expr.cse_exprs(); + let cse_exprs = if cse_exprs.is_empty() { + None + } else { + Some(get_hstack( + cse_exprs, + expr_arena, + to_physical, + (*input_schema).clone(), + None, + true, + )?) + }; + let op = operators::ProjectionOperator { - exprs: exprs_to_physical(expr, expr_arena, &to_physical, Some(&input_schema))?, + exprs: exprs_to_physical( + expr.default_exprs(), + expr_arena, + &to_physical, + Some(&input_schema), + )?, + cse_exprs, }; Box::new(op) as Box } HStack { exprs, input, .. } => { - let input_schema = (*lp_arena.get(*input).schema(lp_arena)).clone(); - let op = operators::HstackOperator { - exprs: exprs_to_physical(exprs, expr_arena, &to_physical, Some(&input_schema))?, - input_schema, + let input_schema = lp_arena.get(*input).schema(lp_arena); + + let cse_exprs = exprs.cse_exprs(); + let cse_exprs = if cse_exprs.is_empty() { + None + } else { + Some(Box::new(get_hstack( + cse_exprs, + expr_arena, + to_physical, + (*input_schema).clone(), + None, + true, + )?)) }; + let op = get_hstack( + exprs.default_exprs(), + expr_arena, + to_physical, + (*input_schema).clone(), + cse_exprs, + false, + )?; + Box::new(op) as Box } Selection { predicate, input } => { @@ -458,7 +518,7 @@ pub fn create_pipeline( operators: Vec>, operator_nodes: Vec, sink_nodes: Vec<(usize, Node, Rc>)>, - lp_arena: &mut Arena, + lp_arena: &Arena, expr_arena: &mut Arena, to_physical: F, verbose: bool, diff --git a/polars/polars-lazy/polars-pipe/src/pipeline/dispatcher.rs b/crates/polars-pipe/src/pipeline/dispatcher.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/pipeline/dispatcher.rs rename to crates/polars-pipe/src/pipeline/dispatcher.rs diff --git a/polars/polars-lazy/polars-pipe/src/pipeline/mod.rs b/crates/polars-pipe/src/pipeline/mod.rs similarity index 100% rename from polars/polars-lazy/polars-pipe/src/pipeline/mod.rs rename to crates/polars-pipe/src/pipeline/mod.rs diff --git a/polars/polars-lazy/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml similarity index 88% rename from polars/polars-lazy/polars-plan/Cargo.toml rename to crates/polars-plan/Cargo.toml index 090f7eded60a4..a741fd14ab348 100644 --- a/polars/polars-lazy/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -6,6 +6,9 @@ license = "MIT" repository = "https://github.com/pola-rs/polars" description = "Lazy query engine for the Polars DataFrame library" +[lib] +doctest = false + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] @@ -16,12 +19,12 @@ chrono-tz = { version = "0.8", optional = true } ciborium = { version = "0.2", optional = true } futures = { version = "0.3.25", optional = true } once_cell.workspace = true -polars-arrow = { version = "0.31.1", path = "../../polars-arrow" } -polars-core = { version = "0.31.1", path = "../../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } -polars-io = { version = "0.31.1", path = "../../polars-io", features = ["lazy", "csv"], default-features = false } -polars-ops = { version = "0.31.1", path = "../../polars-ops", default-features = false } -polars-time = { version = "0.31.1", path = "../../polars-time", optional = true } -polars-utils = { version = "0.31.1", path = "../../polars-utils" } +polars-arrow = { version = "0.31.1", path = "../polars-arrow" } +polars-core = { version = "0.31.1", path = "../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } +polars-io = { version = "0.31.1", path = "../polars-io", features = ["lazy", "csv"], default-features = false } +polars-ops = { version = "0.31.1", path = "../polars-ops", default-features = false } +polars-time = { version = "0.31.1", path = "../polars-time", optional = true } +polars-utils = { version = "0.31.1", path = "../polars-utils" } pyo3 = { version = "0.19", optional = true } rayon.workspace = true regex = { version = "1.6", optional = true } diff --git a/polars/polars-sql/LICENSE b/crates/polars-plan/LICENSE similarity index 100% rename from polars/polars-sql/LICENSE rename to crates/polars-plan/LICENSE diff --git a/crates/polars-plan/src/constants.rs b/crates/polars-plan/src/constants.rs new file mode 100644 index 0000000000000..f260e7ffcaa2f --- /dev/null +++ b/crates/polars-plan/src/constants.rs @@ -0,0 +1,2 @@ +pub static MAP_LIST_NAME: &str = "map_list"; +pub static CSE_REPLACED: &str = "__POLARS_CSER_"; diff --git a/polars/polars-lazy/polars-plan/src/dot.rs b/crates/polars-plan/src/dot.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dot.rs rename to crates/polars-plan/src/dot.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/arithmetic.rs b/crates/polars-plan/src/dsl/arithmetic.rs similarity index 84% rename from polars/polars-lazy/polars-plan/src/dsl/arithmetic.rs rename to crates/polars-plan/src/dsl/arithmetic.rs index a81a92fc37d1d..5cf14575cba30 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/arithmetic.rs +++ b/crates/polars-plan/src/dsl/arithmetic.rs @@ -53,7 +53,7 @@ impl Expr { pub fn pow>(self, exponent: E) -> Self { Expr::Function { input: vec![self, exponent.into()], - function: FunctionExpr::Pow, + function: FunctionExpr::Pow(PowFunction::Generic), options: FunctionOptions { collect_groups: ApplyOptions::ApplyFlat, ..Default::default() @@ -61,6 +61,16 @@ impl Expr { } } + /// Compute the square root of the given expression + pub fn sqrt(self) -> Self { + self.map_private(FunctionExpr::Pow(PowFunction::Sqrt)) + } + + /// Compute the cube root of the given expression + pub fn cbrt(self) -> Self { + self.map_private(FunctionExpr::Pow(PowFunction::Cbrt)) + } + /// Compute the cosine of the given expression #[cfg(feature = "trigonometry")] pub fn cos(self) -> Self { @@ -103,6 +113,19 @@ impl Expr { self.map_private(FunctionExpr::Trigonometry(TrigonometricFunction::ArcTan)) } + /// Compute the inverse tangent of the given expression, with the angle expressed as the argument of a complex number + #[cfg(feature = "trigonometry")] + pub fn arctan2(self, x: Self) -> Self { + Expr::Function { + input: vec![self, x], + function: FunctionExpr::Atan2, + options: FunctionOptions { + collect_groups: ApplyOptions::ApplyFlat, + ..Default::default() + }, + } + } + /// Compute the hyperbolic cosine of the given expression #[cfg(feature = "trigonometry")] pub fn cosh(self) -> Self { diff --git a/crates/polars-plan/src/dsl/arity.rs b/crates/polars-plan/src/dsl/arity.rs new file mode 100644 index 0000000000000..05ff22df52b08 --- /dev/null +++ b/crates/polars-plan/src/dsl/arity.rs @@ -0,0 +1,155 @@ +use super::*; + +/// Utility struct for the `when-then-otherwise` expression. +/// +/// Represents the state of the expression after [when] is called. +/// +/// In this state, `then` must be called to continue to finish the expression. +#[derive(Clone)] +pub struct When { + condition: Expr, +} + +/// Utility struct for the `when-then-otherwise` expression. +/// +/// Represents the state of the expression after `when(...).then(...)` is called. +#[derive(Clone)] +pub struct Then { + condition: Expr, + statement: Expr, +} + +/// Utility struct for the `when-then-otherwise` expression. +/// +/// Represents the state of the expression after an additional `when` is called. +/// +/// In this state, `then` must be called to continue to finish the expression. +#[derive(Clone)] +pub struct ChainedWhen { + conditions: Vec, + statements: Vec, +} + +/// Utility struct for the `when-then-otherwise` expression. +/// +/// Represents the state of the expression after an additional `then` is called. +#[derive(Clone)] +pub struct ChainedThen { + conditions: Vec, + statements: Vec, +} + +impl When { + /// Add a condition to the `when-then-otherwise` expression. + pub fn then>(self, expr: E) -> Then { + Then { + condition: self.condition, + statement: expr.into(), + } + } +} + +impl Then { + /// Attach a statement to the corresponding condition. + pub fn when>(self, condition: E) -> ChainedWhen { + ChainedWhen { + conditions: vec![self.condition, condition.into()], + statements: vec![self.statement], + } + } + + /// Define a default for the `when-then-otherwise` expression. + pub fn otherwise>(self, statement: E) -> Expr { + ternary_expr(self.condition, self.statement, statement.into()) + } +} + +impl ChainedWhen { + pub fn then>(mut self, statement: E) -> ChainedThen { + self.statements.push(statement.into()); + ChainedThen { + conditions: self.conditions, + statements: self.statements, + } + } +} + +impl ChainedThen { + /// Add another condition to the `when-then-otherwise` expression. + pub fn when>(mut self, condition: E) -> ChainedWhen { + self.conditions.push(condition.into()); + + ChainedWhen { + conditions: self.conditions, + statements: self.statements, + } + } + + /// Define a default for the `when-then-otherwise` expression. + pub fn otherwise>(self, expr: E) -> Expr { + // we iterate the preds/ exprs last in first out + // and nest them. + // + // // this expr: + // when((col('x') == 'a')).then(1) + // .when(col('x') == 'b').then(2) + // .when(col('x') == 'c').then(3) + // .otherwise(4) + // + // needs to become: + // when((col('x') == 'a')).then(1) - + // .otherwise( | + // when(col('x') == 'b').then(2) - | + // .otherwise( | | + // pl.when(col('x') == 'c').then(3) | | + // .otherwise(4) | inner | outer + // ) | | + // ) _| _| + // + // by iterating LIFO we first create + // `inner` and then assign that to `otherwise`, + // which will be used in the next layer `outer` + // + + let conditions_iter = self.conditions.into_iter().rev(); + let mut statements_iter = self.statements.into_iter().rev(); + + let mut otherwise = expr.into(); + + for e in conditions_iter { + otherwise = ternary_expr( + e, + statements_iter + .next() + .expect("expr expected, did you call when().then().otherwise?"), + otherwise, + ); + } + + otherwise + } +} + +/// Start a `when-then-otherwise` expression. +pub fn when>(condition: E) -> When { + When { + condition: condition.into(), + } +} + +pub fn ternary_expr(predicate: Expr, truthy: Expr, falsy: Expr) -> Expr { + Expr::Ternary { + predicate: Box::new(predicate), + truthy: Box::new(truthy), + falsy: Box::new(falsy), + } +} + +/// Compute `op(l, r)` (or equivalently `l op r`). `l` and `r` must have types compatible with the Operator. +pub fn binary_expr(l: Expr, op: Operator, r: Expr) -> Expr { + Expr::BinaryExpr { + left: Box::new(l), + op, + right: Box::new(r), + } +} diff --git a/polars/polars-lazy/polars-plan/src/dsl/array.rs b/crates/polars-plan/src/dsl/array.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/array.rs rename to crates/polars-plan/src/dsl/array.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/binary.rs b/crates/polars-plan/src/dsl/binary.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/binary.rs rename to crates/polars-plan/src/dsl/binary.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/cat.rs b/crates/polars-plan/src/dsl/cat.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/cat.rs rename to crates/polars-plan/src/dsl/cat.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/dt.rs b/crates/polars-plan/src/dsl/dt.rs similarity index 92% rename from polars/polars-lazy/polars-plan/src/dsl/dt.rs rename to crates/polars-plan/src/dsl/dt.rs index ed6298690dcf1..f237a2c02c4ba 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/dt.rs +++ b/crates/polars-plan/src/dsl/dt.rs @@ -93,17 +93,6 @@ impl DateLikeNameSpace { ) } - /// Localize tz-naive Datetime Series to tz-aware Datetime Series. - // - // This method takes a naive Datetime Series and makes this time zone aware. - // It does not move the time to another time zone. - #[cfg(feature = "timezones")] - #[deprecated(note = "use replace_time_zone")] - pub fn tz_localize(self, tz: TimeZone) -> Expr { - self.0 - .map_private(FunctionExpr::TemporalExpr(TemporalFunction::TzLocalize(tz))) - } - /// Get the year of a Date/Datetime pub fn year(self) -> Expr { self.0 @@ -226,12 +215,10 @@ impl DateLikeNameSpace { .map_private(FunctionExpr::TemporalExpr(TemporalFunction::TimeStamp(tu))) } - pub fn truncate>(self, every: S, offset: S) -> Expr { - let every = every.as_ref().into(); - let offset = offset.as_ref().into(); + pub fn truncate(self, options: TruncateOptions) -> Expr { self.0 .map_private(FunctionExpr::TemporalExpr(TemporalFunction::Truncate( - every, offset, + options, ))) } @@ -285,11 +272,9 @@ impl DateLikeNameSpace { time_zone: Option, use_earliest: Option, ) -> Expr { - self.0 - .map_private(FunctionExpr::TemporalExpr(TemporalFunction::CastTimezone( - time_zone, - use_earliest, - ))) + self.0.map_private(FunctionExpr::TemporalExpr( + TemporalFunction::ReplaceTimeZone(time_zone, use_earliest), + )) } pub fn combine(self, time: Expr, tu: TimeUnit) -> Expr { diff --git a/polars/polars-lazy/polars-plan/src/dsl/expr.rs b/crates/polars-plan/src/dsl/expr.rs similarity index 99% rename from polars/polars-lazy/polars-plan/src/dsl/expr.rs rename to crates/polars-plan/src/dsl/expr.rs index 0414145efc756..a321ef0626ee8 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/expr.rs +++ b/crates/polars-plan/src/dsl/expr.rs @@ -154,10 +154,6 @@ pub enum Expr { output_type: GetOutput, options: FunctionOptions, }, - Cache { - input: Box, - id: usize, - }, /// Expressions in this node should only be expanding /// e.g. /// `Expr::Columns` diff --git a/polars/polars-lazy/polars-plan/src/dsl/expr_dyn_fn.rs b/crates/polars-plan/src/dsl/expr_dyn_fn.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/expr_dyn_fn.rs rename to crates/polars-plan/src/dsl/expr_dyn_fn.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/from.rs b/crates/polars-plan/src/dsl/from.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/from.rs rename to crates/polars-plan/src/dsl/from.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/abs.rs b/crates/polars-plan/src/dsl/function_expr/abs.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/abs.rs rename to crates/polars-plan/src/dsl/function_expr/abs.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/arg_where.rs b/crates/polars-plan/src/dsl/function_expr/arg_where.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/arg_where.rs rename to crates/polars-plan/src/dsl/function_expr/arg_where.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/array.rs rename to crates/polars-plan/src/dsl/function_expr/array.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/binary.rs b/crates/polars-plan/src/dsl/function_expr/binary.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/binary.rs rename to crates/polars-plan/src/dsl/function_expr/binary.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/boolean.rs b/crates/polars-plan/src/dsl/function_expr/boolean.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/boolean.rs rename to crates/polars-plan/src/dsl/function_expr/boolean.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/bounds.rs b/crates/polars-plan/src/dsl/function_expr/bounds.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/bounds.rs rename to crates/polars-plan/src/dsl/function_expr/bounds.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/cat.rs b/crates/polars-plan/src/dsl/function_expr/cat.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/cat.rs rename to crates/polars-plan/src/dsl/function_expr/cat.rs index 455ae39e8805b..f2bc703b39fa0 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/cat.rs +++ b/crates/polars-plan/src/dsl/function_expr/cat.rs @@ -54,7 +54,9 @@ fn set_ordering(s: &Series, lexical: bool) -> PolarsResult { fn get_categories(s: &Series) -> PolarsResult { // categorical check let ca = s.categorical()?; - let DataType::Categorical(Some(rev_map)) = ca.dtype() else { unreachable!() }; + let DataType::Categorical(Some(rev_map)) = ca.dtype() else { + unreachable!() + }; let arr = rev_map.get_categories().clone().boxed(); Series::try_from((ca.name(), arr)) } diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/clip.rs b/crates/polars-plan/src/dsl/function_expr/clip.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/clip.rs rename to crates/polars-plan/src/dsl/function_expr/clip.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/concat.rs b/crates/polars-plan/src/dsl/function_expr/concat.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/concat.rs rename to crates/polars-plan/src/dsl/function_expr/concat.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/correlation.rs b/crates/polars-plan/src/dsl/function_expr/correlation.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/correlation.rs rename to crates/polars-plan/src/dsl/function_expr/correlation.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/cum.rs b/crates/polars-plan/src/dsl/function_expr/cum.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/cum.rs rename to crates/polars-plan/src/dsl/function_expr/cum.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/datetime.rs b/crates/polars-plan/src/dsl/function_expr/datetime.rs similarity index 81% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/datetime.rs rename to crates/polars-plan/src/dsl/function_expr/datetime.rs index 14eef0e168c30..c1d9054e702bf 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/datetime.rs +++ b/crates/polars-plan/src/dsl/function_expr/datetime.rs @@ -31,7 +31,7 @@ pub enum TemporalFunction { Microsecond, Nanosecond, TimeStamp(TimeUnit), - Truncate(String, String), + Truncate(TruncateOptions), #[cfg(feature = "date_offset")] MonthStart, #[cfg(feature = "date_offset")] @@ -42,19 +42,27 @@ pub enum TemporalFunction { DSTOffset, Round(String, String), #[cfg(feature = "timezones")] - CastTimezone(Option, Option), - #[cfg(feature = "timezones")] - TzLocalize(TimeZone), + ReplaceTimeZone(Option, Option), DateRange { every: Duration, closed: ClosedWindow, time_unit: Option, - tz: Option, + time_zone: Option, + }, + DateRanges { + every: Duration, + closed: ClosedWindow, + time_unit: Option, + time_zone: Option, }, TimeRange { every: Duration, closed: ClosedWindow, }, + TimeRanges { + every: Duration, + closed: ClosedWindow, + }, Combine(TimeUnit), } @@ -92,11 +100,11 @@ impl Display for TemporalFunction { DSTOffset => "dst_offset", Round(..) => "round", #[cfg(feature = "timezones")] - CastTimezone(_, _) => "replace_timezone", - #[cfg(feature = "timezones")] - TzLocalize(_) => "tz_localize", + ReplaceTimeZone(_, _) => "replace_time_zone", DateRange { .. } => return write!(f, "date_range"), + DateRanges { .. } => return write!(f, "date_ranges"), TimeRange { .. } => return write!(f, "time_range"), + TimeRanges { .. } => return write!(f, "time_ranges"), Combine(_) => "combine", }; write!(f, "dt.{s}") @@ -133,11 +141,10 @@ pub(super) fn ordinal_day(s: &Series) -> PolarsResult { pub(super) fn time(s: &Series) -> PolarsResult { match s.dtype() { #[cfg(feature = "timezones")] - DataType::Datetime(_, Some(_)) => s - .datetime() - .unwrap() - .replace_time_zone(None, None)? - .cast(&DataType::Time), + DataType::Datetime(_, Some(_)) => { + polars_ops::prelude::replace_time_zone(s.datetime().unwrap(), None, None)? + .cast(&DataType::Time) + } DataType::Datetime(_, _) => s.datetime().unwrap().cast(&DataType::Time), DataType::Date => s.datetime().unwrap().cast(&DataType::Time), DataType::Time => Ok(s.clone()), @@ -148,11 +155,10 @@ pub(super) fn date(s: &Series) -> PolarsResult { match s.dtype() { #[cfg(feature = "timezones")] DataType::Datetime(_, Some(tz)) => { - let mut out = s - .datetime() - .unwrap() - .replace_time_zone(None, None)? - .cast(&DataType::Date)?; + let mut out = { + polars_ops::chunked_array::replace_time_zone(s.datetime().unwrap(), None, None)? + .cast(&DataType::Date)? + }; if tz != "UTC" { // DST transitions may not preserve sortedness. out.set_sorted_flag(IsSorted::Not); @@ -168,11 +174,10 @@ pub(super) fn datetime(s: &Series) -> PolarsResult { match s.dtype() { #[cfg(feature = "timezones")] DataType::Datetime(tu, Some(tz)) => { - let mut out = s - .datetime() - .unwrap() - .replace_time_zone(None, None)? - .cast(&DataType::Datetime(*tu, None))?; + let mut out = { + polars_ops::chunked_array::replace_time_zone(s.datetime().unwrap(), None, None)? + .cast(&DataType::Datetime(*tu, None))? + }; if tz != "UTC" { // DST transitions may not preserve sortedness. out.set_sorted_flag(IsSorted::Not); @@ -205,28 +210,18 @@ pub(super) fn timestamp(s: &Series, tu: TimeUnit) -> PolarsResult { s.timestamp(tu).map(|ca| ca.into_series()) } -pub(super) fn truncate(s: &Series, every: &str, offset: &str) -> PolarsResult { - let every = Duration::parse(every); - let offset = Duration::parse(offset); +pub(super) fn truncate(s: &Series, options: &TruncateOptions) -> PolarsResult { let mut out = match s.dtype() { DataType::Datetime(_, tz) => match tz { #[cfg(feature = "timezones")] Some(tz) => s .datetime() .unwrap() - .truncate(every, offset, tz.parse::().ok().as_ref())? - .into_series(), - _ => s - .datetime() - .unwrap() - .truncate(every, offset, None)? + .truncate(options, tz.parse::().ok().as_ref())? .into_series(), + _ => s.datetime().unwrap().truncate(options, None)?.into_series(), }, - DataType::Date => s - .date() - .unwrap() - .truncate(every, offset, None)? - .into_series(), + DataType::Date => s.date().unwrap().truncate(options, None)?.into_series(), dt => polars_bail!(opq = round, got = dt, expected = "date/datetime"), }; out.set_sorted_flag(s.is_sorted_flag()); @@ -321,27 +316,3 @@ pub(super) fn round(s: &Series, every: &str, offset: &str) -> PolarsResult polars_bail!(opq = round, got = dt, expected = "date/datetime"), }) } - -#[cfg(feature = "timezones")] -pub(super) fn replace_timezone( - s: &Series, - time_zone: Option<&str>, - use_earliest: Option, -) -> PolarsResult { - let ca = s.datetime()?; - ca.replace_time_zone(time_zone, use_earliest) - .map(|ca| ca.into_series()) -} - -#[cfg(feature = "timezones")] -#[deprecated(note = "use replace_time_zone")] -pub(super) fn tz_localize(s: &Series, tz: &str) -> PolarsResult { - let ca = s.datetime()?.clone(); - polars_ensure!( - ca.time_zone().as_ref().map_or(true, |tz| tz.is_empty()), - ComputeError: - "cannot localize a tz-aware datetime \ - (consider using 'dt.convert_time_zone' or 'dt.replace_time_zone')" - ); - Ok(ca.replace_time_zone(Some(tz), None)?.into_series()) -} diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs similarity index 75% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/dispatch.rs rename to crates/polars-plan/src/dsl/function_expr/dispatch.rs index 5ca34b389efae..d44c5844b8d21 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/crates/polars-plan/src/dsl/function_expr/dispatch.rs @@ -32,3 +32,13 @@ pub(super) fn set_sorted_flag(s: &Series, sorted: IsSorted) -> PolarsResult, + use_earliest: Option, +) -> PolarsResult { + let ca = s.datetime().unwrap(); + Ok(polars_ops::prelude::replace_time_zone(ca, time_zone, use_earliest)?.into_series()) +} diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/fill_null.rs b/crates/polars-plan/src/dsl/function_expr/fill_null.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/fill_null.rs rename to crates/polars-plan/src/dsl/function_expr/fill_null.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/fused.rs b/crates/polars-plan/src/dsl/function_expr/fused.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/fused.rs rename to crates/polars-plan/src/dsl/function_expr/fused.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/list.rs rename to crates/polars-plan/src/dsl/function_expr/list.rs index 4bffb97ad84df..f42e745653d7b 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/list.rs +++ b/crates/polars-plan/src/dsl/function_expr/list.rs @@ -97,7 +97,7 @@ pub(super) fn slice(args: &mut [Series]) -> PolarsResult> { list_ca .amortized_iter() - .zip(length_ca.into_iter()) + .zip(length_ca) .map(|(opt_s, opt_length)| match (opt_s, opt_length) { (Some(s), Some(length)) => Some(s.as_ref().slice(offset, length as usize)), _ => None, @@ -134,8 +134,8 @@ pub(super) fn slice(args: &mut [Series]) -> PolarsResult> { list_ca .amortized_iter() - .zip(offset_ca.into_iter()) - .zip(length_ca.into_iter()) + .zip(offset_ca) + .zip(length_ca) .map( |((opt_s, opt_offset), opt_length)| match (opt_s, opt_offset, opt_length) { (Some(s), Some(offset), Some(length)) => { @@ -226,7 +226,7 @@ pub(super) fn take(args: &[Series], null_on_oob: bool) -> PolarsResult { let idx = &args[1]; let ca = ca.list()?; - if idx.len() == 1 { + if idx.len() == 1 && null_on_oob { // fast path let idx = idx.get(0)?.try_extract::()?; let out = ca.lst_get(idx)?; diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/log.rs b/crates/polars-plan/src/dsl/function_expr/log.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/log.rs rename to crates/polars-plan/src/dsl/function_expr/log.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs similarity index 93% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs rename to crates/polars-plan/src/dsl/function_expr/mod.rs index 0493c231a429d..3dd74b58331f8 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -79,6 +79,7 @@ pub use self::boolean::BooleanFunction; pub(crate) use self::cat::CategoricalFunction; #[cfg(feature = "temporal")] pub(super) use self::datetime::TemporalFunction; +pub(super) use self::pow::PowFunction; #[cfg(feature = "range")] pub(super) use self::range::RangeFunction; #[cfg(feature = "strings")] @@ -95,7 +96,7 @@ pub enum FunctionExpr { #[cfg(feature = "abs")] Abs, NullCount, - Pow, + Pow(PowFunction), #[cfg(feature = "row_hash")] Hash(u64, u64, u64, u64), #[cfg(feature = "arg_where")] @@ -113,6 +114,8 @@ pub enum FunctionExpr { DateOffset(polars_time::Duration), #[cfg(feature = "trigonometry")] Trigonometry(TrigonometricFunction), + #[cfg(feature = "trigonometry")] + Atan2, #[cfg(feature = "sign")] Sign, FillNull { @@ -241,7 +244,7 @@ impl Display for FunctionExpr { #[cfg(feature = "abs")] Abs => "abs", NullCount => "null_count", - Pow => "pow", + Pow(func) => return write!(f, "{func}"), #[cfg(feature = "row_hash")] Hash(_, _, _, _) => "hash", #[cfg(feature = "arg_where")] @@ -259,6 +262,8 @@ impl Display for FunctionExpr { DateOffset(_) => "dt.offset_by", #[cfg(feature = "trigonometry")] Trigonometry(func) => return write!(f, "{func}"), + #[cfg(feature = "trigonometry")] + Atan2 => return write!(f, "arctan2"), #[cfg(feature = "sign")] Sign => "sign", FillNull { .. } => "fill_null", @@ -428,9 +433,11 @@ impl From for SpecialEq> { }; wrap!(f) } - Pow => { - wrap!(pow::pow) - } + Pow(func) => match func { + PowFunction::Generic => wrap!(pow::pow), + PowFunction::Sqrt => map!(pow::sqrt), + PowFunction::Cbrt => map!(pow::cbrt), + }, #[cfg(feature = "row_hash")] Hash(k0, k1, k2, k3) => { map!(row_hash::row_hash, k0, k1, k2, k3) @@ -455,10 +462,16 @@ impl From for SpecialEq> { DateOffset(offset) => { map_owned!(temporal::date_offset, offset) } + #[cfg(feature = "trigonometry")] Trigonometry(trig_function) => { map!(trigonometry::apply_trigonometric_function, trig_function) } + #[cfg(feature = "trigonometry")] + Atan2 => { + wrap!(trigonometry::apply_arctan2) + } + #[cfg(feature = "sign")] Sign => { map!(sign::sign) @@ -696,7 +709,6 @@ impl From for SpecialEq> { } #[cfg(feature = "temporal")] -#[allow(deprecated)] // tz_localize impl From for SpecialEq> { fn from(func: TemporalFunction) -> Self { use TemporalFunction::*; @@ -720,7 +732,9 @@ impl From for SpecialEq> { Microsecond => map!(datetime::microsecond), Nanosecond => map!(datetime::nanosecond), TimeStamp(tu) => map!(datetime::timestamp, tu), - Truncate(every, offset) => map!(datetime::truncate, &every, &offset), + Truncate(truncate_options) => { + map!(datetime::truncate, &truncate_options) + } #[cfg(feature = "date_offset")] MonthStart => map!(datetime::month_start), #[cfg(feature = "date_offset")] @@ -731,17 +745,15 @@ impl From for SpecialEq> { DSTOffset => map!(datetime::dst_offset), Round(every, offset) => map!(datetime::round, &every, &offset), #[cfg(feature = "timezones")] - CastTimezone(tz, use_earliest) => { - map!(datetime::replace_timezone, tz.as_deref(), use_earliest) + ReplaceTimeZone(tz, use_earliest) => { + map!(dispatch::replace_time_zone, tz.as_deref(), use_earliest) } - #[cfg(feature = "timezones")] - TzLocalize(tz) => map!(datetime::tz_localize, &tz), Combine(tu) => map_as_slice!(temporal::combine, tu), DateRange { every, closed, time_unit, - tz, + time_zone, } => { map_as_slice!( temporal::temporal_range_dispatch, @@ -749,7 +761,22 @@ impl From for SpecialEq> { every, closed, time_unit, - tz.clone() + time_zone.clone() + ) + } + DateRanges { + every, + closed, + time_unit, + time_zone, + } => { + map_as_slice!( + temporal::temporal_range_dispatch, + "date_range", + every, + closed, + time_unit, + time_zone.clone() ) } TimeRange { every, closed } => { @@ -762,6 +789,16 @@ impl From for SpecialEq> { None ) } + TimeRanges { every, closed } => { + map_as_slice!( + temporal::temporal_range_dispatch, + "time_range", + every, + closed, + None, + None + ) + } } } } diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/nan.rs b/crates/polars-plan/src/dsl/function_expr/nan.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/nan.rs rename to crates/polars-plan/src/dsl/function_expr/nan.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/pow.rs b/crates/polars-plan/src/dsl/function_expr/pow.rs similarity index 61% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/pow.rs rename to crates/polars-plan/src/dsl/function_expr/pow.rs index bf8a7e92e0b03..27b97d9494bd2 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/pow.rs +++ b/crates/polars-plan/src/dsl/function_expr/pow.rs @@ -5,6 +5,25 @@ use polars_core::export::num::{Float, ToPrimitive}; use super::*; +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)] +pub enum PowFunction { + Generic, + Sqrt, + Cbrt, +} + +impl Display for PowFunction { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + use self::*; + match self { + PowFunction::Generic => write!(f, "pow"), + PowFunction::Sqrt => write!(f, "sqrt"), + PowFunction::Cbrt => write!(f, "cbrt"), + } + } +} + fn pow_on_floats(base: &ChunkedArray, exponent: &Series) -> PolarsResult> where T: PolarsFloatType, @@ -17,7 +36,7 @@ where if exponent.len() == 1 { let Some(exponent_value) = exponent.get(0) else { - return Ok(Some(Series::full_null(base.name(), base.len(), &dtype))) + return Ok(Some(Series::full_null(base.name(), base.len(), &dtype))); }; let s = match exponent_value.to_f64().unwrap() { a if a == 1.0 => base.clone().into_series(), @@ -46,7 +65,7 @@ where } else { Ok(Some( base.into_iter() - .zip(exponent.into_iter()) + .zip(exponent) .map(|(opt_base, opt_exponent)| match (opt_base, opt_exponent) { (Some(base), Some(exponent)) => Some(num::pow::Pow::pow(base, exponent)), _ => None, @@ -91,3 +110,57 @@ pub(super) fn pow(s: &mut [Series]) -> PolarsResult> { ), } } + +pub(super) fn sqrt(base: &Series) -> PolarsResult { + use DataType::*; + match base.dtype() { + Float32 => { + let ca = base.f32().unwrap(); + sqrt_on_floats(ca) + } + Float64 => { + let ca = base.f64().unwrap(); + sqrt_on_floats(ca) + } + _ => { + let base = base.cast(&DataType::Float64)?; + sqrt(&base) + } + } +} + +fn sqrt_on_floats(base: &ChunkedArray) -> PolarsResult +where + T: PolarsFloatType, + T::Native: num::pow::Pow + ToPrimitive + Float, + ChunkedArray: IntoSeries, +{ + Ok(base.apply(|v| v.sqrt()).into_series()) +} + +pub(super) fn cbrt(base: &Series) -> PolarsResult { + use DataType::*; + match base.dtype() { + Float32 => { + let ca = base.f32().unwrap(); + cbrt_on_floats(ca) + } + Float64 => { + let ca = base.f64().unwrap(); + cbrt_on_floats(ca) + } + _ => { + let base = base.cast(&DataType::Float64)?; + cbrt(&base) + } + } +} + +fn cbrt_on_floats(base: &ChunkedArray) -> PolarsResult +where + T: PolarsFloatType, + T::Native: num::pow::Pow + ToPrimitive + Float, + ChunkedArray: IntoSeries, +{ + Ok(base.apply(|v| v.cbrt()).into_series()) +} diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/random.rs b/crates/polars-plan/src/dsl/function_expr/random.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/random.rs rename to crates/polars-plan/src/dsl/function_expr/random.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/range.rs b/crates/polars-plan/src/dsl/function_expr/range.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/range.rs rename to crates/polars-plan/src/dsl/function_expr/range.rs index 611e0fa8b5fe2..19d1c75a62adb 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range.rs @@ -136,7 +136,7 @@ pub(super) fn int_ranges(s: &[Series], step: i64) -> PolarsResult { DataType::Int64, ); - for (opt_start, opt_end) in start.into_iter().zip(end.into_iter()) { + for (opt_start, opt_end) in start.into_iter().zip(end) { match (opt_start, opt_end) { (Some(start_v), Some(end_v)) => match step { 1 => { diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/rolling.rs b/crates/polars-plan/src/dsl/function_expr/rolling.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/rolling.rs rename to crates/polars-plan/src/dsl/function_expr/rolling.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/round.rs b/crates/polars-plan/src/dsl/function_expr/round.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/round.rs rename to crates/polars-plan/src/dsl/function_expr/round.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/row_hash.rs b/crates/polars-plan/src/dsl/function_expr/row_hash.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/row_hash.rs rename to crates/polars-plan/src/dsl/function_expr/row_hash.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs similarity index 90% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs rename to crates/polars-plan/src/dsl/function_expr/schema.rs index fc0f414b4c776..82dd321fd8532 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs +++ b/crates/polars-plan/src/dsl/function_expr/schema.rs @@ -14,7 +14,7 @@ impl FunctionExpr { #[cfg(feature = "abs")] Abs => mapper.with_same_dtype(), NullCount => mapper.with_dtype(IDX_DTYPE), - Pow => mapper.map_to_float_dtype(), + Pow(_) => mapper.map_to_float_dtype(), Coalesce => mapper.map_to_supertype(), #[cfg(feature = "row_hash")] Hash(..) => mapper.with_dtype(DataType::UInt64), @@ -58,23 +58,44 @@ impl FunctionExpr { DSTOffset => DataType::Duration(TimeUnit::Milliseconds), Round(..) => mapper.with_same_dtype().unwrap().dtype, #[cfg(feature = "timezones")] - CastTimezone(tz, _use_earliest) => { + ReplaceTimeZone(tz, _use_earliest) => { return mapper.map_datetime_dtype_timezone(tz.as_ref()) } - #[cfg(feature = "timezones")] - TzLocalize(tz) => return mapper.map_datetime_dtype_timezone(Some(tz)), DateRange { every, closed: _, time_unit, - tz, + time_zone, } => { - // output dtype may change based on `every`, `tz`, and `time_unit` - return mapper.map_to_date_range_dtype(every, time_unit, tz); + // output dtype may change based on `every`, `time_unit`, and `time_zone` + let inner_dtype = + mapper.map_to_date_range_dtype(every, time_unit, time_zone)?; + return Ok(Field::new("date", DataType::List(Box::new(inner_dtype)))); } + DateRanges { + every, + closed: _, + time_unit, + time_zone, + } => { + // output dtype may change based on `every`, `time_unit`, and `time_zone` + let inner_dtype = + mapper.map_to_date_range_dtype(every, time_unit, time_zone)?; + return Ok(Field::new( + "date_range", + DataType::List(Box::new(inner_dtype)), + )); + } + TimeRange { .. } => { return Ok(Field::new("time", DataType::List(Box::new(DataType::Time)))); } + TimeRanges { .. } => { + return Ok(Field::new( + "time_range", + DataType::List(Box::new(DataType::Time)), + )); + } Combine(tu) => match mapper.with_same_dtype().unwrap().dtype { DataType::Datetime(_, tz) => DataType::Datetime(*tu, tz), DataType::Date => DataType::Datetime(*tu, None), @@ -102,6 +123,8 @@ impl FunctionExpr { DateOffset(_) => mapper.with_same_dtype(), #[cfg(feature = "trigonometry")] Trigonometry(_) => mapper.map_to_float_dtype(), + #[cfg(feature = "trigonometry")] + Atan2 => mapper.map_to_float_dtype(), #[cfg(feature = "sign")] Sign => mapper.with_dtype(DataType::Int64), FillNull { super_type, .. } => mapper.with_dtype(super_type.clone()), @@ -154,9 +177,15 @@ impl FunctionExpr { match s { FieldByIndex(index) => { let (index, _) = slice_offsets(*index, 0, fields.len()); - fields.get(index).cloned().ok_or_else( - || polars_err!(ComputeError: "index out of bounds in `struct.field`"), - ) + if let DataType::Struct(flds) = &fields[0].dtype { + flds.get(index).cloned().ok_or_else( + || polars_err!(ComputeError: "index out of bounds in `struct.field`") + ) + } else { + polars_bail!( + ComputeError: "expected struct dtype, got: `{}`", &fields[0].dtype + ) + } } FieldByName(name) => { if let DataType::Struct(flds) = &fields[0].dtype { @@ -344,7 +373,7 @@ impl<'a> FieldsMapper<'a> { every: &Duration, time_unit: &Option, tz: &Option, - ) -> PolarsResult { + ) -> PolarsResult { let inner_dtype = match (&self.map_to_supertype()?.dtype, time_unit, tz, every) { #[cfg(feature = "timezones")] (DataType::Datetime(tu, Some(field_tz)), time_unit, Some(tz), _) => { @@ -394,7 +423,7 @@ impl<'a> FieldsMapper<'a> { polars_bail!(ComputeError: "expected Date or Datetime, got {}", dtype) } }; - Ok(Field::new("date", DataType::List(Box::new(inner_dtype)))) + Ok(inner_dtype) } /// Map the dtypes to the "supertype" of a list of lists. @@ -429,7 +458,7 @@ impl<'a> FieldsMapper<'a> { if let DataType::Datetime(tu, _) = dt { Ok(DataType::Datetime(*tu, tz.cloned())) } else { - polars_bail!(op = "cast-timezone", got = dt, expected = "Datetime"); + polars_bail!(op = "replace-time-zone", got = dt, expected = "Datetime"); } }) } diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/search_sorted.rs b/crates/polars-plan/src/dsl/function_expr/search_sorted.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/search_sorted.rs rename to crates/polars-plan/src/dsl/function_expr/search_sorted.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/shift_and_fill.rs b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/shift_and_fill.rs rename to crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/shrink_type.rs b/crates/polars-plan/src/dsl/function_expr/shrink_type.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/shrink_type.rs rename to crates/polars-plan/src/dsl/function_expr/shrink_type.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/sign.rs b/crates/polars-plan/src/dsl/function_expr/sign.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/sign.rs rename to crates/polars-plan/src/dsl/function_expr/sign.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs rename to crates/polars-plan/src/dsl/function_expr/strings.rs index 60486b1cc0734..57236d05ab963 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -187,8 +187,9 @@ pub(super) fn lengths(s: &Series) -> PolarsResult { #[cfg(feature = "regex")] pub(super) fn contains(s: &[Series], literal: bool, strict: bool) -> PolarsResult { - let ca = &s[0].utf8()?; - let pat = &s[1].utf8()?; + // TODO! move to polars-ops + let ca = s[0].utf8()?; + let pat = s[1].utf8()?; let mut out: BooleanChunked = match pat.len() { 1 => match pat.get(0) { @@ -204,7 +205,7 @@ pub(super) fn contains(s: &[Series], literal: bool, strict: bool) -> PolarsResul _ => { if literal { ca.into_iter() - .zip(pat.into_iter()) + .zip(pat) .map(|(opt_src, opt_val)| match (opt_src, opt_val) { (Some(src), Some(pat)) => src.contains(pat), _ => false, @@ -212,7 +213,7 @@ pub(super) fn contains(s: &[Series], literal: bool, strict: bool) -> PolarsResul .collect_trusted() } else if strict { ca.into_iter() - .zip(pat.into_iter()) + .zip(pat) .map(|(opt_src, opt_val)| match (opt_src, opt_val) { (Some(src), Some(pat)) => { let re = Regex::new(pat)?; @@ -223,7 +224,7 @@ pub(super) fn contains(s: &[Series], literal: bool, strict: bool) -> PolarsResul .collect::>()? } else { ca.into_iter() - .zip(pat.into_iter()) + .zip(pat) .map(|(opt_src, opt_val)| match (opt_src, opt_val) { (Some(src), Some(pat)) => Regex::new(pat).ok().map(|re| re.is_match(src)), _ => Some(false), @@ -238,8 +239,8 @@ pub(super) fn contains(s: &[Series], literal: bool, strict: bool) -> PolarsResul } pub(super) fn ends_with(s: &[Series]) -> PolarsResult { - let ca = &s[0].utf8()?; - let sub = &s[1].utf8()?; + let ca = s[0].utf8()?; + let sub = s[1].utf8()?; let mut out: BooleanChunked = match sub.len() { 1 => match sub.get(0) { @@ -248,7 +249,7 @@ pub(super) fn ends_with(s: &[Series]) -> PolarsResult { }, _ => ca .into_iter() - .zip(sub.into_iter()) + .zip(sub) .map(|(opt_src, opt_val)| match (opt_src, opt_val) { (Some(src), Some(val)) => src.ends_with(val), _ => false, @@ -261,8 +262,8 @@ pub(super) fn ends_with(s: &[Series]) -> PolarsResult { } pub(super) fn starts_with(s: &[Series]) -> PolarsResult { - let ca = &s[0].utf8()?; - let sub = &s[1].utf8()?; + let ca = s[0].utf8()?; + let sub = s[1].utf8()?; let mut out: BooleanChunked = match sub.len() { 1 => match sub.get(0) { @@ -271,7 +272,7 @@ pub(super) fn starts_with(s: &[Series]) -> PolarsResult { }, _ => ca .into_iter() - .zip(sub.into_iter()) + .zip(sub) .map(|(opt_src, opt_val)| match (opt_src, opt_val) { (Some(src), Some(val)) => src.starts_with(val), _ => false, @@ -537,7 +538,7 @@ where { let mut out: Utf8Chunked = ca .into_iter() - .zip(val.into_iter()) + .zip(val) .map(|(opt_src, opt_val)| match (opt_src, opt_val) { (Some(src), Some(val)) => Some(f(src, val)), _ => None, diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/struct_.rs b/crates/polars-plan/src/dsl/function_expr/struct_.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/struct_.rs rename to crates/polars-plan/src/dsl/function_expr/struct_.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/temporal.rs b/crates/polars-plan/src/dsl/function_expr/temporal.rs similarity index 89% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/temporal.rs rename to crates/polars-plan/src/dsl/function_expr/temporal.rs index 6e695732a52ff..ae07aa4fd6360 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/temporal.rs +++ b/crates/polars-plan/src/dsl/function_expr/temporal.rs @@ -79,11 +79,12 @@ pub(super) fn combine(s: &[Series], tu: TimeUnit) -> PolarsResult { let result_naive = datetime + duration; match tz { #[cfg(feature = "timezones")] - Some(tz) => Ok(result_naive - .datetime() - .unwrap() - .replace_time_zone(Some(tz), None)? - .into()), + Some(tz) => Ok(polars_ops::prelude::replace_time_zone( + result_naive.datetime().unwrap(), + Some(tz), + None, + )? + .into()), _ => Ok(result_naive), } } @@ -132,21 +133,22 @@ pub(super) fn temporal_range_dispatch( let (mut start, mut stop) = match dtype { #[cfg(feature = "timezones")] DataType::Datetime(_, Some(_)) => ( - start - .cast(&dtype)? - .datetime() - .unwrap() - .replace_time_zone(None, None)? - .into_series() - .to_physical_repr() - .cast(&DataType::Int64)?, - stop.cast(&dtype)? - .datetime() - .unwrap() - .replace_time_zone(None, None)? - .into_series() - .to_physical_repr() - .cast(&DataType::Int64)?, + polars_ops::prelude::replace_time_zone( + start.cast(&dtype)?.datetime().unwrap(), + None, + None, + )? + .into_series() + .to_physical_repr() + .cast(&DataType::Int64)?, + polars_ops::prelude::replace_time_zone( + stop.cast(&dtype)?.datetime().unwrap(), + None, + None, + )? + .into_series() + .to_physical_repr() + .cast(&DataType::Int64)?, ), _ => ( start @@ -184,7 +186,7 @@ pub(super) fn temporal_range_dispatch( start.len() * 5, DataType::Int32, ); - for (start, stop) in start.into_iter().zip(stop.into_iter()) { + for (start, stop) in start.into_iter().zip(stop) { match (start, stop) { (Some(start), Some(stop)) => { let rng = date_range_impl( @@ -213,7 +215,7 @@ pub(super) fn temporal_range_dispatch( start.len() * 5, DataType::Int64, ); - for (start, stop) in start.into_iter().zip(stop.into_iter()) { + for (start, stop) in start.into_iter().zip(stop) { match (start, stop) { (Some(start), Some(stop)) => { let rng = date_range_impl("", start, stop, every, closed, tu, tz.as_ref())?; @@ -231,7 +233,7 @@ pub(super) fn temporal_range_dispatch( start.len() * 5, DataType::Int64, ); - for (start, stop) in start.into_iter().zip(stop.into_iter()) { + for (start, stop) in start.into_iter().zip(stop) { match (start, stop) { (Some(start), Some(stop)) => { let rng = date_range_impl( diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs similarity index 74% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/trigonometry.rs rename to crates/polars-plan/src/dsl/function_expr/trigonometry.rs index 1810baea576a4..866ccb713ec39 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/trigonometry.rs +++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs @@ -1,4 +1,5 @@ use num::Float; +use polars_arrow::utils::CustomIterTools; use polars_core::export::num; use super::*; @@ -68,6 +69,78 @@ pub(super) fn apply_trigonometric_function( } } +pub(super) fn apply_arctan2(s: &mut [Series]) -> PolarsResult> { + let y = &s[0]; + let x = &s[1]; + + let y_len = y.len(); + let x_len = x.len(); + + match (y_len, x_len) { + (1, _) | (_, 1) => arctan2_on_series(y, x), + (len_a, len_b) if len_a == len_b => arctan2_on_series(y, x), + _ => polars_bail!( + ComputeError: + "y shape: {} in `arctan2` expression does not match that of x: {}", + y_len, x_len, + ), + } +} + +fn arctan2_on_series(y: &Series, x: &Series) -> PolarsResult> { + use DataType::*; + match y.dtype() { + Float32 => { + let y_ca: &ChunkedArray = y.f32().unwrap(); + arctan2_on_floats(y_ca, x) + } + Float64 => { + let y_ca: &ChunkedArray = y.f64().unwrap(); + arctan2_on_floats(y_ca, x) + } + _ => { + let y = y.cast(&DataType::Float64)?; + arctan2_on_series(&y, x) + } + } +} + +fn arctan2_on_floats(y: &ChunkedArray, x: &Series) -> PolarsResult> +where + T: PolarsFloatType, + T::Native: Float, + ChunkedArray: IntoSeries, +{ + let dtype = T::get_dtype(); + let x = x.cast(&dtype)?; + let x = y.unpack_series_matching_type(&x).unwrap(); + + if x.len() == 1 { + let x_value = x + .get(0) + .ok_or_else(|| polars_err!(ComputeError: "arctan2 x value is null"))?; + + Ok(Some(y.apply(|v| v.atan2(x_value)).into_series())) + } else if y.len() == 1 { + let y_value = y + .get(0) + .ok_or_else(|| polars_err!(ComputeError: "arctan2 y value is null"))?; + + Ok(Some(x.apply(|v| y_value.atan2(v)).into_series())) + } else { + Ok(Some( + y.into_iter() + .zip(x) + .map(|(opt_y, opt_x)| match (opt_y, opt_x) { + (Some(y), Some(x)) => Some(y.atan2(x)), + _ => None, + }) + .collect_trusted::>() + .into_series(), + )) + } +} + fn apply_trigonometric_function_to_float( ca: &ChunkedArray, trig_function: TrigonometricFunction, diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/unique.rs b/crates/polars-plan/src/dsl/function_expr/unique.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/function_expr/unique.rs rename to crates/polars-plan/src/dsl/function_expr/unique.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/arity.rs b/crates/polars-plan/src/dsl/functions/arity.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/functions/arity.rs rename to crates/polars-plan/src/dsl/functions/arity.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/coerce.rs b/crates/polars-plan/src/dsl/functions/coerce.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/functions/coerce.rs rename to crates/polars-plan/src/dsl/functions/coerce.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/concat.rs b/crates/polars-plan/src/dsl/functions/concat.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/functions/concat.rs rename to crates/polars-plan/src/dsl/functions/concat.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/correlation.rs b/crates/polars-plan/src/dsl/functions/correlation.rs similarity index 95% rename from polars/polars-lazy/polars-plan/src/dsl/functions/correlation.rs rename to crates/polars-plan/src/dsl/functions/correlation.rs index 41f5b530fccb0..f5912c390a207 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/functions/correlation.rs +++ b/crates/polars-plan/src/dsl/functions/correlation.rs @@ -72,8 +72,6 @@ pub fn spearman_rank_corr(a: Expr, b: Expr, ddof: u8, propagate_nans: bool) -> E #[cfg(feature = "rolling_window")] pub fn rolling_corr(x: Expr, y: Expr, options: RollingCovOptions) -> Expr { - let x = x.cache(); - let y = y.cache(); // see: https://github.com/pandas-dev/pandas/blob/v1.5.1/pandas/core/window/rolling.py#L1780-L1804 let rolling_options = RollingOptions { window_size: Duration::new(options.window_size as i64), @@ -96,8 +94,7 @@ pub fn rolling_corr(x: Expr, y: Expr, options: RollingCovOptions) -> Expr { let count_x_y = (x + y) .is_not_null() .cast(DataType::Float64) - .rolling_sum(rolling_options_count) - .cache(); + .rolling_sum(rolling_options_count); let numerator = (mean_x_y - mean_x * mean_y) * (count_x_y.clone() / (count_x_y - lit(ddof))); let denominator = (var_x * var_y).pow(lit(0.5)); @@ -106,8 +103,6 @@ pub fn rolling_corr(x: Expr, y: Expr, options: RollingCovOptions) -> Expr { #[cfg(feature = "rolling_window")] pub fn rolling_cov(x: Expr, y: Expr, options: RollingCovOptions) -> Expr { - let x = x.cache(); - let y = y.cache(); // see: https://github.com/pandas-dev/pandas/blob/91111fd99898d9dcaa6bf6bedb662db4108da6e6/pandas/core/window/rolling.py#L1700 let rolling_options = RollingOptions { window_size: Duration::new(options.window_size as i64), @@ -126,8 +121,7 @@ pub fn rolling_cov(x: Expr, y: Expr, options: RollingCovOptions) -> Expr { let count_x_y = (x + y) .is_not_null() .cast(DataType::Float64) - .rolling_sum(rolling_options_count) - .cache(); + .rolling_sum(rolling_options_count); let ddof = options.ddof as f64; diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/horizontal.rs b/crates/polars-plan/src/dsl/functions/horizontal.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/functions/horizontal.rs rename to crates/polars-plan/src/dsl/functions/horizontal.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/index.rs b/crates/polars-plan/src/dsl/functions/index.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/functions/index.rs rename to crates/polars-plan/src/dsl/functions/index.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/mod.rs b/crates/polars-plan/src/dsl/functions/mod.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/functions/mod.rs rename to crates/polars-plan/src/dsl/functions/mod.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/range.rs b/crates/polars-plan/src/dsl/functions/range.rs similarity index 74% rename from polars/polars-lazy/polars-plan/src/dsl/functions/range.rs rename to crates/polars-plan/src/dsl/functions/range.rs index 5bb7a63664a58..bbf8211879594 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/functions/range.rs +++ b/crates/polars-plan/src/dsl/functions/range.rs @@ -82,7 +82,7 @@ pub fn date_range( every: Duration, closed: ClosedWindow, time_unit: Option, - tz: Option, + time_zone: Option, ) -> Expr { let input = vec![start, end]; @@ -92,7 +92,7 @@ pub fn date_range( every, closed, time_unit, - tz, + time_zone, }), options: FunctionOptions { collect_groups: ApplyOptions::ApplyGroups, @@ -103,7 +103,36 @@ pub fn date_range( } } -/// Create a time range from a `start` and `stop` expression. +/// Create a column of date ranges from a `start` and `stop` expression. +#[cfg(feature = "temporal")] +pub fn date_ranges( + start: Expr, + end: Expr, + every: Duration, + closed: ClosedWindow, + time_unit: Option, + time_zone: Option, +) -> Expr { + let input = vec![start, end]; + + Expr::Function { + input, + function: FunctionExpr::TemporalExpr(TemporalFunction::DateRanges { + every, + closed, + time_unit, + time_zone, + }), + options: FunctionOptions { + collect_groups: ApplyOptions::ApplyGroups, + cast_to_supertypes: true, + allow_rename: true, + ..Default::default() + }, + } +} + +/// Generate a time range. #[cfg(feature = "temporal")] pub fn time_range(start: Expr, end: Expr, every: Duration, closed: ClosedWindow) -> Expr { let input = vec![start, end]; @@ -120,6 +149,23 @@ pub fn time_range(start: Expr, end: Expr, every: Duration, closed: ClosedWindow) } } +/// Create a column of time ranges from a `start` and `stop` expression. +#[cfg(feature = "temporal")] +pub fn time_ranges(start: Expr, end: Expr, every: Duration, closed: ClosedWindow) -> Expr { + let input = vec![start, end]; + + Expr::Function { + input, + function: FunctionExpr::TemporalExpr(TemporalFunction::TimeRanges { every, closed }), + options: FunctionOptions { + collect_groups: ApplyOptions::ApplyGroups, + cast_to_supertypes: false, + allow_rename: true, + ..Default::default() + }, + } +} + /// Create a column of length `n` containing `n` copies of the literal `value`. Generally you won't need this function, /// as `lit(value)` already represents a column containing only `value` whose length is automatically set to the correct /// number of rows. diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/selectors.rs b/crates/polars-plan/src/dsl/functions/selectors.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/functions/selectors.rs rename to crates/polars-plan/src/dsl/functions/selectors.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/syntactic_sugar.rs b/crates/polars-plan/src/dsl/functions/syntactic_sugar.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/functions/syntactic_sugar.rs rename to crates/polars-plan/src/dsl/functions/syntactic_sugar.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/temporal.rs b/crates/polars-plan/src/dsl/functions/temporal.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/dsl/functions/temporal.rs rename to crates/polars-plan/src/dsl/functions/temporal.rs index 53c556262b7a6..2b035d265cc7e 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/functions/temporal.rs +++ b/crates/polars-plan/src/dsl/functions/temporal.rs @@ -18,6 +18,7 @@ macro_rules! impl_unit_setter { /// /// # Examples /// ``` +/// use polars_plan::prelude::*; /// // construct a DatetimeArgs set to July 20, 1969 at 20:17 /// let args = DatetimeArgs::new(lit(1969), lit(7), lit(20)).with_hms(lit(20), lit(17), lit(0)); /// // or @@ -137,12 +138,12 @@ pub fn datetime(args: DatetimeArgs) -> Expr { let ca: Int64Chunked = year .into_iter() - .zip(month.into_iter()) - .zip(day.into_iter()) - .zip(hour.into_iter()) - .zip(minute.into_iter()) - .zip(second.into_iter()) - .zip(microsecond.into_iter()) + .zip(month) + .zip(day) + .zip(hour) + .zip(minute) + .zip(second) + .zip(microsecond) .map(|((((((y, m), d), h), mnt), s), us)| { if let (Some(y), Some(m), Some(d), Some(h), Some(mnt), Some(s), Some(us)) = (y, m, d, h, mnt, s, us) diff --git a/polars/polars-lazy/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/list.rs rename to crates/polars-plan/src/dsl/list.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/meta.rs b/crates/polars-plan/src/dsl/meta.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/meta.rs rename to crates/polars-plan/src/dsl/meta.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/dsl/mod.rs rename to crates/polars-plan/src/dsl/mod.rs index 120ec51da67d6..f7529943e348d 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -821,6 +821,12 @@ impl Expr { self.map_private(FunctionExpr::Floor) } + /// Constant Pi + #[cfg(feature = "round_series")] + pub fn pi() -> Self { + lit(std::f64::consts::PI) + } + /// Ceil underlying floating point array to the highest integers smaller or equal to the float value. #[cfg(feature = "round_series")] pub fn ceil(self) -> Self { @@ -1126,10 +1132,8 @@ impl Expr { /// Keep the original root name /// - /// ``` - /// use polars_core::prelude::*; - /// use polars_lazy::prelude::*; - /// + /// ```rust,no_run + /// # use polars_plan::prelude::*; /// fn example(df: LazyFrame) -> LazyFrame { /// df.select([ /// // even thought the alias yields a different column name, @@ -1493,6 +1497,25 @@ impl Expr { }) } + #[cfg(feature = "cutqcut")] + pub fn qcut_uniform( + self, + n_bins: usize, + labels: Option>, + left_closed: bool, + allow_duplicates: bool, + include_breaks: bool, + ) -> Expr { + let probs = (1..n_bins).map(|b| b as f64 / n_bins as f64).collect(); + self.apply_private(FunctionExpr::QCut { + probs, + labels, + left_closed, + allow_duplicates, + include_breaks, + }) + } + #[cfg(feature = "rle")] pub fn rle(self) -> Expr { self.apply_private(FunctionExpr::RLE) @@ -1744,20 +1767,6 @@ impl Expr { self.apply_private(FunctionExpr::SetSortedFlag(sorted)) } - /// Cache this expression, so that it is executed only once per context. - pub fn cache(self) -> Expr { - match &self { - // don't cache cheap no-ops - Expr::Column(_) => self, - Expr::Alias(input, _) if matches!(**input, Expr::Column(_)) => self, - _ => { - let input = Box::new(self); - let id = input.as_ref() as *const Expr as usize; - Self::Cache { input, id } - } - } - } - #[cfg(feature = "row_hash")] /// Compute the hash of every element pub fn hash(self, k0: u64, k1: u64, k2: u64, k3: u64) -> Expr { diff --git a/polars/polars-lazy/polars-plan/src/dsl/names.rs b/crates/polars-plan/src/dsl/names.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/names.rs rename to crates/polars-plan/src/dsl/names.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/options.rs b/crates/polars-plan/src/dsl/options.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/options.rs rename to crates/polars-plan/src/dsl/options.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/python_udf.rs b/crates/polars-plan/src/dsl/python_udf.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/python_udf.rs rename to crates/polars-plan/src/dsl/python_udf.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/random.rs b/crates/polars-plan/src/dsl/random.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/random.rs rename to crates/polars-plan/src/dsl/random.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/selector.rs b/crates/polars-plan/src/dsl/selector.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/selector.rs rename to crates/polars-plan/src/dsl/selector.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/string.rs rename to crates/polars-plan/src/dsl/string.rs diff --git a/polars/polars-lazy/polars-plan/src/dsl/struct_.rs b/crates/polars-plan/src/dsl/struct_.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/dsl/struct_.rs rename to crates/polars-plan/src/dsl/struct_.rs diff --git a/polars/polars-lazy/polars-plan/src/frame/mod.rs b/crates/polars-plan/src/frame/mod.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/frame/mod.rs rename to crates/polars-plan/src/frame/mod.rs diff --git a/polars/polars-lazy/polars-plan/src/frame/opt_state.rs b/crates/polars-plan/src/frame/opt_state.rs similarity index 80% rename from polars/polars-lazy/polars-plan/src/frame/opt_state.rs rename to crates/polars-plan/src/frame/opt_state.rs index 28cf448c76f98..30508f7dfa338 100644 --- a/polars/polars-lazy/polars-plan/src/frame/opt_state.rs +++ b/crates/polars-plan/src/frame/opt_state.rs @@ -8,7 +8,9 @@ pub struct OptState { pub file_caching: bool, pub slice_pushdown: bool, #[cfg(feature = "cse")] - pub common_subplan_elimination: bool, + pub comm_subplan_elim: bool, + #[cfg(feature = "cse")] + pub comm_subexpr_elim: bool, pub streaming: bool, } @@ -23,7 +25,9 @@ impl Default for OptState { // will be toggled by a scan operation such as csv scan or parquet scan file_caching: false, #[cfg(feature = "cse")] - common_subplan_elimination: true, + comm_subplan_elim: true, + #[cfg(feature = "cse")] + comm_subexpr_elim: false, streaming: false, } } diff --git a/polars/polars-lazy/polars-plan/src/global.rs b/crates/polars-plan/src/global.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/global.rs rename to crates/polars-plan/src/global.rs diff --git a/polars/polars-lazy/polars-plan/src/lib.rs b/crates/polars-plan/src/lib.rs similarity index 64% rename from polars/polars-lazy/polars-plan/src/lib.rs rename to crates/polars-plan/src/lib.rs index 7b949bb5848a4..01fde0ab0d230 100644 --- a/polars/polars-lazy/polars-plan/src/lib.rs +++ b/crates/polars-plan/src/lib.rs @@ -1,4 +1,5 @@ #![cfg_attr(docsrs, feature(doc_auto_cfg))] +#![cfg_attr(feature = "nightly", allow(clippy::needless_pass_by_ref_mut))] // remove once stable pub mod constants; pub mod dot; diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/aexpr/mod.rs b/crates/polars-plan/src/logical_plan/aexpr/mod.rs similarity index 66% rename from polars/polars-lazy/polars-plan/src/logical_plan/aexpr/mod.rs rename to crates/polars-plan/src/logical_plan/aexpr/mod.rs index 379add5595d68..b4fea241db5c4 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/aexpr/mod.rs +++ b/crates/polars-plan/src/logical_plan/aexpr/mod.rs @@ -10,6 +10,8 @@ use polars_utils::arena::{Arena, Node}; use strum_macros::IntoStaticStr; use crate::dsl::function_expr::FunctionExpr; +#[cfg(feature = "cse")] +use crate::logical_plan::visitor::AexprNode; use crate::logical_plan::Context; use crate::prelude::aexpr::NodeInputs::Single; use crate::prelude::names::COUNT; @@ -43,6 +45,34 @@ pub enum AAggExpr { AggGroups(Node), } +impl AAggExpr { + pub(super) fn equal_nodes(&self, other: &AAggExpr) -> bool { + use AAggExpr::*; + match (self, other) { + ( + Min { + propagate_nans: l, .. + }, + Min { + propagate_nans: r, .. + }, + ) => l == r, + ( + Max { + propagate_nans: l, .. + }, + Max { + propagate_nans: r, .. + }, + ) => l == r, + (Quantile { interpol: l, .. }, Quantile { interpol: r, .. }) => l == r, + (Std(_, l), Std(_, r)) => l == r, + (Var(_, l), Var(_, r)) => l == r, + _ => std::mem::discriminant(self) == std::mem::discriminant(other), + } + } +} + impl From for GroupByMethod { fn from(value: AAggExpr) -> Self { use AAggExpr::*; @@ -145,13 +175,25 @@ pub enum AExpr { }, Count, Nth(i64), - Cache { - input: Node, - id: usize, - }, } impl AExpr { + #[cfg(feature = "cse")] + pub(crate) fn is_equal(l: Node, r: Node, arena: &Arena) -> bool { + let arena = arena as *const Arena as *mut Arena; + // safety: we can pass a *mut pointer + // the equality operation will not access mutable + unsafe { + let ae_node_l = AexprNode::from_raw(l, arena); + let ae_node_r = AexprNode::from_raw(r, arena); + ae_node_l == ae_node_r + } + } + + #[cfg(feature = "cse")] + pub(crate) fn col(name: &str) -> Self { + AExpr::Column(Arc::from(name)) + } /// Any expression that is sensitive to the number of elements in a group /// - Aggregations /// - Sorts @@ -182,7 +224,6 @@ impl AExpr { | Ternary { .. } | Wildcard | Cast { .. } - | Cache{..} | Filter { .. } => false, } } @@ -198,26 +239,119 @@ impl AExpr { .map(|f| f.data_type().clone()) } + /// Push nodes at this level to a pre-allocated stack + pub(crate) fn nodes(&self, container: &mut Vec) { + use AExpr::*; + + match self { + Nth(_) | Column(_) | Literal(_) | Wildcard | Count => {} + Alias(e, _) => container.push(*e), + BinaryExpr { left, op: _, right } => { + // reverse order so that left is popped first + container.push(*right); + container.push(*left); + } + Cast { expr, .. } => container.push(*expr), + Sort { expr, .. } => container.push(*expr), + Take { expr, idx } => { + container.push(*idx); + // latest, so that it is popped first + container.push(*expr); + } + SortBy { expr, by, .. } => { + for node in by { + container.push(*node) + } + // latest, so that it is popped first + container.push(*expr); + } + Filter { input, by } => { + container.push(*by); + // latest, so that it is popped first + container.push(*input); + } + Agg(agg_e) => { + let node = agg_e.get_input().first(); + container.push(node); + } + Ternary { + truthy, + falsy, + predicate, + } => { + container.push(*predicate); + container.push(*falsy); + // latest, so that it is popped first + container.push(*truthy); + } + AnonymousFunction { input, .. } | Function { input, .. } => + // we iterate in reverse order, so that the lhs is popped first and will be found + // as the root columns/ input columns by `_suffix` and `_keep_name` etc. + { + input + .iter() + .rev() + .copied() + .for_each(|node| container.push(node)) + } + Explode(e) => container.push(*e), + Window { + function, + partition_by, + order_by, + options: _, + } => { + for e in partition_by.iter().rev() { + container.push(*e); + } + if let Some(e) = order_by { + container.push(*e); + } + // latest so that it is popped first + container.push(*function); + } + Slice { + input, + offset, + length, + } => { + container.push(*length); + container.push(*offset); + // latest so that it is popped first + container.push(*input); + } + } + } + pub(crate) fn replace_inputs(mut self, inputs: &[Node]) -> Self { use AExpr::*; let input = match &mut self { Column(_) | Literal(_) | Wildcard | Count | Nth(_) => return self, Alias(input, _) => input, Cast { expr, .. } => expr, - Explode(input) | Slice { input, .. } | Cache { input, .. } => input, + Explode(input) | Slice { input, .. } => input, BinaryExpr { left, right, .. } => { - *left = inputs[0]; - *right = inputs[1]; + *right = inputs[0]; + *left = inputs[1]; return self; } - Sort { expr, .. } | Take { expr, .. } => expr, + Take { expr, idx } => { + *idx = inputs[0]; + *expr = inputs[1]; + return self; + } + Sort { expr, .. } => expr, SortBy { expr, by, .. } => { *expr = *inputs.last().unwrap(); by.clear(); by.extend_from_slice(&inputs[..inputs.len() - 1]); return self; } - Filter { input, .. } => input, + Filter { input, by, .. } => { + *by = inputs[0]; + *input = inputs[1]; + return self; + } Agg(a) => { a.set_input(inputs[0]); return self; @@ -227,9 +361,9 @@ impl AExpr { falsy, predicate, } => { - *truthy = inputs[0]; + *predicate = inputs[0]; *falsy = inputs[1]; - *predicate = inputs[2]; + *truthy = inputs[2]; return self; } AnonymousFunction { input, .. } | Function { input, .. } => { @@ -243,8 +377,10 @@ impl AExpr { order_by, .. } => { - *function = inputs[0]; - partition_by.extend_from_slice(&inputs[1..]); + *function = *inputs.last().unwrap(); + partition_by.clear(); + partition_by.extend_from_slice(&inputs[..inputs.len() - 1]); + assert!(order_by.is_none()); return self; } @@ -253,57 +389,6 @@ impl AExpr { self } - pub(crate) fn get_input(&self) -> NodeInputs { - use AExpr::*; - use NodeInputs::*; - match self { - Alias(input, _) => Single(*input), - Cast { expr, .. } => Single(*expr), - Explode(input) => Single(*input), - Column(_) => Leaf, - Literal(_) => Leaf, - BinaryExpr { left, right, .. } => Many(vec![*left, *right]), - Sort { expr, .. } => Single(*expr), - Take { expr, .. } => Single(*expr), - SortBy { expr, by, .. } => { - let mut many = by.clone(); - many.push(*expr); - Many(many) - } - Filter { input, .. } => Single(*input), - Agg(a) => a.get_input(), - Ternary { - truthy, - falsy, - predicate, - } => Many(vec![*truthy, *falsy, *predicate]), - // we iterate in reverse order, so that the lhs is popped first and will be found - // as the root columns/ input columns by `_suffix` and `_keep_name` etc. - AnonymousFunction { input, .. } | Function { input, .. } => match input.len() { - 1 => Single(input[0]), - _ => Many(input.iter().copied().rev().collect()), - }, - Window { - function, - order_by, - partition_by, - .. - } => { - let mut out = Vec::with_capacity(partition_by.len() + 2); - out.push(*function); - if let Some(a) = order_by { - out.push(*a); - } - out.extend(partition_by); - Many(out) - } - Wildcard => panic!("no wildcard expected"), - Slice { input, .. } => Single(*input), - Cache { input, .. } => Single(*input), - Count => Leaf, - Nth(_) => Leaf, - } - } pub(crate) fn is_leaf(&self) -> bool { matches!( self, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/aexpr/schema.rs b/crates/polars-plan/src/logical_plan/aexpr/schema.rs similarity index 99% rename from polars/polars-lazy/polars-plan/src/logical_plan/aexpr/schema.rs rename to crates/polars-plan/src/logical_plan/aexpr/schema.rs index 6bab8a340af4c..d102bcecba2cb 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/aexpr/schema.rs +++ b/crates/polars-plan/src/logical_plan/aexpr/schema.rs @@ -207,7 +207,6 @@ impl AExpr { function.get_field(schema, ctxt, &fields) } Slice { input, .. } => arena.get(*input).to_field(schema, ctxt, arena), - Cache { input, .. } => arena.get(*input).to_field(schema, ctxt, arena), Wildcard => panic!("should be no wildcard at this point"), Nth(_) => panic!("should be no nth at this point"), } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/alp.rs b/crates/polars-plan/src/logical_plan/alp.rs similarity index 72% rename from polars/polars-lazy/polars-plan/src/logical_plan/alp.rs rename to crates/polars-plan/src/logical_plan/alp.rs index c922e947c7079..45f1a76b93f10 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/alp.rs +++ b/crates/polars-plan/src/logical_plan/alp.rs @@ -5,11 +5,12 @@ use std::sync::Arc; use polars_core::prelude::*; use polars_utils::arena::{Arena, Node}; +use super::projection_expr::*; use crate::logical_plan::functions::FunctionNode; -use crate::logical_plan::schema::{det_join_schema, FileInfo}; +use crate::logical_plan::schema::FileInfo; use crate::logical_plan::FileScan; use crate::prelude::*; -use crate::utils::{aexprs_to_schema, PushNode}; +use crate::utils::PushNode; /// ALogicalPlan is a representation of LogicalPlan with Nodes which are allocated in an Arena #[derive(Clone, Debug)] @@ -19,7 +20,7 @@ pub enum ALogicalPlan { file_info: FileInfo, output_schema: Option, predicate: Option, - options: AnonymousScanOptions, + options: Arc, }, #[cfg(feature = "python")] PythonScan { @@ -55,7 +56,7 @@ pub enum ALogicalPlan { }, Projection { input: Node, - expr: Vec, + expr: ProjectionExprs, schema: SchemaRef, }, LocalProjection { @@ -80,7 +81,7 @@ pub enum ALogicalPlan { schema: SchemaRef, apply: Option>, maintain_order: bool, - options: GroupbyOptions, + options: Arc, }, Join { input_left: Node, @@ -88,11 +89,11 @@ pub enum ALogicalPlan { schema: SchemaRef, left_on: Vec, right_on: Vec, - options: JoinOptions, + options: Arc, }, HStack { input: Node, - exprs: Vec, + exprs: ProjectionExprs, schema: SchemaRef, }, Distinct { @@ -253,7 +254,7 @@ impl ALogicalPlan { }, Projection { schema, .. } => Projection { input: inputs[0], - expr: exprs, + expr: ProjectionExprs::new(exprs), schema: schema.clone(), }, Aggregate { @@ -303,7 +304,7 @@ impl ALogicalPlan { }, HStack { schema, .. } => HStack { input: inputs[0], - exprs, + exprs: ProjectionExprs::new(exprs), schema: schema.clone(), }, Scan { @@ -501,194 +502,12 @@ impl ALogicalPlan { } } -pub struct ALogicalPlanBuilder<'a> { - root: Node, - expr_arena: &'a mut Arena, - lp_arena: &'a mut Arena, -} - -impl<'a> ALogicalPlanBuilder<'a> { - pub(crate) fn new( - root: Node, - expr_arena: &'a mut Arena, - lp_arena: &'a mut Arena, - ) -> Self { - ALogicalPlanBuilder { - root, - expr_arena, - lp_arena, - } - } - - pub(crate) fn from_lp( - lp: ALogicalPlan, - expr_arena: &'a mut Arena, - lp_arena: &'a mut Arena, - ) -> Self { - let root = lp_arena.add(lp); - ALogicalPlanBuilder { - root, - expr_arena, - lp_arena, - } - } - - pub fn project_local(self, exprs: Vec) -> Self { - let input_schema = self.lp_arena.get(self.root).schema(self.lp_arena); - let schema = aexprs_to_schema(&exprs, &input_schema, Context::Default, self.expr_arena); - let lp = ALogicalPlan::LocalProjection { - expr: exprs, - input: self.root, - schema: Arc::new(schema), - }; - let node = self.lp_arena.add(lp); - ALogicalPlanBuilder::new(node, self.expr_arena, self.lp_arena) - } - - pub fn project(self, exprs: Vec) -> Self { - let input_schema = self.lp_arena.get(self.root).schema(self.lp_arena); - let schema = aexprs_to_schema(&exprs, &input_schema, Context::Default, self.expr_arena); - - // if len == 0, no projection has to be done. This is a select all operation. - if !exprs.is_empty() { - let lp = ALogicalPlan::Projection { - expr: exprs, - input: self.root, - schema: Arc::new(schema), - }; - let node = self.lp_arena.add(lp); - ALogicalPlanBuilder::new(node, self.expr_arena, self.lp_arena) - } else { - self - } - } - - pub fn build(self) -> ALogicalPlan { - if self.root.0 == self.lp_arena.len() { - self.lp_arena.pop().unwrap() - } else { - self.lp_arena.take(self.root) - } - } - - pub(crate) fn schema(&'a self) -> Cow<'a, SchemaRef> { - self.lp_arena.get(self.root).schema(self.lp_arena) - } - - pub(crate) fn with_columns(self, exprs: Vec) -> Self { - let schema = self.schema(); - let mut new_schema = (**schema).clone(); - - for e in &exprs { - let field = self - .expr_arena - .get(*e) - .to_field(&schema, Context::Default, self.expr_arena) - .unwrap(); - - new_schema.with_column(field.name().clone(), field.data_type().clone()); - } - - let lp = ALogicalPlan::HStack { - input: self.root, - exprs, - schema: Arc::new(new_schema), - }; - let root = self.lp_arena.add(lp); - Self::new(root, self.expr_arena, self.lp_arena) - } - - pub fn groupby( - self, - keys: Vec, - aggs: Vec, - apply: Option>, - maintain_order: bool, - options: GroupbyOptions, - ) -> Self { - let current_schema = self.schema(); - // TODO! add this line if LogicalPlan is dropped in favor of ALogicalPlan - // let aggs = rewrite_projections(aggs, current_schema); - - let mut schema = - aexprs_to_schema(&keys, ¤t_schema, Context::Default, self.expr_arena); - let other = aexprs_to_schema( - &aggs, - ¤t_schema, - Context::Aggregation, - self.expr_arena, - ); - schema.merge(other); - - #[cfg(feature = "dynamic_groupby")] - { - let index_columns = &[ - options - .rolling - .as_ref() - .map(|options| &options.index_column), - options - .dynamic - .as_ref() - .map(|options| &options.index_column), - ]; - for &name in index_columns.iter().flatten() { - let dtype = current_schema.get(name).unwrap(); - schema.with_column(name.clone(), dtype.clone()); - } - } - - let lp = ALogicalPlan::Aggregate { - input: self.root, - keys, - aggs, - schema: Arc::new(schema), - apply, - maintain_order, - options, - }; - let root = self.lp_arena.add(lp); - Self::new(root, self.expr_arena, self.lp_arena) - } - - pub fn join( - self, - other: Node, - left_on: Vec, - right_on: Vec, - options: JoinOptions, - ) -> Self { - let schema_left = self.schema(); - let schema_right = self.lp_arena.get(other).schema(self.lp_arena); - - let left_on_exprs = left_on - .iter() - .map(|node| node_to_expr(*node, self.expr_arena)) - .collect::>(); - let right_on_exprs = right_on - .iter() - .map(|node| node_to_expr(*node, self.expr_arena)) - .collect::>(); - - let schema = det_join_schema( - &schema_left, - &schema_right, - &left_on_exprs, - &right_on_exprs, - &options, - ) - .unwrap(); - - let lp = ALogicalPlan::Join { - input_left: self.root, - input_right: other, - schema, - left_on, - right_on, - options, - }; +#[cfg(test)] +mod test { + use super::*; - let root = self.lp_arena.add(lp); - Self::new(root, self.expr_arena, self.lp_arena) + #[test] + fn test_alp_size() { + assert!(std::mem::size_of::() <= 152); } } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/anonymous_scan.rs b/crates/polars-plan/src/logical_plan/anonymous_scan.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/anonymous_scan.rs rename to crates/polars-plan/src/logical_plan/anonymous_scan.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/apply.rs b/crates/polars-plan/src/logical_plan/apply.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/apply.rs rename to crates/polars-plan/src/logical_plan/apply.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs similarity index 92% rename from polars/polars-lazy/polars-plan/src/logical_plan/builder.rs rename to crates/polars-plan/src/logical_plan/builder.rs index 493baf93c92e8..7f82633fc63a2 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs +++ b/crates/polars-plan/src/logical_plan/builder.rs @@ -5,7 +5,6 @@ use std::io::{Read, Seek}; use polars_core::cloud::CloudOptions; use polars_core::frame::explode::MeltArgs; use polars_core::prelude::*; -use polars_core::utils::try_get_supertype; #[cfg(feature = "ipc")] use polars_io::ipc::IpcReader; #[cfg(all(feature = "parquet", feature = "async"))] @@ -26,6 +25,7 @@ use polars_io::{ csv::NullValues, }; +use super::builder_functions::*; use crate::logical_plan::functions::FunctionNode; use crate::logical_plan::projection::{is_regex_projection, rewrite_projections}; use crate::logical_plan::schema::{det_join_schema, FileInfo}; @@ -103,7 +103,7 @@ impl LogicalPlanBuilder { function, file_info, predicate: None, - options: AnonymousScanOptions { + options: Arc::new(AnonymousScanOptions { fmt_str: name, schema, skip_rows, @@ -111,7 +111,7 @@ impl LogicalPlanBuilder { output_schema: None, with_columns: None, predicate: None, - }, + }), } .into()) } @@ -144,7 +144,7 @@ impl LogicalPlanBuilder { ParquetAsyncReader::file_info(&uri, cloud_options.as_ref())? } } else { - let file = std::fs::File::open(&path)?; + let file = polars_utils::open_file(&path)?; let mut reader = ParquetReader::new(file); (reader.schema()?, reader.num_rows()?) }; @@ -195,7 +195,7 @@ impl LogicalPlanBuilder { use polars_io::SerReader as _; let path = path.into(); - let file = std::fs::File::open(&path)?; + let file = polars_utils::open_file(&path)?; let mut reader = IpcReader::new(file); let mut schema = reader.schema()?; @@ -253,7 +253,15 @@ impl LogicalPlanBuilder { try_parse_dates: bool, ) -> PolarsResult { let path = path.into(); - let mut file = std::fs::File::open(&path)?; + let mut file = polars_utils::open_file(&path).map_err(|e| { + let path = path.to_string_lossy(); + if path.len() > 88 { + let path: String = path.chars().skip(path.len() - 88).collect(); + polars_err!(ComputeError: "error open file: ...{}, {}", path, e) + } else { + polars_err!(ComputeError: "error open file: {}, {}", path, e) + } + })?; let mut magic_nr = [0u8; 2]; file.read_exact(&mut magic_nr) .map_err(|_| polars_err!(NoData: "empty csv"))?; @@ -612,7 +620,7 @@ impl LogicalPlanBuilder { schema: Arc::new(schema), apply, maintain_order, - options, + options: Arc::new(options), } .into() } @@ -659,23 +667,20 @@ impl LogicalPlanBuilder { let schema = try_delayed!(self.0.schema(), &self.0, into); let columns = try_delayed!(rewrite_projections(columns, &schema, &[]), &self.0, into); - let mut schema = (**schema).clone(); - // columns to string let columns = columns .iter() .map(|e| { if let Expr::Column(name) = e { - if let Some(DataType::List(inner)) = schema.get(name) { - let inner = *inner.clone(); - schema.with_column(name.as_ref().into(), inner); - } name.clone() } else { panic!("expected column expression") } }) - .collect(); + .collect::]>>(); + + let mut schema = (**schema).clone(); + try_delayed!(explode_schema(&mut schema, &columns), &self.0, into); LogicalPlan::MapFunction { input: Box::new(self.0), @@ -697,6 +702,22 @@ impl LogicalPlanBuilder { .into() } + pub fn row_count(self, name: &str, offset: Option) -> Self { + let mut schema = try_delayed!(self.0.schema(), &self.0, into).into_owned(); + let schema_mut = Arc::make_mut(&mut schema); + row_count_schema(schema_mut, name); + + LogicalPlan::MapFunction { + input: Box::new(self.0), + function: FunctionNode::RowCount { + name: Arc::from(name), + offset, + schema, + }, + } + .into() + } + pub fn distinct(self, options: DistinctOptions) -> Self { LogicalPlan::Distinct { input: Box::new(self.0), @@ -719,7 +740,7 @@ impl LogicalPlanBuilder { other: LogicalPlan, left_on: Vec, right_on: Vec, - options: JoinOptions, + options: Arc, ) -> Self { for e in left_on.iter().chain(right_on.iter()) { if has_expr(e, |e| matches!(e, Expr::Alias(_, _))) { @@ -810,49 +831,3 @@ impl LogicalPlanBuilder { .into() } } - -pub(crate) fn det_melt_schema(args: &MeltArgs, input_schema: &Schema) -> SchemaRef { - let mut new_schema = args - .id_vars - .iter() - .map(|id| Field::new(id, input_schema.get(id).unwrap().clone())) - .collect::(); - let variable_name = args - .variable_name - .as_ref() - .cloned() - .unwrap_or_else(|| "variable".into()); - let value_name = args - .value_name - .as_ref() - .cloned() - .unwrap_or_else(|| "value".into()); - - new_schema.with_column(variable_name, DataType::Utf8); - - // We need to determine the supertype of all value columns. - let mut st = None; - - // take all columns that are not in `id_vars` as `value_var` - if args.value_vars.is_empty() { - let id_vars = PlHashSet::from_iter(&args.id_vars); - for (name, dtype) in input_schema.iter() { - if !id_vars.contains(name) { - match &st { - None => st = Some(dtype.clone()), - Some(st_) => st = Some(try_get_supertype(st_, dtype).unwrap()), - } - } - } - } else { - for name in &args.value_vars { - let dtype = input_schema.get(name).unwrap(); - match &st { - None => st = Some(dtype.clone()), - Some(st_) => st = Some(try_get_supertype(st_, dtype).unwrap()), - } - } - } - new_schema.with_column(value_name, st.unwrap()); - Arc::new(new_schema) -} diff --git a/crates/polars-plan/src/logical_plan/builder_alp.rs b/crates/polars-plan/src/logical_plan/builder_alp.rs new file mode 100644 index 0000000000000..df3ccd54b04d8 --- /dev/null +++ b/crates/polars-plan/src/logical_plan/builder_alp.rs @@ -0,0 +1,239 @@ +use std::borrow::Cow; + +use super::builder_functions::*; +use super::*; +use crate::logical_plan::projection_expr::ProjectionExprs; + +pub struct ALogicalPlanBuilder<'a> { + root: Node, + expr_arena: &'a mut Arena, + lp_arena: &'a mut Arena, +} + +impl<'a> ALogicalPlanBuilder<'a> { + pub(crate) fn new( + root: Node, + expr_arena: &'a mut Arena, + lp_arena: &'a mut Arena, + ) -> Self { + ALogicalPlanBuilder { + root, + expr_arena, + lp_arena, + } + } + + pub(crate) fn from_lp( + lp: ALogicalPlan, + expr_arena: &'a mut Arena, + lp_arena: &'a mut Arena, + ) -> Self { + let root = lp_arena.add(lp); + ALogicalPlanBuilder { + root, + expr_arena, + lp_arena, + } + } + + fn add_alp(self, lp: ALogicalPlan) -> Self { + let node = self.lp_arena.add(lp); + ALogicalPlanBuilder::new(node, self.expr_arena, self.lp_arena) + } + + pub fn project_local(self, exprs: Vec) -> Self { + let input_schema = self.lp_arena.get(self.root).schema(self.lp_arena); + let schema = aexprs_to_schema(&exprs, &input_schema, Context::Default, self.expr_arena); + let lp = ALogicalPlan::LocalProjection { + expr: exprs, + input: self.root, + schema: Arc::new(schema), + }; + self.add_alp(lp) + } + + pub fn project(self, exprs: Vec) -> Self { + let input_schema = self.lp_arena.get(self.root).schema(self.lp_arena); + let schema = aexprs_to_schema(&exprs, &input_schema, Context::Default, self.expr_arena); + + // if len == 0, no projection has to be done. This is a select all operation. + if !exprs.is_empty() { + let lp = ALogicalPlan::Projection { + expr: exprs.into(), + input: self.root, + schema: Arc::new(schema), + }; + let node = self.lp_arena.add(lp); + ALogicalPlanBuilder::new(node, self.expr_arena, self.lp_arena) + } else { + self + } + } + + pub fn build(self) -> ALogicalPlan { + if self.root.0 == self.lp_arena.len() { + self.lp_arena.pop().unwrap() + } else { + self.lp_arena.take(self.root) + } + } + + pub(crate) fn schema(&'a self) -> Cow<'a, SchemaRef> { + self.lp_arena.get(self.root).schema(self.lp_arena) + } + + pub(crate) fn with_columns(self, exprs: Vec) -> Self { + let schema = self.schema(); + let mut new_schema = (**schema).clone(); + + for e in &exprs { + let field = self + .expr_arena + .get(*e) + .to_field(&schema, Context::Default, self.expr_arena) + .unwrap(); + + new_schema.with_column(field.name().clone(), field.data_type().clone()); + } + + let lp = ALogicalPlan::HStack { + input: self.root, + exprs: ProjectionExprs::new(exprs), + schema: Arc::new(new_schema), + }; + self.add_alp(lp) + } + + // call this if the schema needs to be updated + pub(crate) fn explode(self, columns: Arc<[Arc]>) -> Self { + let mut schema = (*self.schema().into_owned()).clone(); + explode_schema(&mut schema, &columns).unwrap(); + + let lp = ALogicalPlan::MapFunction { + input: self.root, + function: FunctionNode::Explode { + columns, + schema: Arc::new(schema), + }, + }; + self.add_alp(lp) + } + + pub fn groupby( + self, + keys: Vec, + aggs: Vec, + apply: Option>, + maintain_order: bool, + options: Arc, + ) -> Self { + let current_schema = self.schema(); + // TODO! add this line if LogicalPlan is dropped in favor of ALogicalPlan + // let aggs = rewrite_projections(aggs, current_schema); + + let mut schema = + aexprs_to_schema(&keys, ¤t_schema, Context::Default, self.expr_arena); + let other = aexprs_to_schema( + &aggs, + ¤t_schema, + Context::Aggregation, + self.expr_arena, + ); + schema.merge(other); + + #[cfg(feature = "dynamic_groupby")] + { + let index_columns = &[ + options + .rolling + .as_ref() + .map(|options| &options.index_column), + options + .dynamic + .as_ref() + .map(|options| &options.index_column), + ]; + for &name in index_columns.iter().flatten() { + let dtype = current_schema.get(name).unwrap(); + schema.with_column(name.clone(), dtype.clone()); + } + } + + let lp = ALogicalPlan::Aggregate { + input: self.root, + keys, + aggs, + schema: Arc::new(schema), + apply, + maintain_order, + options, + }; + self.add_alp(lp) + } + + pub fn join( + self, + other: Node, + left_on: Vec, + right_on: Vec, + options: Arc, + ) -> Self { + let schema_left = self.schema(); + let schema_right = self.lp_arena.get(other).schema(self.lp_arena); + + let left_on_exprs = left_on + .iter() + .map(|node| node_to_expr(*node, self.expr_arena)) + .collect::>(); + let right_on_exprs = right_on + .iter() + .map(|node| node_to_expr(*node, self.expr_arena)) + .collect::>(); + + let schema = det_join_schema( + &schema_left, + &schema_right, + &left_on_exprs, + &right_on_exprs, + &options, + ) + .unwrap(); + + let lp = ALogicalPlan::Join { + input_left: self.root, + input_right: other, + schema, + left_on, + right_on, + options, + }; + + self.add_alp(lp) + } + + pub fn melt(self, args: Arc) -> Self { + let schema = self.schema(); + let schema = det_melt_schema(&args, &schema); + let lp = ALogicalPlan::MapFunction { + input: self.root, + function: FunctionNode::Melt { args, schema }, + }; + self.add_alp(lp) + } + + pub fn row_count(self, name: Arc, offset: Option) -> Self { + let mut schema = self.schema().into_owned(); + let schema_mut = Arc::make_mut(&mut schema); + row_count_schema(schema_mut, name.as_ref()); + + let lp = ALogicalPlan::MapFunction { + input: self.root, + function: FunctionNode::RowCount { + name, + offset, + schema, + }, + }; + self.add_alp(lp) + } +} diff --git a/crates/polars-plan/src/logical_plan/builder_functions.rs b/crates/polars-plan/src/logical_plan/builder_functions.rs new file mode 100644 index 0000000000000..ed1f08dc25960 --- /dev/null +++ b/crates/polars-plan/src/logical_plan/builder_functions.rs @@ -0,0 +1,66 @@ +use polars_core::utils::try_get_supertype; + +use super::*; + +// Has functions that create schema's for both the `LogicalPlan` and the `AlogicalPlan` builders. + +pub(super) fn explode_schema(schema: &mut Schema, columns: &[Arc]) -> PolarsResult<()> { + // columns to string + columns.iter().try_for_each(|name| { + if let DataType::List(inner) = schema.try_get(name)? { + let inner = *inner.clone(); + schema.with_column(name.as_ref().into(), inner); + }; + Ok(()) + }) +} + +pub(super) fn det_melt_schema(args: &MeltArgs, input_schema: &Schema) -> SchemaRef { + let mut new_schema = args + .id_vars + .iter() + .map(|id| Field::new(id, input_schema.get(id).unwrap().clone())) + .collect::(); + let variable_name = args + .variable_name + .as_ref() + .cloned() + .unwrap_or_else(|| "variable".into()); + let value_name = args + .value_name + .as_ref() + .cloned() + .unwrap_or_else(|| "value".into()); + + new_schema.with_column(variable_name, DataType::Utf8); + + // We need to determine the supertype of all value columns. + let mut st = None; + + // take all columns that are not in `id_vars` as `value_var` + if args.value_vars.is_empty() { + let id_vars = PlHashSet::from_iter(&args.id_vars); + for (name, dtype) in input_schema.iter() { + if !id_vars.contains(name) { + match &st { + None => st = Some(dtype.clone()), + Some(st_) => st = Some(try_get_supertype(st_, dtype).unwrap()), + } + } + } + } else { + for name in &args.value_vars { + let dtype = input_schema.get(name).unwrap(); + match &st { + None => st = Some(dtype.clone()), + Some(st_) => st = Some(try_get_supertype(st_, dtype).unwrap()), + } + } + } + new_schema.with_column(value_name, st.unwrap()); + Arc::new(new_schema) +} + +pub(super) fn row_count_schema(schema: &mut Schema, name: &str) { + schema.insert_at_index(0, name.into(), IDX_DTYPE).unwrap(); +} diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/conversion.rs b/crates/polars-plan/src/logical_plan/conversion.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/logical_plan/conversion.rs rename to crates/polars-plan/src/logical_plan/conversion.rs index c6a30bac056d5..9ed908ead7bac 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/conversion.rs +++ b/crates/polars-plan/src/logical_plan/conversion.rs @@ -145,10 +145,6 @@ pub fn to_aexpr(expr: Expr, arena: &mut Arena) -> Node { offset: to_aexpr(*offset, arena), length: to_aexpr(*length, arena), }, - Expr::Cache { input, id } => AExpr::Cache { - input: to_aexpr(*input, arena), - id, - }, Expr::Wildcard => AExpr::Wildcard, Expr::Count => AExpr::Count, Expr::Nth(i) => AExpr::Nth(i), @@ -239,10 +235,10 @@ pub fn to_alp( input, schema, } => { - let exp = expr.into_iter().map(|x| to_aexpr(x, expr_arena)).collect(); + let expr = expr.into_iter().map(|x| to_aexpr(x, expr_arena)).collect(); let i = to_alp(*input, expr_arena, lp_arena)?; ALogicalPlan::Projection { - expr: exp, + expr, input: i, schema, } @@ -600,10 +596,6 @@ pub fn node_to_expr(node: Node, expr_arena: &Arena) -> Expr { offset: Box::new(node_to_expr(offset, expr_arena)), length: Box::new(node_to_expr(length, expr_arena)), }, - AExpr::Cache { input, id } => Expr::Cache { - input: Box::new(node_to_expr(input, expr_arena)), - id, - }, AExpr::Count => Expr::Count, AExpr::Nth(i) => Expr::Nth(i), AExpr::Wildcard => Expr::Wildcard, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/debug.rs b/crates/polars-plan/src/logical_plan/debug.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/debug.rs rename to crates/polars-plan/src/logical_plan/debug.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/file_scan.rs b/crates/polars-plan/src/logical_plan/file_scan.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/file_scan.rs rename to crates/polars-plan/src/logical_plan/file_scan.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/format.rs b/crates/polars-plan/src/logical_plan/format.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/logical_plan/format.rs rename to crates/polars-plan/src/logical_plan/format.rs index acff7e0e2043c..3aeb9af9dc45b 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/format.rs +++ b/crates/polars-plan/src/logical_plan/format.rs @@ -137,7 +137,8 @@ impl LogicalPlan { ) } Selection { predicate, input } => { - write!(f, "{:indent$}FILTER {predicate:?} FROM", "")?; + // this one is writeln because we don't increase indent (which inserts a line) + writeln!(f, "{:indent$}FILTER {predicate:?} FROM", "")?; input._format(f, indent) } DataFrameScan { @@ -373,7 +374,6 @@ impl Debug for Expr { RenameAlias { expr, .. } => write!(f, ".rename_alias({expr:?})"), Columns(names) => write!(f, "cols({names:?})"), DtypeColumn(dt) => write!(f, "dtype_columns({dt:?})"), - Cache { input, .. } => write!(f, "{input:?}.cache()"), Selector(_) => write!(f, "SELECTOR"), } } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/drop.rs b/crates/polars-plan/src/logical_plan/functions/drop.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/functions/drop.rs rename to crates/polars-plan/src/logical_plan/functions/drop.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/explode.rs b/crates/polars-plan/src/logical_plan/functions/explode.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/functions/explode.rs rename to crates/polars-plan/src/logical_plan/functions/explode.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/merge_sorted.rs b/crates/polars-plan/src/logical_plan/functions/merge_sorted.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/functions/merge_sorted.rs rename to crates/polars-plan/src/logical_plan/functions/merge_sorted.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs b/crates/polars-plan/src/logical_plan/functions/mod.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs rename to crates/polars-plan/src/logical_plan/functions/mod.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/python_udf.rs b/crates/polars-plan/src/logical_plan/functions/python_udf.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/functions/python_udf.rs rename to crates/polars-plan/src/logical_plan/functions/python_udf.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/rename.rs b/crates/polars-plan/src/logical_plan/functions/rename.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/functions/rename.rs rename to crates/polars-plan/src/logical_plan/functions/rename.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/iterator.rs b/crates/polars-plan/src/logical_plan/iterator.rs similarity index 64% rename from polars/polars-lazy/polars-plan/src/logical_plan/iterator.rs rename to crates/polars-plan/src/logical_plan/iterator.rs index 840f690b77c7c..545e4ef377ce3 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/iterator.rs +++ b/crates/polars-plan/src/logical_plan/iterator.rs @@ -2,7 +2,6 @@ use polars_arrow::error::PolarsResult; use crate::prelude::*; -#[macro_export] macro_rules! push_expr { ($current_expr:expr, $push:ident, $iter:ident) => {{ use Expr::*; @@ -93,7 +92,6 @@ macro_rules! push_expr { } Exclude(e, _) => $push(e), KeepName(e) => $push(e), - Cache { input, .. } => $push(input), RenameAlias { expr, .. } => $push(expr), // pass Selector(_) => {} @@ -137,8 +135,7 @@ impl<'a> ExprMut<'a> { if !f(current_expr)? { break; } - let mut push = |e: &'a mut Expr| self.stack.push(e); - push_expr!(current_expr, push, iter_mut); + current_expr.nodes_mut(&mut self.stack) } Ok(()) } @@ -153,14 +150,24 @@ impl<'a> Iterator for ExprIter<'a> { fn next(&mut self) -> Option { self.stack.pop().map(|current_expr| { - let mut push = |e: &'a Expr| self.stack.push(e); - - push_expr!(current_expr, push, iter); + current_expr.nodes(&mut self.stack); current_expr }) } } +impl Expr { + pub fn nodes<'a>(&'a self, container: &mut Vec<&'a Expr>) { + let mut push = |e: &'a Expr| container.push(e); + push_expr!(self, push, iter); + } + + pub fn nodes_mut<'a>(&'a mut self, container: &mut Vec<&'a mut Expr>) { + let mut push = |e: &'a mut Expr| container.push(e); + push_expr!(self, push, iter_mut); + } +} + impl<'a> IntoIterator for &'a Expr { type Item = &'a Expr; type IntoIter = ExprIter<'a>; @@ -172,105 +179,6 @@ impl<'a> IntoIterator for &'a Expr { } } -impl AExpr { - /// Push nodes at this level to a pre-allocated stack - pub(crate) fn nodes<'a>(&'a self, container: &mut Vec) { - let mut push = |e: &'a Node| container.push(*e); - use AExpr::*; - - match self { - Nth(_) | Column(_) | Literal(_) | Wildcard | Count => {} - Alias(e, _) => push(e), - BinaryExpr { left, op: _, right } => { - // reverse order so that left is popped first - push(right); - push(left); - } - Cast { expr, .. } => push(expr), - Cache { input, .. } => push(input), - Sort { expr, .. } => push(expr), - Take { expr, idx } => { - push(idx); - // latest, so that it is popped first - push(expr); - } - SortBy { expr, by, .. } => { - for node in by { - push(node) - } - // latest, so that it is popped first - push(expr); - } - Filter { input, by } => { - push(by); - // latest, so that it is popped first - push(input); - } - Agg(agg_e) => { - use AAggExpr::*; - match agg_e { - Max { input, .. } => push(input), - Min { input, .. } => push(input), - Mean(e) => push(e), - Median(e) => push(e), - NUnique(e) => push(e), - First(e) => push(e), - Last(e) => push(e), - Implode(e) => push(e), - Count(e) => push(e), - Quantile { expr, .. } => push(expr), - Sum(e) => push(e), - AggGroups(e) => push(e), - Std(e, _) => push(e), - Var(e, _) => push(e), - } - } - Ternary { - truthy, - falsy, - predicate, - } => { - push(predicate); - push(falsy); - // latest, so that it is popped first - push(truthy); - } - AnonymousFunction { input, .. } | Function { input, .. } => - // we iterate in reverse order, so that the lhs is popped first and will be found - // as the root columns/ input columns by `_suffix` and `_keep_name` etc. - { - input.iter().rev().for_each(push) - } - Explode(e) => push(e), - Window { - function, - partition_by, - order_by, - options: _, - } => { - for e in partition_by.iter().rev() { - push(e); - } - if let Some(e) = order_by { - push(e); - } - // latest so that it is popped first - push(function); - } - Slice { - input, - offset, - length, - } => { - push(length); - push(offset); - // latest so that it is popped first - push(input); - } - } - } -} - pub struct AExprIter<'a> { stack: Vec, arena: Option<&'a Arena>, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/lit.rs b/crates/polars-plan/src/logical_plan/lit.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/lit.rs rename to crates/polars-plan/src/logical_plan/lit.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/mod.rs b/crates/polars-plan/src/logical_plan/mod.rs similarity index 93% rename from polars/polars-lazy/polars-plan/src/logical_plan/mod.rs rename to crates/polars-plan/src/logical_plan/mod.rs index 050685a054cc7..e0421eaa69183 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/mod.rs +++ b/crates/polars-plan/src/logical_plan/mod.rs @@ -16,6 +16,8 @@ pub(crate) mod anonymous_scan; mod apply; mod builder; +mod builder_alp; +pub mod builder_functions; pub(crate) mod conversion; #[cfg(feature = "debugging")] pub(crate) mod debug; @@ -27,10 +29,11 @@ mod lit; pub(crate) mod optimizer; pub(crate) mod options; pub(crate) mod projection; +mod projection_expr; #[cfg(feature = "python")] mod pyarrow; mod schema; -#[cfg(feature = "meta")] +#[cfg(any(feature = "meta", feature = "cse"))] pub(crate) mod tree_format; pub mod visitor; @@ -39,6 +42,7 @@ pub use alp::*; pub use anonymous_scan::*; pub use apply::*; pub use builder::*; +pub use builder_alp::*; pub use conversion::*; pub use file_scan::*; pub use functions::*; @@ -142,7 +146,7 @@ pub enum LogicalPlan { function: Arc, file_info: FileInfo, predicate: Option, - options: AnonymousScanOptions, + options: Arc, }, #[cfg(feature = "python")] PythonScan { options: PythonOptions }, @@ -196,7 +200,7 @@ pub enum LogicalPlan { #[cfg_attr(feature = "serde", serde(skip))] apply: Option>, maintain_order: bool, - options: GroupbyOptions, + options: Arc, }, /// Join operation Join { @@ -205,7 +209,7 @@ pub enum LogicalPlan { schema: SchemaRef, left_on: Vec, right_on: Vec, - options: JoinOptions, + options: Arc, }, /// Adding columns to the table without a Join HStack { @@ -275,4 +279,13 @@ impl LogicalPlan { pub fn describe(&self) -> String { format!("{self:#?}") } + + pub fn to_alp(self) -> PolarsResult<(Node, Arena, Arena)> { + let mut lp_arena = Arena::with_capacity(16); + let mut expr_arena = Arena::with_capacity(16); + + let node = to_alp(self, &mut expr_arena, &mut lp_arena)?; + + Ok((node, lp_arena, expr_arena)) + } } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/cache_states.rs b/crates/polars-plan/src/logical_plan/optimizer/cache_states.rs similarity index 99% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/cache_states.rs rename to crates/polars-plan/src/logical_plan/optimizer/cache_states.rs index 656b8143e4548..2d1ada49ec1c0 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/cache_states.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/cache_states.rs @@ -74,6 +74,7 @@ pub(super) fn set_cache_states( // also self-referencing plans can deadlock on the files they lock Join { options, .. } if has_caches && options.allow_parallel => { if let Join { options, .. } = lp_arena.get_mut(current_node) { + let options = Arc::make_mut(options); options.allow_parallel = false; } } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/cse.rs b/crates/polars-plan/src/logical_plan/optimizer/cse.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/cse.rs rename to crates/polars-plan/src/logical_plan/optimizer/cse.rs index 13081fd7efd83..2deb4b5fdd2c5 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/cse.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/cse.rs @@ -88,12 +88,12 @@ fn expr_nodes_equal(a: &[Node], b: &[Node], expr_arena: &Arena) -> bool { a.len() == b.len() && a.iter() .zip(b) - .all(|(a, b)| node_to_expr(*a, expr_arena) == node_to_expr(*b, expr_arena)) + .all(|(a, b)| AExpr::is_equal(*a, *b, expr_arena)) } fn predicate_equal(a: Option, b: Option, expr_arena: &Arena) -> bool { match (a, b) { - (Some(l), Some(r)) => node_to_expr(l, expr_arena) == node_to_expr(r, expr_arena), + (Some(l), Some(r)) => AExpr::is_equal(l, r, expr_arena), (None, None) => true, _ => false, } @@ -135,7 +135,7 @@ fn lp_node_equal(a: &ALogicalPlan, b: &ALogicalPlan, expr_arena: &Arena) && predicate_equal(*predicate_left, *predicate_right, expr_arena) } (Selection { predicate: l, .. }, Selection { predicate: r, .. }) => { - node_to_expr(*l, expr_arena) == node_to_expr(*r, expr_arena) + AExpr::is_equal(*l, *r, expr_arena) } (Projection { expr: l, .. }, Projection { expr: r, .. }) | (HStack { exprs: l, .. }, HStack { exprs: r, .. }) => expr_nodes_equal(l, r, expr_arena), diff --git a/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs b/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs new file mode 100644 index 0000000000000..0fb9d86c09c58 --- /dev/null +++ b/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs @@ -0,0 +1,646 @@ +use std::rc::Rc; + +use polars_utils::vec::CapacityByFactor; + +use super::*; +use crate::constants::CSE_REPLACED; +use crate::logical_plan::projection_expr::ProjectionExprs; +use crate::logical_plan::visitor::{RewriteRecursion, VisitRecursion}; +use crate::prelude::visitor::{ALogicalPlanNode, AexprNode, RewritingVisitor, TreeWalker, Visitor}; + +/// Identifier that shows the sub-expression path. +/// Must implement hash and equality and ideally +/// have little collisions +/// We will do a full expression comparison to check if the +/// expressions with equal identifiers are truly equal +// TODO! try to use a hash `usize` for this? +type Identifier = Rc; +/// Identifier maps to Expr Node and count. +type SubExprCount = PlHashMap; +/// (post_visit_idx, identifier); +type IdentifierArray = Vec<(usize, Identifier)>; + +fn replace_name(id: &str) -> String { + format!("{}{}", CSE_REPLACED, id) +} + +#[derive(Debug)] +enum VisitRecord { + /// entered a new expression + Entered(usize), + /// every visited sub-expression pushes their identifier to the stack + SubExprId(Identifier), +} + +/// Goes through an expression and generates a identifier +/// +/// The visitor uses a `visit_stack` to track traversal order. +/// +/// # Entering a node +/// When `pre-visit` is called we enter a new (sub)-expression and +/// we add `Entered` to the stack. +/// # Leaving a node +/// On `post-visit` when we leave the node and we pop all `SubExprIds` nodes. +/// Those are considered sub-expression of the leaving node +/// +/// We also record an `id_array` that followed the pre-visit order. This +/// is used to cache the `Identifiers`. +// +// # Example (this is not a docstring as clippy complains about spacing) +// Say we have the expression: `(col("f00").min() * col("bar")).sum()` +// with the following call tree: +// +// sum +// +// | +// +// binary: * +// +// | | +// +// col(bar) min +// +// | +// +// col(f00) +// +// # call order +// function-called stack stack-after(pop until E, push I) # ID +// pre-visit: sum E - +// pre-visit: binary: * EE - +// pre-visit: col(bar) EEE - +// post-visit: col(bar) EEE EEI id: col(bar) +// pre-visit: min EEIE - +// pre-visit: col(f00) EEIEE - +// post-visit: col(f00) EEIEE EEIEI id: col(f00) +// post-visit: min EEIEI EEII id: min!col(f00) +// post-visit: binary: * EEII EI id: binary: *!min!col(f00)!col(bar) +// post-visit: sum EI I id: sum!binary: *!min!col(f00)!col(bar) +struct ExprIdentifierVisitor<'a> { + se_count: &'a mut SubExprCount, + identifier_array: &'a mut IdentifierArray, + // index in pre-visit traversal order + pre_visit_idx: usize, + post_visit_idx: usize, + visit_stack: &'a mut Vec, + /// Offset in the identifier array + /// this allows us to use a single `vec` on multiple expressions + id_array_offset: usize, + // whether the expression replaced a subexpression + has_sub_expr: bool, + // During aggregation we only identify element-wise operations + is_groupby: bool, +} + +impl ExprIdentifierVisitor<'_> { + fn new<'a>( + se_count: &'a mut SubExprCount, + identifier_array: &'a mut IdentifierArray, + visit_stack: &'a mut Vec, + is_groupby: bool, + ) -> ExprIdentifierVisitor<'a> { + let id_array_offset = identifier_array.len(); + ExprIdentifierVisitor { + se_count, + identifier_array, + pre_visit_idx: 0, + post_visit_idx: 0, + visit_stack, + id_array_offset, + has_sub_expr: false, + is_groupby, + } + } + + /// pop all visit-records until an `Entered` is found. We accumulate a `SubExprId`s + /// to `id`. Finally we return the expression `idx` and `Identifier`. + /// This works due to the stack. + /// If we traverse another expression in the mean time, it will get popped of the stack first + /// so the returned identifier belongs to a single sub-expression + fn pop_until_entered(&mut self) -> (usize, Identifier) { + let mut id = String::new(); + + while let Some(item) = self.visit_stack.pop() { + match item { + VisitRecord::Entered(idx) => return (idx, Rc::from(id)), + VisitRecord::SubExprId(s) => { + id.push('!'); + id.push_str(s.as_ref()); + } + } + } + unreachable!() + } + + fn accept_node(&self, ae: &AExpr) -> bool { + match ae { + // skip window functions for now until we properly implemented the physical side + AExpr::Column(_) + | AExpr::Count + | AExpr::Literal(_) + | AExpr::Window { .. } + | AExpr::Alias(_, _) => false, + #[cfg(feature = "random")] + AExpr::Function { + function: FunctionExpr::Random { .. }, + .. + } => false, + _ => { + // during aggregation we only store elementwise operation in the state + // other operations we cannot add to the state as they have the output size of the + // groups, not the original dataframe + if self.is_groupby { + match ae { + AExpr::Agg(_) | AExpr::AnonymousFunction { .. } => false, + AExpr::Function { options, .. } => !options.is_groups_sensitive(), + _ => true, + } + } else { + true + } + } + } + } +} + +impl Visitor for ExprIdentifierVisitor<'_> { + type Node = AexprNode; + + fn pre_visit(&mut self, _node: &Self::Node) -> PolarsResult { + self.visit_stack + .push(VisitRecord::Entered(self.pre_visit_idx)); + self.pre_visit_idx += 1; + + // implement default placeholders + self.identifier_array + .push((self.id_array_offset, "".into())); + + Ok(VisitRecursion::Continue) + } + + fn post_visit(&mut self, node: &Self::Node) -> PolarsResult { + let ae = node.to_aexpr(); + self.post_visit_idx += 1; + + let (pre_visit_idx, sub_expr_id) = self.pop_until_entered(); + + // if we don't store this node + // we only push the visit_stack, so the parents know the trail + if !self.accept_node(ae) { + self.identifier_array[pre_visit_idx + self.id_array_offset].0 = self.post_visit_idx; + self.visit_stack + .push(VisitRecord::SubExprId(Rc::from(format!("{:E}", ae)))); + return Ok(VisitRecursion::Continue); + } + + // create the id of this node + let id: Identifier = Rc::from(format!("{:E}{}", ae, sub_expr_id)); + + // store the created id + self.identifier_array[pre_visit_idx + self.id_array_offset] = + (self.post_visit_idx, id.clone()); + + // We popped until entered, push this Id on the stack so the trail + // is available for the parent expression + self.visit_stack.push(VisitRecord::SubExprId(id.clone())); + + let (_, se_count) = self.se_count.entry(id).or_insert_with(|| (node.node(), 0)); + + *se_count += 1; + self.has_sub_expr |= *se_count > 1; + + Ok(VisitRecursion::Continue) + } +} + +struct CommonSubExprRewriter<'a> { + sub_expr_map: &'a SubExprCount, + identifier_array: &'a IdentifierArray, + /// keep track of the replaced identifiers + replaced_identifiers: &'a mut PlHashSet, + + max_post_visit_idx: usize, + /// index in traversal order in which `identifier_array` + /// was written. This is the index in `identifier_array`. + visited_idx: usize, + /// Offset in the identifier array + /// this allows us to use a single `vec` on multiple expressions + id_array_offset: usize, +} + +impl<'a> CommonSubExprRewriter<'a> { + fn new( + sub_expr_map: &'a SubExprCount, + identifier_array: &'a IdentifierArray, + replaced_identifiers: &'a mut PlHashSet, + id_array_offset: usize, + ) -> Self { + Self { + sub_expr_map, + identifier_array, + replaced_identifiers, + max_post_visit_idx: 0, + visited_idx: 0, + id_array_offset, + } + } +} + +// # Example +// Expression tree with [pre-visit,post-visit] indices +// counted from 1 +// [1,8] binary: + +// +// | | +// +// [2,2] sum [4,7] sum +// +// | | +// +// [3,1] col(foo) [5,6] binary: * +// +// | | +// +// [6,3] col(bar) [7,5] sum +// +// | +// +// [8,4] col(foo) +// +// in this tree `col(foo).sum()` should be post-visited/mutated +// so if we are at `[2,2]` +// +// call stack +// pre-visit [1,8] binary -> no_mutate_and_continue -> visits children +// pre-visit [2,2] sum -> mutate_and_stop -> does not visit children +// post-visit [2,2] sum -> skip index to [4,7] (because we didn't visit children) +// pre-visit [4,7] sum -> no_mutate_and_continue -> visits children +// pre-visit [5,6] binary -> no_mutate_and_continue -> visits children +// pre-visit [6,3] col -> stop_recursion -> does not mutate +// pre-visit [7,5] sum -> mutate_and_stop -> does not visit children +// post-visit [7,5] -> skip index to end +impl RewritingVisitor for CommonSubExprRewriter<'_> { + type Node = AexprNode; + + fn pre_visit(&mut self, ae_node: &Self::Node) -> PolarsResult { + if self.visited_idx + self.id_array_offset >= self.identifier_array.len() + || self.max_post_visit_idx + > self.identifier_array[self.visited_idx + self.id_array_offset].0 + { + return Ok(RewriteRecursion::Stop); + } + + let id = &self.identifier_array[self.visited_idx + self.id_array_offset].1; + + // placeholder not overwritten, so we can skip this sub-expression + if id.is_empty() { + self.visited_idx += 1; + let recurse = if ae_node.is_leaf() { + RewriteRecursion::Stop + } else { + // continue visit its children to see + // if there are cse + RewriteRecursion::NoMutateAndContinue + }; + return Ok(recurse); + } + + let (node, count) = self.sub_expr_map.get(id).unwrap(); + if *count > 1 + // this does a full expression traversal to check if the expression is truly + // the same + && ae_node.binary(*node, |l, r| l == r) + { + self.replaced_identifiers.insert(id.clone()); + // rewrite this sub-expression, don't visit its children + Ok(RewriteRecursion::MutateAndStop) + } else { + // This is a unique expression + // visit its children to see if they are cse + self.visited_idx += 1; + Ok(RewriteRecursion::NoMutateAndContinue) + } + } + + fn mutate(&mut self, mut node: Self::Node) -> PolarsResult { + let (post_visit_count, id) = + &self.identifier_array[self.visited_idx + self.id_array_offset]; + self.visited_idx += 1; + + // TODO!: check if we ever hit this branch + if *post_visit_count < self.max_post_visit_idx { + return Ok(node); + } + + self.max_post_visit_idx = *post_visit_count; + // DFS, so every post_visit that is smaller than `post_visit_count` + // is a subexpression of this node and we can skip that + // + // `self.visited_idx` will influence recursion strategy in `pre_visit` + // see call-stack comment above + while self.visited_idx < self.identifier_array.len() - self.id_array_offset + && *post_visit_count > self.identifier_array[self.visited_idx + self.id_array_offset].0 + { + self.visited_idx += 1; + } + + let name = replace_name(id.as_ref()); + node.assign(AExpr::col(name.as_ref())); + + Ok(node) + } +} + +pub(crate) struct CommonSubExprOptimizer<'a> { + expr_arena: &'a mut Arena, + // amortize allocations + // these are cleared per lp node + se_count: SubExprCount, + id_array: IdentifierArray, + id_array_offsets: Vec, + replaced_identifiers: PlHashSet, + // these are cleared per expr node + visit_stack: Vec, +} + +impl<'a> CommonSubExprOptimizer<'a> { + pub(crate) fn new(expr_arena: &'a mut Arena) -> Self { + Self { + expr_arena, + se_count: Default::default(), + id_array: Default::default(), + visit_stack: Default::default(), + id_array_offsets: Default::default(), + replaced_identifiers: Default::default(), + } + } + + fn visit_expression( + &mut self, + ae_node: AexprNode, + is_groupby: bool, + ) -> PolarsResult<(usize, bool)> { + let mut visitor = ExprIdentifierVisitor::new( + &mut self.se_count, + &mut self.id_array, + &mut self.visit_stack, + is_groupby, + ); + ae_node.visit(&mut visitor).map(|_| ())?; + Ok((visitor.id_array_offset, visitor.has_sub_expr)) + } + fn mutate_expression( + &mut self, + ae_node: AexprNode, + id_array_offset: usize, + ) -> PolarsResult { + let mut rewriter = CommonSubExprRewriter::new( + &self.se_count, + &self.id_array, + &mut self.replaced_identifiers, + id_array_offset, + ); + ae_node.rewrite(&mut rewriter) + } + + fn find_cse( + &mut self, + expr: &[Node], + expr_arena: &mut Arena, + id_array_offsets: &mut Vec, + is_groupby: bool, + ) -> PolarsResult> { + let mut has_sub_expr = false; + + // first get all cse's + for node in expr { + // the visitor can return early thus depleted its stack + // on a previous iteration + self.visit_stack.clear(); + + // visit expressions and collect sub-expression counts + let (id_array_offset, this_expr_has_se) = + AexprNode::with_context(*node, expr_arena, |ae_node| { + self.visit_expression(ae_node, is_groupby) + })?; + id_array_offsets.push(id_array_offset as u32); + has_sub_expr |= this_expr_has_se; + } + + if has_sub_expr { + let mut new_expr = Vec::with_capacity_by_factor(expr.len(), 1.3); + + // then rewrite the expressions that have a cse count > 1 + for (node, offset) in expr.iter().zip(id_array_offsets.iter()) { + let new_node = AexprNode::with_context(*node, expr_arena, |ae_node| { + self.mutate_expression(ae_node, *offset as usize) + })?; + new_expr.push(new_node.node()) + } + // Add the tmp columns + for id in &self.replaced_identifiers { + let (node, _count) = self.se_count.get(id).unwrap(); + let name = replace_name(id.as_ref()); + let ae = AExpr::Alias(*node, Arc::from(name)); + let node = expr_arena.add(ae); + new_expr.push(node) + } + let expr = ProjectionExprs::new_with_cse(new_expr, self.replaced_identifiers.len()); + Ok(Some(expr)) + } else { + Ok(None) + } + } +} + +impl<'a> RewritingVisitor for CommonSubExprOptimizer<'a> { + type Node = ALogicalPlanNode; + + fn pre_visit(&mut self, node: &Self::Node) -> PolarsResult { + use ALogicalPlan::*; + Ok(match node.to_alp() { + Projection { .. } | HStack { .. } | Aggregate { .. } => { + RewriteRecursion::MutateAndContinue + } + _ => RewriteRecursion::NoMutateAndContinue, + }) + } + + fn mutate(&mut self, mut node: Self::Node) -> PolarsResult { + let mut expr_arena = Arena::new(); + std::mem::swap(self.expr_arena, &mut expr_arena); + let mut id_array_offsets = std::mem::take(&mut self.id_array_offsets); + + self.se_count.clear(); + self.id_array.clear(); + id_array_offsets.clear(); + self.replaced_identifiers.clear(); + + match node.to_alp() { + ALogicalPlan::Projection { + input, + expr, + schema, + } => { + if let Some(expr) = + self.find_cse(expr, &mut expr_arena, &mut id_array_offsets, false)? + { + let lp = ALogicalPlan::Projection { + input: *input, + expr, + schema: schema.clone(), + }; + node.replace(lp); + } + } + ALogicalPlan::HStack { + input, + exprs, + schema, + } => { + if let Some(exprs) = + self.find_cse(exprs, &mut expr_arena, &mut id_array_offsets, false)? + { + let lp = ALogicalPlan::HStack { + input: *input, + exprs, + schema: schema.clone(), + }; + node.replace(lp); + } + } + ALogicalPlan::Aggregate { + input, + keys, + aggs, + options, + maintain_order, + apply, + schema, + } => { + if let Some(aggs) = + self.find_cse(aggs, &mut expr_arena, &mut id_array_offsets, true)? + { + let keys = keys.clone(); + let options = options.clone(); + let schema = schema.clone(); + let apply = apply.clone(); + let maintain_order = *maintain_order; + let input = *input; + + let input = node.with_arena_mut(|lp_arena| { + let lp = ALogicalPlanBuilder::new(input, &mut expr_arena, lp_arena) + .with_columns(aggs.cse_exprs().to_vec()) + .build(); + lp_arena.add(lp) + }); + + let lp = ALogicalPlan::Aggregate { + input, + keys, + aggs: aggs.default_exprs().to_vec(), + options, + schema, + maintain_order, + apply, + }; + node.replace(lp); + } + } + _ => {} + }; + std::mem::swap(self.expr_arena, &mut expr_arena); + self.id_array_offsets = id_array_offsets; + Ok(node) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_cse_replacer() { + let e = (col("f00").sum() * col("bar")).sum() + col("f00").sum(); + + let mut arena = Arena::new(); + let node = to_aexpr(e, &mut arena); + + let mut se_count = Default::default(); + + // Pre-fill `id_array` with a value to also check if we deal with the offset correct; + let mut id_array = vec![(0, Rc::from("")); 1]; + let id_array_offset = id_array.len(); + let mut visit_stack = vec![]; + let mut visitor = + ExprIdentifierVisitor::new(&mut se_count, &mut id_array, &mut visit_stack, false); + + AexprNode::with_context(node, &mut arena, |ae_node| ae_node.visit(&mut visitor)).unwrap(); + + let mut replaced_ids = Default::default(); + let mut rewriter = + CommonSubExprRewriter::new(&se_count, &id_array, &mut replaced_ids, id_array_offset); + let ae_node = + AexprNode::with_context(node, &mut arena, |ae_node| ae_node.rewrite(&mut rewriter)) + .unwrap(); + + let e = node_to_expr(ae_node.node(), &arena); + assert_eq!( + format!("{}", e), + r#"[(col("__POLARS_CSER_sum!col(f00)")) + ([(col("bar")) * (col("__POLARS_CSER_sum!col(f00)"))].sum())]"# + ); + } + + #[test] + fn test_lp_cse_replacer() { + let df = df![ + "a" => [1, 2, 3], + "b" => [4, 5, 6], + ] + .unwrap(); + + let e = col("a").sum(); + + let lp = LogicalPlanBuilder::from_existing_df(df) + .project(vec![ + e.clone() * col("b"), + e.clone() * col("b") + e, + col("b"), + ]) + .build(); + + let (node, mut lp_arena, mut expr_arena) = lp.to_alp().unwrap(); + let mut optimizer = CommonSubExprOptimizer::new(&mut expr_arena); + + let out = ALogicalPlanNode::with_context(node, &mut lp_arena, |alp_node| { + alp_node.rewrite(&mut optimizer) + }) + .unwrap(); + + let ALogicalPlan::Projection { expr, .. } = out.to_alp() else { + unreachable!() + }; + + let default = expr.default_exprs(); + assert_eq!(default.len(), 3); + assert_eq!( + format!("{}", node_to_expr(default[0], &expr_arena)), + r#"[(col("b")) * (col("__POLARS_CSER_sum!col(a)"))]"# + ); + assert_eq!( + format!("{}", node_to_expr(default[1], &expr_arena)), + r#"[(col("__POLARS_CSER_sum!col(a)")) + ([(col("b")) * (col("__POLARS_CSER_sum!col(a)"))])]"# + ); + assert_eq!( + format!("{}", node_to_expr(default[2], &expr_arena)), + r#"col("b")"# + ); + + let cse = expr.cse_exprs(); + assert_eq!(cse.len(), 1); + assert_eq!( + format!("{}", node_to_expr(cse[0], &expr_arena)), + r#"col("a").sum().alias("__POLARS_CSER_sum!col(a)")"# + ); + } +} diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/delay_rechunk.rs b/crates/polars-plan/src/logical_plan/optimizer/delay_rechunk.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/delay_rechunk.rs rename to crates/polars-plan/src/logical_plan/optimizer/delay_rechunk.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/drop_nulls.rs b/crates/polars-plan/src/logical_plan/optimizer/drop_nulls.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/drop_nulls.rs rename to crates/polars-plan/src/logical_plan/optimizer/drop_nulls.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/fast_projection.rs b/crates/polars-plan/src/logical_plan/optimizer/fast_projection.rs similarity index 99% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/fast_projection.rs rename to crates/polars-plan/src/logical_plan/optimizer/fast_projection.rs index e796ff0a4032b..b883849511884 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/fast_projection.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/fast_projection.rs @@ -32,7 +32,7 @@ impl FastProjectionAndCollapse { fn impl_fast_projection( input: Node, expr: &[Node], - expr_arena: &mut Arena, + expr_arena: &Arena, ) -> Option { let mut columns = Vec::with_capacity(expr.len()); for node in expr.iter() { diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/file_caching.rs b/crates/polars-plan/src/logical_plan/optimizer/file_caching.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/file_caching.rs rename to crates/polars-plan/src/logical_plan/optimizer/file_caching.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/flatten_union.rs b/crates/polars-plan/src/logical_plan/optimizer/flatten_union.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/flatten_union.rs rename to crates/polars-plan/src/logical_plan/optimizer/flatten_union.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/fused.rs b/crates/polars-plan/src/logical_plan/optimizer/fused.rs similarity index 91% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/fused.rs rename to crates/polars-plan/src/logical_plan/optimizer/fused.rs index 7ece608af5671..76117d96bbd2a 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/fused.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/fused.rs @@ -26,7 +26,9 @@ fn check_eligible( expr_arena: &Arena, lp_arena: &Arena, ) -> PolarsResult<(Option, Option)> { - let Some(input_node) = lp_arena.get(lp_node).get_input() else {return Ok((None, None))}; + let Some(input_node) = lp_arena.get(lp_node).get_input() else { + return Ok((None, None)); + }; let schema = lp_arena.get(input_node).schema(lp_arena); let field_left = expr_arena .get(*left) @@ -35,7 +37,14 @@ fn check_eligible( .get(*right) .get_type(&schema, Context::Default, expr_arena)?; let type_left = &field_left.dtype; - if type_left.is_numeric() && type_right.is_numeric() { + // Exclude literals for now as these will not benefit from fused operations downstream #9857 + // This optimization would also interfere with the `col -> lit` type-coercion rules + // And it might also interfere with constant folding which is a more suitable optimizations here + if type_left.is_numeric() + && type_right.is_numeric() + && !has_aexpr_literal(*left, expr_arena) + && !has_aexpr_literal(*right, expr_arena) + { Ok((Some(true), Some(field_left))) } else { Ok((Some(false), None)) @@ -45,7 +54,7 @@ fn check_eligible( impl OptimizationRule for FusedArithmetic { #[allow(clippy::float_cmp)] fn optimize_expr( - &self, + &mut self, expr_arena: &mut Arena, expr_node: Node, lp_arena: &Arena, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/mod.rs similarity index 89% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs rename to crates/polars-plan/src/logical_plan/optimizer/mod.rs index 71f997ad79451..1b8b08e68a7b0 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/mod.rs @@ -9,6 +9,8 @@ mod cse; mod delay_rechunk; mod drop_nulls; +#[cfg(feature = "cse")] +mod cse_expr; mod fast_projection; #[cfg(any(feature = "ipc", feature = "parquet", feature = "csv", feature = "cse"))] pub(crate) mod file_caching; @@ -37,6 +39,10 @@ pub use type_coercion::TypeCoercionRule; use self::flatten_union::FlattenUnionRule; pub use crate::frame::{AllowedOptimizations, OptState}; +#[cfg(feature = "cse")] +use crate::logical_plan::optimizer::cse_expr::CommonSubExprOptimizer; +#[cfg(feature = "cse")] +use crate::logical_plan::visitor::*; pub trait Optimize { fn optimize(&self, logical_plan: LogicalPlan) -> PolarsResult; @@ -64,10 +70,12 @@ pub fn optimize( let slice_pushdown = opt_state.slice_pushdown; let streaming = opt_state.streaming; #[cfg(feature = "cse")] - let cse = opt_state.common_subplan_elimination; + let comm_subplan_elim = opt_state.comm_subplan_elim; + #[cfg(feature = "cse")] + let comm_subexpr_elim = opt_state.comm_subexpr_elim; #[allow(unused_variables)] - let agg_scan_projection = opt_state.file_caching; + let agg_scan_projection = opt_state.file_caching && !streaming; // gradually fill the rules passed to the optimizer let opt = StackOptimizer {}; @@ -80,7 +88,7 @@ pub fn optimize( let mut lp_top = to_alp(logical_plan, expr_arena, lp_arena)?; #[cfg(feature = "cse")] - let cse_changed = if cse { + let cse_changed = if comm_subplan_elim { let (lp, changed) = cse::elim_cmn_subplans(lp_top, lp_arena, expr_arena); lp_top = lp; changed @@ -177,6 +185,16 @@ pub fn optimize( lp_top = opt.optimize_loop(&mut rules, expr_arena, lp_arena, lp_top)?; + // This one should run (nearly) last as this modifies the projections + #[cfg(feature = "cse")] + if comm_subexpr_elim { + let mut optimizer = CommonSubExprOptimizer::new(expr_arena); + lp_top = ALogicalPlanNode::with_context(lp_top, lp_arena, |alp_node| { + alp_node.rewrite(&mut optimizer) + })? + .node() + } + // during debug we check if the optimizations have not modified the final schema #[cfg(debug_assertions)] { diff --git a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/join.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/join.rs new file mode 100644 index 0000000000000..b9e932673b85e --- /dev/null +++ b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/join.rs @@ -0,0 +1,128 @@ +use super::*; + +fn should_block_join_specific(ae: &AExpr, how: &JoinType) -> bool { + use AExpr::*; + match ae { + // joins can produce null values + Function { + function: + FunctionExpr::Boolean(BooleanFunction::IsNotNull) + | FunctionExpr::Boolean(BooleanFunction::IsNull) + | FunctionExpr::FillNull { .. }, + .. + } => join_produces_null(how), + // joins can produce duplicates + #[cfg(feature = "is_unique")] + Function { + function: + FunctionExpr::Boolean(BooleanFunction::IsUnique) + | FunctionExpr::Boolean(BooleanFunction::IsDuplicated), + .. + } => true, + #[cfg(feature = "is_first")] + Function { + function: FunctionExpr::Boolean(BooleanFunction::IsFirst), + .. + } => true, + // any operation that checks for equality or ordering can be wrong because + // the join can produce null values + // TODO! check if we can be less conservative here + BinaryExpr { op, .. } => !matches!(op, Operator::NotEq) && join_produces_null(how), + _ => false, + } +} + +fn join_produces_null(how: &JoinType) -> bool { + #[cfg(feature = "asof_join")] + { + matches!( + how, + JoinType::Left | JoinType::Outer | JoinType::Cross | JoinType::AsOf(_) + ) + } + #[cfg(not(feature = "asof_join"))] + { + matches!(how, JoinType::Left | JoinType::Outer | JoinType::Cross) + } +} + +#[allow(clippy::too_many_arguments)] +pub(super) fn process_join( + opt: &PredicatePushDown, + lp_arena: &mut Arena, + expr_arena: &mut Arena, + input_left: Node, + input_right: Node, + left_on: Vec, + right_on: Vec, + schema: SchemaRef, + options: Arc, + acc_predicates: PlHashMap, Node>, +) -> PolarsResult { + use ALogicalPlan::*; + let schema_left = lp_arena.get(input_left).schema(lp_arena); + let schema_right = lp_arena.get(input_right).schema(lp_arena); + + let mut pushdown_left = init_hashmap(Some(acc_predicates.len())); + let mut pushdown_right = init_hashmap(Some(acc_predicates.len())); + let mut local_predicates = Vec::with_capacity(acc_predicates.len()); + + for (_, predicate) in acc_predicates { + // check if predicate can pass the joins node + if has_aexpr(predicate, expr_arena, |ae| { + should_block_join_specific(ae, &options.args.how) + }) { + local_predicates.push(predicate); + continue; + } + // these indicate to which tables we are going to push down the predicate + let mut filter_left = false; + let mut filter_right = false; + + // predicate should not have an aggregation or window function as that would + // be influenced by join + #[allow(clippy::suspicious_else_formatting)] + if !predicate_is_pushdown_boundary(predicate, expr_arena) { + if check_input_node(predicate, &schema_left, expr_arena) { + insert_and_combine_predicate(&mut pushdown_left, predicate, expr_arena); + filter_left = true; + } + // this is `else if` because if the predicate is in the left hand side + // the right hand side should be renamed with the suffix. + // in that case we should not push down as the user wants to filter on `x` + // not on `x_rhs`. + else if check_input_node(predicate, &schema_right, expr_arena) { + insert_and_combine_predicate(&mut pushdown_right, predicate, expr_arena); + filter_right = true; + } + } + match (filter_left, filter_right, &options.args.how) { + // if not pushed down on one of the tables we have to do it locally. + (false, false, _) | + // if left join and predicate only available in right table, + // 'we should not filter right, because that would lead to + // invalid results. + // see: #2057 + (false, true, JoinType::Left) + => { + local_predicates.push(predicate); + continue; + }, + // business as usual + _ => {} + } + } + + opt.pushdown_and_assign(input_left, pushdown_left, lp_arena, expr_arena)?; + opt.pushdown_and_assign(input_right, pushdown_right, lp_arena, expr_arena)?; + + let lp = Join { + input_left, + input_right, + left_on, + right_on, + schema, + options, + }; + Ok(opt.optional_apply_predicate(lp, local_predicates, lp_arena, expr_arena)) +} diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/keys.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/keys.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/keys.rs rename to crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/keys.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs similarity index 78% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs rename to crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs index 10cc1c0a560ce..134787fa78a96 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs @@ -1,3 +1,4 @@ +mod join; mod keys; mod rename; mod utils; @@ -9,26 +10,13 @@ use utils::*; use super::*; use crate::dsl::function_expr::FunctionExpr; use crate::logical_plan::{optimizer, Context}; +use crate::prelude::optimizer::predicate_pushdown::join::process_join; use crate::prelude::optimizer::predicate_pushdown::rename::process_rename; use crate::utils::{aexprs_to_schema, check_input_node, has_aexpr}; #[derive(Default)] pub struct PredicatePushDown {} -fn join_produces_null(how: &JoinType) -> bool { - #[cfg(feature = "asof_join")] - { - matches!( - how, - JoinType::Left | JoinType::Outer | JoinType::Cross | JoinType::AsOf(_) - ) - } - #[cfg(not(feature = "asof_join"))] - { - matches!(how, JoinType::Left | JoinType::Outer | JoinType::Cross) - } -} - impl PredicatePushDown { fn optional_apply_predicate( &self, @@ -361,104 +349,16 @@ impl PredicatePushDown { schema, options, } => { - let schema_left = lp_arena.get(input_left).schema(lp_arena); - let schema_right = lp_arena.get(input_right).schema(lp_arena); - - let mut pushdown_left = optimizer::init_hashmap(Some(acc_predicates.len())); - let mut pushdown_right = optimizer::init_hashmap(Some(acc_predicates.len())); - let mut local_predicates = Vec::with_capacity(acc_predicates.len()); - - for (_, predicate) in acc_predicates { - // unique and duplicated can be caused by joins - #[cfg(feature = "is_unique")] - let matches = { - |e: &AExpr| matches!(e, AExpr::Function{ - function: FunctionExpr::Boolean(BooleanFunction::IsDuplicated) - | FunctionExpr::Boolean(BooleanFunction::IsUnique), - .. - }) - }; - #[cfg(not(feature = "is_unique"))] - let matches = { - |_e: &AExpr| false - }; - - - let checks_nulls = - |e: &AExpr| matches!(e, AExpr::Function{ - function: FunctionExpr::Boolean(BooleanFunction::IsNotNull) - | FunctionExpr::Boolean(BooleanFunction::IsNull), - .. - }) || - // any operation that checks for equality or ordering can be wrong because - // the join can produce null values - matches!(e, AExpr::BinaryExpr {op, ..} if !matches!(op, Operator::NotEq)); - if has_aexpr(predicate, expr_arena, matches) - // join might create null values. - || has_aexpr(predicate, expr_arena, checks_nulls) - // only these join types produce null values - && join_produces_null(&options.args.how) { - local_predicates.push(predicate); - continue; - } - // these indicate to which tables we are going to push down the predicate - let mut filter_left = false; - let mut filter_right = false; - - // predicate should not have an aggregation or window function as that would - // be influenced by join - #[allow(clippy::suspicious_else_formatting)] - if !predicate_is_pushdown_boundary(predicate, expr_arena) { - if check_input_node(predicate, &schema_left, expr_arena) { - insert_and_combine_predicate( - &mut pushdown_left, - predicate, - expr_arena, - ); - filter_left = true; - } - // this is `else if` because if the predicate is in the left hand side - // the right hand side should be renamed with the suffix. - // in that case we should not push down as the user wants to filter on `x` - // not on `x_rhs`. - else if check_input_node(predicate, &schema_right, expr_arena) { - insert_and_combine_predicate( - &mut pushdown_right, - predicate, - expr_arena, - ); - filter_right = true; - } - } - match (filter_left, filter_right, &options.args.how) { - // if not pushed down on one of the tables we have to do it locally. - (false, false, _) | - // if left join and predicate only available in right table, - // 'we should not filter right, because that would lead to - // invalid results. - // see: #2057 - (false, true, JoinType::Left) - => { - local_predicates.push(predicate); - continue; - }, - // business as usual - _ => {} - } - } - - self.pushdown_and_assign(input_left, pushdown_left, lp_arena, expr_arena)?; - self.pushdown_and_assign(input_right, pushdown_right, lp_arena, expr_arena)?; - - let lp = Join { - input_left, - input_right, - left_on, - right_on, - schema, - options, - }; - Ok(self.optional_apply_predicate(lp, local_predicates, lp_arena, expr_arena)) + process_join(self, lp_arena, + expr_arena, + input_left, + input_right, + left_on, + right_on, + schema, + options, + acc_predicates + ) } MapFunction { ref function, .. } => { if function.allow_predicate_pd() @@ -617,21 +517,3 @@ impl PredicatePushDown { self.push_down(logical_plan, acc_predicates, lp_arena, expr_arena) } } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_insert_and_combine_predicate() { - let mut acc_predicates = PlHashMap::with_capacity(32); - let mut expr_arena = Arena::new(); - - let predicate_expr = col("foo").gt(col("bar")); - let predicate = to_aexpr(predicate_expr.clone(), &mut expr_arena); - insert_and_combine_predicate(&mut acc_predicates, predicate, &mut expr_arena); - let root = *acc_predicates.get("foo").unwrap(); - let expr = node_to_expr(root, &expr_arena); - assert_eq!(format!("{:?}", &expr), format!("{:?}", predicate_expr)); - } -} diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/rename.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/rename.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/rename.rs rename to crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/rename.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs similarity index 97% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs rename to crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs index 6e4e41bb2d32b..3526e5b303a98 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs @@ -214,7 +214,13 @@ fn rename_predicate_columns_due_to_aliased_projection( let projection_aexpr = expr_arena.get(projection_node); if let AExpr::Alias(_, alias_name) = projection_aexpr { let alias_name = alias_name.as_ref(); - let projection_roots = aexpr_to_leaf_names(projection_node, expr_arena); + let projection_leaves = aexpr_to_leaf_names(projection_node, expr_arena); + + // this means the leaf is a literal + if projection_leaves.is_empty() { + return LoopBehavior::Nothing; + } + // if this alias refers to one of the predicates in the upper nodes // we rename the column of the predicate before we push it downwards. if let Some(predicate) = acc_predicates.remove(alias_name) { @@ -222,11 +228,11 @@ fn rename_predicate_columns_due_to_aliased_projection( local_predicates.push(predicate); return LoopBehavior::Continue; } - if projection_roots.len() == 1 { + if projection_leaves.len() == 1 { // we were able to rename the alias column with the root column name // before pushing down the predicate let predicate = - rename_aexpr_leaf_names(predicate, expr_arena, projection_roots[0].clone()); + rename_aexpr_leaf_names(predicate, expr_arena, projection_leaves[0].clone()); insert_and_combine_predicate(acc_predicates, predicate, expr_arena); } else { @@ -259,7 +265,7 @@ fn rename_predicate_columns_due_to_aliased_projection( /// Implementation for both Hstack and Projection pub(super) fn rewrite_projection_node( expr_arena: &mut Arena, - lp_arena: &mut Arena, + lp_arena: &Arena, acc_predicates: &mut PlHashMap, Node>, projections: Vec, input: Node, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/melt.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/melt.rs similarity index 90% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/melt.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/melt.rs index d31d5d2da7cd7..4309ea08e11bc 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/melt.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/melt.rs @@ -43,6 +43,12 @@ pub(super) fn process_melt( lp_arena, expr_arena, )?; + + // re-make melt node so that the schema is updated + let lp = ALogicalPlanBuilder::new(input, expr_arena, lp_arena) + .melt(args.clone()) + .build(); + if local_projections.is_empty() { Ok(lp) } else { diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/mod.rs similarity index 81% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/mod.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/mod.rs index d3b97f84be747..b476124e808ed 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/functions/mod.rs @@ -15,11 +15,6 @@ pub(super) fn process_functions( lp_arena: &mut Arena, expr_arena: &mut Arena, ) -> PolarsResult { - let lp = ALogicalPlan::MapFunction { - input, - function: function.clone(), - }; - use FunctionNode::*; match function { Rename { @@ -43,6 +38,11 @@ pub(super) fn process_functions( lp_arena, expr_arena, )?; + + let lp = ALogicalPlan::MapFunction { + input, + function: function.clone(), + }; Ok(lp) } Explode { columns, .. } => { @@ -57,19 +57,32 @@ pub(super) fn process_functions( lp_arena, expr_arena, )?; - Ok(lp) + Ok(ALogicalPlanBuilder::new(input, expr_arena, lp_arena) + .explode(columns.clone()) + .build()) + } + Melt { args, .. } => { + let lp = ALogicalPlan::MapFunction { + input, + function: function.clone(), + }; + + process_melt( + proj_pd, + lp, + args, + input, + acc_projections, + projections_seen, + lp_arena, + expr_arena, + ) } - Melt { args, .. } => process_melt( - proj_pd, - lp, - args, - input, - acc_projections, - projections_seen, - lp_arena, - expr_arena, - ), _ => { + let lp = ALogicalPlan::MapFunction { + input, + function: function.clone(), + }; if function.allow_projection_pd() && !acc_projections.is_empty() { let original_acc_projection_len = acc_projections.len(); diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/generic.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/generic.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/generic.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/generic.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs index e5875e1e9bebd..09586b88490cd 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs @@ -9,7 +9,7 @@ pub(super) fn process_groupby( apply: Option>, schema: SchemaRef, maintain_order: bool, - options: GroupbyOptions, + options: Arc, acc_projections: Vec, projected_names: PlHashSet>, projections_seen: usize, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/hstack.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/hstack.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/hstack.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/hstack.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs index 04cc897a478b7..9bd5290edeb9a 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs @@ -35,7 +35,7 @@ pub(super) fn process_asof_join( input_right: Node, left_on: Vec, right_on: Vec, - options: JoinOptions, + options: Arc, acc_projections: Vec, _projected_names: PlHashSet>, projections_seen: usize, @@ -51,7 +51,9 @@ pub(super) fn process_asof_join( let mut names_right = PlHashSet::with_capacity(n); let mut local_projection = Vec::with_capacity(n); - let JoinType::AsOf(asof_options) = &options.args.how else {unreachable!()}; + let JoinType::AsOf(asof_options) = &options.args.how else { + unreachable!() + }; // if there are no projections we don't have to do anything (all columns are projected) // otherwise we build local projections to sort out proper column names due to the @@ -196,7 +198,7 @@ pub(super) fn process_join( input_right: Node, left_on: Vec, right_on: Vec, - options: JoinOptions, + options: Arc, acc_projections: Vec, _projected_names: PlHashSet>, projections_seen: usize, @@ -436,7 +438,7 @@ fn resolve_join_suffixes( input_right: Node, left_on: Vec, right_on: Vec, - options: JoinOptions, + options: Arc, lp_arena: &mut Arena, expr_arena: &mut Arena, local_projection: &mut [Node], diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs similarity index 95% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs index 53a2b2f104343..d853e7be86b58 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs @@ -24,7 +24,7 @@ use crate::prelude::optimizer::projection_pushdown::projection::process_projecti use crate::prelude::optimizer::projection_pushdown::rename::process_rename; use crate::prelude::*; use crate::utils::{ - aexpr_assign_renamed_leaf, aexpr_to_leaf_names, aexpr_to_leaf_nodes, check_input_node, + aexpr_assign_renamed_leaf, aexpr_to_column_nodes, aexpr_to_leaf_names, check_input_node, expr_is_projected_upstream, }; @@ -73,7 +73,7 @@ fn get_scan_columns( fn split_acc_projections( acc_projections: Vec, down_schema: &Schema, - expr_arena: &mut Arena, + expr_arena: &Arena, expands_schema: bool, ) -> (Vec, Vec, PlHashSet>) { // If node above has as many columns as the projection there is nothing to pushdown. @@ -99,10 +99,10 @@ fn add_expr_to_accumulated( expr: Node, acc_projections: &mut Vec, projected_names: &mut PlHashSet>, - expr_arena: &mut Arena, + expr_arena: &Arena, ) { - for root_node in aexpr_to_leaf_nodes(expr, expr_arena) { - for name in aexpr_to_leaf_names(root_node, expr_arena) { + for root_node in aexpr_to_column_nodes_iter(expr, expr_arena) { + for name in aexpr_to_leaf_names_iter(root_node, expr_arena) { if projected_names.insert(name) { acc_projections.push(root_node) } @@ -218,24 +218,26 @@ impl ProjectionPushDown { pushdown_right: &mut Vec, names_left: &mut PlHashSet>, names_right: &mut PlHashSet>, - expr_arena: &mut Arena, + expr_arena: &Arena, ) -> (bool, bool) { let mut pushed_at_least_one = false; let mut already_projected = false; let names = aexpr_to_leaf_names(proj, expr_arena); - let root_projections = aexpr_to_leaf_nodes(proj, expr_arena); + let root_projections = aexpr_to_column_nodes(proj, expr_arena); for (name, root_projection) in names.into_iter().zip(root_projections) { - let was_not_in_left = names_left.insert(name.clone()); - let was_not_in_right = names_right.insert(name.clone()); - already_projected |= !was_not_in_left; - already_projected |= !was_not_in_right; + let is_in_left = names_left.contains(&name); + let is_in_right = names_right.contains(&name); + already_projected |= is_in_left; + already_projected |= is_in_right; - if check_input_node(root_projection, schema_left, expr_arena) && was_not_in_left { + if check_input_node(root_projection, schema_left, expr_arena) && !is_in_left { + names_left.insert(name.clone()); pushdown_left.push(proj); pushed_at_least_one = true; } - if check_input_node(root_projection, schema_right, expr_arena) && was_not_in_right { + if check_input_node(root_projection, schema_right, expr_arena) && !is_in_right { + names_right.insert(name.clone()); pushdown_right.push(proj); pushed_at_least_one = true; } @@ -326,7 +328,7 @@ impl ProjectionPushDown { Projection { expr, input, .. } => process_projection( self, input, - expr, + expr.exprs(), acc_projections, projected_names, projections_seen, @@ -362,9 +364,11 @@ impl ProjectionPushDown { output_schema, } => { if function.allows_projection_pushdown() { - options.with_columns = get_scan_columns(&mut acc_projections, expr_arena, None); + let mut_options = Arc::make_mut(&mut options); + mut_options.with_columns = + get_scan_columns(&mut acc_projections, expr_arena, None); - let output_schema = if options.with_columns.is_none() { + let output_schema = if mut_options.with_columns.is_none() { None } else { Some(Arc::new(update_scan_schema( @@ -374,7 +378,7 @@ impl ProjectionPushDown { true, )?)) }; - options.output_schema = output_schema.clone(); + mut_options.output_schema = output_schema.clone(); let lp = AnonymousScan { function, @@ -483,7 +487,7 @@ impl ProjectionPushDown { if !acc_projections.is_empty() { // Make sure that the column(s) used for the sort is projected by_column.iter().for_each(|node| { - aexpr_to_leaf_nodes(*node, expr_arena) + aexpr_to_column_nodes(*node, expr_arena) .iter() .for_each(|root| { add_expr_to_accumulated( @@ -628,7 +632,7 @@ impl ProjectionPushDown { HStack { input, exprs, .. } => process_hstack( self, input, - exprs, + exprs.exprs(), acc_projections, projected_names, projections_seen, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/semi_anti_join.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/semi_anti_join.rs similarity index 99% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/semi_anti_join.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/semi_anti_join.rs index a9e949bae54b5..7e0cee38462ec 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/semi_anti_join.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/semi_anti_join.rs @@ -8,7 +8,7 @@ pub(super) fn process_semi_anti_join( input_right: Node, left_on: Vec, right_on: Vec, - options: JoinOptions, + options: Arc, acc_projections: Vec, _projected_names: PlHashSet>, projections_seen: usize, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/simplify_expr.rs b/crates/polars-plan/src/logical_plan/optimizer/simplify_expr.rs similarity index 92% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/simplify_expr.rs rename to crates/polars-plan/src/logical_plan/optimizer/simplify_expr.rs index 9ad4458a96af3..d668e35fc1690 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/simplify_expr.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/simplify_expr.rs @@ -107,7 +107,7 @@ pub struct SimplifyBooleanRule {} impl OptimizationRule for SimplifyBooleanRule { fn optimize_expr( - &self, + &mut self, expr_arena: &mut Arena, expr_node: Node, _: &Arena, @@ -440,7 +440,7 @@ pub struct SimplifyExprRule {} impl OptimizationRule for SimplifyExprRule { #[allow(clippy::float_cmp)] fn optimize_expr( - &self, + &mut self, expr_arena: &mut Arena, expr_node: Node, _lp_arena: &Arena, @@ -623,11 +623,12 @@ impl OptimizationRule for SimplifyExprRule { } } AExpr::Cast { - expr, data_type, .. + expr, + data_type, + strict, } => { let input = expr_arena.get(*expr); - // faster casts (we only do strict casts) - inline_cast(input, data_type) + inline_cast(input, data_type, *strict)? } // flatten nested concat_str calls #[cfg(all(feature = "strings", feature = "concat_str"))] @@ -665,17 +666,55 @@ impl OptimizationRule for SimplifyExprRule { } } -fn inline_cast(input: &AExpr, dtype: &DataType) -> Option { - match (input, dtype) { - #[cfg(feature = "dtype-duration")] - (AExpr::Literal(lv), _) if !matches!(dtype, DataType::Unknown) => { - let av = lv.to_anyvalue()?; - let out = av.cast(dtype).ok()?; - let lv: LiteralValue = out.try_into().ok()?; - Some(AExpr::Literal(lv)) - } - _ => None, - } +fn inline_cast(input: &AExpr, dtype: &DataType, strict: bool) -> PolarsResult> { + let lv = match (input, dtype) { + (AExpr::Literal(lv), _) if !matches!(dtype, DataType::Unknown) => match lv { + LiteralValue::Series(s) => { + let s = if strict { + s.strict_cast(dtype) + } else { + s.cast(dtype) + }?; + LiteralValue::Series(SpecialEq::new(s)) + } + _ => { + let Some(av) = lv.to_anyvalue() else { + return Ok(None); + }; + match (av, dtype) { + // casting null always remains null + (AnyValue::Null, _) => return Ok(None), + // series cast should do this one + #[cfg(feature = "dtype-datetime")] + (AnyValue::Datetime(_, _, _), DataType::Datetime(_, _)) => return Ok(None), + #[cfg(feature = "dtype-duration")] + (AnyValue::Duration(_, _), _) => return Ok(None), + #[cfg(feature = "dtype-categorical")] + (AnyValue::Categorical(_, _, _), _) | (_, DataType::Categorical(_)) => { + return Ok(None) + } + #[cfg(feature = "dtype-struct")] + (_, DataType::Struct(_)) => return Ok(None), + (av, _) => { + // raise in debug builds so we can fix them + // in release we continue and apply the cast later + #[cfg(debug_assertions)] + let out = { av.cast(dtype)? }; + #[cfg(not(debug_assertions))] + let out = { + match av.cast(&dtype) { + Ok(out) => out, + Err(_) => return Ok(None), + } + }; + out.try_into()? + } + } + } + }, + _ => return Ok(None), + }; + Ok(Some(AExpr::Literal(lv))) } #[test] diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs b/crates/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs similarity index 91% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs rename to crates/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs index c8a3c31ee3b62..6e5dbbba5c802 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs @@ -10,7 +10,7 @@ fn pushdown(input: Node, offset: Node, length: Node, arena: &mut Arena) - impl OptimizationRule for SlicePushDown { fn optimize_expr( - &self, + &mut self, expr_arena: &mut Arena, expr_node: Node, _lp_arena: &Arena, @@ -27,11 +27,13 @@ impl OptimizationRule for SlicePushDown { use AExpr::*; let out = match expr_arena.get(*input) { - m @ Alias(..) | m @ Cast { .. } => { - let m = m.clone(); - let input = m.get_input().first(); + ae @ Alias(..) | ae @ Cast { .. } => { + let ae = ae.clone(); + self.scratch.clear(); + ae.nodes(&mut self.scratch); + let input = self.scratch[0]; let new_input = pushdown(input, offset, length, expr_arena); - Some(m.replace_inputs(&[new_input])) + Some(ae.replace_inputs(&[new_input])) } Literal(lv) => { match lv { diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_lp.rs b/crates/polars-plan/src/logical_plan/optimizer/slice_pushdown_lp.rs similarity index 95% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_lp.rs rename to crates/polars-plan/src/logical_plan/optimizer/slice_pushdown_lp.rs index 39b69d095b9c3..099a1e2ab928e 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_lp.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/slice_pushdown_lp.rs @@ -4,6 +4,7 @@ use crate::prelude::*; pub(super) struct SlicePushDown { streaming: bool, + pub scratch: Vec, } #[derive(Copy, Clone)] @@ -14,7 +15,10 @@ struct State { impl SlicePushDown { pub(super) fn new(streaming: bool) -> Self { - Self { streaming } + Self { + streaming, + scratch: vec![], + } } // slice will be done at this node if we found any @@ -110,7 +114,8 @@ impl SlicePushDown { // TODO! we currently skip slice pushdown if there is a predicate. // we can modify the readers to only limit after predicates have been applied Some(state)) if state.offset == 0 && predicate.is_none() => { - options.n_rows = Some(state.len as usize); + let mut_options = Arc::make_mut(&mut options); + mut_options.n_rows = Some(state.len as usize); let lp = AnonymousScan { function, file_info, @@ -202,7 +207,8 @@ impl SlicePushDown { // then assign the slice state to the join operation - options.args.slice = Some((state.offset, state.len as usize)); + let mut_options = Arc::make_mut(&mut options); + mut_options.args.slice = Some((state.offset, state.len as usize)); Ok(Join { input_left, @@ -219,7 +225,8 @@ impl SlicePushDown { let input_lp = self.pushdown(input_lp, None, lp_arena, expr_arena)?; let input= lp_arena.add(input_lp); - options.slice = Some((state.offset, state.len as usize)); + let mut_options= Arc::make_mut(&mut options); + mut_options.slice = Some((state.offset, state.len as usize)); Ok(Aggregate { input, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/stack_opt.rs b/crates/polars-plan/src/logical_plan/optimizer/stack_opt.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/stack_opt.rs rename to crates/polars-plan/src/logical_plan/optimizer/stack_opt.rs index bb140baf461ff..7ca87324f8521 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/stack_opt.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/stack_opt.rs @@ -66,7 +66,7 @@ impl StackOptimizer { continue; } } - for rule in rules.iter() { + for rule in rules.iter_mut() { // keep iterating over same rule while let Some(x) = rule.optimize_expr( expr_arena, @@ -104,7 +104,7 @@ pub trait OptimizationRule { None } fn optimize_expr( - &self, + &mut self, _expr_arena: &mut Arena, _expr_node: Node, _lp_arena: &Arena, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs rename to crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs similarity index 99% rename from polars/polars-lazy/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs rename to crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs index 79746a40d8bf7..9539b8c84fbec 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs @@ -273,7 +273,7 @@ fn get_aexpr_and_type<'a>( impl OptimizationRule for TypeCoercionRule { fn optimize_expr( - &self, + &mut self, expr_arena: &mut Arena, expr_node: Node, lp_arena: &Arena, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/options.rs b/crates/polars-plan/src/logical_plan/options.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/options.rs rename to crates/polars-plan/src/logical_plan/options.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/projection.rs b/crates/polars-plan/src/logical_plan/projection.rs similarity index 96% rename from polars/polars-lazy/polars-plan/src/logical_plan/projection.rs rename to crates/polars-plan/src/logical_plan/projection.rs index 61fe7b6112600..4620d1ebbe13e 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/projection.rs +++ b/crates/polars-plan/src/logical_plan/projection.rs @@ -24,6 +24,17 @@ pub(super) fn replace_wildcard_with_column(mut expr: Expr, column_name: Arc expr } +pub fn remove_exclude(mut expr: Expr) -> Expr { + expr.mutate().apply(|e| { + if let Expr::Exclude(input, _) = e { + *e = remove_exclude(std::mem::take(input)); + } + // always keep iterating all inputs + true + }); + expr +} + fn rewrite_special_aliases(expr: Expr) -> PolarsResult { // the blocks are added by cargo fmt #[allow(clippy::blocks_in_if_conditions)] @@ -102,7 +113,7 @@ fn expand_regex( regex::Regex::new(pattern).map_err(|e| polars_err!(ComputeError: "invalid regex {}", e))?; for name in schema.iter_names() { if re.is_match(name) && !exclude.contains(name.as_str()) { - let mut new_expr = expr.clone(); + let mut new_expr = remove_exclude(expr.clone()); new_expr.mutate().apply(|e| match &e { Expr::Column(pat) if pat.as_ref() == pattern => { @@ -113,7 +124,7 @@ fn expand_regex( }); let new_expr = rewrite_special_aliases(new_expr)?; - result.push(new_expr) + result.push(new_expr); } } Ok(()) @@ -139,18 +150,7 @@ fn replace_regex( match regex { None => { regex = Some(name); - if exclude.is_empty() { - expand_regex(expr, result, schema, name, exclude)? - } else { - // iterate until we find the Exclude node - // we remove that node from the expression - for e in expr.into_iter() { - if let Expr::Exclude(e, _) = e { - expand_regex(e, result, schema, name, exclude)?; - break; - } - } - } + expand_regex(expr, result, schema, name, exclude)?; } Some(r) => { polars_ensure!( @@ -588,7 +588,9 @@ fn replace_selector(expr: &mut Expr, schema: &Schema, keys: &[Expr]) -> PolarsRe members .into_iter() .map(|e| { - let Expr::Column(name) = e else {unreachable!()}; + let Expr::Column(name) = e else { + unreachable!() + }; name.to_string() }) .collect(), diff --git a/crates/polars-plan/src/logical_plan/projection_expr.rs b/crates/polars-plan/src/logical_plan/projection_expr.rs new file mode 100644 index 0000000000000..3921a99498d2d --- /dev/null +++ b/crates/polars-plan/src/logical_plan/projection_expr.rs @@ -0,0 +1,87 @@ +use std::ops::Deref; + +use super::*; + +#[derive(Debug, Clone)] +pub struct ProjectionExprs { + expr: Vec, + /// offset from the back + /// `expr[expr.len() - common_sub_offset..]` + /// are the common sub expressions + common_sub_offset: usize, +} + +impl Deref for ProjectionExprs { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.expr + } +} + +impl From> for ProjectionExprs { + fn from(value: Vec) -> Self { + Self::new(value) + } +} + +impl FromIterator for ProjectionExprs { + fn from_iter>(iter: T) -> Self { + let expr = iter.into_iter().collect(); + Self::new(expr) + } +} + +impl ProjectionExprs { + pub(crate) fn new(expr: Vec) -> Self { + Self::new_with_cse(expr, 0) + } + + pub fn default_exprs(&self) -> &[Node] { + &self.expr[..self.expr.len() - self.common_sub_offset] + } + + pub fn cse_exprs(&self) -> &[Node] { + &self.expr[self.expr.len() - self.common_sub_offset..] + } + + pub(crate) fn new_with_cse(expr: Vec, common_sub_offset: usize) -> Self { + Self { + expr, + common_sub_offset, + } + } + + pub(crate) fn has_sub_exprs(&self) -> bool { + self.common_sub_offset != 0 + } + + fn dbg_assert_no_sub_exprs(&self) { + debug_assert!(!self.has_sub_exprs(), "should not have sub-expressions yet"); + } + + pub(crate) fn exprs(self) -> Vec { + self.dbg_assert_no_sub_exprs(); + self.expr + } +} + +impl IntoIterator for ProjectionExprs { + type Item = Node; + type IntoIter = as IntoIterator>::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + assert!(!self.has_sub_exprs(), "should not have sub-expressions yet"); + self.expr.into_iter() + } +} + +impl<'a> IntoIterator for &'a ProjectionExprs { + type Item = &'a Node; + type IntoIter = <&'a Vec as IntoIterator>::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + assert!(!self.has_sub_exprs(), "should not have sub-expressions yet"); + self.expr.iter() + } +} diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/pyarrow.rs b/crates/polars-plan/src/logical_plan/pyarrow.rs similarity index 100% rename from polars/polars-lazy/polars-plan/src/logical_plan/pyarrow.rs rename to crates/polars-plan/src/logical_plan/pyarrow.rs diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs b/crates/polars-plan/src/logical_plan/schema.rs similarity index 97% rename from polars/polars-lazy/polars-plan/src/logical_plan/schema.rs rename to crates/polars-plan/src/logical_plan/schema.rs index 9cdf5fd74dbec..3dcfc5f1aa774 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs +++ b/crates/polars-plan/src/logical_plan/schema.rs @@ -131,12 +131,15 @@ pub fn set_estimated_row_counts( right_on, } = lp_arena.take(root) { + let mut_options = Arc::make_mut(&mut options); let (known_size, estimated_size, filter_count_left) = set_estimated_row_counts(input_left, lp_arena, expr_arena, 0); - options.rows_left = estimate_sizes(known_size, estimated_size, filter_count_left); + mut_options.rows_left = + estimate_sizes(known_size, estimated_size, filter_count_left); let (known_size, estimated_size, filter_count_right) = set_estimated_row_counts(input_right, lp_arena, expr_arena, 0); - options.rows_right = estimate_sizes(known_size, estimated_size, filter_count_right); + mut_options.rows_right = + estimate_sizes(known_size, estimated_size, filter_count_right); let mut out = match options.args.how { JoinType::Left => { diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/tree_format.rs b/crates/polars-plan/src/logical_plan/tree_format.rs similarity index 98% rename from polars/polars-lazy/polars-plan/src/logical_plan/tree_format.rs rename to crates/polars-plan/src/logical_plan/tree_format.rs index 344cc36d42b68..33ce757817b5a 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/tree_format.rs +++ b/crates/polars-plan/src/logical_plan/tree_format.rs @@ -55,7 +55,6 @@ impl UpperExp for AExpr { AExpr::Slice { .. } => "slice", AExpr::Count => "count", AExpr::Nth(v) => return write!(f, "nth({})", v), - AExpr::Cache { id, .. } => return write!(f, "cache({id:x})"), }; write!(f, "{s}") diff --git a/crates/polars-plan/src/logical_plan/visitor/expr.rs b/crates/polars-plan/src/logical_plan/visitor/expr.rs new file mode 100644 index 0000000000000..0960935cc0660 --- /dev/null +++ b/crates/polars-plan/src/logical_plan/visitor/expr.rs @@ -0,0 +1,249 @@ +use super::*; +use crate::prelude::*; + +impl TreeWalker for Expr { + fn apply_children<'a>( + &'a self, + op: &mut dyn FnMut(&Self) -> PolarsResult, + ) -> PolarsResult { + let mut scratch = vec![]; + + self.nodes(&mut scratch); + + for child in scratch { + match op(child)? { + VisitRecursion::Continue => {} + // early stop + VisitRecursion::Skip => return Ok(VisitRecursion::Continue), + VisitRecursion::Stop => return Ok(VisitRecursion::Stop), + } + } + Ok(VisitRecursion::Continue) + } + + fn map_children(self, _op: &mut dyn FnMut(Self) -> PolarsResult) -> PolarsResult { + todo!() + } +} + +pub struct AexprNode { + node: Node, + arena: *mut Arena, +} + +impl AexprNode { + /// Don't use this directly, use [`Self::with_context`] + /// + /// # Safety + /// This will keep a pointer to `arena`. The caller must ensure it stays alive. + unsafe fn new(node: Node, arena: &mut Arena) -> Self { + Self { node, arena } + } + + /// # Safety + /// This will keep a pointer to `arena`. The caller must ensure it stays alive. + pub(crate) unsafe fn from_raw(node: Node, arena: *mut Arena) -> Self { + Self { node, arena } + } + + /// Safe interface. Take the `&mut Arena` only for the duration of `op`. + pub fn with_context(node: Node, arena: &mut Arena, mut op: F) -> T + where + F: FnMut(AexprNode) -> T, + { + // safety: we drop this context before arena is out of scope + unsafe { op(Self::new(node, arena)) } + } + + /// Get the `Node`. + pub fn node(&self) -> Node { + self.node + } + + /// Apply an operation with the underlying `Arena`. + pub fn with_arena<'a, F, T>(&self, op: F) -> T + where + F: FnOnce(&'a Arena) -> T, + { + let arena = unsafe { &(*self.arena) }; + + op(arena) + } + + /// Apply an operation with the underlying `Arena`. + pub fn with_arena_mut<'a, F, T>(&mut self, op: F) -> T + where + F: FnOnce(&'a mut Arena) -> T, + { + let arena = unsafe { &mut (*self.arena) }; + + op(arena) + } + + /// Assign an `AExpr` to underlying arena. + pub fn assign(&mut self, ae: AExpr) { + let node = self.with_arena_mut(|arena| arena.add(ae)); + self.node = node + } + + /// Take a `Node` and convert it an `AExprNode` and call + /// `F` with `self` and the new created `AExprNode` + pub fn binary(&self, other: Node, op: F) -> T + where + F: FnOnce(&AexprNode, &AexprNode) -> T, + { + // this is safe as we remain in context + let other = unsafe { AexprNode::from_raw(other, self.arena) }; + op(self, &other) + } + + pub fn to_aexpr(&self) -> &AExpr { + self.with_arena(|arena| arena.get(self.node)) + } + + pub fn to_expr(&self) -> Expr { + self.with_arena(|arena| node_to_expr(self.node, arena)) + } + + // traverses all nodes and does a full equality check + fn is_equal(&self, other: &Self, scratch1: &mut Vec, scratch2: &mut Vec) -> bool { + self.with_arena(|arena| { + let self_ae = self.to_aexpr(); + let other_ae = arena.get(other.node()); + + use AExpr::*; + let this_node_equal = match (self_ae, other_ae) { + (Alias(_, l), Alias(_, r)) => l == r, + (Column(l), Column(r)) => l == r, + (Literal(l), Literal(r)) => l == r, + (Nth(l), Nth(r)) => l == r, + (Window { options: l, .. }, Window { options: r, .. }) => l == r, + ( + Cast { + strict: strict_l, + data_type: dtl, + .. + }, + Cast { + strict: strict_r, + data_type: dtr, + .. + }, + ) => strict_l == strict_r && dtl == dtr, + (Sort { options: l, .. }, Sort { options: r, .. }) => l == r, + (Take { .. }, Take { .. }) + | (Filter { .. }, Filter { .. }) + | (Ternary { .. }, Ternary { .. }) + | (Count, Count) + | (Explode(_), Explode(_)) => true, + (SortBy { descending: l, .. }, SortBy { descending: r, .. }) => l == r, + (Agg(l), Agg(r)) => l.equal_nodes(r), + ( + Function { + function: fl, + options: ol, + .. + }, + Function { + function: fr, + options: or, + .. + }, + ) => fl == fr && ol == or, + (AnonymousFunction { function: l, .. }, AnonymousFunction { function: r, .. }) => { + // check only data pointer as location + let l = l.as_ref() as *const _ as *const () as usize; + let r = r.as_ref() as *const _ as *const () as usize; + l == r + } + (BinaryExpr { op: l, .. }, BinaryExpr { op: r, .. }) => l == r, + _ => false, + }; + + if !this_node_equal { + return false; + } + + self_ae.nodes(scratch1); + other_ae.nodes(scratch2); + + loop { + match (scratch1.pop(), scratch2.pop()) { + (Some(l), Some(r)) => { + // safety: we can pass a *mut pointer + // the equality operation will not access mutable + let l = unsafe { AexprNode::from_raw(l, self.arena) }; + let r = unsafe { AexprNode::from_raw(r, self.arena) }; + + if !l.is_equal(&r, scratch1, scratch2) { + return false; + } + } + (None, None) => return true, + _ => return false, + } + } + }) + } + + #[cfg(feature = "cse")] + pub(crate) fn is_leaf(&self) -> bool { + matches!(self.to_aexpr(), AExpr::Column(_) | AExpr::Literal(_)) + } +} + +impl PartialEq for AexprNode { + fn eq(&self, other: &Self) -> bool { + let mut scratch1 = vec![]; + let mut scratch2 = vec![]; + self.is_equal(other, &mut scratch1, &mut scratch2) + } +} + +impl TreeWalker for AexprNode { + fn apply_children<'a>( + &'a self, + op: &mut dyn FnMut(&Self) -> PolarsResult, + ) -> PolarsResult { + let mut scratch = vec![]; + + self.to_aexpr().nodes(&mut scratch); + for node in scratch { + let aenode = AexprNode { + node, + arena: self.arena, + }; + match op(&aenode)? { + VisitRecursion::Continue => {} + // early stop + VisitRecursion::Skip => return Ok(VisitRecursion::Continue), + VisitRecursion::Stop => return Ok(VisitRecursion::Stop), + } + } + Ok(VisitRecursion::Continue) + } + + fn map_children( + mut self, + op: &mut dyn FnMut(Self) -> PolarsResult, + ) -> PolarsResult { + let mut scratch = vec![]; + + let ae = self.to_aexpr(); + ae.nodes(&mut scratch); + + // rewrite the nodes + for node in &mut scratch { + let aenode = AexprNode { + node: *node, + arena: self.arena, + }; + *node = op(aenode)?.node; + } + + let ae = ae.clone().replace_inputs(&scratch); + let node = self.with_arena_mut(move |arena| arena.add(ae)); + self.node = node; + Ok(self) + } +} diff --git a/crates/polars-plan/src/logical_plan/visitor/lp.rs b/crates/polars-plan/src/logical_plan/visitor/lp.rs new file mode 100644 index 0000000000000..abd3572db27fa --- /dev/null +++ b/crates/polars-plan/src/logical_plan/visitor/lp.rs @@ -0,0 +1,143 @@ +use std::borrow::Cow; + +use polars_core::schema::SchemaRef; + +use super::*; +use crate::prelude::*; + +pub struct ALogicalPlanNode { + node: Node, + arena: *mut Arena, +} + +impl ALogicalPlanNode { + /// Don't use this directly, use [`Self::with_context`] + /// + /// # Safety + /// This will keep a pointer to `arena`. The caller must ensure it stays alive. + unsafe fn new(node: Node, arena: &mut Arena) -> Self { + Self { node, arena } + } + + /// # Safety + /// This will keep a pointer to `arena`. The caller must ensure it stays alive. + pub(crate) unsafe fn from_raw(node: Node, arena: *mut Arena) -> Self { + Self { node, arena } + } + + /// Safe interface. Take the `&mut Arena` only for the duration of `op`. + pub fn with_context(node: Node, arena: &mut Arena, mut op: F) -> T + where + F: FnMut(ALogicalPlanNode) -> T, + { + // safety: we drop this context before arena is out of scope + unsafe { op(Self::new(node, arena)) } + } + + pub fn node(&self) -> Node { + self.node + } + + pub fn with_arena<'a, F, T>(&self, op: F) -> T + where + F: Fn(&'a Arena) -> T, + { + let arena = unsafe { &(*self.arena) }; + + op(arena) + } + + pub fn with_arena_mut<'a, F, T>(&mut self, op: F) -> T + where + F: FnOnce(&'a mut Arena) -> T, + { + let arena = unsafe { &mut (*self.arena) }; + + op(arena) + } + + /// Add a new `ALogicalPlan` to the arena and set that node to `Self`. + pub fn assign(&mut self, ae: ALogicalPlan) { + let node = self.with_arena_mut(|arena| arena.add(ae)); + self.node = node + } + + /// Replace the current `Node` with a new `ALogicalPlan`. + pub fn replace(&mut self, ae: ALogicalPlan) { + let node = self.node; + self.with_arena_mut(|arena| arena.replace(node, ae)); + } + + pub fn to_alp(&self) -> &ALogicalPlan { + self.with_arena(|arena| arena.get(self.node)) + } + + pub fn to_alp_mut(&mut self) -> &mut ALogicalPlan { + let node = self.node; + self.with_arena_mut(|arena| arena.get_mut(node)) + } + + pub fn schema(&self) -> Cow { + self.with_arena(|arena| arena.get(self.node).schema(arena)) + } + + /// Take a `Node` and convert it an `ALogicalPlanNode` and call + /// `F` with `self` and the new created `ALogicalPlanNode` + pub fn binary(&self, other: Node, op: F) -> T + where + F: FnOnce(&ALogicalPlanNode, &ALogicalPlanNode) -> T, + { + // this is safe as we remain in context + let other = unsafe { ALogicalPlanNode::from_raw(other, self.arena) }; + op(self, &other) + } +} + +impl TreeWalker for ALogicalPlanNode { + fn apply_children<'a>( + &'a self, + op: &mut dyn FnMut(&Self) -> PolarsResult, + ) -> PolarsResult { + let mut scratch = vec![]; + + self.to_alp().copy_inputs(&mut scratch); + for node in scratch { + let lp_node = ALogicalPlanNode { + node, + arena: self.arena, + }; + match op(&lp_node)? { + VisitRecursion::Continue => {} + // early stop + VisitRecursion::Skip => return Ok(VisitRecursion::Continue), + VisitRecursion::Stop => return Ok(VisitRecursion::Stop), + } + } + Ok(VisitRecursion::Continue) + } + + fn map_children( + mut self, + op: &mut dyn FnMut(Self) -> PolarsResult, + ) -> PolarsResult { + let mut inputs = vec![]; + let mut exprs = vec![]; + + let lp = self.to_alp(); + lp.copy_inputs(&mut inputs); + lp.copy_exprs(&mut exprs); + + // rewrite the nodes + for node in &mut inputs { + let lp_node = ALogicalPlanNode { + node: *node, + arena: self.arena, + }; + *node = op(lp_node)?.node; + } + + let lp = lp.with_exprs_and_input(exprs, inputs); + self.with_arena_mut(move |arena| arena.replace(self.node, lp)); + Ok(self) + } +} diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/visitor/mod.rs b/crates/polars-plan/src/logical_plan/visitor/mod.rs similarity index 92% rename from polars/polars-lazy/polars-plan/src/logical_plan/visitor/mod.rs rename to crates/polars-plan/src/logical_plan/visitor/mod.rs index 3fdaed6f35ce8..1ae5ba5601967 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/visitor/mod.rs +++ b/crates/polars-plan/src/logical_plan/visitor/mod.rs @@ -2,9 +2,11 @@ use polars_arrow::error::PolarsResult; mod expr; +mod lp; mod visitors; pub use expr::*; +pub use lp::*; pub use visitors::*; /// Controls how the [`TreeWalker`] recursion should proceed for [`TreeWalker::visit`]. @@ -22,9 +24,9 @@ pub enum VisitRecursion { #[derive(Debug)] pub enum RewriteRecursion { /// Continue the visit to this node and children. - Continue, + MutateAndContinue, /// Don't mutate this node, continue visiting the children - Skip, + NoMutateAndContinue, /// Stop and return. /// This doesn't visit the children Stop, diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/visitor/visitors.rs b/crates/polars-plan/src/logical_plan/visitor/visitors.rs similarity index 62% rename from polars/polars-lazy/polars-plan/src/logical_plan/visitor/visitors.rs rename to crates/polars-plan/src/logical_plan/visitor/visitors.rs index 7689cce898696..b06f7ee4b34b5 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/visitor/visitors.rs +++ b/crates/polars-plan/src/logical_plan/visitor/visitors.rs @@ -35,8 +35,8 @@ pub trait TreeWalker: Sized { let mutate_this_node = match rewriter.pre_visit(&self)? { RewriteRecursion::MutateAndStop => return rewriter.mutate(self), RewriteRecursion::Stop => return Ok(self), - RewriteRecursion::Continue => true, - RewriteRecursion::Skip => false, + RewriteRecursion::MutateAndContinue => true, + RewriteRecursion::NoMutateAndContinue => false, }; let after_applied_children = self.map_children(&mut |node| node.rewrite(rewriter))?; @@ -69,63 +69,8 @@ pub trait RewritingVisitor { /// Invoked before any children of `node` are visited. fn pre_visit(&mut self, _node: &Self::Node) -> PolarsResult { - Ok(RewriteRecursion::Continue) + Ok(RewriteRecursion::MutateAndContinue) } fn mutate(&mut self, node: Self::Node) -> PolarsResult; } - -#[cfg(test)] -mod test { - use super::*; - use crate::prelude::*; - - #[test] - fn test_visitor() { - struct VisitPath { - pre_idx: usize, - pre_stack: Vec, - #[allow(dead_code)] - post_idx: usize, - post_stack: Vec, - } - - impl VisitPath { - fn new() -> Self { - Self { - pre_idx: 0, - pre_stack: vec![], - post_idx: 0, - post_stack: vec![], - } - } - } - - impl Visitor for VisitPath { - type Node = AexprNode; - - fn pre_visit(&mut self, _node: &Self::Node) -> PolarsResult { - self.pre_idx += 1; - self.pre_stack.push(self.pre_idx); - Ok(VisitRecursion::Continue) - } - - fn post_visit(&mut self, _node: &Self::Node) -> PolarsResult { - // self.post_idx += 1; - let idx = self.pre_stack.pop().unwrap(); - self.post_stack.push(idx); - Ok(VisitRecursion::Continue) - } - } - - let e = (col("f00").sum() * col("bar")).sum() + col("f00").sum(); - let mut arena = Arena::new(); - let node = to_aexpr(e, &mut arena); - let mut visitor = VisitPath::new(); - - AexprNode::with_context(node, &mut arena, |node| node.visit(&mut visitor).unwrap()); - - dbg!(visitor.pre_stack); - dbg!(visitor.post_stack); - } -} diff --git a/polars/polars-lazy/polars-plan/src/prelude.rs b/crates/polars-plan/src/prelude.rs similarity index 94% rename from polars/polars-lazy/polars-plan/src/prelude.rs rename to crates/polars-plan/src/prelude.rs index e6c7f255f0c85..f84bf050adb99 100644 --- a/polars/polars-lazy/polars-plan/src/prelude.rs +++ b/crates/polars-plan/src/prelude.rs @@ -17,7 +17,6 @@ pub(crate) use polars_time::{ pub use polars_utils::arena::{Arena, Node}; pub use crate::dsl::*; -pub(crate) use crate::logical_plan::alp::*; pub(crate) use crate::logical_plan::conversion::*; #[cfg(feature = "debugging")] pub use crate::logical_plan::debug::*; diff --git a/polars/polars-lazy/polars-plan/src/utils.rs b/crates/polars-plan/src/utils.rs similarity index 90% rename from polars/polars-lazy/polars-plan/src/utils.rs rename to crates/polars-plan/src/utils.rs index c20ac02813bae..c3f79f49f6f69 100644 --- a/polars/polars-lazy/polars-plan/src/utils.rs +++ b/crates/polars-plan/src/utils.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use polars_core::prelude::*; use smartstring::alias::String as SmartString; +use crate::constants::CSE_REPLACED; use crate::logical_plan::iterator::ArenaExprIter; use crate::logical_plan::Context; use crate::prelude::names::COUNT; @@ -124,6 +125,10 @@ pub fn has_aexpr_window(current_node: Node, arena: &Arena) -> bool { has_aexpr(current_node, arena, |e| matches!(e, AExpr::Window { .. })) } +pub fn has_aexpr_literal(current_node: Node, arena: &Arena) -> bool { + has_aexpr(current_node, arena, |e| matches!(e, AExpr::Literal(_))) +} + /// Can check if an expression tree has a matching_expr. This /// requires a dummy expression to be created that will be used to patter match against. pub(crate) fn has_expr(current_expr: &Expr, matches: F) -> bool @@ -219,28 +224,26 @@ pub fn expr_to_leaf_column_name(expr: &Expr) -> PolarsResult> { } } -fn is_leaf_aexpr(ae: &AExpr) -> bool { +fn is_column_aexpr(ae: &AExpr) -> bool { matches!(ae, AExpr::Column(_) | AExpr::Wildcard) } #[allow(clippy::type_complexity)] -pub(crate) fn aexpr_to_leaf_nodes_iter<'a>( +pub(crate) fn aexpr_to_column_nodes_iter<'a>( root: Node, arena: &'a Arena, ) -> FlatMap, Option, fn((Node, &'a AExpr)) -> Option> { - arena.iter(root).flat_map( - |(node, ae)| { - if is_leaf_aexpr(ae) { - Some(node) - } else { - None - } - }, - ) + arena.iter(root).flat_map(|(node, ae)| { + if is_column_aexpr(ae) { + Some(node) + } else { + None + } + }) } -pub(crate) fn aexpr_to_leaf_nodes(root: Node, arena: &Arena) -> Vec { - aexpr_to_leaf_nodes_iter(root, arena).collect() +pub(crate) fn aexpr_to_column_nodes(root: Node, arena: &Arena) -> Vec { + aexpr_to_column_nodes_iter(root, arena).collect() } /// Rename the roots of the expression to a single name. @@ -272,7 +275,7 @@ pub(crate) fn rename_matching_aexpr_leaf_names( current: &str, new_name: &str, ) -> Node { - let mut leaves = aexpr_to_leaf_nodes_iter(node, arena); + let mut leaves = aexpr_to_column_nodes_iter(node, arena); if leaves.any(|node| matches!(arena.get(node), AExpr::Column(name) if &**name == current)) { // we convert to expression as we cannot easily copy the aexpr. @@ -298,7 +301,7 @@ pub(crate) fn aexpr_assign_renamed_leaf( current: &str, new_name: &str, ) -> Node { - let leafs = aexpr_to_leaf_nodes_iter(node, arena); + let leafs = aexpr_to_column_nodes_iter(node, arena); for node in leafs { match arena.get(node) { @@ -339,7 +342,7 @@ pub fn aexpr_to_leaf_names_iter( node: Node, arena: &Arena, ) -> impl Iterator> + '_ { - aexpr_to_leaf_nodes_iter(node, arena).map(|node| match arena.get(node) { + aexpr_to_column_nodes_iter(node, arena).map(|node| match arena.get(node) { // expecting only columns here, wildcards and dtypes should already be replaced AExpr::Column(name) => name.clone(), e => { @@ -394,7 +397,7 @@ where pub fn expr_is_projected_upstream( e: &Node, input: Node, - lp_arena: &mut Arena, + lp_arena: &Arena, expr_arena: &Arena, projected_names: &PlHashSet>, ) -> bool { @@ -407,3 +410,15 @@ pub fn expr_is_projected_upstream( let output_name = output_field.name(); projected_names.contains(output_name.as_str()) } + +pub fn rename_cse_tmp_series(s: &mut Series) { + if s.name().starts_with(CSE_REPLACED) { + let field = s.field().into_owned(); + let name = &field.name; + let pat = r#"col("#; + let offset = name.rfind(pat).unwrap() + pat.len(); + // -1 is `)` of `col(foo)` + let name = &name[offset..name.len() - 1]; + s.rename(name); + } +} diff --git a/polars/polars-row/Cargo.toml b/crates/polars-row/Cargo.toml similarity index 100% rename from polars/polars-row/Cargo.toml rename to crates/polars-row/Cargo.toml diff --git a/polars/polars-time/LICENSE b/crates/polars-row/LICENSE similarity index 100% rename from polars/polars-time/LICENSE rename to crates/polars-row/LICENSE diff --git a/polars/polars-row/README.md b/crates/polars-row/README.md similarity index 100% rename from polars/polars-row/README.md rename to crates/polars-row/README.md diff --git a/polars/polars-row/src/decode.rs b/crates/polars-row/src/decode.rs similarity index 100% rename from polars/polars-row/src/decode.rs rename to crates/polars-row/src/decode.rs diff --git a/polars/polars-row/src/encode.rs b/crates/polars-row/src/encode.rs similarity index 100% rename from polars/polars-row/src/encode.rs rename to crates/polars-row/src/encode.rs diff --git a/polars/polars-row/src/fixed.rs b/crates/polars-row/src/fixed.rs similarity index 100% rename from polars/polars-row/src/fixed.rs rename to crates/polars-row/src/fixed.rs diff --git a/polars/polars-row/src/lib.rs b/crates/polars-row/src/lib.rs similarity index 100% rename from polars/polars-row/src/lib.rs rename to crates/polars-row/src/lib.rs diff --git a/polars/polars-row/src/row.rs b/crates/polars-row/src/row.rs similarity index 100% rename from polars/polars-row/src/row.rs rename to crates/polars-row/src/row.rs diff --git a/polars/polars-row/src/utils.rs b/crates/polars-row/src/utils.rs similarity index 100% rename from polars/polars-row/src/utils.rs rename to crates/polars-row/src/utils.rs diff --git a/polars/polars-row/src/variable.rs b/crates/polars-row/src/variable.rs similarity index 100% rename from polars/polars-row/src/variable.rs rename to crates/polars-row/src/variable.rs diff --git a/polars/polars-sql/Cargo.toml b/crates/polars-sql/Cargo.toml similarity index 91% rename from polars/polars-sql/Cargo.toml rename to crates/polars-sql/Cargo.toml index db7871220d6be..d1ecce7d1c7c8 100644 --- a/polars/polars-sql/Cargo.toml +++ b/crates/polars-sql/Cargo.toml @@ -18,7 +18,7 @@ parquet = ["polars-lazy/parquet"] polars-arrow = { version = "0.31.1", path = "../polars-arrow", features = ["like"] } polars-core = { version = "0.31.1", path = "../polars-core", features = [] } polars-lazy = { version = "0.31.1", path = "../polars-lazy", features = ["compile", "strings", "cross_join", "trigonometry", "abs", "round_series", "log", "regex", "is_in", "meta", "cum_agg"] } -polars-plan = { version = "0.31.1", path = "../polars-lazy/polars-plan", features = ["compile"] } +polars-plan = { version = "0.31.1", path = "../polars-plan", features = ["compile"] } serde = "1" serde_json = { version = "1" } # sqlparser = { git = "https://github.com/sqlparser-rs/sqlparser-rs.git", rev = "ae3b5844c839072c235965fe0d1bddc473dced87" } diff --git a/polars/polars-utils/LICENSE b/crates/polars-sql/LICENSE similarity index 100% rename from polars/polars-utils/LICENSE rename to crates/polars-sql/LICENSE diff --git a/polars/polars-sql/README.md b/crates/polars-sql/README.md similarity index 100% rename from polars/polars-sql/README.md rename to crates/polars-sql/README.md diff --git a/polars/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs similarity index 100% rename from polars/polars-sql/src/context.rs rename to crates/polars-sql/src/context.rs diff --git a/polars/polars-sql/src/functions.rs b/crates/polars-sql/src/functions.rs similarity index 95% rename from polars/polars-sql/src/functions.rs rename to crates/polars-sql/src/functions.rs index 70a75718a12ed..3c4c33d7030e0 100644 --- a/polars/polars-sql/src/functions.rs +++ b/crates/polars-sql/src/functions.rs @@ -81,6 +81,11 @@ pub(crate) enum PolarsSqlFunctions { /// SELECT ATAN(column_1) from df; /// ``` Atan, + /// SQL 'atan2' function + /// ```sql + /// SELECT ATAN2(column_1) from df; + /// ``` + Atan2, /// SQL 'acosd' function /// ```sql /// SELECT ACOSD(column_1) from df; @@ -96,6 +101,11 @@ pub(crate) enum PolarsSqlFunctions { /// SELECT ATAND(column_1) from df; /// ``` AtanD, + /// SQL 'atan2d' function + /// ```sql + /// SELECT ATAN2D(column_1) from df; + /// ``` + Atan2D, /// SQL 'ceil' function /// ```sql /// SELECT CEIL(column_1) from df; @@ -111,6 +121,11 @@ pub(crate) enum PolarsSqlFunctions { /// SELECT FLOOR(column_1) from df; /// ``` Floor, + /// SQL 'pi' function + /// ```sql + /// SELECT PI() from df; + /// ``` + Pi, /// SQL 'ln' function /// ```sql /// SELECT LN(column_1) from df; @@ -141,6 +156,16 @@ pub(crate) enum PolarsSqlFunctions { /// SELECT POW(column_1, 2) from df; /// ``` Pow, + /// SQL 'sqrt' function + /// ```sql + /// SELECT SQRT(column_1) from df; + /// ``` + Sqrt, + /// SQL 'cbrt' function + /// ```sql + /// SELECT CBRT(column_1) from df; + /// ``` + Cbrt, /// SQL 'round' function /// ```sql /// SELECT ROUND(column_1, 3) from df; @@ -352,8 +377,11 @@ impl PolarsSqlFunctions { "asin", "asind", "atan", + "atan2", + "atan2d", "atand", "avg", + "cbrt", "ceil", "ceiling", "cos", @@ -379,6 +407,7 @@ impl PolarsSqlFunctions { "max", "min", "octet_length", + "pi", "pow", "power", "radians", @@ -386,6 +415,7 @@ impl PolarsSqlFunctions { "rtrim", "sin", "sind", + "sqrt", "starts_with", "stddev", "sum", @@ -420,20 +450,25 @@ impl TryFrom<&'_ SQLFunction> for PolarsSqlFunctions { "acos" => Self::Acos, "asin" => Self::Asin, "atan" => Self::Atan, + "atan2" => Self::Atan2, "acosd" => Self::AcosD, "asind" => Self::AsinD, "atand" => Self::AtanD, + "atan2d" => Self::Atan2D, "degrees" => Self::Degrees, "radians" => Self::Radians, "ceil" | "ceiling" => Self::Ceil, "exp" => Self::Exp, "floor" => Self::Floor, + "pi" => Self::Pi, "ln" => Self::Ln, "log" => Self::Log, "log10" => Self::Log10, "log1p" => Self::Log1p, "log2" => Self::Log2, "pow" | "power" => Self::Pow, + "sqrt" => Self::Sqrt, + "cbrt" => Self::Cbrt, "round" => Self::Round, // ---- @@ -506,20 +541,25 @@ impl SqlFunctionVisitor<'_> { Acos => self.visit_unary(Expr::arccos), Asin => self.visit_unary(Expr::arcsin), Atan => self.visit_unary(Expr::arctan), + Atan2 => self.visit_binary(Expr::arctan2), AcosD => self.visit_unary(|e| e.arccos().degrees()), AsinD => self.visit_unary(|e| e.arcsin().degrees()), AtanD => self.visit_unary(|e| e.arctan().degrees()), + Atan2D => self.visit_binary(|e, s| e.arctan2(s).degrees()), Degrees => self.visit_unary(Expr::degrees), Radians => self.visit_unary(Expr::radians), Ceil => self.visit_unary(Expr::ceil), Exp => self.visit_unary(Expr::exp), Floor => self.visit_unary(Expr::floor), + Pi => self.visit_nullary(Expr::pi), Ln => self.visit_unary(|e| e.log(std::f64::consts::E)), Log => self.visit_binary(Expr::log), Log10 => self.visit_unary(|e| e.log(10.0)), Log1p => self.visit_unary(Expr::log1p), Log2 => self.visit_unary(|e| e.log(2.0)), Pow => self.visit_binary::(Expr::pow), + Sqrt => self.visit_unary(Expr::sqrt), + Cbrt => self.visit_unary(Expr::cbrt), Round => match function.args.len() { 1 => self.visit_unary(|e| e.round(0)), 2 => self.try_visit_binary(|e, decimals| { @@ -756,6 +796,14 @@ impl SqlFunctionVisitor<'_> { } } + fn visit_nullary(&self, f: impl Fn() -> Expr) -> PolarsResult { + let args = extract_args(self.func); + if !args.is_empty() { + return self.not_supported_error(); + } + Ok(f()) + } + fn visit_count(&self) -> PolarsResult { let args = extract_args(self.func); match (self.func.distinct, args.as_slice()) { diff --git a/polars/polars-sql/src/keywords.rs b/crates/polars-sql/src/keywords.rs similarity index 100% rename from polars/polars-sql/src/keywords.rs rename to crates/polars-sql/src/keywords.rs diff --git a/polars/polars-sql/src/lib.rs b/crates/polars-sql/src/lib.rs similarity index 100% rename from polars/polars-sql/src/lib.rs rename to crates/polars-sql/src/lib.rs diff --git a/polars/polars-sql/src/sql_expr.rs b/crates/polars-sql/src/sql_expr.rs similarity index 94% rename from polars/polars-sql/src/sql_expr.rs rename to crates/polars-sql/src/sql_expr.rs index 31aaabf56de79..d73cd2429220d 100644 --- a/polars/polars-sql/src/sql_expr.rs +++ b/crates/polars-sql/src/sql_expr.rs @@ -406,10 +406,6 @@ impl SqlExprVisitor<'_> { else_result, } = expr { - if operand.is_some() { - polars_bail!(ComputeError: "CASE operand is not yet supported"); - } - polars_ensure!( conditions.len() == results.len(), ComputeError: "WHEN and THEN expressions must have the same length" @@ -432,6 +428,34 @@ impl SqlExprVisitor<'_> { None => polars_bail!(ComputeError: "ELSE expression is required"), }; + if let Some(operand_expr) = operand { + let first_operand_expr = self.visit_expr(operand_expr)?; + + let first = first.unwrap(); + let first_cond = first_operand_expr.eq(self.visit_expr(first.0)?); + let first_then = self.visit_expr(first.1)?; + let expr = when(first_cond).then(first_then); + let next = when_thens.next(); + + let mut when_then = if let Some((cond, res)) = next { + let second_operand_expr = self.visit_expr(operand_expr)?; + let cond = second_operand_expr.eq(self.visit_expr(cond)?); + let res = self.visit_expr(res)?; + expr.when(cond).then(res) + } else { + return Ok(expr.otherwise(else_res)); + }; + + for (cond, res) in when_thens { + let new_operand_expr = self.visit_expr(operand_expr)?; + let cond = new_operand_expr.eq(self.visit_expr(cond)?); + let res = self.visit_expr(res)?; + when_then = when_then.when(cond).then(res); + } + + return Ok(when_then.otherwise(else_res)); + } + let first = first.unwrap(); let first_cond = self.visit_expr(first.0)?; let first_then = self.visit_expr(first.1)?; diff --git a/polars/polars-sql/src/table_functions.rs b/crates/polars-sql/src/table_functions.rs similarity index 100% rename from polars/polars-sql/src/table_functions.rs rename to crates/polars-sql/src/table_functions.rs diff --git a/polars/polars-sql/tests/functions_cumulative.rs b/crates/polars-sql/tests/functions_cumulative.rs similarity index 100% rename from polars/polars-sql/tests/functions_cumulative.rs rename to crates/polars-sql/tests/functions_cumulative.rs diff --git a/polars/polars-sql/tests/functions_io.rs b/crates/polars-sql/tests/functions_io.rs similarity index 100% rename from polars/polars-sql/tests/functions_io.rs rename to crates/polars-sql/tests/functions_io.rs diff --git a/polars/polars-sql/tests/functions_math.rs b/crates/polars-sql/tests/functions_math.rs similarity index 81% rename from polars/polars-sql/tests/functions_math.rs rename to crates/polars-sql/tests/functions_math.rs index a5e39b8e3f370..b0b6d2c9eac95 100644 --- a/polars/polars-sql/tests/functions_math.rs +++ b/crates/polars-sql/tests/functions_math.rs @@ -17,6 +17,7 @@ fn test_math_functions() { ACOS(a) AS acos, ASIN(a) AS asin, ATAN(a) AS atan, + PI() AS pi, CEIL(a) AS ceil, EXP(a) AS exp, FLOOR(a) AS floor, @@ -25,7 +26,9 @@ fn test_math_functions() { LOG10(a) AS log10, LOG(a, 5) AS log5, LOG1P(a) AS log1p, - POW(a, 2) AS pow + POW(a, 2) AS pow, + SQRT(a) AS sqrt, + CBRT(a) AS cbrt FROM df"#; let df_sql = context.execute(sql).unwrap().collect().unwrap(); let df_pl = df @@ -36,6 +39,7 @@ fn test_math_functions() { col("a").arccos().alias("acos"), col("a").arcsin().alias("asin"), col("a").arctan().alias("atan"), + lit(std::f64::consts::PI).alias("pi"), col("a").ceil().alias("ceil"), col("a").exp().alias("exp"), col("a").floor().alias("floor"), @@ -45,8 +49,12 @@ fn test_math_functions() { col("a").log(5.0).alias("log5"), col("a").log1p().alias("log1p"), col("a").pow(2.0).alias("pow"), + col("a").sqrt().alias("sqrt"), + col("a").cbrt().alias("cbrt"), ]) .collect() .unwrap(); + println!("{}", df_pl.head(Some(10))); + println!("{}", df_sql.head(Some(10))); assert!(df_sql.frame_equal_missing(&df_pl)); } diff --git a/polars/polars-sql/tests/functions_meta.rs b/crates/polars-sql/tests/functions_meta.rs similarity index 100% rename from polars/polars-sql/tests/functions_meta.rs rename to crates/polars-sql/tests/functions_meta.rs diff --git a/polars/polars-sql/tests/functions_string.rs b/crates/polars-sql/tests/functions_string.rs similarity index 100% rename from polars/polars-sql/tests/functions_string.rs rename to crates/polars-sql/tests/functions_string.rs diff --git a/polars/polars-sql/tests/iss_7436.rs b/crates/polars-sql/tests/iss_7436.rs similarity index 100% rename from polars/polars-sql/tests/iss_7436.rs rename to crates/polars-sql/tests/iss_7436.rs diff --git a/polars/polars-sql/tests/iss_7437.rs b/crates/polars-sql/tests/iss_7437.rs similarity index 100% rename from polars/polars-sql/tests/iss_7437.rs rename to crates/polars-sql/tests/iss_7437.rs diff --git a/polars/polars-sql/tests/iss_7440.rs b/crates/polars-sql/tests/iss_7440.rs similarity index 100% rename from polars/polars-sql/tests/iss_7440.rs rename to crates/polars-sql/tests/iss_7440.rs diff --git a/polars/polars-sql/tests/iss_8395.rs b/crates/polars-sql/tests/iss_8395.rs similarity index 100% rename from polars/polars-sql/tests/iss_8395.rs rename to crates/polars-sql/tests/iss_8395.rs diff --git a/polars/polars-sql/tests/iss_8419.rs b/crates/polars-sql/tests/iss_8419.rs similarity index 100% rename from polars/polars-sql/tests/iss_8419.rs rename to crates/polars-sql/tests/iss_8419.rs diff --git a/polars/polars-sql/tests/ops_distinct_on.rs b/crates/polars-sql/tests/ops_distinct_on.rs similarity index 100% rename from polars/polars-sql/tests/ops_distinct_on.rs rename to crates/polars-sql/tests/ops_distinct_on.rs diff --git a/polars/polars-sql/tests/simple_exprs.rs b/crates/polars-sql/tests/simple_exprs.rs similarity index 95% rename from polars/polars-sql/tests/simple_exprs.rs rename to crates/polars-sql/tests/simple_exprs.rs index 7d58133800884..68653d8def23a 100644 --- a/polars/polars-sql/tests/simple_exprs.rs +++ b/crates/polars-sql/tests/simple_exprs.rs @@ -528,6 +528,30 @@ fn test_case_expr() { assert!(df_sql.frame_equal(&df_pl)); } +#[test] +fn test_case_expr_with_expression() { + let df = create_sample_df().unwrap(); + let mut context = SQLContext::new(); + context.register("df", df.clone().lazy()); + let sql = r#" + SELECT + CASE b%2 + WHEN 0 THEN 'even' + WHEN 1 THEN 'odd' + ELSE 'No?' + END AS parity + FROM df"#; + let df_sql = context.execute(sql).unwrap().collect().unwrap(); + let case_expr = when((col("b") % lit(2)).eq(lit(0))) + .then(lit("even")) + .when((col("b") % lit(2)).eq(lit(1))) + .then(lit("odd")) + .otherwise(lit("No?")) + .alias("parity"); + let df_pl = df.lazy().select(&[case_expr]).collect().unwrap(); + assert!(df_sql.frame_equal(&df_pl)); +} + #[test] fn test_sql_expr() { let df = create_sample_df().unwrap(); diff --git a/polars/polars-sql/tests/statements.rs b/crates/polars-sql/tests/statements.rs similarity index 100% rename from polars/polars-sql/tests/statements.rs rename to crates/polars-sql/tests/statements.rs diff --git a/polars/polars-time/Cargo.toml b/crates/polars-time/Cargo.toml similarity index 96% rename from polars/polars-time/Cargo.toml rename to crates/polars-time/Cargo.toml index e7e0ff20adeea..44fd531155736 100644 --- a/polars/polars-time/Cargo.toml +++ b/crates/polars-time/Cargo.toml @@ -30,7 +30,7 @@ dtype-time = ["polars-core/dtype-time", "polars-core/temporal"] dtype-duration = ["polars-core/dtype-duration", "polars-core/temporal"] rolling_window = ["polars-core/rolling_window", "dtype-duration"] fmt = ["polars-core/fmt"] -timezones = ["chrono-tz", "dtype-datetime", "polars-core/timezones", "polars-arrow/timezones"] +timezones = ["chrono-tz", "dtype-datetime", "polars-core/timezones", "polars-arrow/timezones", "polars-ops/timezones"] test = ["dtype-date", "dtype-datetime", "polars-core/fmt"] diff --git a/crates/polars-time/LICENSE b/crates/polars-time/LICENSE new file mode 120000 index 0000000000000..30cff7403da04 --- /dev/null +++ b/crates/polars-time/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/polars/polars-time/README.md b/crates/polars-time/README.md similarity index 100% rename from polars/polars-time/README.md rename to crates/polars-time/README.md diff --git a/polars/polars-time/src/base_utc_offset.rs b/crates/polars-time/src/base_utc_offset.rs similarity index 100% rename from polars/polars-time/src/base_utc_offset.rs rename to crates/polars-time/src/base_utc_offset.rs diff --git a/polars/polars-time/src/chunkedarray/date.rs b/crates/polars-time/src/chunkedarray/date.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/date.rs rename to crates/polars-time/src/chunkedarray/date.rs diff --git a/polars/polars-time/src/chunkedarray/datetime.rs b/crates/polars-time/src/chunkedarray/datetime.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/datetime.rs rename to crates/polars-time/src/chunkedarray/datetime.rs diff --git a/polars/polars-time/src/chunkedarray/duration.rs b/crates/polars-time/src/chunkedarray/duration.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/duration.rs rename to crates/polars-time/src/chunkedarray/duration.rs diff --git a/polars/polars-time/src/chunkedarray/kernels.rs b/crates/polars-time/src/chunkedarray/kernels.rs similarity index 76% rename from polars/polars-time/src/chunkedarray/kernels.rs rename to crates/polars-time/src/chunkedarray/kernels.rs index e34adbdb08a11..8d558d629396f 100644 --- a/polars/polars-time/src/chunkedarray/kernels.rs +++ b/crates/polars-time/src/chunkedarray/kernels.rs @@ -4,10 +4,10 @@ use chrono::{Datelike, NaiveDate, NaiveDateTime, Timelike}; use polars_arrow::export::arrow::array::{BooleanArray, PrimitiveArray}; use polars_arrow::export::arrow::compute::arity::unary; #[cfg(feature = "dtype-time")] -use polars_arrow::export::arrow::temporal_conversions::time64ns_to_time; +use polars_arrow::export::arrow::temporal_conversions::time64ns_to_time_opt; use polars_arrow::export::arrow::temporal_conversions::{ - date32_to_datetime, timestamp_ms_to_datetime, timestamp_ns_to_datetime, - timestamp_us_to_datetime, + date32_to_datetime_opt, timestamp_ms_to_datetime_opt, timestamp_ns_to_datetime_opt, + timestamp_us_to_datetime_opt, }; use super::super::windows::calendar::*; @@ -44,13 +44,17 @@ impl PolarsIso for NaiveDate { } macro_rules! to_temporal_unit { - ($name: ident, $chrono_method: ident, $to_datetime_fn: expr, $dtype_in: ty, $dtype_out:expr) => { - pub(crate) fn $name(arr: &PrimitiveArray<$dtype_in>) -> ArrayRef { + ($name: ident, $chrono_method: ident, $to_datetime_fn: expr, + $primitive_in: ty, + $primitive_out: ty, + $dtype_out:expr) => { + pub(crate) fn $name(arr: &PrimitiveArray<$primitive_in>) -> ArrayRef { Box::new(unary( arr, |value| { - let dt = $to_datetime_fn(value); - dt.$chrono_method() + $to_datetime_fn(value) + .map(|dt| dt.$chrono_method()) + .unwrap_or(value as $primitive_out) }, $dtype_out, )) as ArrayRef @@ -65,8 +69,9 @@ macro_rules! to_boolean_temporal_unit { .values() .iter() .map(|value| { - let dt = $to_datetime_fn(*value); - $boolean_method(dt.$chrono_method()) + $to_datetime_fn(*value) + .map(|dt| $boolean_method(dt.$chrono_method())) + .unwrap_or(false) }) .collect::>(); Box::new(BooleanArray::new( @@ -83,15 +88,17 @@ macro_rules! to_boolean_temporal_unit { to_temporal_unit!( date_to_iso_week, week, - date32_to_datetime, + date32_to_datetime_opt, i32, + u32, ArrowDataType::UInt32 ); #[cfg(feature = "dtype-date")] to_temporal_unit!( date_to_iso_year, iso_year, - date32_to_datetime, + date32_to_datetime_opt, + i32, i32, ArrowDataType::Int32 ); @@ -99,15 +106,17 @@ to_temporal_unit!( to_temporal_unit!( date_to_iso_weekday, p_weekday, - date32_to_datetime, + date32_to_datetime_opt, i32, + u32, ArrowDataType::UInt32 ); #[cfg(feature = "dtype-date")] to_temporal_unit!( date_to_year, year, - date32_to_datetime, + date32_to_datetime_opt, + i32, i32, ArrowDataType::Int32 ); @@ -116,31 +125,34 @@ to_boolean_temporal_unit!( date_to_is_leap_year, year, is_leap_year, - date32_to_datetime, + date32_to_datetime_opt, i32 ); #[cfg(feature = "dtype-date")] to_temporal_unit!( date_to_month, month, - date32_to_datetime, + date32_to_datetime_opt, i32, + u32, ArrowDataType::UInt32 ); #[cfg(feature = "dtype-date")] to_temporal_unit!( date_to_day, day, - date32_to_datetime, + date32_to_datetime_opt, i32, + u32, ArrowDataType::UInt32 ); #[cfg(feature = "dtype-date")] to_temporal_unit!( date_to_ordinal, ordinal, - date32_to_datetime, + date32_to_datetime_opt, i32, + u32, ArrowDataType::UInt32 ); @@ -149,32 +161,36 @@ to_temporal_unit!( to_temporal_unit!( time_to_hour, hour, - time64ns_to_time, + time64ns_to_time_opt, i64, + u32, ArrowDataType::UInt32 ); #[cfg(feature = "dtype-time")] to_temporal_unit!( time_to_minute, minute, - time64ns_to_time, + time64ns_to_time_opt, i64, + u32, ArrowDataType::UInt32 ); #[cfg(feature = "dtype-time")] to_temporal_unit!( time_to_second, second, - time64ns_to_time, + time64ns_to_time_opt, i64, + u32, ArrowDataType::UInt32 ); #[cfg(feature = "dtype-time")] to_temporal_unit!( time_to_nanosecond, nanosecond, - time64ns_to_time, + time64ns_to_time_opt, i64, + u32, ArrowDataType::UInt32 ); @@ -182,8 +198,9 @@ to_temporal_unit!( to_temporal_unit!( datetime_to_ordinal_ns, ordinal, - timestamp_ns_to_datetime, + timestamp_ns_to_datetime_opt, i64, + u32, ArrowDataType::UInt32 ); @@ -191,16 +208,18 @@ to_temporal_unit!( to_temporal_unit!( datetime_to_ordinal_ms, ordinal, - timestamp_ms_to_datetime, + timestamp_ms_to_datetime_opt, i64, + u32, ArrowDataType::UInt32 ); #[cfg(feature = "dtype-datetime")] to_temporal_unit!( datetime_to_ordinal_us, ordinal, - timestamp_us_to_datetime, + timestamp_us_to_datetime_opt, i64, + u32, ArrowDataType::UInt32 ); @@ -208,8 +227,9 @@ to_temporal_unit!( to_temporal_unit!( datetime_to_iso_year_ns, iso_year, - timestamp_ns_to_datetime, + timestamp_ns_to_datetime_opt, i64, + i32, ArrowDataType::Int32 ); @@ -217,8 +237,9 @@ to_temporal_unit!( to_temporal_unit!( datetime_to_iso_year_us, iso_year, - timestamp_us_to_datetime, + timestamp_us_to_datetime_opt, i64, + i32, ArrowDataType::Int32 ); @@ -226,8 +247,9 @@ to_temporal_unit!( to_temporal_unit!( datetime_to_iso_year_ms, iso_year, - timestamp_ms_to_datetime, + timestamp_ms_to_datetime_opt, i64, + i32, ArrowDataType::Int32 ); #[cfg(feature = "dtype-datetime")] @@ -235,7 +257,7 @@ to_boolean_temporal_unit!( datetime_to_is_leap_year_ns, year, is_leap_year, - timestamp_ns_to_datetime, + timestamp_ns_to_datetime_opt, i64 ); #[cfg(feature = "dtype-datetime")] @@ -243,7 +265,7 @@ to_boolean_temporal_unit!( datetime_to_is_leap_year_us, year, is_leap_year, - timestamp_us_to_datetime, + timestamp_us_to_datetime_opt, i64 ); #[cfg(feature = "dtype-datetime")] @@ -251,6 +273,6 @@ to_boolean_temporal_unit!( datetime_to_is_leap_year_ms, year, is_leap_year, - timestamp_ms_to_datetime, + timestamp_ms_to_datetime_opt, i64 ); diff --git a/polars/polars-time/src/chunkedarray/mod.rs b/crates/polars-time/src/chunkedarray/mod.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/mod.rs rename to crates/polars-time/src/chunkedarray/mod.rs diff --git a/polars/polars-time/src/chunkedarray/rolling_window/floats.rs b/crates/polars-time/src/chunkedarray/rolling_window/floats.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/rolling_window/floats.rs rename to crates/polars-time/src/chunkedarray/rolling_window/floats.rs diff --git a/polars/polars-time/src/chunkedarray/rolling_window/ints.rs b/crates/polars-time/src/chunkedarray/rolling_window/ints.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/rolling_window/ints.rs rename to crates/polars-time/src/chunkedarray/rolling_window/ints.rs diff --git a/polars/polars-time/src/chunkedarray/rolling_window/mod.rs b/crates/polars-time/src/chunkedarray/rolling_window/mod.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/rolling_window/mod.rs rename to crates/polars-time/src/chunkedarray/rolling_window/mod.rs diff --git a/polars/polars-time/src/chunkedarray/rolling_window/rolling_kernels/mod.rs b/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/mod.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/rolling_window/rolling_kernels/mod.rs rename to crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/mod.rs diff --git a/polars/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs b/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs rename to crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs diff --git a/polars/polars-time/src/chunkedarray/time.rs b/crates/polars-time/src/chunkedarray/time.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/time.rs rename to crates/polars-time/src/chunkedarray/time.rs diff --git a/polars/polars-time/src/chunkedarray/utf8/infer.rs b/crates/polars-time/src/chunkedarray/utf8/infer.rs similarity index 99% rename from polars/polars-time/src/chunkedarray/utf8/infer.rs rename to crates/polars-time/src/chunkedarray/utf8/infer.rs index 15f908db3b752..3663d0757b460 100644 --- a/polars/polars-time/src/chunkedarray/utf8/infer.rs +++ b/crates/polars-time/src/chunkedarray/utf8/infer.rs @@ -523,14 +523,14 @@ pub(crate) fn to_datetime( Pattern::DatetimeYMDZ => infer.coerce_utf8(ca).datetime().map(|ca| { let mut ca = ca.clone(); ca.set_time_unit(tu); - ca.replace_time_zone(Some("UTC"), None) + polars_ops::prelude::replace_time_zone(&ca, Some("UTC"), None) })?, _ => infer.coerce_utf8(ca).datetime().map(|ca| { let mut ca = ca.clone(); ca.set_time_unit(tu); match tz { #[cfg(feature = "timezones")] - Some(tz) => ca.replace_time_zone(Some(tz), None), + Some(tz) => polars_ops::prelude::replace_time_zone(&ca, Some(tz), None), _ => Ok(ca), } })?, diff --git a/polars/polars-time/src/chunkedarray/utf8/mod.rs b/crates/polars-time/src/chunkedarray/utf8/mod.rs similarity index 98% rename from polars/polars-time/src/chunkedarray/utf8/mod.rs rename to crates/polars-time/src/chunkedarray/utf8/mod.rs index 9862ae6b101a4..7bf2d17f0f326 100644 --- a/polars/polars-time/src/chunkedarray/utf8/mod.rs +++ b/crates/polars-time/src/chunkedarray/utf8/mod.rs @@ -303,7 +303,9 @@ pub trait Utf8Methods: AsUtf8 { ca.rename(utf8_ca.name()); match (tz_aware, tz) { #[cfg(feature = "timezones")] - (false, Some(tz)) => ca.into_datetime(tu, None).replace_time_zone(Some(tz), None), + (false, Some(tz)) => { + polars_ops::prelude::replace_time_zone(&ca.into_datetime(tu, None), Some(tz), None) + } #[cfg(feature = "timezones")] (true, _) => Ok(ca.into_datetime(tu, Some("UTC".to_string()))), _ => Ok(ca.into_datetime(tu, None)), @@ -516,7 +518,11 @@ pub trait Utf8Methods: AsUtf8 { ca.rename(utf8_ca.name()); match tz { #[cfg(feature = "timezones")] - Some(tz) => ca.into_datetime(tu, None).replace_time_zone(Some(tz), None), + Some(tz) => polars_ops::prelude::replace_time_zone( + &ca.into_datetime(tu, None), + Some(tz), + None, + ), _ => Ok(ca.into_datetime(tu, None)), } } diff --git a/polars/polars-time/src/chunkedarray/utf8/patterns.rs b/crates/polars-time/src/chunkedarray/utf8/patterns.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/utf8/patterns.rs rename to crates/polars-time/src/chunkedarray/utf8/patterns.rs diff --git a/polars/polars-time/src/chunkedarray/utf8/strptime.rs b/crates/polars-time/src/chunkedarray/utf8/strptime.rs similarity index 100% rename from polars/polars-time/src/chunkedarray/utf8/strptime.rs rename to crates/polars-time/src/chunkedarray/utf8/strptime.rs diff --git a/polars/polars-time/src/date_range.rs b/crates/polars-time/src/date_range.rs similarity index 100% rename from polars/polars-time/src/date_range.rs rename to crates/polars-time/src/date_range.rs diff --git a/polars/polars-time/src/dst_offset.rs b/crates/polars-time/src/dst_offset.rs similarity index 100% rename from polars/polars-time/src/dst_offset.rs rename to crates/polars-time/src/dst_offset.rs diff --git a/polars/polars-time/src/groupby/dynamic.rs b/crates/polars-time/src/groupby/dynamic.rs similarity index 98% rename from polars/polars-time/src/groupby/dynamic.rs rename to crates/polars-time/src/groupby/dynamic.rs index b8592c0cff216..27ad694504462 100644 --- a/polars/polars-time/src/groupby/dynamic.rs +++ b/crates/polars-time/src/groupby/dynamic.rs @@ -7,7 +7,7 @@ use polars_core::series::IsSorted; use polars_core::utils::ensure_sorted_arg; use polars_core::utils::flatten::flatten_par; use polars_core::POOL; -use polars_utils::slice::SortedSlice; +use polars_utils::slice::{GetSaferUnchecked, SortedSlice}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use smartstring::alias::String as SmartString; @@ -638,15 +638,19 @@ fn update_subgroups_idx( sub_groups .iter() .map(|&[first, len]| { - let new_first = unsafe { *base_g.1.get_unchecked(first as usize) }; + let new_first = if len == 0 { + // in case the group is empty + // keep the original first so that the + // groupby keys still point to the original group + base_g.0 + } else { + unsafe { *base_g.1.get_unchecked_release(first as usize) } + }; let first = first as usize; let len = len as usize; let idx = (first..first + len) - .map(|i| { - debug_assert!(i < base_g.1.len()); - unsafe { *base_g.1.get_unchecked(i) } - }) + .map(|i| unsafe { *base_g.1.get_unchecked_release(i) }) .collect_trusted::>(); (new_first, idx) }) diff --git a/polars/polars-time/src/groupby/mod.rs b/crates/polars-time/src/groupby/mod.rs similarity index 100% rename from polars/polars-time/src/groupby/mod.rs rename to crates/polars-time/src/groupby/mod.rs diff --git a/polars/polars-time/src/lib.rs b/crates/polars-time/src/lib.rs similarity index 100% rename from polars/polars-time/src/lib.rs rename to crates/polars-time/src/lib.rs diff --git a/polars/polars-time/src/month_end.rs b/crates/polars-time/src/month_end.rs similarity index 100% rename from polars/polars-time/src/month_end.rs rename to crates/polars-time/src/month_end.rs diff --git a/polars/polars-time/src/month_start.rs b/crates/polars-time/src/month_start.rs similarity index 99% rename from polars/polars-time/src/month_start.rs rename to crates/polars-time/src/month_start.rs index 4676fa8ce7d70..6b5fefaf6daa0 100644 --- a/polars/polars-time/src/month_start.rs +++ b/crates/polars-time/src/month_start.rs @@ -43,7 +43,7 @@ pub(crate) fn roll_backward( let ndt = NaiveDateTime::new(date, time); let t = match tz { #[cfg(feature = "timezones")] - Some(tz) => datetime_to_timestamp(localize_datetime(ndt, tz)?), + Some(tz) => datetime_to_timestamp(localize_datetime(ndt, tz, None)?), _ => datetime_to_timestamp(ndt), }; Ok(t) diff --git a/polars/polars-time/src/prelude.rs b/crates/polars-time/src/prelude.rs similarity index 100% rename from polars/polars-time/src/prelude.rs rename to crates/polars-time/src/prelude.rs diff --git a/polars/polars-time/src/round.rs b/crates/polars-time/src/round.rs similarity index 100% rename from polars/polars-time/src/round.rs rename to crates/polars-time/src/round.rs diff --git a/polars/polars-time/src/series/_trait.rs b/crates/polars-time/src/series/_trait.rs similarity index 100% rename from polars/polars-time/src/series/_trait.rs rename to crates/polars-time/src/series/_trait.rs diff --git a/polars/polars-time/src/series/implementations/boolean.rs b/crates/polars-time/src/series/implementations/boolean.rs similarity index 100% rename from polars/polars-time/src/series/implementations/boolean.rs rename to crates/polars-time/src/series/implementations/boolean.rs diff --git a/polars/polars-time/src/series/implementations/categoricals.rs b/crates/polars-time/src/series/implementations/categoricals.rs similarity index 100% rename from polars/polars-time/src/series/implementations/categoricals.rs rename to crates/polars-time/src/series/implementations/categoricals.rs diff --git a/polars/polars-time/src/series/implementations/date.rs b/crates/polars-time/src/series/implementations/date.rs similarity index 100% rename from polars/polars-time/src/series/implementations/date.rs rename to crates/polars-time/src/series/implementations/date.rs diff --git a/polars/polars-time/src/series/implementations/datetime.rs b/crates/polars-time/src/series/implementations/datetime.rs similarity index 100% rename from polars/polars-time/src/series/implementations/datetime.rs rename to crates/polars-time/src/series/implementations/datetime.rs diff --git a/polars/polars-time/src/series/implementations/duration.rs b/crates/polars-time/src/series/implementations/duration.rs similarity index 100% rename from polars/polars-time/src/series/implementations/duration.rs rename to crates/polars-time/src/series/implementations/duration.rs diff --git a/polars/polars-time/src/series/implementations/floats.rs b/crates/polars-time/src/series/implementations/floats.rs similarity index 100% rename from polars/polars-time/src/series/implementations/floats.rs rename to crates/polars-time/src/series/implementations/floats.rs diff --git a/polars/polars-time/src/series/implementations/integers.rs b/crates/polars-time/src/series/implementations/integers.rs similarity index 100% rename from polars/polars-time/src/series/implementations/integers.rs rename to crates/polars-time/src/series/implementations/integers.rs diff --git a/polars/polars-time/src/series/implementations/list.rs b/crates/polars-time/src/series/implementations/list.rs similarity index 100% rename from polars/polars-time/src/series/implementations/list.rs rename to crates/polars-time/src/series/implementations/list.rs diff --git a/polars/polars-time/src/series/implementations/mod.rs b/crates/polars-time/src/series/implementations/mod.rs similarity index 100% rename from polars/polars-time/src/series/implementations/mod.rs rename to crates/polars-time/src/series/implementations/mod.rs diff --git a/polars/polars-time/src/series/implementations/object.rs b/crates/polars-time/src/series/implementations/object.rs similarity index 100% rename from polars/polars-time/src/series/implementations/object.rs rename to crates/polars-time/src/series/implementations/object.rs diff --git a/polars/polars-time/src/series/implementations/struct_.rs b/crates/polars-time/src/series/implementations/struct_.rs similarity index 100% rename from polars/polars-time/src/series/implementations/struct_.rs rename to crates/polars-time/src/series/implementations/struct_.rs diff --git a/polars/polars-time/src/series/implementations/time.rs b/crates/polars-time/src/series/implementations/time.rs similarity index 100% rename from polars/polars-time/src/series/implementations/time.rs rename to crates/polars-time/src/series/implementations/time.rs diff --git a/polars/polars-time/src/series/implementations/utf8.rs b/crates/polars-time/src/series/implementations/utf8.rs similarity index 100% rename from polars/polars-time/src/series/implementations/utf8.rs rename to crates/polars-time/src/series/implementations/utf8.rs diff --git a/polars/polars-time/src/series/mod.rs b/crates/polars-time/src/series/mod.rs similarity index 100% rename from polars/polars-time/src/series/mod.rs rename to crates/polars-time/src/series/mod.rs diff --git a/crates/polars-time/src/truncate.rs b/crates/polars-time/src/truncate.rs new file mode 100644 index 0000000000000..ef250864963ef --- /dev/null +++ b/crates/polars-time/src/truncate.rs @@ -0,0 +1,58 @@ +#[cfg(feature = "dtype-date")] +use polars_arrow::export::arrow::temporal_conversions::{MILLISECONDS, SECONDS_IN_DAY}; +use polars_arrow::time_zone::Tz; +use polars_core::prelude::*; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::prelude::*; +#[derive(Clone, PartialEq, Debug, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct TruncateOptions { + /// Period length + pub every: String, + /// Offset of the window + pub offset: String, + /// How to deal with ambiguous datetimes + pub use_earliest: Option, +} + +pub trait PolarsTruncate { + fn truncate(&self, options: &TruncateOptions, tz: Option<&Tz>) -> PolarsResult + where + Self: Sized; +} + +#[cfg(feature = "dtype-datetime")] +impl PolarsTruncate for DatetimeChunked { + fn truncate(&self, options: &TruncateOptions, tz: Option<&Tz>) -> PolarsResult { + let every = Duration::parse(&options.every); + let offset = Duration::parse(&options.offset); + let w = Window::new(every, every, offset); + + let func = match self.time_unit() { + TimeUnit::Nanoseconds => Window::truncate_ns, + TimeUnit::Microseconds => Window::truncate_us, + TimeUnit::Milliseconds => Window::truncate_ms, + }; + + Ok(self + .try_apply(|t| func(&w, t, tz, options.use_earliest))? + .into_datetime(self.time_unit(), self.time_zone().clone())) + } +} + +#[cfg(feature = "dtype-date")] +impl PolarsTruncate for DateChunked { + fn truncate(&self, options: &TruncateOptions, _tz: Option<&Tz>) -> PolarsResult { + let every = Duration::parse(&options.every); + let offset = Duration::parse(&options.offset); + let w = Window::new(every, every, offset); + Ok(self + .try_apply(|t| { + const MSECS_IN_DAY: i64 = MILLISECONDS * SECONDS_IN_DAY; + Ok((w.truncate_ms(MSECS_IN_DAY * t as i64, None, None)? / MSECS_IN_DAY) as i32) + })? + .into_date()) + } +} diff --git a/polars/polars-time/src/upsample.rs b/crates/polars-time/src/upsample.rs similarity index 100% rename from polars/polars-time/src/upsample.rs rename to crates/polars-time/src/upsample.rs diff --git a/polars/polars-time/src/utils.rs b/crates/polars-time/src/utils.rs similarity index 61% rename from polars/polars-time/src/utils.rs rename to crates/polars-time/src/utils.rs index e6b4e1736c325..019fe231ca70b 100644 --- a/polars/polars-time/src/utils.rs +++ b/crates/polars-time/src/utils.rs @@ -12,15 +12,24 @@ use polars_arrow::time_zone::Tz; use polars_core::prelude::{polars_bail, PolarsResult, TimeUnit}; #[cfg(feature = "timezones")] -pub(crate) fn localize_datetime(ndt: NaiveDateTime, tz: &Tz) -> PolarsResult { +pub(crate) fn localize_datetime( + ndt: NaiveDateTime, + tz: &Tz, + use_earliest: Option, +) -> PolarsResult { // e.g. '2021-01-01 03:00' -> '2021-01-01 03:00CDT' match tz.from_local_datetime(&ndt) { LocalResult::Single(tz) => Ok(tz.naive_utc()), - LocalResult::Ambiguous(_, _) => { - polars_bail!( - ComputeError: format!("datetime '{}' is ambiguous in time zone '{}'. Ambiguous datetimes are not yet supported", ndt, tz) - ) - } + LocalResult::Ambiguous(dt_earliest, dt_latest) => match use_earliest { + Some(true) => Ok(dt_earliest.naive_utc()), + Some(false) => Ok(dt_latest.naive_utc()), + None => polars_bail!(ComputeError: + format!("datetime '{}' is ambiguous in time zone '{}'. \ + Please use `use_earliest` to tell how it should be localized. \ + If you got here from a function which doesn't have a `use_earliest` argument, \ + please open an issue at https://github.com/pola-rs/polars/issues.", ndt, tz) + ), + }, LocalResult::None => { polars_bail!( ComputeError: format!("datetime '{}' is non-existent in time zone '{}'. Non-existent datetimes are not yet supported", ndt, tz) @@ -39,13 +48,22 @@ pub(crate) fn unlocalize_datetime(ndt: NaiveDateTime, tz: &Tz) -> NaiveDateTime pub(crate) fn localize_timestamp(timestamp: i64, tu: TimeUnit, tz: Tz) -> PolarsResult { match tu { TimeUnit::Nanoseconds => { - Ok(localize_datetime(timestamp_ns_to_datetime(timestamp), &tz)?.timestamp_nanos()) + Ok( + localize_datetime(timestamp_ns_to_datetime(timestamp), &tz, None)? + .timestamp_nanos(), + ) } TimeUnit::Microseconds => { - Ok(localize_datetime(timestamp_us_to_datetime(timestamp), &tz)?.timestamp_micros()) + Ok( + localize_datetime(timestamp_us_to_datetime(timestamp), &tz, None)? + .timestamp_micros(), + ) } TimeUnit::Milliseconds => { - Ok(localize_datetime(timestamp_ms_to_datetime(timestamp), &tz)?.timestamp_millis()) + Ok( + localize_datetime(timestamp_ms_to_datetime(timestamp), &tz, None)? + .timestamp_millis(), + ) } } } diff --git a/polars/polars-time/src/windows/bounds.rs b/crates/polars-time/src/windows/bounds.rs similarity index 100% rename from polars/polars-time/src/windows/bounds.rs rename to crates/polars-time/src/windows/bounds.rs diff --git a/polars/polars-time/src/windows/calendar.rs b/crates/polars-time/src/windows/calendar.rs similarity index 100% rename from polars/polars-time/src/windows/calendar.rs rename to crates/polars-time/src/windows/calendar.rs diff --git a/polars/polars-time/src/windows/duration.rs b/crates/polars-time/src/windows/duration.rs similarity index 95% rename from polars/polars-time/src/windows/duration.rs rename to crates/polars-time/src/windows/duration.rs index e6c11e4701ae0..90cfc2eee5f2e 100644 --- a/polars/polars-time/src/windows/duration.rs +++ b/crates/polars-time/src/windows/duration.rs @@ -45,7 +45,7 @@ pub struct Duration { impl PartialOrd for Duration { fn partial_cmp(&self, other: &Self) -> Option { - self.duration_ns().partial_cmp(&other.duration_ns()) + Some(self.cmp(other)) } } @@ -438,6 +438,7 @@ impl Duration { nsecs_to_unit: F, timestamp_to_datetime: G, datetime_to_timestamp: J, + _use_earliest: Option, ) -> PolarsResult where F: Fn(i64) -> i64, @@ -465,6 +466,7 @@ impl Duration { Some(tz) => Ok(datetime_to_timestamp(localize_datetime( timestamp_to_datetime(t - remainder), tz, + _use_earliest, )?)), _ => Ok(t - remainder), } @@ -484,6 +486,7 @@ impl Duration { Some(tz) => Ok(datetime_to_timestamp(localize_datetime( first_day_of_week.and_time(NaiveTime::default()), tz, + _use_earliest, )?)), _ => Ok(datetime_to_timestamp( first_day_of_week.and_time(NaiveTime::default()), @@ -509,6 +512,7 @@ impl Duration { Some(tz) => Ok(datetime_to_timestamp(localize_datetime( timestamp_to_datetime(t - remainder), tz, + _use_earliest, )?)), _ => Ok(t - remainder), } @@ -536,7 +540,11 @@ impl Duration { ))?; match tz { #[cfg(feature = "timezones")] - Some(tz) => Ok(datetime_to_timestamp(localize_datetime(dt, tz)?)), + Some(tz) => Ok(datetime_to_timestamp(localize_datetime( + dt, + tz, + _use_earliest, + )?)), _ => Ok(datetime_to_timestamp(dt)), } } @@ -548,37 +556,55 @@ impl Duration { // Truncate the given ns timestamp by the window boundary. #[inline] - pub fn truncate_ns(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { + pub fn truncate_ns( + &self, + t: i64, + tz: Option<&Tz>, + use_earliest: Option, + ) -> PolarsResult { self.truncate_impl( t, tz, |nsecs| nsecs, timestamp_ns_to_datetime, datetime_to_timestamp_ns, + use_earliest, ) } // Truncate the given ns timestamp by the window boundary. #[inline] - pub fn truncate_us(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { + pub fn truncate_us( + &self, + t: i64, + tz: Option<&Tz>, + use_earliest: Option, + ) -> PolarsResult { self.truncate_impl( t, tz, |nsecs| nsecs / 1000, timestamp_us_to_datetime, datetime_to_timestamp_us, + use_earliest, ) } // Truncate the given ms timestamp by the window boundary. #[inline] - pub fn truncate_ms(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { + pub fn truncate_ms( + &self, + t: i64, + tz: Option<&Tz>, + use_earliest: Option, + ) -> PolarsResult { self.truncate_impl( t, tz, |nsecs| nsecs / 1_000_000, timestamp_ms_to_datetime, datetime_to_timestamp_ms, + use_earliest, ) } @@ -607,7 +633,7 @@ impl Duration { let dt = Self::add_month(ts, d.months, d.negative, d.saturating)?; new_t = match tz { #[cfg(feature = "timezones")] - Some(tz) => datetime_to_timestamp(localize_datetime(dt, tz)?), + Some(tz) => datetime_to_timestamp(localize_datetime(dt, tz, None)?), _ => datetime_to_timestamp(dt), }; } @@ -620,8 +646,11 @@ impl Duration { new_t = datetime_to_timestamp(unlocalize_datetime(timestamp_to_datetime(t), tz)); new_t += if d.negative { -t_weeks } else { t_weeks }; - new_t = - datetime_to_timestamp(localize_datetime(timestamp_to_datetime(new_t), tz)?); + new_t = datetime_to_timestamp(localize_datetime( + timestamp_to_datetime(new_t), + tz, + None, + )?); } _ => new_t += if d.negative { -t_weeks } else { t_weeks }, }; @@ -635,8 +664,11 @@ impl Duration { new_t = datetime_to_timestamp(unlocalize_datetime(timestamp_to_datetime(t), tz)); new_t += if d.negative { -t_days } else { t_days }; - new_t = - datetime_to_timestamp(localize_datetime(timestamp_to_datetime(new_t), tz)?); + new_t = datetime_to_timestamp(localize_datetime( + timestamp_to_datetime(new_t), + tz, + None, + )?); } _ => new_t += if d.negative { -t_days } else { t_days }, }; diff --git a/polars/polars-time/src/windows/groupby.rs b/crates/polars-time/src/windows/groupby.rs similarity index 100% rename from polars/polars-time/src/windows/groupby.rs rename to crates/polars-time/src/windows/groupby.rs diff --git a/polars/polars-time/src/windows/mod.rs b/crates/polars-time/src/windows/mod.rs similarity index 100% rename from polars/polars-time/src/windows/mod.rs rename to crates/polars-time/src/windows/mod.rs diff --git a/polars/polars-time/src/windows/test.rs b/crates/polars-time/src/windows/test.rs similarity index 100% rename from polars/polars-time/src/windows/test.rs rename to crates/polars-time/src/windows/test.rs diff --git a/polars/polars-time/src/windows/window.rs b/crates/polars-time/src/windows/window.rs similarity index 90% rename from polars/polars-time/src/windows/window.rs rename to crates/polars-time/src/windows/window.rs index efaa5cc4d4d81..1a419ced36f85 100644 --- a/polars/polars-time/src/windows/window.rs +++ b/crates/polars-time/src/windows/window.rs @@ -31,73 +31,88 @@ impl Window { } /// Truncate the given ns timestamp by the window boundary. - pub fn truncate_ns(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { - let t = self.every.truncate_ns(t, tz)?; + pub fn truncate_ns( + &self, + t: i64, + tz: Option<&Tz>, + use_earliest: Option, + ) -> PolarsResult { + let t = self.every.truncate_ns(t, tz, use_earliest)?; self.offset.add_ns(t, tz) } pub fn truncate_no_offset_ns(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { - self.every.truncate_ns(t, tz) + self.every.truncate_ns(t, tz, None) } /// Truncate the given us timestamp by the window boundary. - pub fn truncate_us(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { - let t = self.every.truncate_us(t, tz)?; + pub fn truncate_us( + &self, + t: i64, + tz: Option<&Tz>, + use_earliest: Option, + ) -> PolarsResult { + let t = self.every.truncate_us(t, tz, use_earliest)?; self.offset.add_us(t, tz) } pub fn truncate_no_offset_us(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { - self.every.truncate_us(t, tz) + self.every.truncate_us(t, tz, None) } - pub fn truncate_ms(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { - let t = self.every.truncate_ms(t, tz)?; + pub fn truncate_ms( + &self, + t: i64, + tz: Option<&Tz>, + use_earliest: Option, + ) -> PolarsResult { + let t = self.every.truncate_ms(t, tz, use_earliest)?; self.offset.add_ms(t, tz) } #[inline] pub fn truncate_no_offset_ms(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { - self.every.truncate_ms(t, tz) + self.every.truncate_ms(t, tz, None) } /// Round the given ns timestamp by the window boundary. pub fn round_ns(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { let t = t + self.every.duration_ns() / 2_i64; - self.truncate_ns(t, tz) + self.truncate_ns(t, tz, None) } /// Round the given us timestamp by the window boundary. pub fn round_us(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { let t = t + self.every.duration_ns() / (2 * timeunit_scale(ArrowTimeUnit::Nanosecond, ArrowTimeUnit::Microsecond) as i64); - self.truncate_us(t, tz) + self.truncate_us(t, tz, None) } /// Round the given ms timestamp by the window boundary. pub fn round_ms(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { let t = t + self.every.duration_ns() / (2 * timeunit_scale(ArrowTimeUnit::Nanosecond, ArrowTimeUnit::Millisecond) as i64); - self.truncate_ms(t, tz) + self.truncate_ms(t, tz, None) } /// returns the bounds for the earliest window bounds /// that contains the given time t. For underlapping windows that /// do not contain time t, the window directly after time t will be returned. pub fn get_earliest_bounds_ns(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { - let start = self.truncate_ns(t, tz)?; + let start = self.truncate_ns(t, tz, None)?; let stop = self.period.add_ns(start, tz)?; Ok(Bounds::new_checked(start, stop)) } pub fn get_earliest_bounds_us(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { - let start = self.truncate_us(t, tz)?; + let start = self.truncate_us(t, tz, None)?; let stop = self.period.add_us(start, tz)?; Ok(Bounds::new_checked(start, stop)) } pub fn get_earliest_bounds_ms(&self, t: i64, tz: Option<&Tz>) -> PolarsResult { - let start = self.truncate_ms(t, tz)?; + let start = self.truncate_ms(t, tz, None)?; let stop = self.period.add_ms(start, tz)?; Ok(Bounds::new_checked(start, stop)) diff --git a/polars/polars-utils/Cargo.toml b/crates/polars-utils/Cargo.toml similarity index 90% rename from polars/polars-utils/Cargo.toml rename to crates/polars-utils/Cargo.toml index e73ba57fcab7c..5b847234f4ec0 100644 --- a/polars/polars-utils/Cargo.toml +++ b/crates/polars-utils/Cargo.toml @@ -13,6 +13,7 @@ ahash.workspace = true hashbrown.workspace = true num-traits.workspace = true once_cell.workspace = true +polars-error = { version = "0.31.1", path = "../polars-error" } rayon.workspace = true smartstring.workspace = true sysinfo = { version = "0.29", default-features = false, optional = true } diff --git a/crates/polars-utils/LICENSE b/crates/polars-utils/LICENSE new file mode 120000 index 0000000000000..30cff7403da04 --- /dev/null +++ b/crates/polars-utils/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/polars/polars-utils/README.md b/crates/polars-utils/README.md similarity index 100% rename from polars/polars-utils/README.md rename to crates/polars-utils/README.md diff --git a/polars/polars-utils/src/aliases.rs b/crates/polars-utils/src/aliases.rs similarity index 100% rename from polars/polars-utils/src/aliases.rs rename to crates/polars-utils/src/aliases.rs diff --git a/polars/polars-utils/src/arena.rs b/crates/polars-utils/src/arena.rs similarity index 100% rename from polars/polars-utils/src/arena.rs rename to crates/polars-utils/src/arena.rs diff --git a/polars/polars-utils/src/atomic.rs b/crates/polars-utils/src/atomic.rs similarity index 100% rename from polars/polars-utils/src/atomic.rs rename to crates/polars-utils/src/atomic.rs diff --git a/polars/polars-utils/src/cell.rs b/crates/polars-utils/src/cell.rs similarity index 100% rename from polars/polars-utils/src/cell.rs rename to crates/polars-utils/src/cell.rs diff --git a/polars/polars-utils/src/contention_pool.rs b/crates/polars-utils/src/contention_pool.rs similarity index 100% rename from polars/polars-utils/src/contention_pool.rs rename to crates/polars-utils/src/contention_pool.rs diff --git a/polars/polars-utils/src/error.rs b/crates/polars-utils/src/error.rs similarity index 100% rename from polars/polars-utils/src/error.rs rename to crates/polars-utils/src/error.rs diff --git a/polars/polars-utils/src/fmt.rs b/crates/polars-utils/src/fmt.rs similarity index 100% rename from polars/polars-utils/src/fmt.rs rename to crates/polars-utils/src/fmt.rs diff --git a/polars/polars-utils/src/functions.rs b/crates/polars-utils/src/functions.rs similarity index 100% rename from polars/polars-utils/src/functions.rs rename to crates/polars-utils/src/functions.rs diff --git a/crates/polars-utils/src/io.rs b/crates/polars-utils/src/io.rs new file mode 100644 index 0000000000000..0cc4ca1db455d --- /dev/null +++ b/crates/polars-utils/src/io.rs @@ -0,0 +1,19 @@ +use std::fs::File; +use std::path::Path; + +use polars_error::*; + +pub fn open_file

(path: P) -> PolarsResult +where + P: AsRef, +{ + std::fs::File::open(&path).map_err(|e| { + let path = path.as_ref().to_string_lossy(); + if path.len() > 88 { + let path: String = path.chars().skip(path.len() - 88).collect(); + polars_err!(ComputeError: "error open file: ...{}, {}", path, e) + } else { + polars_err!(ComputeError: "error open file: {}, {}", path, e) + } + }) +} diff --git a/polars/polars-utils/src/iter/enumerate_idx.rs b/crates/polars-utils/src/iter/enumerate_idx.rs similarity index 100% rename from polars/polars-utils/src/iter/enumerate_idx.rs rename to crates/polars-utils/src/iter/enumerate_idx.rs diff --git a/polars/polars-utils/src/iter/mod.rs b/crates/polars-utils/src/iter/mod.rs similarity index 100% rename from polars/polars-utils/src/iter/mod.rs rename to crates/polars-utils/src/iter/mod.rs diff --git a/polars/polars-utils/src/lib.rs b/crates/polars-utils/src/lib.rs similarity index 93% rename from polars/polars-utils/src/lib.rs rename to crates/polars-utils/src/lib.rs index b5fafb4e36fb4..7a4941c4a09ff 100644 --- a/polars/polars-utils/src/lib.rs +++ b/crates/polars-utils/src/lib.rs @@ -27,3 +27,6 @@ pub mod macros; pub mod vec; #[cfg(target_family = "wasm")] pub mod wasm; + +pub mod io; +pub use io::open_file; diff --git a/polars/polars-utils/src/macros.rs b/crates/polars-utils/src/macros.rs similarity index 100% rename from polars/polars-utils/src/macros.rs rename to crates/polars-utils/src/macros.rs diff --git a/polars/polars-utils/src/mem.rs b/crates/polars-utils/src/mem.rs similarity index 100% rename from polars/polars-utils/src/mem.rs rename to crates/polars-utils/src/mem.rs diff --git a/polars/polars-utils/src/slice.rs b/crates/polars-utils/src/slice.rs similarity index 100% rename from polars/polars-utils/src/slice.rs rename to crates/polars-utils/src/slice.rs diff --git a/polars/polars-utils/src/sort.rs b/crates/polars-utils/src/sort.rs similarity index 100% rename from polars/polars-utils/src/sort.rs rename to crates/polars-utils/src/sort.rs diff --git a/polars/polars-utils/src/sync.rs b/crates/polars-utils/src/sync.rs similarity index 100% rename from polars/polars-utils/src/sync.rs rename to crates/polars-utils/src/sync.rs diff --git a/polars/polars-utils/src/sys.rs b/crates/polars-utils/src/sys.rs similarity index 100% rename from polars/polars-utils/src/sys.rs rename to crates/polars-utils/src/sys.rs diff --git a/polars/polars-utils/src/unwrap.rs b/crates/polars-utils/src/unwrap.rs similarity index 100% rename from polars/polars-utils/src/unwrap.rs rename to crates/polars-utils/src/unwrap.rs diff --git a/polars/polars-utils/src/vec.rs b/crates/polars-utils/src/vec.rs similarity index 86% rename from polars/polars-utils/src/vec.rs rename to crates/polars-utils/src/vec.rs index fd9e0d712fd73..df7f332dc0c14 100644 --- a/polars/polars-utils/src/vec.rs +++ b/crates/polars-utils/src/vec.rs @@ -63,3 +63,14 @@ impl PushUnchecked for Vec { self.set_len(self.len() + 1); } } + +pub trait CapacityByFactor { + fn with_capacity_by_factor(original_len: usize, factor: f64) -> Self; +} + +impl CapacityByFactor for Vec { + fn with_capacity_by_factor(original_len: usize, factor: f64) -> Self { + let cap = (original_len as f64 * factor) as usize; + Vec::with_capacity(cap) + } +} diff --git a/polars/polars-utils/src/wasm.rs b/crates/polars-utils/src/wasm.rs similarity index 100% rename from polars/polars-utils/src/wasm.rs rename to crates/polars-utils/src/wasm.rs diff --git a/polars/Cargo.toml b/crates/polars/Cargo.toml similarity index 93% rename from polars/Cargo.toml rename to crates/polars/Cargo.toml index 48a3ef38dc15c..4a5c0576aed19 100644 --- a/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -312,13 +312,13 @@ bench = [ ] [dependencies] -polars-algo = { version = "0.31.1", path = "./polars-algo", optional = true } -polars-core = { version = "0.31.1", path = "./polars-core", features = ["docs"], default-features = false } -polars-io = { version = "0.31.1", path = "./polars-io", features = [], default-features = false, optional = true } -polars-lazy = { version = "0.31.1", path = "./polars-lazy", features = [], default-features = false, optional = true } -polars-ops = { version = "0.31.1", path = "./polars-ops" } -polars-sql = { version = "0.31.1", path = "./polars-sql", default-features = false, optional = true } -polars-time = { version = "0.31.1", path = "./polars-time", default-features = false, optional = true } +polars-algo = { version = "0.31.1", path = "../polars-algo", optional = true } +polars-core = { version = "0.31.1", path = "../polars-core", features = ["docs"], default-features = false } +polars-io = { version = "0.31.1", path = "../polars-io", features = [], default-features = false, optional = true } +polars-lazy = { version = "0.31.1", path = "../polars-lazy", features = [], default-features = false, optional = true } +polars-ops = { version = "0.31.1", path = "../polars-ops" } +polars-sql = { version = "0.31.1", path = "../polars-sql", default-features = false, optional = true } +polars-time = { version = "0.31.1", path = "../polars-time", default-features = false, optional = true } # enable js feature for getrandom to work in wasm [target.'cfg(target_family = "wasm")'.dependencies.getrandom] diff --git a/crates/polars/LICENSE b/crates/polars/LICENSE new file mode 120000 index 0000000000000..30cff7403da04 --- /dev/null +++ b/crates/polars/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/polars/build.rs b/crates/polars/build.rs similarity index 100% rename from polars/build.rs rename to crates/polars/build.rs diff --git a/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs similarity index 100% rename from polars/src/docs/eager.rs rename to crates/polars/src/docs/eager.rs diff --git a/polars/src/docs/lazy.rs b/crates/polars/src/docs/lazy.rs similarity index 100% rename from polars/src/docs/lazy.rs rename to crates/polars/src/docs/lazy.rs diff --git a/polars/src/docs/mod.rs b/crates/polars/src/docs/mod.rs similarity index 100% rename from polars/src/docs/mod.rs rename to crates/polars/src/docs/mod.rs diff --git a/polars/src/docs/performance.rs b/crates/polars/src/docs/performance.rs similarity index 100% rename from polars/src/docs/performance.rs rename to crates/polars/src/docs/performance.rs diff --git a/polars/src/export.rs b/crates/polars/src/export.rs similarity index 100% rename from polars/src/export.rs rename to crates/polars/src/export.rs diff --git a/polars/src/lib.rs b/crates/polars/src/lib.rs similarity index 100% rename from polars/src/lib.rs rename to crates/polars/src/lib.rs diff --git a/polars/src/prelude.rs b/crates/polars/src/prelude.rs similarity index 100% rename from polars/src/prelude.rs rename to crates/polars/src/prelude.rs diff --git a/polars/src/sql.rs b/crates/polars/src/sql.rs similarity index 100% rename from polars/src/sql.rs rename to crates/polars/src/sql.rs diff --git a/polars/tests/it/core/date_like.rs b/crates/polars/tests/it/core/date_like.rs similarity index 100% rename from polars/tests/it/core/date_like.rs rename to crates/polars/tests/it/core/date_like.rs diff --git a/polars/tests/it/core/groupby.rs b/crates/polars/tests/it/core/groupby.rs similarity index 100% rename from polars/tests/it/core/groupby.rs rename to crates/polars/tests/it/core/groupby.rs diff --git a/polars/tests/it/core/joins.rs b/crates/polars/tests/it/core/joins.rs similarity index 100% rename from polars/tests/it/core/joins.rs rename to crates/polars/tests/it/core/joins.rs diff --git a/polars/tests/it/core/list.rs b/crates/polars/tests/it/core/list.rs similarity index 100% rename from polars/tests/it/core/list.rs rename to crates/polars/tests/it/core/list.rs diff --git a/polars/tests/it/core/mod.rs b/crates/polars/tests/it/core/mod.rs similarity index 100% rename from polars/tests/it/core/mod.rs rename to crates/polars/tests/it/core/mod.rs diff --git a/polars/tests/it/core/ops/mod.rs b/crates/polars/tests/it/core/ops/mod.rs similarity index 100% rename from polars/tests/it/core/ops/mod.rs rename to crates/polars/tests/it/core/ops/mod.rs diff --git a/polars/tests/it/core/ops/take.rs b/crates/polars/tests/it/core/ops/take.rs similarity index 100% rename from polars/tests/it/core/ops/take.rs rename to crates/polars/tests/it/core/ops/take.rs diff --git a/polars/tests/it/core/pivot.rs b/crates/polars/tests/it/core/pivot.rs similarity index 100% rename from polars/tests/it/core/pivot.rs rename to crates/polars/tests/it/core/pivot.rs diff --git a/polars/tests/it/core/random.rs b/crates/polars/tests/it/core/random.rs similarity index 100% rename from polars/tests/it/core/random.rs rename to crates/polars/tests/it/core/random.rs diff --git a/polars/tests/it/core/rolling_window.rs b/crates/polars/tests/it/core/rolling_window.rs similarity index 100% rename from polars/tests/it/core/rolling_window.rs rename to crates/polars/tests/it/core/rolling_window.rs diff --git a/polars/tests/it/core/series.rs b/crates/polars/tests/it/core/series.rs similarity index 100% rename from polars/tests/it/core/series.rs rename to crates/polars/tests/it/core/series.rs diff --git a/polars/tests/it/core/utils.rs b/crates/polars/tests/it/core/utils.rs similarity index 100% rename from polars/tests/it/core/utils.rs rename to crates/polars/tests/it/core/utils.rs diff --git a/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs similarity index 99% rename from polars/tests/it/io/csv.rs rename to crates/polars/tests/it/io/csv.rs index 051594373ea79..f7a0a0087a2af 100644 --- a/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -4,7 +4,7 @@ use polars::io::RowCount; use super::*; -const FOODS_CSV: &str = "../examples/datasets/foods1.csv"; +const FOODS_CSV: &str = "../../examples/datasets/foods1.csv"; #[test] fn write_csv() { diff --git a/polars/tests/it/io/ipc_stream.rs b/crates/polars/tests/it/io/ipc_stream.rs similarity index 100% rename from polars/tests/it/io/ipc_stream.rs rename to crates/polars/tests/it/io/ipc_stream.rs diff --git a/polars/tests/it/io/json.rs b/crates/polars/tests/it/io/json.rs similarity index 100% rename from polars/tests/it/io/json.rs rename to crates/polars/tests/it/io/json.rs diff --git a/polars/tests/it/io/mod.rs b/crates/polars/tests/it/io/mod.rs similarity index 100% rename from polars/tests/it/io/mod.rs rename to crates/polars/tests/it/io/mod.rs diff --git a/polars/tests/it/io/parquet.rs b/crates/polars/tests/it/io/parquet.rs similarity index 100% rename from polars/tests/it/io/parquet.rs rename to crates/polars/tests/it/io/parquet.rs diff --git a/polars/tests/it/joins.rs b/crates/polars/tests/it/joins.rs similarity index 100% rename from polars/tests/it/joins.rs rename to crates/polars/tests/it/joins.rs diff --git a/polars/tests/it/lazy/aggregation.rs b/crates/polars/tests/it/lazy/aggregation.rs similarity index 100% rename from polars/tests/it/lazy/aggregation.rs rename to crates/polars/tests/it/lazy/aggregation.rs diff --git a/polars/tests/it/lazy/cse.rs b/crates/polars/tests/it/lazy/cse.rs similarity index 94% rename from polars/tests/it/lazy/cse.rs rename to crates/polars/tests/it/lazy/cse.rs index f5db791b7d24e..a474a272d6335 100644 --- a/polars/tests/it/lazy/cse.rs +++ b/crates/polars/tests/it/lazy/cse.rs @@ -28,7 +28,7 @@ fn test_cse_union_schema_6504() -> PolarsResult<()> { }, ) .unwrap() - .with_common_subplan_elimination(true) + .with_comm_subplan_elim(true) .collect()?; let expected = df![ "a" => [1, 0], diff --git a/polars/tests/it/lazy/explodes.rs b/crates/polars/tests/it/lazy/explodes.rs similarity index 100% rename from polars/tests/it/lazy/explodes.rs rename to crates/polars/tests/it/lazy/explodes.rs diff --git a/polars/tests/it/lazy/expressions/apply.rs b/crates/polars/tests/it/lazy/expressions/apply.rs similarity index 100% rename from polars/tests/it/lazy/expressions/apply.rs rename to crates/polars/tests/it/lazy/expressions/apply.rs diff --git a/polars/tests/it/lazy/expressions/arity.rs b/crates/polars/tests/it/lazy/expressions/arity.rs similarity index 100% rename from polars/tests/it/lazy/expressions/arity.rs rename to crates/polars/tests/it/lazy/expressions/arity.rs diff --git a/polars/tests/it/lazy/expressions/expand.rs b/crates/polars/tests/it/lazy/expressions/expand.rs similarity index 100% rename from polars/tests/it/lazy/expressions/expand.rs rename to crates/polars/tests/it/lazy/expressions/expand.rs diff --git a/polars/tests/it/lazy/expressions/filter.rs b/crates/polars/tests/it/lazy/expressions/filter.rs similarity index 100% rename from polars/tests/it/lazy/expressions/filter.rs rename to crates/polars/tests/it/lazy/expressions/filter.rs diff --git a/polars/tests/it/lazy/expressions/is_in.rs b/crates/polars/tests/it/lazy/expressions/is_in.rs similarity index 100% rename from polars/tests/it/lazy/expressions/is_in.rs rename to crates/polars/tests/it/lazy/expressions/is_in.rs diff --git a/polars/tests/it/lazy/expressions/mod.rs b/crates/polars/tests/it/lazy/expressions/mod.rs similarity index 100% rename from polars/tests/it/lazy/expressions/mod.rs rename to crates/polars/tests/it/lazy/expressions/mod.rs diff --git a/polars/tests/it/lazy/expressions/slice.rs b/crates/polars/tests/it/lazy/expressions/slice.rs similarity index 100% rename from polars/tests/it/lazy/expressions/slice.rs rename to crates/polars/tests/it/lazy/expressions/slice.rs diff --git a/polars/tests/it/lazy/expressions/window.rs b/crates/polars/tests/it/lazy/expressions/window.rs similarity index 100% rename from polars/tests/it/lazy/expressions/window.rs rename to crates/polars/tests/it/lazy/expressions/window.rs diff --git a/polars/tests/it/lazy/folds.rs b/crates/polars/tests/it/lazy/folds.rs similarity index 100% rename from polars/tests/it/lazy/folds.rs rename to crates/polars/tests/it/lazy/folds.rs diff --git a/polars/tests/it/lazy/functions.rs b/crates/polars/tests/it/lazy/functions.rs similarity index 100% rename from polars/tests/it/lazy/functions.rs rename to crates/polars/tests/it/lazy/functions.rs diff --git a/polars/tests/it/lazy/groupby.rs b/crates/polars/tests/it/lazy/groupby.rs similarity index 100% rename from polars/tests/it/lazy/groupby.rs rename to crates/polars/tests/it/lazy/groupby.rs diff --git a/polars/tests/it/lazy/groupby_dynamic.rs b/crates/polars/tests/it/lazy/groupby_dynamic.rs similarity index 100% rename from polars/tests/it/lazy/groupby_dynamic.rs rename to crates/polars/tests/it/lazy/groupby_dynamic.rs diff --git a/polars/tests/it/lazy/mod.rs b/crates/polars/tests/it/lazy/mod.rs similarity index 98% rename from polars/tests/it/lazy/mod.rs rename to crates/polars/tests/it/lazy/mod.rs index 2419976ce95b1..7dc6c62bb775c 100644 --- a/polars/tests/it/lazy/mod.rs +++ b/crates/polars/tests/it/lazy/mod.rs @@ -10,6 +10,7 @@ mod groupby_dynamic; mod predicate_queries; mod projection_queries; mod queries; +mod schema; use polars::prelude::*; diff --git a/polars/tests/it/lazy/predicate_queries.rs b/crates/polars/tests/it/lazy/predicate_queries.rs similarity index 93% rename from polars/tests/it/lazy/predicate_queries.rs rename to crates/polars/tests/it/lazy/predicate_queries.rs index 785d3da904f8c..d749dd37d8ded 100644 --- a/polars/tests/it/lazy/predicate_queries.rs +++ b/crates/polars/tests/it/lazy/predicate_queries.rs @@ -235,3 +235,21 @@ fn test_predicate_on_join_select_4884() -> PolarsResult<()> { assert_eq!(out, expected); Ok(()) } + +#[test] +fn test_predicate_pushdown_block_8847() -> PolarsResult<()> { + let ldf = df![ + "A" => [1, 2, 3] + ]? + .lazy(); + + let q = ldf + .with_column(lit(1).strict_cast(DataType::Int32).alias("B")) + .drop_nulls(None) + .filter(col("B").eq(lit(1))); + + let out = q.collect()?; + assert_eq!(out.get_column_names(), &["A", "B"]); + + Ok(()) +} diff --git a/polars/tests/it/lazy/projection_queries.rs b/crates/polars/tests/it/lazy/projection_queries.rs similarity index 100% rename from polars/tests/it/lazy/projection_queries.rs rename to crates/polars/tests/it/lazy/projection_queries.rs diff --git a/polars/tests/it/lazy/queries.rs b/crates/polars/tests/it/lazy/queries.rs similarity index 100% rename from polars/tests/it/lazy/queries.rs rename to crates/polars/tests/it/lazy/queries.rs diff --git a/crates/polars/tests/it/lazy/schema.rs b/crates/polars/tests/it/lazy/schema.rs new file mode 100644 index 0000000000000..3684c8b0376f6 --- /dev/null +++ b/crates/polars/tests/it/lazy/schema.rs @@ -0,0 +1,30 @@ +use super::*; + +#[test] +fn test_schema_update_after_projection_pd() -> PolarsResult<()> { + let df = df![ + "a" => [1], + "b" => [1], + "c" => [1], + ]?; + + let q = df + .lazy() + .with_column(col("a").implode()) + .explode([col("a")]) + .select([cols(["a", "b"])]); + + // run optimizations + let (node, lp_arena, _expr_arena) = q.to_alp_optimized()?; + // get the explode node + let input = lp_arena.get(node).get_inputs()[0]; + // assert the schema has been corrected with the projection pushdown run + lp_arena.get(input); + let schema = lp_arena.get(input).schema(&lp_arena).into_owned(); + let mut expected = Schema::new(); + expected.with_column("a".into(), DataType::Int32); + expected.with_column("b".into(), DataType::Int32); + assert_eq!(schema.as_ref(), &expected); + + Ok(()) +} diff --git a/polars/tests/it/main.rs b/crates/polars/tests/it/main.rs similarity index 56% rename from polars/tests/it/main.rs rename to crates/polars/tests/it/main.rs index a8b34dec8aa75..8cf14da210c37 100644 --- a/polars/tests/it/main.rs +++ b/crates/polars/tests/it/main.rs @@ -6,4 +6,4 @@ mod lazy; mod schema; mod time; -pub static FOODS_CSV: &str = "../examples/datasets/foods1.csv"; +pub static FOODS_CSV: &str = "../../examples/datasets/foods1.csv"; diff --git a/polars/tests/it/schema.rs b/crates/polars/tests/it/schema.rs similarity index 100% rename from polars/tests/it/schema.rs rename to crates/polars/tests/it/schema.rs diff --git a/polars/tests/it/time/date_range.rs b/crates/polars/tests/it/time/date_range.rs similarity index 100% rename from polars/tests/it/time/date_range.rs rename to crates/polars/tests/it/time/date_range.rs diff --git a/polars/tests/it/time/mod.rs b/crates/polars/tests/it/time/mod.rs similarity index 100% rename from polars/tests/it/time/mod.rs rename to crates/polars/tests/it/time/mod.rs diff --git a/examples/python_rust_compiled_function/Cargo.toml b/examples/python_rust_compiled_function/Cargo.toml index acc214294d80c..f919458e7a5b6 100644 --- a/examples/python_rust_compiled_function/Cargo.toml +++ b/examples/python_rust_compiled_function/Cargo.toml @@ -9,6 +9,7 @@ name = "my_polars_functions" crate-type = ["cdylib"] [dependencies] -polars = { path = "../../polars" } -polars-arrow = { path = "../../polars/polars-arrow" } +polars = { path = "../../crates/polars" } +polars-arrow = { path = "../../crates/polars-arrow" } + pyo3 = { version = "0.19", features = ["extension-module"] } diff --git a/examples/python_rust_compiled_function/src/lib.rs b/examples/python_rust_compiled_function/src/lib.rs index a51941903a877..7e67a9c29fe1c 100644 --- a/examples/python_rust_compiled_function/src/lib.rs +++ b/examples/python_rust_compiled_function/src/lib.rs @@ -18,7 +18,7 @@ fn hamming_distance(series_a: &PyAny, series_b: &PyAny) -> PyResult { fn hamming_distance_impl(a: &Series, b: &Series) -> PolarsResult { Ok(a.utf8()? .into_iter() - .zip(b.utf8()?.into_iter()) + .zip(b.utf8()?) .map(|(lhs, rhs)| hamming_distance_strs(lhs, rhs)) .collect()) } diff --git a/examples/read_csv/Cargo.toml b/examples/read_csv/Cargo.toml index 12cf09f9a0b0b..c75c54be9793b 100644 --- a/examples/read_csv/Cargo.toml +++ b/examples/read_csv/Cargo.toml @@ -10,4 +10,4 @@ write_output = ["polars/ipc", "polars/parquet"] default = ["write_output"] [dependencies] -polars = { path = "../../polars", features = ["lazy", "csv", "ipc"] } +polars = { path = "../../crates/polars", features = ["lazy", "csv", "ipc"] } diff --git a/examples/read_json/Cargo.toml b/examples/read_json/Cargo.toml index 62dbe4e6858f5..26d4da9a04b80 100644 --- a/examples/read_json/Cargo.toml +++ b/examples/read_json/Cargo.toml @@ -6,4 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -polars = { path = "../../polars", features = ["json"] } +polars = { path = "../../crates/polars", features = ["json"] } diff --git a/examples/read_parquet/Cargo.toml b/examples/read_parquet/Cargo.toml index a19edb61eab0a..cf8035a46772b 100644 --- a/examples/read_parquet/Cargo.toml +++ b/examples/read_parquet/Cargo.toml @@ -6,4 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -polars = { path = "../../polars", features = ["lazy", "parquet"] } +polars = { path = "../../crates/polars", features = ["lazy", "parquet"] } diff --git a/examples/read_parquet_cloud/Cargo.toml b/examples/read_parquet_cloud/Cargo.toml index 10bfca12b59a3..6197af0b792bf 100644 --- a/examples/read_parquet_cloud/Cargo.toml +++ b/examples/read_parquet_cloud/Cargo.toml @@ -7,4 +7,4 @@ edition = "2021" [dependencies] aws-creds = "0.35.0" -polars = { path = "../../polars", features = ["lazy", "aws"] } +polars = { path = "../../crates/polars", features = ["lazy", "aws"] } diff --git a/examples/string_filter/Cargo.toml b/examples/string_filter/Cargo.toml index c82d527c1b6a7..1dc4e0f356e45 100644 --- a/examples/string_filter/Cargo.toml +++ b/examples/string_filter/Cargo.toml @@ -6,4 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -polars = { path = "../../polars", features = ["strings", "lazy"] } +polars = { path = "../../crates/polars", features = ["strings", "lazy"] } diff --git a/polars-cli/Cargo.toml b/polars-cli/Cargo.toml index f6d8bd3c56c96..e82fe2960b948 100644 --- a/polars-cli/Cargo.toml +++ b/polars-cli/Cargo.toml @@ -23,12 +23,13 @@ json = ["polars/json"] ipc = ["polars/ipc"] [dependencies] +polars = { version = "0.31.1", path = "../crates/polars", features = ["lazy", "sql", "dtype-full", "serde-lazy"] } + atty = { version = "0.2" } ciborium = "0.2.0" clap = { version = "4.2.2", features = ["derive", "cargo"] } nu-ansi-term = { version = "0.47.0", optional = true } once_cell.workspace = true -polars = { version = "0.31.1", path = "../polars", features = ["lazy", "sql", "dtype-full", "serde-lazy"] } reedline = { version = "0.21.0" } serde = { version = "1.0.160", features = ["derive"] } sqlparser = "0.34" diff --git a/polars/LICENSE b/polars/LICENSE deleted file mode 120000 index ea5b60640b01f..0000000000000 --- a/polars/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../LICENSE \ No newline at end of file diff --git a/polars/polars-arrow/src/bitmap/mod.rs b/polars/polars-arrow/src/bitmap/mod.rs deleted file mode 100644 index ea3ca07130f86..0000000000000 --- a/polars/polars-arrow/src/bitmap/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod mutable; diff --git a/polars/polars-lazy/polars-pipe/LICENSE b/polars/polars-lazy/polars-pipe/LICENSE deleted file mode 120000 index 5853aaea53bc0..0000000000000 --- a/polars/polars-lazy/polars-pipe/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../../../LICENSE \ No newline at end of file diff --git a/polars/polars-lazy/polars-pipe/README.md b/polars/polars-lazy/polars-pipe/README.md deleted file mode 100644 index 1b503c33f7dfb..0000000000000 --- a/polars/polars-lazy/polars-pipe/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Polars Pipe - -`polars-pipe` is a sub-crate of polars-lazy that provides OOC (out of core) algorithms to the polars physical plans. - -Not intended for external usage diff --git a/polars/polars-lazy/polars-plan/LICENSE b/polars/polars-lazy/polars-plan/LICENSE deleted file mode 120000 index 5853aaea53bc0..0000000000000 --- a/polars/polars-lazy/polars-plan/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../../../LICENSE \ No newline at end of file diff --git a/polars/polars-lazy/polars-plan/src/constants.rs b/polars/polars-lazy/polars-plan/src/constants.rs deleted file mode 100644 index 7c62d9cd56da8..0000000000000 --- a/polars/polars-lazy/polars-plan/src/constants.rs +++ /dev/null @@ -1 +0,0 @@ -pub static MAP_LIST_NAME: &str = "map_list"; diff --git a/polars/polars-lazy/polars-plan/src/dsl/arity.rs b/polars/polars-lazy/polars-plan/src/dsl/arity.rs deleted file mode 100644 index d752ee1fb78f6..0000000000000 --- a/polars/polars-lazy/polars-plan/src/dsl/arity.rs +++ /dev/null @@ -1,141 +0,0 @@ -use super::*; - -/// Intermediate state of `when(..).then(..).otherwise(..)` expr. -#[derive(Clone)] -pub struct When { - predicate: Expr, -} - -/// Intermediate state of `when(..).then(..).otherwise(..)` expr. -#[derive(Clone)] -pub struct WhenThen { - predicate: Expr, - then: Expr, -} - -/// Intermediate state of chain when then exprs. -/// -/// ```text -/// when(..).then(..) -/// when(..).then(..) -/// when(..).then(..) -/// .otherwise(..)` -/// ``` -#[derive(Clone)] -#[must_use] -pub struct WhenThenThen { - predicates: Vec, - thens: Vec, -} - -impl When { - pub fn then>(self, expr: E) -> WhenThen { - WhenThen { - predicate: self.predicate, - then: expr.into(), - } - } -} - -impl WhenThen { - pub fn when>(self, predicate: E) -> WhenThenThen { - WhenThenThen { - predicates: vec![self.predicate, predicate.into()], - thens: vec![self.then], - } - } - - pub fn otherwise>(self, expr: E) -> Expr { - Expr::Ternary { - predicate: Box::new(self.predicate), - truthy: Box::new(self.then), - falsy: Box::new(expr.into()), - } - } -} - -impl WhenThenThen { - pub fn then(mut self, expr: Expr) -> Self { - self.thens.push(expr); - self - } - - pub fn when(mut self, predicate: Expr) -> Self { - self.predicates.push(predicate); - self - } - - pub fn otherwise(self, expr: Expr) -> Expr { - // we iterate the preds/ exprs last in first out - // and nest them. - // - // // this expr: - // when((col('x') == 'a')).then(1) - // .when(col('x') == 'a').then(2) - // .when(col('x') == 'b').then(3) - // .otherwise(4) - // - // needs to become: - // when((col('x') == 'a')).then(1) - - // .otherwise( | - // when(col('x') == 'a').then(2) - | - // .otherwise( | | - // pl.when(col('x') == 'b').then(3) | | - // .otherwise(4) | inner | outer - // ) | | - // ) _| _| - // - // by iterating lifo we first create - // `inner` and then assign that to `otherwise`, - // which will be used in the next layer `outer` - // - - let pred_iter = self.predicates.into_iter().rev(); - let mut then_iter = self.thens.into_iter().rev(); - - let mut otherwise = expr; - - for e in pred_iter { - otherwise = Expr::Ternary { - predicate: Box::new(e), - truthy: Box::new( - then_iter - .next() - .expect("expr expected, did you call when().then().otherwise?"), - ), - falsy: Box::new(otherwise), - } - } - if then_iter.next().is_some() { - panic!( - "this expr is not properly constructed. \ - Every `when` should have an accompanied `then` call." - ) - } - otherwise - } -} - -/// Start a when-then-otherwise expression -pub fn when>(predicate: E) -> When { - When { - predicate: predicate.into(), - } -} - -pub fn ternary_expr(predicate: Expr, truthy: Expr, falsy: Expr) -> Expr { - Expr::Ternary { - predicate: Box::new(predicate), - truthy: Box::new(truthy), - falsy: Box::new(falsy), - } -} - -/// Compute `op(l, r)` (or equivalently `l op r`). `l` and `r` must have types compatible with the Operator. -pub fn binary_expr(l: Expr, op: Operator, r: Expr) -> Expr { - Expr::BinaryExpr { - left: Box::new(l), - op, - right: Box::new(r), - } -} diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/visitor/expr.rs b/polars/polars-lazy/polars-plan/src/logical_plan/visitor/expr.rs deleted file mode 100644 index 3bf8ed8d738ef..0000000000000 --- a/polars/polars-lazy/polars-plan/src/logical_plan/visitor/expr.rs +++ /dev/null @@ -1,131 +0,0 @@ -use super::*; -use crate::prelude::*; -use crate::push_expr; - -impl TreeWalker for Expr { - fn apply_children<'a>( - &'a self, - op: &mut dyn FnMut(&Self) -> PolarsResult, - ) -> PolarsResult { - let mut scratch = vec![]; - - let mut push = |e: &'a Expr| scratch.push(e); - push_expr!(self, push, iter); - - for child in scratch { - match op(child)? { - VisitRecursion::Continue => {} - // early stop - VisitRecursion::Skip => return Ok(VisitRecursion::Continue), - VisitRecursion::Stop => return Ok(VisitRecursion::Stop), - } - } - Ok(VisitRecursion::Continue) - } - - fn map_children(self, _op: &mut dyn FnMut(Self) -> PolarsResult) -> PolarsResult { - todo!() - } -} - -pub struct AexprNode { - node: Node, - arena: *mut Arena, -} - -impl AexprNode { - /// Don't use this directly, use [`Self::with_context`] - /// - /// # Safety - /// This will keep a pointer to `arena`. The caller must ensure it stays alive. - unsafe fn new(node: Node, arena: &mut Arena) -> Self { - Self { node, arena } - } - - /// Safe interface. Take the `&mut Arena` only for the duration of `op`. - pub fn with_context(node: Node, arena: &mut Arena, mut op: F) -> T - where - F: FnMut(AexprNode) -> T, - { - // safety: we drop this context before arena is out of scope - unsafe { op(Self::new(node, arena)) } - } - - pub fn node(&self) -> Node { - self.node - } - - pub fn with_arena<'a, F, T>(&self, op: F) -> T - where - F: Fn(&'a Arena) -> T, - { - let arena = unsafe { &(*self.arena) }; - - op(arena) - } - - pub fn with_arena_mut<'a, F, T>(&mut self, op: F) -> T - where - F: FnOnce(&'a mut Arena) -> T, - { - let arena = unsafe { &mut (*self.arena) }; - - op(arena) - } - - pub fn to_aexpr(&self) -> &AExpr { - self.with_arena(|arena| arena.get(self.node)) - } - - pub fn to_expr(&self) -> Expr { - self.with_arena(|arena| node_to_expr(self.node, arena)) - } -} - -impl TreeWalker for AexprNode { - fn apply_children<'a>( - &'a self, - op: &mut dyn FnMut(&Self) -> PolarsResult, - ) -> PolarsResult { - let mut scratch = vec![]; - - self.to_aexpr().nodes(&mut scratch); - for node in scratch { - let aenode = AexprNode { - node, - arena: self.arena, - }; - match op(&aenode)? { - VisitRecursion::Continue => {} - // early stop - VisitRecursion::Skip => return Ok(VisitRecursion::Continue), - VisitRecursion::Stop => return Ok(VisitRecursion::Stop), - } - } - Ok(VisitRecursion::Continue) - } - - fn map_children( - mut self, - op: &mut dyn FnMut(Self) -> PolarsResult, - ) -> PolarsResult { - let mut scratch = vec![]; - - let ae = self.to_aexpr(); - ae.nodes(&mut scratch); - - // rewrite the nodes - for node in &mut scratch { - let aenode = AexprNode { - node: *node, - arena: self.arena, - }; - *node = op(aenode)?.node; - } - - let ae = ae.clone().replace_inputs(&scratch); - let node = self.with_arena_mut(move |arena| arena.add(ae)); - self.node = node; - Ok(self) - } -} diff --git a/polars/polars-lazy/src/physical_plan/expressions/cache.rs b/polars/polars-lazy/src/physical_plan/expressions/cache.rs deleted file mode 100644 index bb65885bf8183..0000000000000 --- a/polars/polars-lazy/src/physical_plan/expressions/cache.rs +++ /dev/null @@ -1,89 +0,0 @@ -use super::*; - -pub struct CacheExpr { - pub(crate) physical_expr: Arc, - expr: Expr, - id: usize, -} - -impl CacheExpr { - pub fn new(physical_expr: Arc, expr: Expr, id: usize) -> Self { - Self { - physical_expr, - expr, - id, - } - } -} - -impl PhysicalExpr for CacheExpr { - fn as_expression(&self) -> Option<&Expr> { - Some(&self.expr) - } - - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { - if let Some(cached) = state.get_expr_cache(self.id) { - let mut hit = true; - let out = cached - .get_or_try_init(|| { - hit = false; - self.physical_expr.evaluate(df, state) - }) - .cloned(); - if state.verbose() { - if hit { - eprintln!("cache hit: {:?}", self.expr) - } else { - eprintln!("cache miss: {:?}", self.expr) - } - } - out - } else { - self.physical_expr.evaluate(df, state) - } - } - - #[allow(clippy::ptr_arg)] - fn evaluate_on_groups<'a>( - &self, - df: &DataFrame, - groups: &'a GroupsProxy, - state: &ExecutionState, - ) -> PolarsResult> { - if let Some(cached) = state.get_expr_cache(self.id) { - let mut hit = true; - let aggregated = cached - .get_or_try_init(|| { - let mut agg = self.physical_expr.evaluate_on_groups(df, groups, state)?; - hit = false; - PolarsResult::Ok(agg.aggregated()) - })? - .clone(); - if state.verbose() { - if hit { - eprintln!("cache hit: {:?}", self.expr) - } else { - eprintln!("cache miss: {:?}", self.expr) - } - } - Ok(AggregationContext::new( - aggregated, - Cow::Borrowed(groups), - true, - )) - } else { - self.physical_expr.evaluate_on_groups(df, groups, state) - } - } - - fn to_field(&self, input_schema: &Schema) -> PolarsResult { - self.physical_expr.to_field(input_schema) - } - - fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { - None - } - fn is_valid_aggregation(&self) -> bool { - self.physical_expr.is_valid_aggregation() - } -} diff --git a/polars/polars-lazy/src/physical_plan/expressions/take.rs b/polars/polars-lazy/src/physical_plan/expressions/take.rs deleted file mode 100644 index be67d9b77c08f..0000000000000 --- a/polars/polars-lazy/src/physical_plan/expressions/take.rs +++ /dev/null @@ -1,211 +0,0 @@ -use std::sync::Arc; - -use polars_arrow::utils::CustomIterTools; -use polars_core::frame::groupby::GroupsProxy; -use polars_core::prelude::*; -use polars_core::utils::NoNull; - -use crate::physical_plan::state::ExecutionState; -use crate::prelude::*; - -pub struct TakeExpr { - pub(crate) phys_expr: Arc, - pub(crate) idx: Arc, - pub(crate) expr: Expr, -} - -impl TakeExpr { - fn finish( - &self, - df: &DataFrame, - state: &ExecutionState, - series: Series, - ) -> PolarsResult { - let idx = self.idx.evaluate(df, state)?; - - let nulls_before_cast = idx.null_count(); - - let idx = idx.cast(&IDX_DTYPE)?; - if idx.null_count() != nulls_before_cast { - self.oob_err()?; - } - let idx_ca = idx.idx()?; - - series.take(idx_ca) - } - - fn oob_err(&self) -> PolarsResult<()> { - polars_bail!(expr = self.expr, ComputeError: "index out of bounds"); - } -} - -impl PhysicalExpr for TakeExpr { - fn as_expression(&self) -> Option<&Expr> { - Some(&self.expr) - } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { - let series = self.phys_expr.evaluate(df, state)?; - self.finish(df, state, series) - } - - #[allow(clippy::ptr_arg)] - fn evaluate_on_groups<'a>( - &self, - df: &DataFrame, - groups: &'a GroupsProxy, - state: &ExecutionState, - ) -> PolarsResult> { - let mut ac = self.phys_expr.evaluate_on_groups(df, groups, state)?; - let mut idx = self.idx.evaluate_on_groups(df, groups, state)?; - - let idx = - match idx.state { - AggState::AggregatedFlat(s) => { - let idx = s.cast(&IDX_DTYPE)?; - let idx = idx.idx().unwrap(); - - // The indexes are AggregatedFlat, meaning they are a single values pointing into - // a group. - // If we zip this with the first of each group -> `idx + firs` then we can - // simply use a take operation on the whole array instead of per group. - - // The groups maybe scattered all over the place, so we sort by group - ac.sort_by_groups(); - - // A previous aggregation may have updated the groups - let groups = ac.groups(); - - // Determine the take indices - let idx: IdxCa = - match groups.as_ref() { - GroupsProxy::Idx(groups) => { - if groups.all().iter().zip(idx.into_iter()).any( - |(g, idx)| match idx { - None => true, - Some(idx) => idx >= g.len() as IdxSize, - }, - ) { - self.oob_err()?; - } - - idx.into_iter() - .zip(groups.first().iter()) - .map(|(idx, first)| idx.map(|idx| idx + first)) - .collect_trusted() - } - GroupsProxy::Slice { groups, .. } => { - if groups - .iter() - .zip(idx.into_iter()) - .any(|(g, idx)| match idx { - None => true, - Some(idx) => idx >= g[1], - }) - { - self.oob_err()?; - } - - idx.into_iter() - .zip(groups.iter()) - .map(|(idx, g)| idx.map(|idx| idx + g[0])) - .collect_trusted() - } - }; - let taken = ac.flat_naive().take(&idx)?; - ac.with_series(taken, true, Some(&self.expr))?; - return Ok(ac); - } - AggState::AggregatedList(s) => s.list().unwrap().clone(), - // Maybe a literal as well, this needs a different path - AggState::NotAggregated(_) => { - let s = idx.aggregated(); - s.list().unwrap().clone() - } - AggState::Literal(s) => { - let idx = s.cast(&IDX_DTYPE)?; - let idx = idx.idx().unwrap(); - - return if idx.len() == 1 { - match idx.get(0) { - None => polars_bail!(ComputeError: "cannot take by a null"), - Some(idx) => { - if idx != 0 { - // We must make sure that the column we take from is sorted by - // groups otherwise we might point into the wrong group - ac.sort_by_groups() - } - // Make sure that we look at the updated groups. - let groups = ac.groups(); - - // we offset the groups first by idx; - let idx: NoNull = match groups.as_ref() { - GroupsProxy::Idx(groups) => { - if groups.all().iter().any(|g| idx >= g.len() as IdxSize) { - self.oob_err()?; - } - - groups.first().iter().map(|f| *f + idx).collect_trusted() - } - GroupsProxy::Slice { groups, .. } => { - if groups.iter().any(|g| idx >= g[1]) { - self.oob_err()?; - } - - groups.iter().map(|g| g[0] + idx).collect_trusted() - } - }; - let taken = ac.flat_naive().take(&idx.into_inner())?; - ac.with_series(taken, true, Some(&self.expr))?; - ac.with_update_groups(UpdateGroups::WithGroupsLen); - Ok(ac) - } - } - } else { - let out = ac - .aggregated() - .list() - .unwrap() - .try_apply_amortized(|s| s.as_ref().take(idx))?; - - ac.with_series(out.into_series(), true, Some(&self.expr))?; - ac.with_update_groups(UpdateGroups::WithGroupsLen); - Ok(ac) - }; - } - }; - - let s = idx.cast(&DataType::List(Box::new(IDX_DTYPE)))?; - let idx = s.list().unwrap(); - - let mut taken = ac - .aggregated() - .list() - .unwrap() - .amortized_iter() - .zip(idx.amortized_iter()) - .map(|(s, idx)| { - s.and_then(|s| { - idx.map(|idx| { - let idx = idx.as_ref().idx().unwrap(); - s.as_ref().take(idx) - }) - }) - .transpose() - }) - .collect::>()?; - - taken.rename(ac.series().name()); - - ac.with_series(taken.into_series(), true, Some(&self.expr))?; - ac.with_update_groups(UpdateGroups::WithGroupsLen); - Ok(ac) - } - - fn to_field(&self, input_schema: &Schema) -> PolarsResult { - self.phys_expr.to_field(input_schema) - } - - fn is_valid_aggregation(&self) -> bool { - true - } -} diff --git a/polars/polars-time/src/truncate.rs b/polars/polars-time/src/truncate.rs deleted file mode 100644 index 6a9cb4943a4e8..0000000000000 --- a/polars/polars-time/src/truncate.rs +++ /dev/null @@ -1,42 +0,0 @@ -#[cfg(feature = "dtype-date")] -use polars_arrow::export::arrow::temporal_conversions::{MILLISECONDS, SECONDS_IN_DAY}; -use polars_arrow::time_zone::Tz; -use polars_core::prelude::*; - -use crate::prelude::*; - -pub trait PolarsTruncate { - fn truncate(&self, every: Duration, offset: Duration, tz: Option<&Tz>) -> PolarsResult - where - Self: Sized; -} - -#[cfg(feature = "dtype-datetime")] -impl PolarsTruncate for DatetimeChunked { - fn truncate(&self, every: Duration, offset: Duration, tz: Option<&Tz>) -> PolarsResult { - let w = Window::new(every, every, offset); - - let func = match self.time_unit() { - TimeUnit::Nanoseconds => Window::truncate_ns, - TimeUnit::Microseconds => Window::truncate_us, - TimeUnit::Milliseconds => Window::truncate_ms, - }; - - Ok(self - .try_apply(|t| func(&w, t, tz))? - .into_datetime(self.time_unit(), self.time_zone().clone())) - } -} - -#[cfg(feature = "dtype-date")] -impl PolarsTruncate for DateChunked { - fn truncate(&self, every: Duration, offset: Duration, _tz: Option<&Tz>) -> PolarsResult { - let w = Window::new(every, every, offset); - Ok(self - .try_apply(|t| { - const MSECS_IN_DAY: i64 = MILLISECONDS * SECONDS_IN_DAY; - Ok((w.truncate_ms(MSECS_IN_DAY * t as i64, None)? / MSECS_IN_DAY) as i32) - })? - .into_date()) - } -} diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 4be7e133bb57a..a6332fb65f291 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -28,9 +37,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" dependencies = [ "memchr", ] @@ -52,9 +61,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56fc6cf8dc8c4158eed8649f9b8b0ea1518eb62b544fe9490d66fa0b349eafe9" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" [[package]] name = "android-tzdata" @@ -99,8 +108,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.17.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e44f27e89e3edd8738a07c5e2c881efaa25e69be97a816d2df051685d460670c" +source = "git+https://github.com/jorgecarleitao/arrow2?rev=d5c78e7ba45fcebfbafd55a82ba2601ee3ea9617#d5c78e7ba45fcebfbafd55a82ba2601ee3ea9617" dependencies = [ "ahash", "arrow-format", @@ -150,18 +158,18 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.27", ] [[package]] name = "async-trait" -version = "0.1.68" +version = "0.1.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" +checksum = "cc6dde6e4ed435a4c1ee4e73592f5ba9da2151af10076cc04858746af9352d09" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.27", ] [[package]] @@ -193,6 +201,21 @@ dependencies = [ "snap", ] +[[package]] +name = "backtrace" +version = "0.3.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + [[package]] name = "base64" version = "0.21.2" @@ -228,9 +251,9 @@ dependencies = [ [[package]] name = "built" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96f9cdd34d6eb553f9ea20e5bf84abb7b13c729f113fc1d8e49dc00ad9fa8738" +checksum = "b99c4cdc7b2c2364182331055623bdf45254fcb679fea565c40c3c11c101889a" dependencies = [ "cargo-lock", "chrono", @@ -260,7 +283,7 @@ checksum = "fdde5c9cd29ebd706ce1b35600920a33550e402fc998a2e53ad3b42c3c47a192" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.27", ] [[package]] @@ -271,9 +294,9 @@ checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" [[package]] name = "cargo-lock" -version = "8.0.3" +version = "9.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "031718ddb8f78aa5def78a09e90defe30151d1f6c672f937af4dd916429ed996" +checksum = "e11c675378efb449ed3ce8de78d75d0d80542fc98487c26aba28eb3b82feac72" dependencies = [ "semver", "serde", @@ -314,9 +337,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf9cc2b23599e6d7479755f3594285efb3f74a1bdca7a7374948bc831e23a552" +checksum = "f1369bc6b9e9a7dfdae2055f6ec151fe9c554a9d23d357c0237cee2e25eaabb7" dependencies = [ "chrono", "chrono-tz-build", @@ -325,9 +348,9 @@ dependencies = [ [[package]] name = "chrono-tz-build" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9998fb9f7e9b2111641485bf8beb32f92945f97f92a3d061f744cfef335f751" +checksum = "e2f5ebdc942f57ed96d560a6d1a459bae5851102a25d5bf89dc04ae453e31ecf" dependencies = [ "parse-zoneinfo", "phf", @@ -426,14 +449,14 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.14" +version = "0.9.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset 0.8.0", + "memoffset", "scopeguard", ] @@ -449,9 +472,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ "cfg-if", ] @@ -474,42 +497,42 @@ dependencies = [ [[package]] name = "crossterm_winapi" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" dependencies = [ "winapi", ] [[package]] name = "dyn-clone" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b0cf012f1230e43cd00ebb729c6bb58707ecfa8ad08b52ef3a4ccd2697fc30" +checksum = "304e6508efa593091e97a9abbc10f90aa7ca635b6d2784feff3c89d41dd12272" [[package]] name = "either" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" [[package]] name = "enum_dispatch" -version = "0.3.11" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11f36e95862220b211a6e2aa5eca09b4fa391b13cd52ceb8035a24bf65a79de2" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.27", ] [[package]] name = "equivalent" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88bffebc5d80432c9b140ee17875ff173a8ab62faad5b257da912bd2f6c1c0a1" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "ethnum" @@ -556,9 +579,9 @@ checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" [[package]] name = "form_urlencoded" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" +checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" dependencies = [ "percent-encoding", ] @@ -619,7 +642,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.27", ] [[package]] @@ -654,9 +677,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.9" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if", "js-sys", @@ -666,21 +689,16 @@ dependencies = [ ] [[package]] -name = "ghost" -version = "0.1.9" +name = "gimli" +version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e77ac7b51b8e6313251737fcef4b1c01a2ea102bde68415b62c0ee9268fec357" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.18", -] +checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" [[package]] name = "git2" -version = "0.16.1" +version = "0.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf7f68c2995f392c49fffb4f95ae2c873297830eb25c6bc4c114ce8f4562acc" +checksum = "7b989d6a7ca95a362cf2cfc5ad688b3a467be1f87e480b8dad07fee8c79b0044" dependencies = [ "bitflags", "libc", @@ -703,9 +721,9 @@ checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" [[package]] name = "halfbrown" -version = "0.2.2" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f985624e90f861184145c13b736873a0f83cdb998a292dbb0653598ab03aecbf" +checksum = "5681137554ddff44396e5f149892c769d45301dd9aa19c51602a89ee214cb0ec" dependencies = [ "hashbrown 0.13.2", "serde", @@ -717,12 +735,6 @@ version = "2.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" -[[package]] -name = "hashbrown" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" - [[package]] name = "hashbrown" version = "0.13.2" @@ -751,12 +763,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.2.6" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" -dependencies = [ - "libc", -] +checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" [[package]] name = "hex" @@ -770,14 +779,14 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys", ] [[package]] name = "iana-time-zone" -version = "0.1.56" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -798,24 +807,14 @@ dependencies = [ [[package]] name = "idna" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" +checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" dependencies = [ "unicode-bidi", "unicode-normalization", ] -[[package]] -name = "indexmap" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" -dependencies = [ - "autocfg", - "hashbrown 0.12.3", -] - [[package]] name = "indexmap" version = "2.0.0" @@ -844,18 +843,15 @@ dependencies = [ [[package]] name = "inventory" -version = "0.3.6" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0539b5de9241582ce6bd6b0ba7399313560151e58c9aaf8b74b711b1bdce644" -dependencies = [ - "ghost", -] +checksum = "a53088c87cf71c9d4f3372a2cb9eea1e7b8a0b1bf8b7f7d23fe5b76dbb07e63b" [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" [[package]] name = "itoap" @@ -894,9 +890,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.63" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -986,9 +982,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.144" +version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" [[package]] name = "libflate" @@ -1012,9 +1008,9 @@ dependencies = [ [[package]] name = "libgit2-sys" -version = "0.14.2+1.5.1" +version = "0.15.2+1.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f3d95f6b51075fe9810a7ae22c7095f12b98005ab364d8544797a825ce946a4" +checksum = "a80df2e11fb4a61f4ba2ab42dbe7f74468da143f1a75c74e11dee7c813f694fa" dependencies = [ "cc", "libc", @@ -1040,9 +1036,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.9" +version = "1.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56ee889ecc9568871456d42f603d6a0ce59ff328d291063a45cbdf0036baf6db" +checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" dependencies = [ "cc", "libc", @@ -1052,9 +1048,9 @@ dependencies = [ [[package]] name = "lock_api" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" dependencies = [ "autocfg", "scopeguard", @@ -1062,9 +1058,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.18" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "518ef76f2f87365916b142844c16d8fefd85039bc5699050210a7778ee1cd1de" +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" [[package]] name = "lz4" @@ -1111,15 +1107,6 @@ dependencies = [ "libc", ] -[[package]] -name = "memoffset" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" -dependencies = [ - "autocfg", -] - [[package]] name = "memoffset" version = "0.9.0" @@ -1156,7 +1143,7 @@ dependencies = [ "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -1233,9 +1220,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" dependencies = [ "autocfg", "libm", @@ -1243,9 +1230,9 @@ dependencies = [ [[package]] name = "num_cpus" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ "hermit-abi", "libc", @@ -1266,11 +1253,20 @@ dependencies = [ "rustc-hash", ] +[[package]] +name = "object" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" -version = "1.17.2" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "parking_lot" @@ -1290,7 +1286,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core 0.9.7", + "parking_lot_core 0.9.8", ] [[package]] @@ -1302,22 +1298,22 @@ dependencies = [ "cfg-if", "instant", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "smallvec", "winapi", ] [[package]] name = "parking_lot_core" -version = "0.9.7" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.3.5", "smallvec", - "windows-sys 0.45.0", + "windows-targets", ] [[package]] @@ -1359,24 +1355,24 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] name = "phf" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ "phf_shared", ] [[package]] name = "phf_codegen" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56ac890c5e3ca598bbdeaa99964edb5b0258a583a9eb6ef4e89fc85d9224770" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" dependencies = [ "phf_generator", "phf_shared", @@ -1384,9 +1380,9 @@ dependencies = [ [[package]] name = "phf_generator" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" dependencies = [ "phf_shared", "rand", @@ -1394,18 +1390,18 @@ dependencies = [ [[package]] name = "phf_shared" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" dependencies = [ "siphasher", ] [[package]] name = "pin-project-lite" -version = "0.2.9" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" +checksum = "4c40d25201921e5ff0c862a505c6557ea88568a4e3ace775ab55e93f2f4f9d57" [[package]] name = "pin-utils" @@ -1480,7 +1476,7 @@ dependencies = [ "comfy-table", "either", "hashbrown 0.14.0", - "indexmap 2.0.0", + "indexmap", "itoap", "ndarray", "num-traits", @@ -1553,7 +1549,7 @@ dependencies = [ "arrow2", "fallible-streaming-iterator", "hashbrown 0.14.0", - "indexmap 2.0.0", + "indexmap", "num-traits", "polars-arrow", "polars-error", @@ -1590,9 +1586,11 @@ dependencies = [ "argminmax", "arrow2", "base64", + "chrono", + "chrono-tz", "either", "hex", - "indexmap 2.0.0", + "indexmap", "jsonpath_lib", "memchr", "polars-arrow", @@ -1645,7 +1643,7 @@ dependencies = [ "regex", "serde", "smartstring", - "strum_macros 0.25.0", + "strum_macros 0.25.1", ] [[package]] @@ -1697,6 +1695,7 @@ dependencies = [ "hashbrown 0.14.0", "num-traits", "once_cell", + "polars-error", "rayon", "smartstring", "sysinfo", @@ -1710,20 +1709,21 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.59" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" dependencies = [ "unicode-ident", ] [[package]] name = "py-polars" -version = "0.18.7" +version = "0.18.9" dependencies = [ "ahash", "built", "ciborium", + "either", "jemallocator", "lexical-core", "libc", @@ -1745,15 +1745,15 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.19.0" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cffef52f74ec3b1a1baf295d9b8fcc3070327aefc39a6d00656b13c1d0b8885c" +checksum = "ffb88ae05f306b4bfcde40ac4a51dc0b05936a9207a4b75b798c7729c4258a59" dependencies = [ "cfg-if", "indoc", "inventory", "libc", - "memoffset 0.9.0", + "memoffset", "parking_lot 0.12.1", "pyo3-build-config", "pyo3-ffi", @@ -1763,9 +1763,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.19.0" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "713eccf888fb05f1a96eb78c0dbc51907fee42b3377272dc902eb38985f418d5" +checksum = "554db24f0b3c180a9c0b1268f91287ab3f17c162e15b54caaae5a6b3773396b0" dependencies = [ "once_cell", "target-lexicon", @@ -1779,9 +1779,9 @@ checksum = "be6d574e0f8cab2cdd1eeeb640cbf845c974519fa9e9b62fa9c08ecece0ca5de" [[package]] name = "pyo3-ffi" -version = "0.19.0" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b2ecbdcfb01cbbf56e179ce969a048fd7305a66d4cdf3303e0da09d69afe4c3" +checksum = "922ede8759e8600ad4da3195ae41259654b9c55da4f7eec84a0ccc7d067a70a4" dependencies = [ "libc", "pyo3-build-config", @@ -1789,9 +1789,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.19.0" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b78fdc0899f2ea781c463679b20cb08af9247febc8d052de941951024cd8aea0" +checksum = "8a5caec6a1dd355964a841fcbeeb1b89fe4146c87295573f94228911af3cc5a2" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -1801,9 +1801,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.19.0" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60da7b84f1227c3e2fe7593505de274dcf4c8928b4e0a1c23d551a14e4e80a0f" +checksum = "e0b78ccbb160db1556cdb6fd96c50334c5d4ec44dc5e0a968d0a1208fa0efa8b" dependencies = [ "proc-macro2", "quote", @@ -1812,9 +1812,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.28" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" dependencies = [ "proc-macro2", ] @@ -1896,15 +1896,36 @@ dependencies = [ "bitflags", ] +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" -version = "1.8.3" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81ca098a9821bd52d6b24fd8b10bd081f47d39c22778cafaa75a2857a62c6390" +checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.2", + "regex-automata", + "regex-syntax 0.7.4", +] + +[[package]] +name = "regex-automata" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.7.4", ] [[package]] @@ -1915,9 +1936,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.7.2" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" +checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" [[package]] name = "rle-decode-fast" @@ -1925,6 +1946,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + [[package]] name = "rustc-hash" version = "1.1.0" @@ -1942,74 +1969,83 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.12" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" [[package]] name = "ryu" -version = "1.0.13" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" [[package]] name = "scopeguard" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" +checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" dependencies = [ "serde", ] [[package]] name = "seq-macro" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.163" +version = "1.0.176" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2" +checksum = "76dc28c9523c5d70816e393136b86d48909cfb27cecaa902d338c19ed47164dc" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.163" +version = "1.0.176" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" +checksum = "a4e7b8c5dc823e3b90651ff1d3808419cd14e5ad76de04feaf37da114e7a306f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.27", ] [[package]] name = "serde_json" -version = "1.0.96" +version = "1.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" +checksum = "076066c5f1078eac5b722a31827a8832fe108bed65dfa75e233c89f8206e976c" dependencies = [ - "indexmap 1.9.3", + "indexmap", "itoa", "ryu", "serde", ] +[[package]] +name = "serde_spanned" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96426c9936fd7a0124915f9185ea1d20aa9445cc9821142f0a73bc9207a2e186" +dependencies = [ + "serde", +] + [[package]] name = "signal-hook" -version = "0.3.15" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9" +checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801" dependencies = [ "libc", "signal-hook-registry", @@ -2037,9 +2073,8 @@ dependencies = [ [[package]] name = "simd-json" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3d0815e7ff0f1f05e09d4b029f86d8a330f0ab15b35b28736f3758325f59e14" +version = "0.10.0" +source = "git+https://github.com/ritchie46/simd-json?branch=initialize#946b316f686c6ad3050f694ea434248c38aa321d" dependencies = [ "ahash", "halfbrown", @@ -2074,9 +2109,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" +checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" [[package]] name = "smartstring" @@ -2163,15 +2198,15 @@ dependencies = [ [[package]] name = "strum_macros" -version = "0.25.0" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe9f3bd7d2e45dcc5e265fbb88d6513e4747d8ef9444cf01a533119bce28a157" +checksum = "6069ca09d878a33f883cc06aaa9718ede171841d3832450354410b718b097232" dependencies = [ "heck", "proc-macro2", "quote", "rustversion", - "syn 2.0.18", + "syn 2.0.27", ] [[package]] @@ -2187,9 +2222,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.18" +version = "2.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0" dependencies = [ "proc-macro2", "quote", @@ -2198,9 +2233,9 @@ dependencies = [ [[package]] name = "sysinfo" -version = "0.29.0" +version = "0.29.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02f1dc6930a439cc5d154221b5387d153f8183529b07c19aca24ea31e0a167e1" +checksum = "165d6d8539689e3d3bc8b98ac59541e1f21c7de7c85d60dc80e43ae0ed2113db" dependencies = [ "cfg-if", "core-foundation-sys", @@ -2218,28 +2253,28 @@ checksum = "06f6b473c37f9add4cf1df5b4d66a8ef58ab6c895f1a3b3f949cf3e21230140e" [[package]] name = "target-lexicon" -version = "0.12.7" +version = "0.12.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd1ba337640d60c3e96bc6f0638a939b9c9a7f2c316a1598c279828b3d1dc8c5" +checksum = "1d2faeef5759ab89935255b1a4cd98e0baf99d1085e37d36599c625dac49ae8e" [[package]] name = "thiserror" -version = "1.0.40" +version = "1.0.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" +checksum = "611040a08a0439f8248d1990b111c95baa9c704c805fa1f62104b39655fd7f90" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.40" +version = "1.0.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" +checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.27", ] [[package]] @@ -2270,25 +2305,51 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.28.2" +version = "1.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" +checksum = "532826ff75199d5833b9d2c5fe410f29235e25704ee5f0ef599fb51c21f4a4da" dependencies = [ "autocfg", + "backtrace", "libc", "mio", "pin-project-lite", "socket2", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] name = "toml" -version = "0.5.11" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +checksum = "c17e963a819c331dcacd7ab957d80bc2b9a9c1e71c804826d2f283dd65306542" dependencies = [ "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.19.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8123f27e969974a3dfba720fdb560be359f57b44302d280ba72e76a74480e8a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", ] [[package]] @@ -2299,9 +2360,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.9" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" +checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" [[package]] name = "unicode-normalization" @@ -2326,9 +2387,9 @@ checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" [[package]] name = "url" -version = "2.3.1" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" +checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" dependencies = [ "form_urlencoded", "idna", @@ -2373,9 +2434,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -2383,24 +2444,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.27", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.36" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" dependencies = [ "cfg-if", "js-sys", @@ -2410,9 +2471,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2420,22 +2481,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.27", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "wasm-timer" @@ -2454,9 +2515,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.63" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" dependencies = [ "js-sys", "wasm-bindgen", @@ -2490,16 +2551,7 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" dependencies = [ - "windows-targets 0.48.0", -] - -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", + "windows-targets", ] [[package]] @@ -2508,105 +2560,54 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.48.0", + "windows-targets", ] [[package]] name = "windows-targets" -version = "0.42.2" +version = "0.48.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows-targets" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" -dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - [[package]] name = "windows_aarch64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" -[[package]] -name = "windows_i686_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - [[package]] name = "windows_i686_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" -[[package]] -name = "windows_i686_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - [[package]] name = "windows_i686_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - [[package]] name = "windows_x86_64_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - [[package]] name = "windows_x86_64_gnullvm" version = "0.48.0" @@ -2615,15 +2616,18 @@ checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" -version = "0.42.2" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] -name = "windows_x86_64_msvc" -version = "0.48.0" +name = "winnow" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +checksum = "25b5872fa2e10bd067ae946f927e726d7d603eaeb6e02fa6a350e0722d2b8c11" +dependencies = [ + "memchr", +] [[package]] name = "xxhash-rust" @@ -2633,18 +2637,18 @@ checksum = "735a71d46c4d68d71d4b24d03fdc2b98e38cea81730595801db779c04fe80d70" [[package]] name = "zstd" -version = "0.12.3+zstd.1.5.2" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "6.0.5+zstd.1.5.4" +version = "6.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d56d9e60b4b1758206c238a10165fbcae3ca37b01744e394c463463f6529d23b" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" dependencies = [ "libc", "zstd-sys", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 9cf2d39685f93..91281157eced4 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "0.18.7" +version = "0.18.9" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -16,16 +16,17 @@ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] } [dependencies] ahash = "0.8" ciborium = "0.2.0" +either = "1.8" lexical-core = "0.8" # todo: unfix when compilation problem is solved libc = "0.2" ndarray = "0.15" numpy = "0.19" once_cell = "1" -polars-algo = { path = "../polars/polars-algo", default-features = false } -polars-core = { path = "../polars/polars-core", features = ["python"], default-features = false } -polars-error = { path = "../polars/polars-error" } -polars-lazy = { path = "../polars/polars-lazy", features = ["python"], default-features = false } +polars-algo = { path = "../crates/polars-algo", default-features = false } +polars-core = { path = "../crates/polars-core", features = ["python"], default-features = false } +polars-error = { path = "../crates/polars-error" } +polars-lazy = { path = "../crates/polars-lazy", features = ["python"], default-features = false } pyo3 = { version = "0.19", features = ["abi3-py38", "extension-module", "multiple-pymethods"] } pyo3-built = { version = "0.4", optional = true } serde_json = { version = "1", optional = true } @@ -122,7 +123,7 @@ default = [ ] [dependencies.polars] -path = "../polars" +path = "../crates/polars" default-features = false features = [ "dynamic_groupby", @@ -192,4 +193,4 @@ lto = "fat" built = { version = "0.6", features = ["chrono", "git2"], optional = true } [patch.crates-io] -# simd-json = { git = "https://github.com/ritchie46/simd-json", branch = "alignment" } +simd-json = { git = "https://github.com/ritchie46/simd-json", branch = "initialize" } diff --git a/py-polars/docs/Makefile b/py-polars/docs/Makefile index 1579311a800a6..580ab347de930 100644 --- a/py-polars/docs/Makefile +++ b/py-polars/docs/Makefile @@ -16,6 +16,10 @@ help: .PHONY: help Makefile +clean: + @rm -rf source/reference/*/api/ + @rm -rf source/reference/api/ + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile diff --git a/py-polars/docs/source/conf.py b/py-polars/docs/source/conf.py index 6c4f0891e9a68..ffb71b2cfde26 100644 --- a/py-polars/docs/source/conf.py +++ b/py-polars/docs/source/conf.py @@ -235,14 +235,14 @@ def _minify_classpaths(s: str) -> str: ) -def process_signature(app, what, name, obj, opts, sig, ret): +def process_signature(app, what, name, obj, opts, sig, ret): # noqa: D103 return ( _minify_classpaths(sig) if sig else sig, _minify_classpaths(ret) if ret else ret, ) -def setup(app): +def setup(app): # noqa: D103 # TODO: a handful of methods do not seem to trigger the event for # some reason (possibly @overloads?) - investigate further... app.connect("autodoc-process-signature", process_signature) diff --git a/py-polars/docs/source/reference/dataframe/attributes.rst b/py-polars/docs/source/reference/dataframe/attributes.rst index a713562eaa4de..086cc41597eb7 100644 --- a/py-polars/docs/source/reference/dataframe/attributes.rst +++ b/py-polars/docs/source/reference/dataframe/attributes.rst @@ -8,6 +8,7 @@ Attributes DataFrame.columns DataFrame.dtypes + DataFrame.flags DataFrame.height DataFrame.schema DataFrame.shape diff --git a/py-polars/docs/source/reference/expressions/computation.rst b/py-polars/docs/source/reference/expressions/computation.rst index fcfe98a89e051..8289041b32c1c 100644 --- a/py-polars/docs/source/reference/expressions/computation.rst +++ b/py-polars/docs/source/reference/expressions/computation.rst @@ -15,6 +15,7 @@ Computation Expr.arctan Expr.arctanh Expr.arg_unique + Expr.cbrt Expr.cos Expr.cosh Expr.cumcount diff --git a/py-polars/docs/source/reference/expressions/functions.rst b/py-polars/docs/source/reference/expressions/functions.rst index 8755360027964..e7386f782700e 100644 --- a/py-polars/docs/source/reference/expressions/functions.rst +++ b/py-polars/docs/source/reference/expressions/functions.rst @@ -19,6 +19,8 @@ These functions are available from the polars module root and can be used as exp apply approx_unique arange + arctan2 + arctan2d arg_sort_by arg_where avg @@ -35,6 +37,7 @@ These functions are available from the polars module root and can be used as exp date datetime date_range + date_ranges duration element exclude @@ -72,6 +75,7 @@ These functions are available from the polars module root and can be used as exp tail time time_range + time_ranges var when zeros diff --git a/py-polars/docs/source/reference/io.rst b/py-polars/docs/source/reference/io.rst index 77e94cdfdca97..ae00950734a92 100644 --- a/py-polars/docs/source/reference/io.rst +++ b/py-polars/docs/source/reference/io.rst @@ -24,6 +24,7 @@ Feather/ IPC scan_ipc read_ipc_schema DataFrame.write_ipc + LazyFrame.sink_ipc Parquet ~~~~~~~ @@ -34,6 +35,7 @@ Parquet scan_parquet read_parquet_schema DataFrame.write_parquet + LazyFrame.sink_parquet Database ~~~~~~~~ diff --git a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst index a02adeff75ff6..e10bb5082bfeb 100644 --- a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst +++ b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst @@ -13,9 +13,6 @@ Miscellaneous LazyFrame.map LazyFrame.pipe LazyFrame.profile - LazyFrame.sink_ipc - LazyFrame.sink_parquet - Read/write logical plan ----------------------- diff --git a/py-polars/docs/source/reference/series/computation.rst b/py-polars/docs/source/reference/series/computation.rst index a42f91726d873..e8ed4c7a2e62b 100644 --- a/py-polars/docs/source/reference/series/computation.rst +++ b/py-polars/docs/source/reference/series/computation.rst @@ -15,6 +15,7 @@ Computation Series.arctanh Series.arg_true Series.arg_unique + Series.cbrt Series.cos Series.cosh Series.cummax diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 7bdde251608ba..ddc26de3b43cd 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -84,6 +84,8 @@ apply, approx_unique, arange, + arctan2, + arctan2d, arg_sort_by, arg_where, avg, @@ -102,6 +104,7 @@ cumsum_horizontal, date, date_range, + date_ranges, datetime, duration, element, @@ -140,6 +143,7 @@ tail, time, time_range, + time_ranges, var, when, zeros, @@ -282,10 +286,12 @@ "arg_where", "concat", "date_range", + "date_ranges", "element", "ones", "repeat", "time_range", + "time_ranges", "zeros", # polars.functions.aggregation "all", @@ -303,6 +309,8 @@ # polars.functions.lazy "apply", "arange", + "arctan2", + "arctan2d", "arg_sort_by", "avg", "coalesce", diff --git a/py-polars/polars/_reexport.py b/py-polars/polars/_reexport.py index 3fe2f44062ada..408fead781def 100644 --- a/py-polars/polars/_reexport.py +++ b/py-polars/polars/_reexport.py @@ -1,7 +1,7 @@ """Re-export Polars functionality to avoid cyclical imports.""" from polars.dataframe import DataFrame -from polars.expr import Expr +from polars.expr import Expr, When from polars.lazyframe import LazyFrame from polars.series import Series @@ -10,4 +10,5 @@ "Expr", "LazyFrame", "Series", + "When", ] diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index 2ef42ad14af3d..311f75d7ba92a 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -204,7 +204,9 @@ def save(cls, file: Path | str | None = None) -> str: Returns ------- - str : json string containing current Config options, or filepath where saved. + str + JSON string containing current Config options, or the path to the file where + the options are saved. """ environment_vars = { diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py index be61f92c3e3f1..47aca8f8caef5 100644 --- a/py-polars/polars/convert.py +++ b/py-polars/polars/convert.py @@ -60,7 +60,7 @@ def from_dict( Returns ------- - :class:`DataFrame` + DataFrame Examples -------- @@ -120,7 +120,7 @@ def from_dicts( Returns ------- - :class:`DataFrame` + DataFrame Examples -------- @@ -224,7 +224,7 @@ def from_records( Returns ------- - :class:`DataFrame` + DataFrame Examples -------- @@ -488,7 +488,7 @@ def from_numpy( Returns ------- - :class:`DataFrame` + DataFrame Examples -------- @@ -559,7 +559,7 @@ def from_arrow( Returns ------- - :class:`DataFrame` or :class:`Series` + DataFrame or Series Examples -------- @@ -666,8 +666,8 @@ def from_pandas( Parameters ---------- - data: :class:`pandas.DataFrame`, :class:`pandas.Series`, :class:`pandas.DatetimeIndex` - Data represented as a pandas DataFrame, Series, or DatetimeIndex. + data : :class:`pandas.DataFrame` or :class:`pandas.Series` or :class:`pandas.Index` + Data represented as a pandas DataFrame, Series, or Index. schema_overrides : dict, default None Support override of inferred types for one or more columns. rechunk : bool, default True @@ -679,7 +679,7 @@ def from_pandas( Returns ------- - :class:`DataFrame` + DataFrame Examples -------- @@ -713,7 +713,7 @@ def from_pandas( 3 ] - """ # noqa: W505 + """ if isinstance(data, (pd.Series, pd.DatetimeIndex)): return pl.Series._from_pandas("", data, nan_to_null=nan_to_null) elif isinstance(data, pd.DataFrame): diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index f4fe79066e6ce..f4232751645d5 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4,7 +4,6 @@ import contextlib import os import random -import warnings from collections import defaultdict from collections.abc import Sized from io import BytesIO, StringIO, TextIOWrapper @@ -87,12 +86,11 @@ from polars.utils._parse_expr_input import parse_as_expression from polars.utils._wrap import wrap_expr, wrap_ldf, wrap_s from polars.utils.convert import _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias +from polars.utils.deprecation import deprecated_alias, issue_deprecation_warning from polars.utils.various import ( _prepare_row_count_args, _process_null_values, can_create_dicts_with_pyarrow, - find_stacklevel, handle_projection_columns, is_bool_sequence, is_int_sequence, @@ -455,10 +453,6 @@ def _from_dict( Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden. - Returns - ------- - DataFrame - """ return cls._from_pydf( dict_to_pydf(data, schema=schema, schema_overrides=schema_overrides) @@ -501,10 +495,6 @@ def _from_records( infer_schema_length How many rows to scan to determine the column type. - Returns - ------- - DataFrame - """ return cls._from_pydf( sequence_to_pydf( @@ -550,10 +540,6 @@ def _from_numpy( the orientation is inferred by matching the columns and data dimensions. If this does not yield conclusive results, column orientation is used. - Returns - ------- - DataFrame - """ return cls._from_pydf( numpy_to_pydf( @@ -596,10 +582,6 @@ def _from_arrow( rechunk : bool, default True Make sure that all data is in contiguous memory. - Returns - ------- - DataFrame - """ return cls._from_pydf( arrow_to_pydf( @@ -648,10 +630,6 @@ def _from_pandas( include_index : bool, default False Load any non-default pandas indexes as columns. - Returns - ------- - DataFrame - """ return cls._from_pydf( pandas_to_pydf( @@ -895,10 +873,6 @@ def _read_avro( n_rows Stop reading from Apache Avro file after reading ``n_rows``. - Returns - ------- - DataFrame - """ if isinstance(source, (str, Path)): source = normalise_filepath(source) @@ -942,10 +916,6 @@ def _read_ipc( memory_map Memory map the file - Returns - ------- - DataFrame - """ if isinstance(source, (str, Path)): source = normalise_filepath(source) @@ -1180,6 +1150,18 @@ def dtypes(self) -> list[PolarsDataType]: """ return self._df.dtypes() + @property + def flags(self) -> dict[str, dict[str, bool]]: + """ + Get flags that are set on the columns of this DataFrame. + + Returns + ------- + dict + Mapping from column names to column flags. + """ + return {name: self[name].flags for name in self.columns} + @property def schema(self) -> SchemaDict: """ @@ -2040,7 +2022,7 @@ def to_pandas( # noqa: D417 of null values. Subsequent operations on the resulting pandas DataFrame may trigger conversion to NumPy arrays if that operation is not supported by pyarrow compute functions. - kwargs + **kwargs Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. Returns @@ -2549,6 +2531,13 @@ def write_excel( hidden_columns: Sequence[str] | None = None, hide_gridlines: bool = False, sheet_zoom: int | None = None, + freeze_panes: ( + str + | tuple[int, int] + | tuple[str, int, int] + | tuple[int, int, int, int] + | None + ) = None, ) -> Workbook: """ Write frame data to a table in an Excel workbook/worksheet. @@ -2665,6 +2654,21 @@ def write_excel( Do not display any gridlines on the output worksheet. sheet_zoom : int Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + Notes ----- @@ -2980,6 +2984,12 @@ def write_excel( ) ws.autofit() + if freeze_panes: + if isinstance(freeze_panes, str): + ws.freeze_panes(freeze_panes) + else: + ws.freeze_panes(*freeze_panes) + if can_close: wb.close() return wb @@ -3141,10 +3151,11 @@ def write_parquet( file, compression, compression_level, statistics, row_group_size ) + @deprecated_alias(connection_uri="connection") def write_database( self, table_name: str, - connection_uri: str, + connection: str, *, if_exists: DbWriteMode = "fail", engine: DbWriteEngine = "sqlalchemy", @@ -3157,8 +3168,8 @@ def write_database( table_name Name of the table to create or append to in the target SQL database. If your table name contains special characters, it should be quoted. - connection_uri - Connection URI, for example: + connection + Connection URI string, for example: * "postgresql://user:pass@server:port/database" * "sqlite:////path/to/database.db" @@ -3184,10 +3195,8 @@ def write_database( f"Value for 'if_exists'={if_exists} was unexpected. " f"Choose one of: {'fail', 'replace', 'append'}." ) - with _open_adbc_connection(connection_uri) as conn: - cursor = conn.cursor() + with _open_adbc_connection(connection) as conn, conn.cursor() as cursor: cursor.adbc_ingest(table_name, self.to_arrow(), mode) - cursor.close() conn.commit() elif engine == "sqlalchemy": @@ -3218,7 +3227,7 @@ def write_database( # ensure conversion to pandas uses the pyarrow extension array option # so that we can make use of the sql/db export without copying data - engine_sa = create_engine(connection_uri) + engine_sa = create_engine(connection) self.to_pandas(use_pyarrow_extension_array=True).to_sql( name=table_name, schema=db_schema, @@ -3313,9 +3322,9 @@ def write_delta( ... ) # doctest: +SKIP """ - from polars.io.delta import check_if_delta_available, resolve_delta_lake_uri + from polars.io.delta import _check_if_delta_available, _resolve_delta_lake_uri - check_if_delta_available() + _check_if_delta_available() from deltalake.writer import ( try_get_deltatable, @@ -3326,7 +3335,7 @@ def write_delta( delta_write_options = {} if isinstance(target, (str, Path)): - target = resolve_delta_lake_uri(str(target), strict=False) + target = _resolve_delta_lake_uri(str(target), strict=False) unsupported_cols = {} unsupported_types = [Time, Categorical, Null] @@ -3416,7 +3425,7 @@ def transpose( *, include_header: bool = False, header_name: str = "column", - column_names: Iterable[str] | None = None, + column_names: str | Iterable[str] | None = None, ) -> Self: """ Transpose a DataFrame over the diagonal. @@ -3429,8 +3438,8 @@ def transpose( If `include_header` is set, this determines the name of the column that will be inserted. column_names - Optional iterable that yields column names. Will be used to - replace the columns in the DataFrame. + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. Notes ----- @@ -3502,20 +3511,34 @@ def transpose( │ 1 ┆ 2 ┆ 3 │ └─────────────┴─────────────┴─────────────┘ - """ - df = self._from_pydf(self._df.transpose(include_header, header_name)) - if column_names is not None: - names = [] - n = df.width - if include_header: - names.append(header_name) - n -= 1 + Use an existing column as the new column names - column_names = iter(column_names) - for _ in range(n): - names.append(next(column_names)) - df.columns = names - return df + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + """ + keep_names_as = header_name if include_header else None + if isinstance(column_names, Generator): + column_names = [next(column_names) for _ in range(self.height)] + return self._from_pydf(self._df.transpose(keep_names_as, column_names)) def reverse(self) -> DataFrame: """ @@ -4129,7 +4152,7 @@ def top_k( .collect( projection_pushdown=False, predicate_pushdown=False, - common_subplan_elimination=False, + comm_subplan_elim=False, slice_pushdown=True, ) ) @@ -4221,7 +4244,7 @@ def bottom_k( .collect( projection_pushdown=False, predicate_pushdown=False, - common_subplan_elimination=False, + comm_subplan_elim=False, slice_pushdown=True, ) ) @@ -5577,7 +5600,7 @@ def join( Returns ------- - Joined DataFrame + DataFrame See Also -------- @@ -5746,16 +5769,16 @@ def apply( │ 6 ┆ 24 │ └──────────┴──────────┘ - It is better to implement this with an expression: + However, it is much better to implement this with a native expression: >>> df.select( ... pl.col("foo") * 2, ... pl.col("bar") * 3, ... ) # doctest: +IGNORE_RESULT - Return a Series by mapping each row to a scalar: + Return a DataFrame with a single column by mapping each row to a scalar: - >>> df.apply(lambda t: (t[0] * 2 + t[1])) + >>> df.apply(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP shape: (3, 1) ┌───────┐ │ apply │ @@ -5767,11 +5790,15 @@ def apply( │ 14 │ └───────┘ - In this case it is better to use the following expression: + In this case it is better to use the following native expression: >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT """ + # TODO: + # from polars.utils.udfs import warn_on_inefficient_apply + # warn_on_inefficient_apply(function, columns=self.columns, apply_target="frame) + out, is_df = self._df.apply(function, return_dtype, inference_size) if is_df: return self._from_pydf(out) @@ -6020,7 +6047,8 @@ def drop_in_place(self, name: str) -> Series: Returns ------- - The dropped column. + Series + The dropped column. Examples -------- @@ -6248,6 +6276,7 @@ def fill_null( Returns ------- + DataFrame DataFrame with None values replaced by the filling strategy. See Also @@ -6327,11 +6356,12 @@ def fill_nan(self, value: Expr | int | float | None) -> DataFrame: Parameters ---------- value - Value to fill NaN with. + Value with which to replace NaN values. Returns ------- - DataFrame with NaN replaced with fill_value + DataFrame + DataFrame with NaN values replaced by the given value. Warnings -------- @@ -6525,12 +6555,11 @@ def pivot( columns = [columns] if aggregate_function is no_default: - warnings.warn( + issue_deprecation_warning( "In a future version of polars, the default `aggregate_function` " "will change from `'first'` to `None`. Please pass `'first'` to keep the " "current behaviour, or `None` to accept the new one.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.16.16", ) aggregate_function = "first" @@ -7272,7 +7301,8 @@ def with_columns( Returns ------- - A new DataFrame with the columns added. + DataFrame + A new DataFrame with the columns added. Notes ----- @@ -7934,7 +7964,8 @@ def unique( Returns ------- - DataFrame with unique rows. + DataFrame + DataFrame with unique rows. Warnings -------- @@ -8296,7 +8327,7 @@ def row( Returns ------- - Tuple (default) or dictionary of row values. + tuple (default) or dictionary of row values Notes ----- @@ -8420,7 +8451,7 @@ def rows( Returns ------- - A list of tuples (default) or dictionaries of row values. + list of tuples (default) or dictionaries of row values Examples -------- @@ -8661,7 +8692,7 @@ def iter_rows( Returns ------- - An iterator of tuples (default) or dictionaries (if named) of python row values. + iterator of tuples (default) or dictionaries (if named) of python row values Examples -------- diff --git a/py-polars/polars/dataframe/groupby.py b/py-polars/polars/dataframe/groupby.py index 573938c59a089..d655df4d324d9 100644 --- a/py-polars/polars/dataframe/groupby.py +++ b/py-polars/polars/dataframe/groupby.py @@ -4,7 +4,6 @@ import polars._reexport as pl from polars import functions as F -from polars.functions.whenthen import WhenThen, WhenThenThen from polars.utils.convert import _timedelta_to_pl_duration if TYPE_CHECKING: @@ -64,9 +63,7 @@ def __iter__(self) -> Self: """ Allows iteration over the groups of the groupby operation. - Returns - ------- - Iterator returning tuples of (name, data) for each group. + Each group is represented by a tuple of (name, data). Examples -------- @@ -110,10 +107,7 @@ def __iter__(self) -> Self: # When grouping by a single column, group name is a single value # When grouping by multiple columns, group name is a tuple of values self._group_names: Iterator[object] | Iterator[tuple[object, ...]] - if ( - isinstance(self.by, (str, pl.Expr, WhenThen, WhenThenThen)) - and not self.more_by - ): + if isinstance(self.by, (str, pl.Expr)) and not self.more_by: self._group_names = iter(group_names.to_series()) else: self._group_names = group_names.iter_rows() @@ -328,7 +322,7 @@ def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: if isinstance(self.by, str): by = [self.by] - elif isinstance(self.by, Iterable) and all(isinstance(c, str) for c in self.by): # type: ignore[union-attr] + elif isinstance(self.by, Iterable) and all(isinstance(c, str) for c in self.by): by = list(self.by) # type: ignore[arg-type] else: raise TypeError("Cannot call `apply` when grouping by an expression.") @@ -845,6 +839,19 @@ def agg( *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr, ) -> DataFrame: + """ + Compute aggregations for each group of a groupby operation. + + Parameters + ---------- + *aggs + Aggregations to compute for each group of the groupby operation, + specified as positional arguments. + Accepts expression input. Strings are parsed as column names. + **named_aggs + Additional aggregations, specified as keyword arguments. + The resulting columns will be renamed to the keyword used. + """ return ( self.df.lazy() .groupby_rolling( @@ -1046,6 +1053,19 @@ def agg( *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr, ) -> DataFrame: + """ + Compute aggregations for each group of a groupby operation. + + Parameters + ---------- + *aggs + Aggregations to compute for each group of the groupby operation, + specified as positional arguments. + Accepts expression input. Strings are parsed as column names. + **named_aggs + Additional aggregations, specified as keyword arguments. + The resulting columns will be renamed to the keyword used. + """ return ( self.df.lazy() .groupby_dynamic( diff --git a/py-polars/polars/datatypes/classes.py b/py-polars/polars/datatypes/classes.py index ec03b5584a264..70272232359c1 100644 --- a/py-polars/polars/datatypes/classes.py +++ b/py-polars/polars/datatypes/classes.py @@ -3,7 +3,7 @@ import contextlib from datetime import timezone from inspect import isclass -from typing import TYPE_CHECKING, Any, Callable, Iterator, Mapping, Sequence +from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Mapping, Sequence import polars.datatypes @@ -31,7 +31,7 @@ def __init__(self, method: Callable[..., Any] | None = None) -> None: def __get__(self, instance: Any, cls: type | None = None) -> Any: return self.fget(cls) # type: ignore[misc] - def getter(self, method: Callable[..., Any]) -> Any: + def getter(self, method: Callable[..., Any]) -> Any: # noqa: D102 self.fget = method return self @@ -46,25 +46,29 @@ def _string_repr(cls) -> str: return _dtype_str_repr(cls) def base_type(cls) -> PolarsDataType: + """Return the base type.""" return cls @classproperty def is_nested(self) -> bool: + """Check if this data type is nested.""" return False @classmethod def is_(cls, other: PolarsDataType) -> bool: + """Check if this DataType is the same as another DataType.""" return cls == other and hash(cls) == hash(other) @classmethod def is_not(cls, other: PolarsDataType) -> bool: + """Check if this DataType is NOT the same as another DataType.""" return not cls.is_(other) class DataType(metaclass=DataTypeClass): """Base class for all Polars data types.""" - def __new__(cls, *args: Any, **kwargs: Any) -> PolarsDataType: # type: ignore[misc] + def __new__(cls, *args: Any, **kwargs: Any) -> PolarsDataType: # type: ignore[misc] # noqa: D102 # this formulation allows for equivalent use of "pl.Type" and "pl.Type()", while # still respecting types that take initialisation params (eg: Duration/Datetime) if args or kwargs: @@ -95,6 +99,7 @@ def base_type(cls) -> DataTypeClass: @classproperty def is_nested(self) -> bool: + """Check if this data type is nested.""" return False @classinstmethod # type: ignore[arg-type] @@ -158,15 +163,30 @@ def _custom_reconstruct( class DataTypeGroup(frozenset): # type: ignore[type-arg] + """Group of data types.""" + _match_base_type: bool - def __new__(cls, items: Any, *, match_base_type: bool = True) -> DataTypeGroup: + def __new__( + cls, items: Iterable[DataType | DataTypeClass], *, match_base_type: bool = True + ) -> DataTypeGroup: + """ + Construct a DataTypeGroup. + + Parameters + ---------- + items : + iterable of data types + match_base_type: + match the base type + + """ for it in items: if not isinstance(it, (DataType, DataTypeClass)): raise TypeError( f"DataTypeGroup items must be dtypes; found {type(it).__name__!r}" ) - dtype_group = super().__new__(cls, items) + dtype_group = super().__new__(cls, items) # type: ignore[arg-type] dtype_group._match_base_type = match_base_type return dtype_group @@ -201,6 +221,7 @@ class NestedType(DataType): @classproperty def is_nested(self) -> bool: + """Check if this data type is nested.""" return True @@ -406,6 +427,8 @@ class Unknown(DataType): class List(NestedType): + """Nested list/array type with variable length of inner lists.""" + inner: PolarsDataType | None = None def __init__(self, inner: PolarsDataType | PythonDataType): @@ -466,6 +489,8 @@ def __repr__(self) -> str: class Array(NestedType): + """Nested list/array type with fixed length of inner arrays.""" + inner: PolarsDataType | None = None width: int @@ -524,6 +549,8 @@ def __repr__(self) -> str: class Field: + """Definition of a single field within a `Struct` DataType.""" + def __init__(self, name: str, dtype: PolarsDataType): """ Definition of a single field within a `Struct` DataType. @@ -551,6 +578,8 @@ def __repr__(self) -> str: class Struct(NestedType): + """Struct composite type.""" + def __init__(self, fields: Sequence[Field] | SchemaDict): """ Struct composite type. diff --git a/py-polars/polars/datatypes/convert.py b/py-polars/polars/datatypes/convert.py index ec06b8bbe44ca..9b4f1a2a985c8 100644 --- a/py-polars/polars/datatypes/convert.py +++ b/py-polars/polars/datatypes/convert.py @@ -73,7 +73,7 @@ T = TypeVar("T") -def cache(function: Callable[..., T]) -> T: +def cache(function: Callable[..., T]) -> T: # noqa: D103 # need this to satisfy mypy issue with "@property/@cache combination" # See: https://github.com/python/mypy/issues/5858 return functools.lru_cache()(function) # type: ignore[return-value] @@ -98,7 +98,10 @@ def cache(function: Callable[..., T]) -> T: @functools.lru_cache(16) -def map_py_type_to_dtype(python_dtype: PythonDataType | type[object]) -> PolarsDataType: +def _map_py_type_to_dtype( + python_dtype: PythonDataType | type[object], +) -> PolarsDataType: + """Convert Python data type to Polars data type.""" if python_dtype is float: return Float64 if python_dtype is int: @@ -134,14 +137,14 @@ def map_py_type_to_dtype(python_dtype: PythonDataType | type[object]) -> PolarsD if hasattr(python_dtype, "__origin__") and hasattr(python_dtype, "__args__"): base_type = python_dtype.__origin__ if base_type is not None: - dtype = map_py_type_to_dtype(base_type) + dtype = _map_py_type_to_dtype(base_type) nested = python_dtype.__args__ if len(nested) == 1: nested = nested[0] return ( dtype if nested is None - else dtype(map_py_type_to_dtype(nested)) # type: ignore[operator] + else dtype(_map_py_type_to_dtype(nested)) # type: ignore[operator] ) raise TypeError("Invalid type") @@ -424,7 +427,7 @@ def py_type_to_dtype( if is_polars_dtype(data_type): return data_type try: - return map_py_type_to_dtype(data_type) + return _map_py_type_to_dtype(data_type) except (KeyError, TypeError): # pragma: no cover if not raise_unmatched: return None diff --git a/py-polars/polars/dependencies.py b/py-polars/polars/dependencies.py index 13288b724538f..2c3e0c33faa6a 100644 --- a/py-polars/polars/dependencies.py +++ b/py-polars/polars/dependencies.py @@ -116,8 +116,9 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]: Returns ------- - tuple[Module, bool]: a lazy-loading module and a boolean indicating if the - requested/underlying module exists (if not, the returned module is a proxy). + tuple of (Module, bool) + A lazy-loading module and a boolean indicating if the requested/underlying + module exists (if not, the returned module is a proxy). """ # check if module is LOADED diff --git a/py-polars/polars/exceptions.py b/py-polars/polars/exceptions.py index 7f83f92ebafd0..fd866c623d5df 100644 --- a/py-polars/polars/exceptions.py +++ b/py-polars/polars/exceptions.py @@ -86,6 +86,14 @@ class ChronoFormatWarning(Warning): """ +class PolarsInefficientApplyWarning(Warning): + """ + Warning raised when a potentially slow `apply` operation is performed. + + Suggestion of what to replace slow pattern with will also be shown. + """ + + __all__ = [ "ArrowError", "ColumnNotFoundError", @@ -95,6 +103,7 @@ class ChronoFormatWarning(Warning): "InvalidOperationError", "NoDataError", "NoRowsReturnedError", + "PolarsInefficientApplyWarning", "PolarsPanicError", "RowsError", "SchemaError", diff --git a/py-polars/polars/expr/__init__.py b/py-polars/polars/expr/__init__.py index 9adda5f14d0f2..e541ba8746f1d 100644 --- a/py-polars/polars/expr/__init__.py +++ b/py-polars/polars/expr/__init__.py @@ -1,5 +1,7 @@ from polars.expr.expr import Expr +from polars.expr.whenthen import When __all__ = [ "Expr", + "When", ] diff --git a/py-polars/polars/expr/binary.py b/py-polars/polars/expr/binary.py index b6b55f5a1eae3..67e2e807c7c8d 100644 --- a/py-polars/polars/expr/binary.py +++ b/py-polars/polars/expr/binary.py @@ -28,7 +28,8 @@ def contains(self, literal: bytes) -> Expr: Returns ------- - Boolean mask + Expr + Expression of data type :class:`Boolean`. See Also -------- @@ -74,7 +75,8 @@ def ends_with(self, suffix: bytes) -> Expr: Returns ------- - Boolean mask + Expr + Expression of data type :class:`Boolean`. See Also -------- @@ -120,7 +122,8 @@ def starts_with(self, prefix: bytes) -> Expr: Returns ------- - Boolean mask + Expr + Expression of data type :class:`Boolean`. See Also -------- @@ -188,7 +191,9 @@ def encode(self, encoding: TransferEncoding) -> Expr: Returns ------- - Binary array with values encoded using provided encoding + Expr + Expression of data type :class:`Utf8` with values encoded using provided + encoding. Examples -------- @@ -211,6 +216,7 @@ def encode(self, encoding: TransferEncoding) -> Expr: │ yellow ┆ [binary data] ┆ ffff00 │ │ blue ┆ [binary data] ┆ 0000ff │ └────────┴───────────────┴──────────────────┘ + """ if encoding == "hex": return wrap_expr(self._pyexpr.bin_hex_encode()) diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py index 993853da0f646..d0a9248ebd7cc 100644 --- a/py-polars/polars/expr/datetime.py +++ b/py-polars/polars/expr/datetime.py @@ -9,7 +9,7 @@ from polars.utils._parse_expr_input import parse_as_expression from polars.utils._wrap import wrap_expr from polars.utils.convert import _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias +from polars.utils.deprecation import deprecated_alias if TYPE_CHECKING: from datetime import timedelta @@ -30,12 +30,14 @@ def truncate( self, every: str | timedelta, offset: str | timedelta | None = None, + *, + use_earliest: bool | None = None, ) -> Expr: """ Divide the date/datetime range into buckets. - Each date/datetime is mapped to the start of its bucket. Note that weekly - buckets start on Monday. + Each date/datetime is mapped to the start of its bucket using the corresponding + local datetime. Note that weekly buckets start on Monday. Parameters ---------- @@ -43,6 +45,11 @@ def truncate( Every interval start and period length offset Offset the window + use_earliest + Determine how to deal with ambiguous datetimes: + - None (default): raise; + - True: use the earliest datetime; + - False: use the latest datetime. Notes ----- @@ -75,7 +82,8 @@ def truncate( Returns ------- - Date/Datetime series + Expr + Expression of data type :class:`Date` or :class:`Datetime`. Examples -------- @@ -139,6 +147,56 @@ def truncate( │ 2001-01-01 01:00:00 ┆ 2001-01-01 01:00:00 │ └─────────────────────┴─────────────────────┘ + If crossing daylight savings time boundaries, you may want to use + `use_earliest` and combine with :func:`~polars.Series.dt.dst_offset` + and :func:`~polars.when`: + + >>> df = ( + ... pl.date_range( + ... datetime(2020, 10, 25, 0), + ... datetime(2020, 10, 25, 2), + ... "30m", + ... eager=True, + ... time_zone="Europe/London", + ... ) + ... .dt.offset_by("15m") + ... .to_frame() + ... ) + >>> df + shape: (7, 1) + ┌─────────────────────────────┐ + │ date │ + │ --- │ + │ datetime[μs, Europe/London] │ + ╞═════════════════════════════╡ + │ 2020-10-25 00:15:00 BST │ + │ 2020-10-25 00:45:00 BST │ + │ 2020-10-25 01:15:00 BST │ + │ 2020-10-25 01:45:00 BST │ + │ 2020-10-25 01:15:00 GMT │ + │ 2020-10-25 01:45:00 GMT │ + │ 2020-10-25 02:15:00 GMT │ + └─────────────────────────────┘ + + >>> df.select( + ... pl.when(pl.col("date").dt.dst_offset() == pl.duration(hours=1)) + ... .then(pl.col("date").dt.truncate("30m", use_earliest=True)) + ... .otherwise(pl.col("date").dt.truncate("30m", use_earliest=False)) + ... ) + shape: (7, 1) + ┌─────────────────────────────┐ + │ date │ + │ --- │ + │ datetime[μs, Europe/London] │ + ╞═════════════════════════════╡ + │ 2020-10-25 00:00:00 BST │ + │ 2020-10-25 00:30:00 BST │ + │ 2020-10-25 01:00:00 BST │ + │ 2020-10-25 01:30:00 BST │ + │ 2020-10-25 01:00:00 GMT │ + │ 2020-10-25 01:30:00 GMT │ + │ 2020-10-25 02:00:00 GMT │ + └─────────────────────────────┘ """ if offset is None: offset = "0ns" @@ -147,6 +205,7 @@ def truncate( self._pyexpr.dt_truncate( _timedelta_to_pl_duration(every), _timedelta_to_pl_duration(offset), + use_earliest, ) ) @@ -199,7 +258,8 @@ def round( Returns ------- - Date/Datetime series + Expr + Expression of data type :class:`Date` or :class:`Datetime`. Warnings -------- @@ -447,7 +507,8 @@ def year(self) -> Expr: Returns ------- - Year as Int32 + Expr + Expression of data type :class:`Int32`. Examples -------- @@ -493,7 +554,8 @@ def is_leap_year(self) -> Expr: Returns ------- - Leap year info as Boolean + Expr + Expression of data type :class:`Boolean`. Examples -------- @@ -540,7 +602,8 @@ def iso_year(self) -> Expr: Returns ------- - ISO Year as Int32 + Expr + Expression of data type :class:`Int32`. Examples -------- @@ -586,7 +649,8 @@ def quarter(self) -> Expr: Returns ------- - Quarter as UInt32 + Expr + Expression of data type :class:`UInt32`. Examples -------- @@ -633,7 +697,8 @@ def month(self) -> Expr: Returns ------- - Month as UInt32 + Expr + Expression of data type :class:`UInt32`. Examples -------- @@ -680,7 +745,8 @@ def week(self) -> Expr: Returns ------- - Week number as UInt32 + Expr + Expression of data type :class:`UInt32`. Examples -------- @@ -726,7 +792,8 @@ def weekday(self) -> Expr: Returns ------- - Week day as UInt32 + Expr + Expression of data type :class:`UInt32`. See Also -------- @@ -784,7 +851,8 @@ def day(self) -> Expr: Returns ------- - Day as UInt32 + Expr + Expression of data type :class:`UInt32`. See Also -------- @@ -842,7 +910,8 @@ def ordinal_day(self) -> Expr: Returns ------- - Day as UInt32 + Expr + Expression of data type :class:`UInt32`. See Also -------- @@ -890,12 +959,45 @@ def ordinal_day(self) -> Expr: return wrap_expr(self._pyexpr.dt_ordinal_day()) def time(self) -> Expr: + """ + Extract time. + + Applies to Datetime columns only; fails on Date. + + Returns + ------- + Expr + Expression of data type :class:`Time`. + + """ return wrap_expr(self._pyexpr.dt_time()) def date(self) -> Expr: + """ + Extract date from date(time). + + Applies to Date and Datetime columns. + + Returns + ------- + Expr + Expression of data type :class:`Date`. + + """ return wrap_expr(self._pyexpr.dt_date()) def datetime(self) -> Expr: + """ + Return datetime. + + Applies to Datetime columns. + + Returns + ------- + Expr + Expression of data type :class:`Datetime`. + + """ return wrap_expr(self._pyexpr.dt_datetime()) def hour(self) -> Expr: @@ -908,7 +1010,8 @@ def hour(self) -> Expr: Returns ------- - Hour as UInt32 + Expr + Expression of data type :class:`UInt32`. Examples -------- @@ -954,7 +1057,8 @@ def minute(self) -> Expr: Returns ------- - Minute as UInt32 + Expr + Expression of data type :class:`UInt32`. Examples -------- @@ -1007,7 +1111,8 @@ def second(self, *, fractional: bool = False) -> Expr: Returns ------- - Second as UInt32 (or Float64) + Expr + Expression of data type :class:`UInt32` or :class:`Float64`. Examples -------- @@ -1102,7 +1207,8 @@ def millisecond(self) -> Expr: Returns ------- - Milliseconds as UInt32 + Expr + Expression of data type :class:`UInt32`. """ return wrap_expr(self._pyexpr.dt_millisecond()) @@ -1115,7 +1221,8 @@ def microsecond(self) -> Expr: Returns ------- - Microseconds as UInt32 + Expr + Expression of data type :class:`UInt32`. Examples -------- @@ -1164,7 +1271,8 @@ def nanosecond(self) -> Expr: Returns ------- - Nanoseconds as UInt32 + Expr + Expression of data type :class:`UInt32`. """ return wrap_expr(self._pyexpr.dt_nanosecond()) @@ -1256,7 +1364,7 @@ def timestamp(self, time_unit: TimeUnit = "us") -> Expr: def with_time_unit(self, time_unit: TimeUnit) -> Expr: """ - Set time unit of a Series of dtype Datetime or Duration. + Set time unit of an expression of dtype Datetime or Duration. This does not modify underlying data, and should be used to fix an incorrect time unit. @@ -1264,7 +1372,7 @@ def with_time_unit(self, time_unit: TimeUnit) -> Expr: Parameters ---------- time_unit : {'ns', 'us', 'ms'} - Unit of time for the ``Datetime`` Series. + Unit of time for the ``Datetime`` expression. Examples -------- @@ -1307,7 +1415,7 @@ def cast_time_unit(self, time_unit: TimeUnit) -> Expr: Parameters ---------- time_unit : {'ns', 'us', 'ms'} - Time unit for the ``Datetime`` Series. + Time unit for the ``Datetime`` expression. Examples -------- @@ -1342,12 +1450,12 @@ def cast_time_unit(self, time_unit: TimeUnit) -> Expr: def convert_time_zone(self, time_zone: str) -> Expr: """ - Convert to given time zone for a Series of type Datetime. + Convert to given time zone for an expression of type Datetime. Parameters ---------- time_zone - Time zone for the `Datetime` Series. + Time zone for the `Datetime` expression. Examples -------- @@ -1388,7 +1496,7 @@ def replace_time_zone( self, time_zone: str | None, *, use_earliest: bool | None = None ) -> Expr: """ - Replace time zone for a Series of type Datetime. + Replace time zone for an expression of type Datetime. Different from ``convert_time_zone``, this will also modify the underlying timestamp and will ignore the original time zone. @@ -1396,11 +1504,12 @@ def replace_time_zone( Parameters ---------- time_zone - Time zone for the `Datetime` Series. Pass `None` to unset time zone. + Time zone for the `Datetime` expression. Pass `None` to unset time zone. use_earliest - If localizing an ambiguous datetime (say, due to daylight saving time), - determine whether to localize to the earliest datetime or not. - If None (the default), then ambiguous datetimes will raise. + Determine how to deal with ambiguous datetimes: + - None (default): raise; + - True: use the earliest datetime; + - False: use the latest datetime. Examples -------- @@ -1487,7 +1596,8 @@ def days(self) -> Expr: Returns ------- - A series of dtype Int64 + Expr + Expression of data type :class:`Int64`. Examples -------- @@ -1525,7 +1635,8 @@ def hours(self) -> Expr: Returns ------- - A series of dtype Int64 + Expr + Expression of data type :class:`Int64`. Examples -------- @@ -1564,7 +1675,8 @@ def minutes(self) -> Expr: Returns ------- - A series of dtype Int64 + Expr + Expression of data type :class:`Int64`. Examples -------- @@ -1603,7 +1715,8 @@ def seconds(self) -> Expr: Returns ------- - A series of dtype Int64 + Expr + Expression of data type :class:`Int64`. Examples -------- @@ -1646,7 +1759,8 @@ def milliseconds(self) -> Expr: Returns ------- - A series of dtype Int64 + Expr + Expression of data type :class:`Int64`. Examples -------- @@ -1693,7 +1807,8 @@ def microseconds(self) -> Expr: Returns ------- - A series of dtype Int64 + Expr + Expression of data type :class:`Int64`. Examples -------- @@ -1740,7 +1855,8 @@ def nanoseconds(self) -> Expr: Returns ------- - A series of dtype Int64 + Expr + Expression of data type :class:`Int64`. Examples -------- @@ -1817,7 +1933,8 @@ def offset_by(self, by: str) -> Expr: Returns ------- - Date/Datetime expression + Expr + Expression of data type :class:`Date` or :class:`Datetime`. Examples -------- @@ -1879,7 +1996,8 @@ def month_start(self) -> Expr: Returns ------- - Date/Datetime expression + Expr + Expression of data type :class:`Date` or :class:`Datetime`. Notes ----- @@ -1925,7 +2043,8 @@ def month_end(self) -> Expr: Returns ------- - Date/Datetime expression + Expr + Expression of data type :class:`Date` or :class:`Datetime`. Notes ----- @@ -1975,7 +2094,8 @@ def base_utc_offset(self) -> Expr: Returns ------- - Duration expression + Expr + Expression of data type :class:`Duration`. See Also -------- @@ -2009,7 +2129,8 @@ def dst_offset(self) -> Expr: Returns ------- - Duration expression + Expr + Expression of data type :class:`Duration`. See Also -------- diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index f113a20cba299..f923c169e6b48 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -5,6 +5,7 @@ import operator import os import random +import warnings from datetime import timedelta from functools import partial, reduce from typing import ( @@ -35,6 +36,7 @@ ) from polars.dependencies import _check_for_numpy from polars.dependencies import numpy as np +from polars.exceptions import PolarsInefficientApplyWarning, PolarsPanicError from polars.expr.array import ExprArrayNameSpace from polars.expr.binary import ExprBinaryNameSpace from polars.expr.categorical import ExprCatNameSpace @@ -48,7 +50,11 @@ parse_as_list_of_expressions, ) from polars.utils.convert import _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias, warn_closed_future_change +from polars.utils.deprecation import ( + deprecated, + deprecated_alias, + warn_closed_future_change, +) from polars.utils.meta import threadpool_size from polars.utils.various import sphinx_accessor @@ -322,7 +328,8 @@ def any(self, drop_nulls: bool = True) -> Self: Returns ------- - Boolean literal + Expr + Expression of data type :class:`Boolean`. Examples -------- @@ -374,7 +381,8 @@ def all(self, drop_nulls: bool = True) -> Self: Returns ------- - Boolean literal + Expr + Expression of data type :class:`Boolean`. Examples -------- @@ -464,7 +472,29 @@ def sqrt(self) -> Self: └──────────┘ """ - return self**0.5 + return self._from_pyexpr(self._pyexpr.sqrt()) + + def cbrt(self) -> Self: + """ + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + """ + return self._from_pyexpr(self._pyexpr.cbrt()) def log10(self) -> Self: """ @@ -512,12 +542,12 @@ def exp(self) -> Self: def alias(self, name: str) -> Self: """ - Rename the output of an expression. + Rename the expression. Parameters ---------- name - New name. + The new name. See Also -------- @@ -527,41 +557,228 @@ def alias(self, name: str) -> Self: Examples -------- + Rename an expression to avoid overwriting an existing column. + >>> df = pl.DataFrame( ... { ... "a": [1, 2, 3], - ... "b": ["a", "b", None], + ... "b": ["x", "y", "z"], ... } ... ) - >>> df - shape: (3, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪══════╡ - │ 1 ┆ a │ - │ 2 ┆ b │ - │ 3 ┆ null │ - └─────┴──────┘ - >>> df.select( - ... pl.col("a").alias("bar"), - ... pl.col("b").alias("foo"), + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), ... ) - shape: (3, 2) - ┌─────┬──────┐ - │ bar ┆ foo │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪══════╡ - │ 1 ┆ a │ - │ 2 ┆ b │ - │ 3 ┆ null │ - └─────┴──────┘ + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ """ return self._from_pyexpr(self._pyexpr.alias(name)) + def map_alias(self, function: Callable[[str], str]) -> Self: + """ + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + """ + return self._from_pyexpr(self._pyexpr.map_alias(function)) + + def prefix(self, prefix: str) -> Self: + """ + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + """ + return self._from_pyexpr(self._pyexpr.prefix(prefix)) + + def suffix(self, suffix: str) -> Self: + """ + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + """ + return self._from_pyexpr(self._pyexpr.suffix(suffix)) + + def keep_name(self) -> Self: + """ + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + """ + return self._from_pyexpr(self._pyexpr.keep_name()) + def exclude( self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], @@ -683,50 +900,6 @@ def exclude( f"Invalid input for `exclude`. Expected `str` or `DataType`, got {type(columns)!r}" ) - def keep_name(self) -> Self: - """ - Keep the original root name of the expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2], - ... "b": [3, 4], - ... } - ... ) - - Keep original column name to undo an alias operation. - - >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 3 │ - │ 18 ┆ 4 │ - └─────┴─────┘ - - Prevent - "DuplicateError: Column with name: 'literal' has more than one occurrences" - errors. - - >>> df.select((pl.lit(10) / pl.all()).keep_name()) - shape: (2, 2) - ┌──────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════════╡ - │ 10.0 ┆ 3.333333 │ - │ 5.0 ┆ 2.5 │ - └──────┴──────────┘ - - """ - return self._from_pyexpr(self._pyexpr.keep_name()) - def pipe( self, function: Callable[Concatenate[Expr, P], T], @@ -778,166 +951,6 @@ def pipe( ''' return function(self, *args, **kwargs) - def prefix(self, prefix: str) -> Self: - """ - Add a prefix to the root column name of the expression. - - Parameters - ---------- - prefix - Prefix to add to root column name. - - See Also - -------- - alias - map_alias - suffix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... } - ... ) - >>> df - shape: (5, 4) - ┌─────┬────────┬─────┬────────┐ - │ A ┆ fruits ┆ B ┆ cars │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi │ - │ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 5 ┆ banana ┆ 1 ┆ beetle │ - └─────┴────────┴─────┴────────┘ - >>> df.select( - ... pl.all(), - ... pl.all().reverse().prefix("reverse_"), - ... ) - shape: (5, 8) - ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ - └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ - - """ # noqa: W505 - return self._from_pyexpr(self._pyexpr.prefix(prefix)) - - def suffix(self, suffix: str) -> Self: - """ - Add a suffix to the root column name of the expression. - - Parameters - ---------- - suffix - Suffix to add to root column name. - - See Also - -------- - alias - map_alias - prefix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... } - ... ) - >>> df - shape: (5, 4) - ┌─────┬────────┬─────┬────────┐ - │ A ┆ fruits ┆ B ┆ cars │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi │ - │ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 5 ┆ banana ┆ 1 ┆ beetle │ - └─────┴────────┴─────┴────────┘ - >>> df.select( - ... pl.all(), - ... pl.all().reverse().suffix("_reverse"), - ... ) - shape: (5, 8) - ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ - └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ - - """ # noqa: W505 - return self._from_pyexpr(self._pyexpr.suffix(suffix)) - - def map_alias(self, function: Callable[[str], str]) -> Self: - """ - Rename the output of an expression by mapping a function over the root name. - - Parameters - ---------- - function - Function that maps root name to new name. - - See Also - -------- - alias - prefix - suffix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2], - ... "B": [3, 4], - ... } - ... ) - - >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( - ... pl.all().map_alias( - ... # Remove "_reverse" suffix and convert to lower case. - ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() - ... ) - ... ) - shape: (2, 4) - ┌───────────┬───────────┬─────┬─────┐ - │ A_reverse ┆ B_reverse ┆ a ┆ b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═══════════╪═══════════╪═════╪═════╡ - │ 2 ┆ 4 ┆ 2 ┆ 4 │ - │ 1 ┆ 3 ┆ 1 ┆ 3 │ - └───────────┴───────────┴─────┴─────┘ - - - """ - return self._from_pyexpr(self._pyexpr.map_alias(function)) - def is_not(self) -> Self: """ Negate a boolean expression. @@ -1040,8 +1053,8 @@ def is_finite(self) -> Self: Returns ------- - out - Series of type Boolean + Expr + Expression of data type :class:`Boolean`. Examples -------- @@ -1071,8 +1084,8 @@ def is_infinite(self) -> Self: Returns ------- - out - Series of type Boolean + Expr + Expression of data type :class:`Boolean`. Examples -------- @@ -2012,7 +2025,7 @@ def arg_sort(self, *, descending: bool = False, nulls_last: bool = False) -> Sel Returns ------- Expr - Series of dtype UInt32. + Expression of data type :class:`UInt32`. Examples -------- @@ -2274,7 +2287,8 @@ def take( Returns ------- - Values taken by index + Expr + Expression of the same data type. Examples -------- @@ -3167,7 +3181,8 @@ def is_first(self) -> Self: Returns ------- - Boolean Series + Expr + Expression of data type :class:`Boolean`. Examples -------- @@ -3348,9 +3363,10 @@ def cut( self._pyexpr.cut(breaks, labels, left_closed, include_breaks) ) + @deprecated_alias(probs="q") def qcut( self, - probs: list[float], + q: list[float] | int, labels: list[str] | None = None, left_closed: bool = False, allow_duplicates: bool = False, @@ -3361,9 +3377,9 @@ def qcut( Parameters ---------- - probs - Probabilities for which to find the corresponding quantiles - For p in probs, we assume 0 <= p <= 1 + q + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. labels Labels to assign to bins. If given, the length must be len(probs) + 1. If computing over groups this must be set for now. @@ -3399,6 +3415,23 @@ def qcut( │ b ┆ 8 ┆ (4.5, inf] │ │ b ┆ 9 ┆ (4.5, inf] │ └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut(2)) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"]).over("g")) shape: (10, 3) ┌─────┬─────┬─────┐ @@ -3451,10 +3484,9 @@ def qcut( │ b ┆ 9 ┆ {inf,"(4.5, inf]"} │ └─────┴─────┴───────────────────────┘ """ + expr_f = self._pyexpr.qcut_uniform if isinstance(q, int) else self._pyexpr.qcut return self._from_pyexpr( - self._pyexpr.qcut( - probs, labels, left_closed, allow_duplicates, include_breaks - ) + expr_f(q, labels, left_closed, allow_duplicates, include_breaks) ) def rle(self) -> Self: @@ -3463,7 +3495,8 @@ def rle(self) -> Self: Returns ------- - A Struct Series containing "lengths" and "values" Fields + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". Examples -------- @@ -3726,7 +3759,7 @@ def apply( In a selection context, the function is applied by row. - >>> df.with_columns( + >>> df.with_columns( # doctest: +SKIP ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), ... ) shape: (4, 3) @@ -3771,20 +3804,39 @@ def apply( """ # input x: Series of type list containing the group values + from polars.utils.udfs import warn_on_inefficient_apply + + try: + root_names = self.meta.root_names() + except PolarsPanicError: + # no root names for pl.col('*') + pass + else: + if root_names: + warn_on_inefficient_apply( + function, columns=root_names, apply_target="expr" + ) + if pass_name: def wrap_f(x: Series) -> Series: # pragma: no cover def inner(s: Series) -> Series: # pragma: no cover return function(s.alias(x.name)) - return x.apply(inner, return_dtype=return_dtype, skip_nulls=skip_nulls) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PolarsInefficientApplyWarning) + return x.apply( + inner, return_dtype=return_dtype, skip_nulls=skip_nulls + ) else: def wrap_f(x: Series) -> Series: # pragma: no cover - return x.apply( - function, return_dtype=return_dtype, skip_nulls=skip_nulls - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PolarsInefficientApplyWarning) + return x.apply( + function, return_dtype=return_dtype, skip_nulls=skip_nulls + ) if strategy == "thread_local": return self.map(wrap_f, agg_list=True, return_dtype=return_dtype) @@ -3856,13 +3908,14 @@ def flatten(self) -> Self: def explode(self) -> Self: """ - Explode a list Series. + Explode a list expression. This means that every item is expanded to a new row. Returns ------- - Exploded Series of same dtype + Expr + Expression with the data type of the list elements. See Also -------- @@ -4027,12 +4080,12 @@ def limit(self, n: int | Expr = 10) -> Self: def and_(self, *others: Any) -> Self: """ - Method equivalent of logical "and" operator ``expr & other & ...``. + Method equivalent of bitwise "and" operator ``expr & other & ...``. Parameters ---------- *others - One or more logical boolean expressions to evaluate/combine. + One or more integer or boolean expressions to evaluate/combine. Examples -------- @@ -4071,12 +4124,12 @@ def and_(self, *others: Any) -> Self: def or_(self, *others: Any) -> Self: """ - Method equivalent of logical "or" operator ``expr | other | ...``. + Method equivalent of bitwise "or" operator ``expr | other | ...``. Parameters ---------- *others - One or more logical boolean expressions to evaluate/combine. + One or more integer or boolean expressions to evaluate/combine. Examples -------- @@ -4655,7 +4708,7 @@ def pow(self, exponent: int | float | None | Series | Expr) -> Self: def xor(self, other: Any) -> Self: """ - Method equivalent of logical exclusive-or operator ``expr ^ other``. + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. Parameters ---------- @@ -4719,7 +4772,8 @@ def is_in(self, other: Expr | Collection[Any] | Series) -> Self: Returns ------- - Expr that evaluates to a Boolean Series. + Expr + Expression of data type :class:`Boolean`. Examples -------- @@ -4763,7 +4817,9 @@ def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: Returns ------- - Series of type List + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. Examples -------- @@ -4811,7 +4867,8 @@ def is_between( Returns ------- - Expr that evaluates to a Boolean Series. + Expr + Expression of data type :class:`Boolean`. Examples -------- @@ -5011,18 +5068,17 @@ def inspect(s: Series) -> Series: # pragma: no cover def interpolate(self, method: InterpolationMethod = "linear") -> Self: """ - Fill nulls with linear interpolation over missing values. - - Can also be used to regrid data to a new grid - see examples below. + Fill null values using interpolation. Parameters ---------- - method : {'linear', 'linear'} - Interpolation method + method : {'linear', 'nearest'} + Interpolation method. Examples -------- - >>> # Fill nulls with linear interpolation + Fill null values using linear interpolation. + >>> df = pl.DataFrame( ... { ... "a": [1, None, 3], @@ -5040,6 +5096,23 @@ def interpolate(self, method: InterpolationMethod = "linear") -> Self: │ 2 ┆ NaN │ │ 3 ┆ 3.0 │ └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + >>> df_original_grid = pl.DataFrame( ... { ... "grid_points": [1, 3, 10], @@ -7397,7 +7470,8 @@ def sin(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7421,7 +7495,8 @@ def cos(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7445,7 +7520,8 @@ def tan(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7469,7 +7545,8 @@ def arcsin(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7493,7 +7570,8 @@ def arccos(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7517,7 +7595,8 @@ def arctan(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7541,7 +7620,8 @@ def sinh(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7565,7 +7645,8 @@ def cosh(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7589,7 +7670,8 @@ def tanh(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7613,7 +7695,8 @@ def arcsinh(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7637,7 +7720,8 @@ def arccosh(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7661,7 +7745,8 @@ def arctanh(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7685,7 +7770,8 @@ def degrees(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7717,7 +7803,8 @@ def radians(self) -> Self: Returns ------- - Series of dtype Float64 + Expr + Expression of data type :class:`Float64`. Examples -------- @@ -7755,9 +7842,10 @@ def reshape(self, dimensions: tuple[int, ...]) -> Self: Returns ------- Expr - If a single dimension is given, results in a flat Series of shape (len,). - If a multiple dimensions are given, results in a Series of Lists with shape - (rows, cols). + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). Examples -------- @@ -8206,7 +8294,8 @@ def value_counts(self, *, multithreaded: bool = False, sort: bool = False) -> Se Returns ------- - Dtype Struct + Expr + Expression of data type :class:`Struct`. Examples -------- @@ -8476,15 +8565,23 @@ def shrink_dtype(self) -> Self: """ return self._from_pyexpr(self._pyexpr.shrink_dtype()) + @deprecated( + "This method now does nothing. It has been superseded by the" + " `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically" + " caches expressions that are equal.", + version="0.18.9", + ) def cache(self) -> Self: """ Cache this expression so that it only is executed once per context. - This can actually hurt performance and can have a lot of contention. - It is advised not to use it until actually benchmarked on your problem. + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. """ - return self._from_pyexpr(self._pyexpr.cache()) + return self def map_dict( self, @@ -8505,6 +8602,7 @@ def map_dict( Dictionary containing the before/after values to map. default Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. Use ``pl.first()``, to keep the original value. return_dtype Set return dtype to override automatic return dtype determination. @@ -8830,6 +8928,9 @@ def inner_with_default(s: Series) -> Series: is_keys=False, ) + default_parsed = self._from_pyexpr( + parse_as_expression(default, str_as_lit=True) + ) return ( ( df.lazy() @@ -8849,7 +8950,7 @@ def inner_with_default(s: Series) -> Series: .select( F.when(F.col(is_remapped_column).is_not_null()) .then(F.col(remap_value_column)) - .otherwise(default) + .otherwise(default_parsed) .alias(column) ) ) diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 731a7f27757b7..ce1e1d6a3da36 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -7,7 +7,7 @@ from polars import functions as F from polars.utils._parse_expr_input import parse_as_expression from polars.utils._wrap import wrap_expr -from polars.utils.decorators import deprecated_alias +from polars.utils.deprecation import deprecated_alias if TYPE_CHECKING: from datetime import date, datetime, time @@ -437,7 +437,8 @@ def contains( Returns ------- - Boolean mask + Expr + Expression of data type :class:`Boolean`. Examples -------- @@ -471,7 +472,8 @@ def join(self, separator: str) -> Expr: Returns ------- - Series of dtype Utf8 + Expr + Expression of data type :class:`Utf8`. Examples -------- @@ -496,7 +498,9 @@ def arg_min(self) -> Expr: Returns ------- - Series of dtype UInt32/UInt64 (depending on compilation) + Expr + Expression of data type :class:`UInt32` or :class:`UInt64` + (depending on compilation). Examples -------- @@ -525,7 +529,9 @@ def arg_max(self) -> Expr: Returns ------- - Series of dtype UInt32/UInt64 (depending on compilation) + Expr + Expression of data type :class:`UInt32` or :class:`UInt64` + (depending on compilation). Examples -------- @@ -704,7 +710,8 @@ def explode(self) -> Expr: Returns ------- - Exploded column with the datatype of the list elements. + Expr + Expression with the data type of the list elements. See Also -------- diff --git a/py-polars/polars/expr/meta.py b/py-polars/polars/expr/meta.py index 9c4e518228540..3ac0568bf0769 100644 --- a/py-polars/polars/expr/meta.py +++ b/py-polars/polars/expr/meta.py @@ -59,9 +59,10 @@ def pop(self) -> list[Expr]: Returns ------- - A list of expressions which in most cases will have a unit length. - This is not the case when an expression has multiple inputs. - For instance in a ``fold`` expression. + list of Expr + A list of expressions which in most cases will have a unit length. + This is not the case when an expression has multiple inputs. + For instance in a ``fold`` expression. """ return [wrap_expr(e) for e in self._pyexpr.meta_pop()] diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index e4ef6fe1bee2f..21ad167422814 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -7,7 +7,7 @@ from polars.exceptions import ChronoFormatWarning from polars.utils._parse_expr_input import parse_as_expression from polars.utils._wrap import wrap_expr -from polars.utils.decorators import deprecated_alias +from polars.utils.deprecation import deprecated_alias, issue_deprecation_warning from polars.utils.various import find_stacklevel if TYPE_CHECKING: @@ -136,14 +136,13 @@ def to_datetime( """ _validate_format_argument(format) if utc is not None: - warnings.warn( + issue_deprecation_warning( "The `utc` argument is now a no-op and has no effect. " "You can safely remove it. " "Offset-naive strings are parsed as ``pl.Datetime(time_unit)``, " "and offset-aware strings are converted to " '``pl.Datetime(time_unit, "UTC")``.', - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.17.15", ) return wrap_expr( self._pyexpr.str_to_datetime( @@ -431,7 +430,8 @@ def concat(self, delimiter: str = "-") -> Expr: Returns ------- - Series of dtype Utf8 + Expr + Expression of data type :class:`Utf8`. Examples -------- @@ -989,10 +989,11 @@ def json_extract( def json_path_match(self, json_path: str) -> Expr: """ - Extract the first match of json string with provided JSONPath expression. + Extract the first match of JSON string with the provided JSONPath expression. - Throw errors if encounter invalid json strings. - All return value will be casted to Utf8 regardless of the original value. + Throws errors if invalid JSON strings are encountered. + All return values will be cast to :class:`Utf8` regardless of the original + value. Documentation on JSONPath standard can be found `here `_. @@ -1004,8 +1005,9 @@ def json_path_match(self, json_path: str) -> Expr: Returns ------- - Utf8 array. Contain null if original value is null or the json_path return - nothing. + Expr + Expression of data type :class:`Utf8`. Contains null values if original + value is null or the json_path returns nothing. Examples -------- @@ -1062,7 +1064,8 @@ def encode(self, encoding: TransferEncoding) -> Expr: Returns ------- - Utf8 array with values encoded using provided encoding + Expr + Expression of data type :class:`Utf8`. Examples -------- @@ -1135,7 +1138,9 @@ def extract(self, pattern: str, group_index: int = 1) -> Expr: Returns ------- - Utf8 array. Contain null if original value is null or regex capture nothing. + Expr + Expression of data type :class:`Utf8`. Contains null values if original + value is null or the regex captures nothing. Examples -------- @@ -1229,7 +1234,8 @@ def extract_all(self, pattern: str | Expr) -> Expr: Returns ------- - List[Utf8] + Expr + Expression of data type ``List(Utf8)``. Examples -------- @@ -1263,7 +1269,9 @@ def count_match(self, pattern: str) -> Expr: Returns ------- - UInt32 array. Contain null if original value is null or regex capture nothing. + Expr + Expression of data type :class:`UInt32`. Contains null values if the + original value is null or the regex captures nothing. Examples -------- @@ -1312,7 +1320,8 @@ def split(self, by: str, *, inclusive: bool = False) -> Expr: Returns ------- - List of Utf8 type + Expr + Expression of data type :class:`Utf8`. """ if inclusive: @@ -1336,6 +1345,12 @@ def split_exact(self, by: str, n: int, *, inclusive: bool = False) -> Expr: inclusive If True, include the split character/string in the results. + Returns + ------- + Expr + Expression of data type :class:`Struct` with fields of data type + :class:`Utf8`. + Examples -------- >>> df = pl.DataFrame({"x": ["a_1", None, "c", "d_4"]}) @@ -1378,10 +1393,6 @@ def split_exact(self, by: str, n: int, *, inclusive: bool = False) -> Expr: │ d_4 ┆ d ┆ 4 │ └──────┴────────────┴─────────────┘ - Returns - ------- - Struct of Utf8 type - """ if inclusive: return wrap_expr(self._pyexpr.str_split_exact_inclusive(by, n)) @@ -1402,6 +1413,12 @@ def splitn(self, by: str, n: int) -> Expr: n Max number of items to return. + Returns + ------- + Expr + Expression of data type :class:`Struct` with fields of data type + :class:`Utf8`. + Examples -------- >>> df = pl.DataFrame({"s": ["foo bar", None, "foo-bar", "foo bar baz"]}) @@ -1441,10 +1458,6 @@ def splitn(self, by: str, n: int) -> Expr: │ foo bar baz ┆ foo ┆ bar baz │ └─────────────┴────────────┴─────────────┘ - Returns - ------- - Struct of Utf8 type - """ return wrap_expr(self._pyexpr.str_splitn(by, n)) @@ -1582,7 +1595,7 @@ def slice(self, offset: int, length: int | None = None) -> Expr: Returns ------- Expr - Series of dtype Utf8. + Expression of data type :class:`Utf8`. Examples -------- @@ -1628,7 +1641,8 @@ def explode(self) -> Expr: Returns ------- - Exploded column with string datatype. + Expr + Expression of data type :class:`Utf8`. Examples -------- @@ -1669,7 +1683,8 @@ def parse_int(self, radix: int = 2, *, strict: bool = True) -> Expr: Returns ------- - Expr : Series of parsed integers in i32 format + Expr + Expression of data type :class:`Int32`. Examples -------- diff --git a/py-polars/polars/expr/whenthen.py b/py-polars/polars/expr/whenthen.py new file mode 100644 index 0000000000000..36d071b4a6b7d --- /dev/null +++ b/py-polars/polars/expr/whenthen.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import polars.functions as F +from polars.expr.expr import Expr +from polars.utils._parse_expr_input import parse_as_expression +from polars.utils._wrap import wrap_expr +from polars.utils.deprecation import deprecated_alias, issue_deprecation_warning + +if TYPE_CHECKING: + from polars.polars import PyExpr + from polars.type_aliases import IntoExpr + + +class When: + """ + Utility class for the `when-then-otherwise` expression. + + Represents the initial state of the expression after ``pl.when(...)`` is called. + + In this state, ``then`` must be called to continue to finish the expression. + + """ + + def __init__(self, when: Any): + self._when = when + + @deprecated_alias(expr="statement") + def then(self, statement: IntoExpr) -> Then: + """ + Attach a statement to the corresponding condition. + + Parameters + ---------- + statement + The statement to apply if the corresponding condition is true. + Accepts expression input. Non-expression inputs are parsed as literals. + + """ + if isinstance(statement, str): + _warn_for_deprecated_string_input_behavior(statement) + statement_pyexpr = parse_as_expression(statement, str_as_lit=True) + return Then(self._when.then(statement_pyexpr)) + + +class Then(Expr): + """ + Utility class for the `when-then-otherwise` expression. + + Represents the state of the expression after ``pl.when(...).then(...)`` is called. + + """ + + def __init__(self, then: Any): + self._then = then + + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Expr: # type: ignore[override] + return wrap_expr(pyexpr) + + @property + def _pyexpr(self) -> PyExpr: + return self._then.otherwise(F.lit(None)._pyexpr) + + @deprecated_alias(predicate="condition") + def when(self, condition: IntoExpr) -> ChainedWhen: + """ + Add a condition to the `when-then-otherwise` expression. + + Parameters + ---------- + condition + The condition for applying the subsequent statement. + Accepts a boolean expression. String input is parsed as a column name. + + """ + condition_pyexpr = parse_as_expression(condition) + return ChainedWhen(self._then.when(condition_pyexpr)) + + @deprecated_alias(expr="statement") + def otherwise(self, statement: IntoExpr) -> Expr: + """ + Define a default for the `when-then-otherwise` expression. + + Parameters + ---------- + statement + The statement to apply if all conditions are false. + Accepts expression input. Non-expression inputs are parsed as literals. + + """ + if isinstance(statement, str): + _warn_for_deprecated_string_input_behavior(statement) + statement_pyexpr = parse_as_expression(statement, str_as_lit=True) + return wrap_expr(self._then.otherwise(statement_pyexpr)) + + +class ChainedWhen(Expr): + """ + Utility class for the `when-then-otherwise` expression. + + Represents the state of the expression after an additional ``when`` is called. + + In this state, ``then`` must be called to continue to finish the expression. + + """ + + def __init__(self, chained_when: Any): + self._chained_when = chained_when + + @deprecated_alias(expr="statement") + def then(self, statement: IntoExpr) -> ChainedThen: + """ + Attach a statement to the corresponding condition. + + Parameters + ---------- + statement + The statement to apply if the corresponding condition is true. + Accepts expression input. Non-expression inputs are parsed as literals. + + """ + if isinstance(statement, str): + _warn_for_deprecated_string_input_behavior(statement) + statement_pyexpr = parse_as_expression(statement, str_as_lit=True) + return ChainedThen(self._chained_when.then(statement_pyexpr)) + + +class ChainedThen(Expr): + """ + Utility class for the `when-then-otherwise` expression. + + Represents the state of the expression after an additional ``then`` is called. + + """ + + def __init__(self, chained_then: Any): + self._chained_then = chained_then + + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Expr: # type: ignore[override] + return wrap_expr(pyexpr) + + @property + def _pyexpr(self) -> PyExpr: + return self._chained_then.otherwise(F.lit(None)._pyexpr) + + @deprecated_alias(predicate="condition") + def when(self, condition: IntoExpr) -> ChainedWhen: + """ + Add another condition to the `when-then-otherwise` expression. + + Parameters + ---------- + condition + The condition for applying the subsequent statement. + Accepts a boolean expression. String input is parsed as a column name. + + """ + condition_pyexpr = parse_as_expression(condition) + return ChainedWhen(self._chained_then.when(condition_pyexpr)) + + @deprecated_alias(expr="statement") + def otherwise(self, statement: IntoExpr) -> Expr: + """ + Define a default for the `when-then-otherwise` expression. + + Parameters + ---------- + statement + The statement to apply if all conditions are false. + Accepts expression input. Non-expression inputs are parsed as literals. + + """ + if isinstance(statement, str): + _warn_for_deprecated_string_input_behavior(statement) + statement_pyexpr = parse_as_expression(statement, str_as_lit=True) + return wrap_expr(self._chained_then.otherwise(statement_pyexpr)) + + +def _warn_for_deprecated_string_input_behavior(input: str) -> None: + issue_deprecation_warning( + "in a future version, string input will be parsed as a column name rather than a string literal." + f" To silence this warning, pass the input as an expression instead: `pl.lit({input!r})`", + version="0.18.9", + ) diff --git a/py-polars/polars/functions/__init__.py b/py-polars/polars/functions/__init__.py index eddb2cc5070df..136293a28610c 100644 --- a/py-polars/polars/functions/__init__.py +++ b/py-polars/polars/functions/__init__.py @@ -26,6 +26,8 @@ from polars.functions.lazy import ( apply, approx_unique, + arctan2, + arctan2d, arg_sort_by, arg_where, avg, @@ -61,7 +63,15 @@ tail, var, ) -from polars.functions.range import arange, date_range, int_range, int_ranges, time_range +from polars.functions.range import ( + arange, + date_range, + date_ranges, + int_range, + int_ranges, + time_range, + time_ranges, +) from polars.functions.repeat import ones, repeat, zeros from polars.functions.whenthen import when @@ -85,14 +95,18 @@ "arg_where", "concat", "date_range", + "date_ranges", "element", "ones", "repeat", "time_range", + "time_ranges", "zeros", # polars.functions.lazy "apply", "arange", + "arctan2", + "arctan2d", "arg_sort_by", "avg", "coalesce", diff --git a/py-polars/polars/functions/aggregation/vertical.py b/py-polars/polars/functions/aggregation/vertical.py index 85b3cc698dac2..c0115a7ac82c3 100644 --- a/py-polars/polars/functions/aggregation/vertical.py +++ b/py-polars/polars/functions/aggregation/vertical.py @@ -1,12 +1,10 @@ from __future__ import annotations -import warnings from typing import TYPE_CHECKING, Any, Iterable, overload import polars._reexport as pl import polars.functions as F -from polars.utils.decorators import deprecated_alias -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import deprecated_alias, issue_deprecation_warning if TYPE_CHECKING: from polars import Expr, Series @@ -91,10 +89,9 @@ def all( if exprs is None: return F.col("*") elif isinstance(exprs, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `all` is deprecated. Use `Series.all()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.7", ) return exprs.all() elif isinstance(exprs, str): @@ -163,10 +160,9 @@ def any( """ if not more_exprs: if isinstance(exprs, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `any` is deprecated. Use `Series.any()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.7", ) return exprs.any() elif isinstance(exprs, str): @@ -257,10 +253,9 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A """ if not more_exprs: if isinstance(exprs, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `max` is deprecated. Use `Series.max()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.7", ) return exprs.max() elif isinstance(exprs, str): @@ -353,10 +348,9 @@ def min( """ if not more_exprs: if isinstance(exprs, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `min` is deprecated. Use `Series.min()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.7", ) return exprs.min() elif isinstance(exprs, str): @@ -450,10 +444,9 @@ def sum( """ if not more_exprs: if isinstance(exprs, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `sum` is deprecated. Use `Series.sum()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.7", ) return exprs.sum() elif isinstance(exprs, str): @@ -524,10 +517,9 @@ def cumsum( """ if not more_exprs: if isinstance(exprs, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `cumsum` is deprecated. Use `Series.cumsum()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.7", ) return exprs.cumsum() elif isinstance(exprs, str): @@ -538,8 +530,7 @@ def cumsum( def _warn_for_deprecated_horizontal_use(name: str) -> None: - warnings.warn( + issue_deprecation_warning( f"using `{name}` for horizontal computation is deprecated. Use `{name}_horizontal` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.7", ) diff --git a/py-polars/polars/functions/as_datatype.py b/py-polars/polars/functions/as_datatype.py index d0c09179761f3..72fa9b622b515 100644 --- a/py-polars/polars/functions/as_datatype.py +++ b/py-polars/polars/functions/as_datatype.py @@ -1,7 +1,6 @@ from __future__ import annotations import contextlib -import warnings from typing import TYPE_CHECKING, Iterable, overload from polars import functions as F @@ -11,7 +10,7 @@ parse_as_list_of_expressions, ) from polars.utils._wrap import wrap_expr -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import issue_deprecation_warning with contextlib.suppress(ImportError): # Module not available when building docs import polars.polars as plr @@ -55,7 +54,8 @@ def datetime_( Returns ------- - Expr of type `pl.Datetime` + Expr + Expression of data type :class:`Datetime`. """ year_expr = parse_as_expression(year) @@ -103,7 +103,8 @@ def date_( Returns ------- - Expr of type pl.Date + Expr + Expression of data type :class:`Date`. """ return datetime_(year, month, day).cast(Date).alias("date") @@ -131,7 +132,8 @@ def time_( Returns ------- - Expr of type pl.Date + Expr + Expression of data type :class:`Date`. """ epoch_start = (1970, 1, 1) @@ -158,7 +160,8 @@ def duration( Returns ------- - Expr of type `pl.Duration` + Expr + Expression of data type :class:`Duration`. Examples -------- @@ -372,11 +375,10 @@ def struct( """ if "exprs" in named_exprs: - warnings.warn( + issue_deprecation_warning( "passing expressions to `struct` using the keyword argument `exprs` is" " deprecated. Use positional syntax instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.1", ) first_input = named_exprs.pop("exprs") pyexprs = parse_as_list_of_expressions(first_input, *exprs, **named_exprs) diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index 3503a3b0b7913..0550bee581572 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -1,7 +1,6 @@ from __future__ import annotations import contextlib -import warnings from datetime import date, datetime, time, timedelta from typing import TYPE_CHECKING, Any, Callable, Iterable, Sequence, overload @@ -27,7 +26,7 @@ _time_to_pl_time, _timedelta_to_pl_timedelta, ) -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import deprecated_alias, issue_deprecation_warning with contextlib.suppress(ImportError): # Module not available when building docs import polars.polars as plr @@ -322,10 +321,9 @@ def count(column: str | Series | None = None) -> Expr | int: return wrap_expr(plr.count()) if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `count` is deprecated. Use `Series.len()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) return column.len() return col(column).count() @@ -384,10 +382,9 @@ def std(column: str | Series, ddof: int = 1) -> Expr | float | None: """ if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `std` is deprecated. Use `Series.std()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) return column.std(ddof) return col(column).std(ddof) @@ -433,10 +430,9 @@ def var(column: str | Series, ddof: int = 1) -> Expr | float | None: """ if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `var` is deprecated. Use `Series.var()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) return column.var(ddof) return col(column).var(ddof) @@ -471,10 +467,9 @@ def mean(column: str | Series) -> Expr | float | None: """ if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `mean` is deprecated. Use `Series.mean()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) return column.mean() return col(column).mean() @@ -540,10 +535,9 @@ def median(column: str | Series) -> Expr | float | int | None: """ if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `median` is deprecated. Use `Series.median()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) return column.median() return col(column).median() @@ -578,10 +572,9 @@ def n_unique(column: str | Series) -> Expr | int: """ if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `n_unique` is deprecated. Use `Series.n_unique()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) return column.n_unique() return col(column).n_unique() @@ -673,10 +666,9 @@ def first(column: str | Series | None = None) -> Expr | Any: return wrap_expr(plr.first()) if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `first` is deprecated. Use `series[0]` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) if column.len() > 0: return column[0] @@ -739,10 +731,9 @@ def last(column: str | Series | None = None) -> Expr: return wrap_expr(plr.last()) if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `last` is deprecated. Use `series[-1]` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) if column.len() > 0: return column[-1] @@ -799,10 +790,9 @@ def head(column: str | Series, n: int = 10) -> Expr | Series: """ if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `head` is deprecated. Use `Series.head()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) return column.head(n) return col(column).head(n) @@ -856,10 +846,9 @@ def tail(column: str | Series, n: int = 10) -> Expr | Series: """ if isinstance(column, pl.Series): - warnings.warn( + issue_deprecation_warning( "passing a Series to `tail` is deprecated. Use `Series.tail()` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) return column.tail(n) return col(column).tail(n) @@ -1114,6 +1103,7 @@ def map( Returns ------- Expr + Expression with the data type given by ``return_dtype``. Examples -------- @@ -1190,6 +1180,7 @@ def apply( Returns ------- Expr + Expression with the data type given by ``return_dtype``. Examples -------- @@ -1214,7 +1205,9 @@ def apply( Calculate product of ``a``. - >>> df.with_columns(pl.col("a").apply(lambda x: x * x).alias("product_a")) + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").apply(lambda x: x * x).alias("product_a") + ... ) shape: (4, 3) ┌─────┬─────┬───────────┐ │ a ┆ b ┆ product_a │ @@ -1550,6 +1543,100 @@ def cumreduce( return wrap_expr(plr.cumreduce(function, exprs)) +def arctan2(y: str | Expr, x: str | Expr) -> Expr: + """ + Compute two argument arctan in radians. + + Returns the angle (in radians) in the plane between the + positive x-axis and the ray from the origin to (x,y). + + Parameters + ---------- + y + Column name or Expression. + x + Column name or Expression. + + Examples + -------- + >>> import math + >>> twoRootTwo = math.sqrt(2) / 2 + >>> df = pl.DataFrame( + ... { + ... "y": [twoRootTwo, -twoRootTwo, twoRootTwo, -twoRootTwo], + ... "x": [twoRootTwo, twoRootTwo, -twoRootTwo, -twoRootTwo], + ... } + ... ) + >>> df.select( + ... pl.arctan2d("y", "x").alias("atan2d"), pl.arctan2("y", "x").alias("atan2") + ... ) + shape: (4, 2) + ┌────────┬───────────┐ + │ atan2d ┆ atan2 │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞════════╪═══════════╡ + │ 45.0 ┆ 0.785398 │ + │ -45.0 ┆ -0.785398 │ + │ 135.0 ┆ 2.356194 │ + │ -135.0 ┆ -2.356194 │ + └────────┴───────────┘ + + """ + if isinstance(y, str): + y = col(y) + if isinstance(x, str): + x = col(x) + return wrap_expr(plr.arctan2(y._pyexpr, x._pyexpr)) + + +def arctan2d(y: str | Expr, x: str | Expr) -> Expr: + """ + Compute two argument arctan in degrees. + + Returns the angle (in degrees) in the plane between the positive x-axis + and the ray from the origin to (x,y). + + Parameters + ---------- + y + Column name or Expression. + x + Column name or Expression. + + Examples + -------- + >>> import math + >>> twoRootTwo = math.sqrt(2) / 2 + >>> df = pl.DataFrame( + ... { + ... "y": [twoRootTwo, -twoRootTwo, twoRootTwo, -twoRootTwo], + ... "x": [twoRootTwo, twoRootTwo, -twoRootTwo, -twoRootTwo], + ... } + ... ) + >>> df.select( + ... pl.arctan2d("y", "x").alias("atan2d"), pl.arctan2("y", "x").alias("atan2") + ... ) + shape: (4, 2) + ┌────────┬───────────┐ + │ atan2d ┆ atan2 │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞════════╪═══════════╡ + │ 45.0 ┆ 0.785398 │ + │ -45.0 ┆ -0.785398 │ + │ 135.0 ┆ 2.356194 │ + │ -135.0 ┆ -2.356194 │ + └────────┴───────────┘ + + """ + if isinstance(y, str): + y = col(y) + if isinstance(x, str): + x = col(x) + return wrap_expr(plr.arctan2d(y._pyexpr, x._pyexpr)) + + def exclude( columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType, @@ -1719,6 +1806,7 @@ def arg_sort_by( return wrap_expr(plr.arg_sort_by(exprs, descending)) +@deprecated_alias(common_subplan_elimination="comm_subplan_elim") def collect_all( lazy_frames: Sequence[LazyFrame], *, @@ -1728,13 +1816,14 @@ def collect_all( simplify_expression: bool = True, no_optimization: bool = False, slice_pushdown: bool = True, - common_subplan_elimination: bool = True, + comm_subplan_elim: bool = True, + comm_subexpr_elim: bool = True, streaming: bool = False, ) -> list[DataFrame]: """ Collect multiple LazyFrames at the same time. - This runs all the computation graphs in parallel on Polars threadpool. + This runs all the computation graphs in parallel on the Polars threadpool. Parameters ---------- @@ -1752,21 +1841,25 @@ def collect_all( Turn off optimizations. slice_pushdown Slice pushdown optimization. - common_subplan_elimination + comm_subplan_elim Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. streaming Run parts of the query in a streaming fashion (this is in an alpha state) Returns ------- - List[DataFrame] + list of DataFrames + The collected DataFrames, returned in the same order as the input LazyFrames. """ if no_optimization: predicate_pushdown = False projection_pushdown = False slice_pushdown = False - common_subplan_elimination = False + comm_subplan_elim = False + comm_subexpr_elim = False prepared = [] @@ -1777,7 +1870,8 @@ def collect_all( projection_pushdown, simplify_expression, slice_pushdown, - common_subplan_elimination, + comm_subplan_elim, + comm_subexpr_elim, streaming, ) prepared.append(ldf) diff --git a/py-polars/polars/functions/range.py b/py-polars/polars/functions/range.py index 0c9c79c19a07f..1e4cdddb535d3 100644 --- a/py-polars/polars/functions/range.py +++ b/py-polars/polars/functions/range.py @@ -1,21 +1,17 @@ from __future__ import annotations import contextlib -import warnings from datetime import time, timedelta from typing import TYPE_CHECKING, overload -import polars._reexport as pl from polars import functions as F from polars.datatypes import Int64 from polars.utils._parse_expr_input import parse_as_expression -from polars.utils._wrap import wrap_expr, wrap_s +from polars.utils._wrap import wrap_expr from polars.utils.convert import ( - _time_to_pl_time, _timedelta_to_pl_duration, ) -from polars.utils.decorators import deprecated_alias -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import deprecated_alias, issue_deprecation_warning with contextlib.suppress(ImportError): # Module not available when building docs import polars.polars as plr @@ -99,8 +95,8 @@ def arange( dtype Data type of the resulting column. Defaults to ``Int64``. eager - Evaluate immediately and return a ``Series``. If set to ``False`` (default), - return an expression instead. + Evaluate immediately and return a ``Series``. + If set to ``False`` (default), return an expression instead. See Also -------- @@ -121,14 +117,13 @@ def arange( """ # This check is not water-proof, but we cannot check for literal expressions here if not (isinstance(start, int) and isinstance(end, int)): - warnings.warn( + issue_deprecation_warning( " `arange` has been replaced by two new functions:" " `int_range` for generating a single range," " and `int_ranges` for generating a list column with multiple ranges." " `arange` will remain available as an alias for `int_range`, which means its behaviour will change." " To silence this warning, use either of the new functions.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.5", ) start = parse_as_expression(start) @@ -201,12 +196,13 @@ def int_range( dtype Data type of the range. Defaults to ``Int64``. eager - Evaluate immediately and return a ``Series``. If set to ``False`` (default), - return an expression instead. + Evaluate immediately and return a ``Series``. + If set to ``False`` (default), return an expression instead. Returns ------- - Column of data type ``Int64``. + Expr or Series + Column of data type :class:`Int64`. See Also -------- @@ -292,12 +288,13 @@ def int_ranges( dtype Integer data type of the ranges. Defaults to ``Int64``. eager - Evaluate immediately and return a ``Series``. If set to ``False`` (default), - return an expression instead. + Evaluate immediately and return a ``Series``. + If set to ``False`` (default), return an expression instead. Returns ------- - Column of data type ``List(dtype)``. + Expr or Series + Column of data type ``List(dtype)``. See Also -------- @@ -330,8 +327,8 @@ def int_ranges( @overload def date_range( - start: date | datetime | Expr | str, - end: date | datetime | Expr | str, + start: date | datetime | IntoExpr, + end: date | datetime | IntoExpr, interval: str | timedelta = ..., *, closed: ClosedInterval = ..., @@ -345,8 +342,8 @@ def date_range( @overload def date_range( - start: date | datetime | Expr | str, - end: date | datetime | Expr | str, + start: date | datetime | IntoExpr, + end: date | datetime | IntoExpr, interval: str | timedelta = ..., *, closed: ClosedInterval = ..., @@ -360,8 +357,8 @@ def date_range( @overload def date_range( - start: date | datetime | Expr | str, - end: date | datetime | Expr | str, + start: date | datetime | IntoExpr, + end: date | datetime | IntoExpr, interval: str | timedelta = ..., *, closed: ClosedInterval = ..., @@ -375,8 +372,8 @@ def date_range( @deprecated_alias(low="start", high="end") def date_range( - start: date | datetime | Expr | str, - end: date | datetime | Expr | str, + start: date | datetime | IntoExpr, + end: date | datetime | IntoExpr, interval: str | timedelta = "1d", *, closed: ClosedInterval = "both", @@ -386,39 +383,45 @@ def date_range( name: str | None = None, ) -> Series | Expr: """ - Create a range of type `Datetime` (or `Date`). + Generate a date range. Parameters ---------- start - Lower bound of the date range, given as a date, datetime, Expr, or column name. + Lower bound of the date range. end - Upper bound of the date range, given as a date, datetime, Expr, or column name. + Upper bound of the date range. interval - Interval of the range periods; can be a python timedelta object like - ``timedelta(days=10)`` or a polars duration string, such as ``3d12h4m25s`` - (representing 3 days, 12 hours, 4 minutes, and 25 seconds). Append - ``_saturating`` to the interval string to restrict resulting invalid dates to - valid ranges. + Interval of the range periods, specified as a Python ``timedelta`` object + or a Polars duration string like ``1h30m25s``. + + Append ``_saturating`` to the interval string to restrict resulting invalid + dates to valid ranges. It is common to attempt to create a month-end date series by using the "1mo" offset string with a start date at the end of the month. This will not produce the desired results. See Note #2 below for further information. closed : {'both', 'left', 'right', 'none'} - Define whether the temporal window interval is closed or not. + Define which sides of the range are closed (inclusive). time_unit : {None, 'ns', 'us', 'ms'} - Set the time unit. Only takes effect if output is of ``Datetime`` type. - time_zone: - Optional timezone. Only takes effect if output is of ``Datetime`` type. + Time unit of the resulting ``Datetime`` data type. + Only takes effect if the output column is of type ``Datetime``. + time_zone + Time zone of the resulting ``Datetime`` data type. + Only takes effect if the output column is of type ``Datetime``. eager - Evaluate immediately and return a ``Series``. If set to ``False`` (default), - return an expression instead. + Evaluate immediately and return a ``Series``. + If set to ``False`` (default), return an expression instead. name Name of the output column. .. deprecated:: 0.18.0 This argument is deprecated. Use the ``alias`` method instead. + Returns + ------- + Expr or Series + Column of data type :class:`Date` or :class:`Datetime`. Notes ----- @@ -460,13 +463,9 @@ def date_range( 2023-05-31 ] - Returns - ------- - A Series of type `Datetime` or `Date`. - Examples -------- - Using polars duration string to specify the interval: + Using Polars duration string to specify the interval: >>> from datetime import date >>> pl.date_range(date(2022, 1, 1), date(2022, 3, 1), "1mo", eager=True) @@ -478,7 +477,7 @@ def date_range( 2022-03-01 ] - Using `timedelta` object to specify the interval: + Using ``timedelta`` object to specify the interval: >>> from datetime import datetime, timedelta >>> pl.date_range( @@ -500,7 +499,7 @@ def date_range( 1985-01-10 00:00:00 ] - Specify a time zone + Specifying a time zone: >>> pl.date_range( ... datetime(2022, 1, 1), @@ -519,11 +518,9 @@ def date_range( Combine with ``month_end`` to get the last day of the month: - >>> ( - ... pl.date_range( - ... datetime(2022, 1, 1), datetime(2022, 3, 1), "1mo", eager=True - ... ).dt.month_end() - ... ) + >>> pl.date_range( + ... datetime(2022, 1, 1), datetime(2022, 3, 1), "1mo", eager=True + ... ).dt.month_end() shape: (3,) Series: 'date' [datetime[μs]] [ @@ -534,47 +531,169 @@ def date_range( """ if name is not None: - warnings.warn( + issue_deprecation_warning( "the `name` argument is deprecated. Use the `alias` method instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.0", ) - if isinstance(interval, timedelta): - interval = _timedelta_to_pl_duration(interval) - elif " " in interval: - interval = interval.replace(" ", "") + interval = _parse_interval_argument(interval) + if time_unit is None and "ns" in interval: + time_unit = "ns" - time_unit_: TimeUnit | None - if time_unit is not None: - time_unit_ = time_unit - elif "ns" in interval: - time_unit_ = "ns" - else: - time_unit_ = None - - start_pl = parse_as_expression(start) - end_pl = parse_as_expression(end) - dt_range = wrap_expr( - plr.date_range_lazy(start_pl, end_pl, interval, closed, time_unit_, time_zone) + start_pyexpr = parse_as_expression(start) + end_pyexpr = parse_as_expression(end) + result = wrap_expr( + plr.date_range(start_pyexpr, end_pyexpr, interval, closed, time_unit, time_zone) ) + if name is not None: - dt_range = dt_range.alias(name) + result = result.alias(name) + + if eager: + s = F.select(result).to_series() + if s.len() == 1: + s = s.explode().set_sorted() + else: + _warn_for_deprecation_date_range() + return s + + _warn_for_deprecation_date_range() + + return result + + +@overload +def date_ranges( + start: date | datetime | IntoExpr, + end: date | datetime | IntoExpr, + interval: str | timedelta = ..., + *, + closed: ClosedInterval = ..., + time_unit: TimeUnit | None = ..., + time_zone: str | None = ..., + eager: Literal[False] = ..., +) -> Expr: + ... + + +@overload +def date_ranges( + start: date | datetime | IntoExpr, + end: date | datetime | IntoExpr, + interval: str | timedelta = ..., + *, + closed: ClosedInterval = ..., + time_unit: TimeUnit | None = ..., + time_zone: str | None = ..., + eager: Literal[True], +) -> Series: + ... + + +@overload +def date_ranges( + start: date | datetime | IntoExpr, + end: date | datetime | IntoExpr, + interval: str | timedelta = ..., + *, + closed: ClosedInterval = ..., + time_unit: TimeUnit | None = ..., + time_zone: str | None = ..., + eager: bool, +) -> Series | Expr: + ... - if ( - not eager - or isinstance(start_pl, (str, pl.Expr)) - or isinstance(end_pl, (str, pl.Expr)) - ): - return dt_range - res = F.select(dt_range).to_series().explode().set_sorted() - return res + +def date_ranges( + start: date | datetime | IntoExpr, + end: date | datetime | IntoExpr, + interval: str | timedelta = "1d", + *, + closed: ClosedInterval = "both", + time_unit: TimeUnit | None = None, + time_zone: str | None = None, + eager: bool = False, +) -> Series | Expr: + """ + Create a column of date ranges. + + Parameters + ---------- + start + Lower bound of the date range. + end + Upper bound of the date range. + interval + Interval of the range periods, specified as a Python ``timedelta`` object + or a Polars duration string like ``1h30m25s``. + + Append ``_saturating`` to the interval string to restrict resulting invalid + dates to valid ranges. + + It is common to attempt to create a month-end date series by using the "1mo" + offset string with a start date at the end of the month. This will not produce + the desired results. See Note #2 below for further information. + closed : {'both', 'left', 'right', 'none'} + Define which sides of the range are closed (inclusive). + time_unit : {None, 'ns', 'us', 'ms'} + Time unit of the resulting ``Datetime`` data type. + Only takes effect if the output column is of type ``Datetime``. + time_zone + Time zone of the resulting ``Datetime`` data type. + Only takes effect if the output column is of type ``Datetime``. + eager + Evaluate immediately and return a ``Series``. + If set to ``False`` (default), return an expression instead. + + Returns + ------- + Expr or Series + Column of data type ``List(Date)`` or ``List(Datetime)``. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "start": [date(2022, 1, 1), date(2022, 1, 2)], + ... "end": date(2022, 1, 3), + ... } + ... ) + >>> df.with_columns(pl.date_ranges("start", "end")) + shape: (2, 3) + ┌────────────┬────────────┬───────────────────────────────────┐ + │ start ┆ end ┆ date_range │ + │ --- ┆ --- ┆ --- │ + │ date ┆ date ┆ list[date] │ + ╞════════════╪════════════╪═══════════════════════════════════╡ + │ 2022-01-01 ┆ 2022-01-03 ┆ [2022-01-01, 2022-01-02, 2022-01… │ + │ 2022-01-02 ┆ 2022-01-03 ┆ [2022-01-02, 2022-01-03] │ + └────────────┴────────────┴───────────────────────────────────┘ + + """ + interval = _parse_interval_argument(interval) + if time_unit is None and "ns" in interval: + time_unit = "ns" + + start_pyexpr = parse_as_expression(start) + end_pyexpr = parse_as_expression(end) + + result = wrap_expr( + plr.date_ranges( + start_pyexpr, end_pyexpr, interval, closed, time_unit, time_zone + ) + ) + + if eager: + return F.select(result).to_series() + + return result @overload def time_range( - start: time | Expr | str | None = ..., - end: time | Expr | str | None = ..., + start: time | IntoExpr | None = ..., + end: time | IntoExpr | None = ..., interval: str | timedelta = ..., *, closed: ClosedInterval = ..., @@ -586,8 +705,8 @@ def time_range( @overload def time_range( - start: time | Expr | str | None = ..., - end: time | Expr | str | None = ..., + start: time | IntoExpr | None = ..., + end: time | IntoExpr | None = ..., interval: str | timedelta = ..., *, closed: ClosedInterval = ..., @@ -599,8 +718,8 @@ def time_range( @overload def time_range( - start: time | Expr | str | None = ..., - end: time | Expr | str | None = ..., + start: time | IntoExpr | None = ..., + end: time | IntoExpr | None = ..., interval: str | timedelta = ..., *, closed: ClosedInterval = ..., @@ -611,8 +730,8 @@ def time_range( def time_range( - start: time | Expr | str | None = None, - end: time | Expr | str | None = None, + start: time | IntoExpr | None = None, + end: time | IntoExpr | None = None, interval: str | timedelta = "1h", *, closed: ClosedInterval = "both", @@ -620,25 +739,24 @@ def time_range( name: str | None = None, ) -> Series | Expr: """ - Create a range of type `Time`. + Generate a time range. Parameters ---------- start - Lower bound of the time range, given as a time, Expr, or column name. - If omitted, will default to ``time(0,0,0,0)``. + Lower bound of the time range. + If omitted, defaults to ``time(0,0,0,0)``. end - Upper bound of the time range, given as a time, Expr, or column name. - If omitted, will default to ``time(23,59,59,999999)``. + Upper bound of the time range. + If omitted, defaults to ``time(23,59,59,999999)``. interval - Interval of the range periods; can be a python timedelta object like - ``timedelta(minutes=10)`` or a polars duration string, such as ``1h30m25s`` - (representing 1 hour, 30 minutes, and 25 seconds). + Interval of the range periods, specified as a Python ``timedelta`` object + or a Polars duration string like ``1h30m25s``. closed : {'both', 'left', 'right', 'none'} - Define whether the temporal window interval is closed or not. + Define which sides of the range are closed (inclusive). eager - Evaluate immediately and return a ``Series``. If set to ``False`` (default), - return an expression instead. + Evaluate immediately and return a ``Series``. + If set to ``False`` (default), return an expression instead. name Name of the output column. @@ -647,12 +765,15 @@ def time_range( Returns ------- - A Series of type `Time`. + Expr or Series + Column of data type `:class:Time`. - Examples + See Also -------- - Create a Series that starts at 14:00, with intervals of 3 hours and 15 mins: + time_ranges : Create a column of time ranges. + Examples + -------- >>> from datetime import time >>> pl.time_range( ... start=time(14, 0), @@ -668,77 +789,182 @@ def time_range( 23:45:00 ] - Generate a DataFrame with two columns made of eager ``time_range`` Series, - and create a third column using ``time_range`` in expression context: + """ + if name is not None: + issue_deprecation_warning( + "the `name` argument is deprecated. Use the `alias` method instead.", + version="0.18.0", + ) - >>> lf = pl.LazyFrame( + interval = _parse_interval_argument(interval) + for unit in ("y", "mo", "w", "d"): + if unit in interval: + raise ValueError(f"invalid interval unit for time_range: found {unit!r}") + + if start is None: + start = time(0, 0, 0) + if end is None: + end = time(23, 59, 59, 999999) + + start_pyexpr = parse_as_expression(start) + end_pyexpr = parse_as_expression(end) + + result = wrap_expr(plr.time_range(start_pyexpr, end_pyexpr, interval, closed)) + + if name is not None: + result = result.alias(name) + + if eager: + s = F.select(result).to_series() + if s.len() == 1: + s = s.explode().set_sorted() + else: + _warn_for_deprecation_time_range() + return s + + _warn_for_deprecation_time_range() + + return result + + +@overload +def time_ranges( + start: time | IntoExpr | None = ..., + end: time | IntoExpr | None = ..., + interval: str | timedelta = ..., + *, + closed: ClosedInterval = ..., + eager: Literal[False] = ..., +) -> Expr: + ... + + +@overload +def time_ranges( + start: time | IntoExpr | None = ..., + end: time | IntoExpr | None = ..., + interval: str | timedelta = ..., + *, + closed: ClosedInterval = ..., + eager: Literal[True], +) -> Series: + ... + + +@overload +def time_ranges( + start: time | IntoExpr | None = ..., + end: time | IntoExpr | None = ..., + interval: str | timedelta = ..., + *, + closed: ClosedInterval = ..., + eager: bool, +) -> Series | Expr: + ... + + +def time_ranges( + start: time | IntoExpr | None = None, + end: time | IntoExpr | None = None, + interval: str | timedelta = "1h", + *, + closed: ClosedInterval = "both", + eager: bool = False, +) -> Series | Expr: + """ + Create a column of time ranges. + + Parameters + ---------- + start + Lower bound of the time range. + If omitted, defaults to ``time(0, 0, 0, 0)``. + end + Upper bound of the time range. + If omitted, defaults to ``time(23, 59, 59, 999999)``. + interval + Interval of the range periods, specified as a Python ``timedelta`` object + or a Polars duration string like ``1h30m25s``. + closed : {'both', 'left', 'right', 'none'} + Define which sides of the range are closed (inclusive). + eager + Evaluate immediately and return a ``Series``. + If set to ``False`` (default), return an expression instead. + + Returns + ------- + Expr or Series + Column of data type ``List(Time)``. + + See Also + -------- + time_range : Generate a single time range. + + Examples + -------- + >>> from datetime import time + >>> df = pl.DataFrame( ... { - ... "start": pl.time_range(interval="6h", eager=True), - ... "stop": pl.time_range(start=time(2, 59), interval="5h59m", eager=True), + ... "start": [time(9, 0), time(10, 0)], + ... "end": time(11, 0), ... } - ... ).with_columns( - ... intervals=pl.time_range("start", "stop", interval="1h29m", eager=False) ... ) - >>> lf.collect() - shape: (4, 3) + >>> df.with_columns(pl.time_ranges("start", "end")) + shape: (2, 3) ┌──────────┬──────────┬────────────────────────────────┐ - │ start ┆ stop ┆ intervals │ + │ start ┆ end ┆ time_range │ │ --- ┆ --- ┆ --- │ │ time ┆ time ┆ list[time] │ ╞══════════╪══════════╪════════════════════════════════╡ - │ 00:00:00 ┆ 02:59:00 ┆ [00:00:00, 01:29:00, 02:58:00] │ - │ 06:00:00 ┆ 08:58:00 ┆ [06:00:00, 07:29:00, 08:58:00] │ - │ 12:00:00 ┆ 14:57:00 ┆ [12:00:00, 13:29:00] │ - │ 18:00:00 ┆ 20:56:00 ┆ [18:00:00, 19:29:00] │ + │ 09:00:00 ┆ 11:00:00 ┆ [09:00:00, 10:00:00, 11:00:00] │ + │ 10:00:00 ┆ 11:00:00 ┆ [10:00:00, 11:00:00] │ └──────────┴──────────┴────────────────────────────────┘ """ - if name is not None: - warnings.warn( - "the `name` argument is deprecated. Use the `alias` method instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), - ) - - if isinstance(interval, timedelta): - interval = _timedelta_to_pl_duration(interval) - elif " " in interval: - interval = interval.replace(" ", "").lower() - + interval = _parse_interval_argument(interval) for unit in ("y", "mo", "w", "d"): if unit in interval: raise ValueError(f"invalid interval unit for time_range: found {unit!r}") - default_start = time(0, 0, 0) - default_end = time(23, 59, 59, 999999) - if ( - not eager - or isinstance(start, (str, pl.Expr)) - or isinstance(end, (str, pl.Expr)) - ): - start_expr = ( - F.lit(default_start)._pyexpr - if start is None - else parse_as_expression(start) - ) + if start is None: + start = time(0, 0, 0) + if end is None: + end = time(23, 59, 59, 999999) - end_expr = ( - F.lit(default_end)._pyexpr if end is None else parse_as_expression(end) - ) + start_pyexpr = parse_as_expression(start) + end_pyexpr = parse_as_expression(end) - tm_expr = wrap_expr(plr.time_range_lazy(start_expr, end_expr, interval, closed)) - if name is not None: - tm_expr = tm_expr.alias(name) - return tm_expr - else: - tm_srs = wrap_s( - plr.time_range_eager( - _time_to_pl_time(default_start if start is None else start), - _time_to_pl_time(default_end if end is None else end), - interval, - closed, - ) - ) - if name is not None: - tm_srs = tm_srs.alias(name) - return tm_srs + result = wrap_expr(plr.time_ranges(start_pyexpr, end_pyexpr, interval, closed)) + + if eager: + return F.select(result).to_series() + + return result + + +def _parse_interval_argument(interval: str | timedelta) -> str: + """Parse the interval argument as a Polars duration string.""" + if isinstance(interval, timedelta): + return _timedelta_to_pl_duration(interval) + + if " " in interval: + interval = interval.replace(" ", "") + return interval.lower() + + +def _warn_for_deprecation_date_range() -> None: + issue_deprecation_warning( + "behavior of `date_range` will change in a future version." + " The result will be a single range of type Date or Datetime instead of List." + " Use the new `date_ranges` function to retain the old functionality and silence this warning.", + version="0.18.9", + ) + + +def _warn_for_deprecation_time_range() -> None: + issue_deprecation_warning( + "behavior of `time_range` will change in a future version." + " The result will be a single range of type Time instead of List." + " Use the new `date_ranges` function to retain the old functionality and silence this warning.", + version="0.18.9", + ) diff --git a/py-polars/polars/functions/repeat.py b/py-polars/polars/functions/repeat.py index 99ef7e4ba41d4..61ae46c0ea5e6 100644 --- a/py-polars/polars/functions/repeat.py +++ b/py-polars/polars/functions/repeat.py @@ -1,14 +1,13 @@ from __future__ import annotations import contextlib -import warnings from typing import TYPE_CHECKING, overload from polars import functions as F from polars.datatypes import Float64 from polars.utils._parse_expr_input import parse_as_expression from polars.utils._wrap import wrap_expr -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import issue_deprecation_warning with contextlib.suppress(ImportError): # Module not available when building docs import polars.polars as plr @@ -18,17 +17,13 @@ from typing import Literal from polars import Expr, Series - from polars.type_aliases import ( - IntoExpr, - PolarsDataType, - PolarsExprType, - ) + from polars.type_aliases import IntoExpr, PolarsDataType @overload def repeat( value: IntoExpr | None, - n: int | PolarsExprType, + n: int | Expr, *, dtype: PolarsDataType | None = ..., eager: Literal[False] = ..., @@ -40,7 +35,7 @@ def repeat( @overload def repeat( value: IntoExpr | None, - n: int | PolarsExprType, + n: int | Expr, *, dtype: PolarsDataType | None = ..., eager: Literal[True], @@ -52,7 +47,7 @@ def repeat( @overload def repeat( value: IntoExpr | None, - n: int | PolarsExprType, + n: int | Expr, *, dtype: PolarsDataType | None = ..., eager: bool, @@ -63,7 +58,7 @@ def repeat( def repeat( value: IntoExpr | None, - n: int | PolarsExprType, + n: int | Expr, *, dtype: PolarsDataType | None = None, eager: bool = False, @@ -126,10 +121,9 @@ def repeat( """ if name is not None: - warnings.warn( + issue_deprecation_warning( "the `name` argument is deprecated. Use the `alias` method instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.0", ) if isinstance(n, int): @@ -145,7 +139,7 @@ def repeat( @overload def ones( - n: int | PolarsExprType, + n: int | Expr, dtype: PolarsDataType = ..., *, eager: Literal[False] = ..., @@ -155,7 +149,7 @@ def ones( @overload def ones( - n: int | PolarsExprType, + n: int | Expr, dtype: PolarsDataType = ..., *, eager: Literal[True], @@ -165,7 +159,7 @@ def ones( @overload def ones( - n: int | PolarsExprType, + n: int | Expr, dtype: PolarsDataType = ..., *, eager: bool, @@ -174,7 +168,7 @@ def ones( def ones( - n: int | PolarsExprType, + n: int | Expr, dtype: PolarsDataType = Float64, *, eager: bool = False, @@ -221,7 +215,7 @@ def ones( @overload def zeros( - n: int | PolarsExprType, + n: int | Expr, dtype: PolarsDataType = ..., *, eager: Literal[False] = ..., @@ -231,7 +225,7 @@ def zeros( @overload def zeros( - n: int | PolarsExprType, + n: int | Expr, dtype: PolarsDataType = ..., *, eager: Literal[True], @@ -241,7 +235,7 @@ def zeros( @overload def zeros( - n: int | PolarsExprType, + n: int | Expr, dtype: PolarsDataType = ..., *, eager: bool, @@ -250,7 +244,7 @@ def zeros( def zeros( - n: int | PolarsExprType, + n: int | Expr, dtype: PolarsDataType = Float64, *, eager: bool = False, diff --git a/py-polars/polars/functions/whenthen.py b/py-polars/polars/functions/whenthen.py index 961887b393c2f..e32a9ff727393 100644 --- a/py-polars/polars/functions/whenthen.py +++ b/py-polars/polars/functions/whenthen.py @@ -1,22 +1,23 @@ from __future__ import annotations import contextlib -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING +import polars._reexport as pl from polars.utils._parse_expr_input import parse_as_expression -from polars.utils._wrap import wrap_expr +from polars.utils.deprecation import deprecated_alias with contextlib.suppress(ImportError): # Module not available when building docs - from polars.polars import when as _when + import polars.polars as plr if TYPE_CHECKING: - from polars import Expr from polars.type_aliases import IntoExpr -def when(expr: IntoExpr) -> When: +@deprecated_alias(expr="condition") +def when(condition: IntoExpr) -> pl.When: """ - Start a "when, then, otherwise" expression. + Start a `when-then-otherwise` expression. Expression similar to an `if-else` statement in Python. Always initiated by a `pl.when().then()`. Optionally followed by chaining @@ -25,6 +26,12 @@ def when(expr: IntoExpr) -> When: appended at the end. If not appended, and none of the conditions are `True`, `None` will be returned. + Parameters + ---------- + condition + The condition for applying the subsequent statement. + Accepts a boolean expression. String input is parsed as a column name. + Examples -------- Below we add a column with the value 1, where column "foo" > 2 and the value -1 @@ -90,96 +97,6 @@ def when(expr: IntoExpr) -> When: │ 4 ┆ 0 ┆ 1 │ └─────┴─────┴──────┘ - """ - expr = parse_as_expression(expr) - pywhen = _when(expr) - return When(pywhen) - - -class When: - """Utility class. See the `when` function.""" - - def __init__(self, pywhen: Any): - self._pywhen = pywhen - - def then(self, expr: IntoExpr) -> WhenThen: - """ - Values to return in case of the predicate being `True`. - - See Also - -------- - pl.when : Documentation for `when, then, otherwise` - - """ - expr = parse_as_expression(expr, str_as_lit=True) - pywhenthen = self._pywhen.then(expr) - return WhenThen(pywhenthen) - - -class WhenThen: - """Utility class. See the `when` function.""" - - def __init__(self, pywhenthen: Any): - self._pywhenthen = pywhenthen - - def when(self, predicate: IntoExpr) -> WhenThenThen: - """Start another "when, then, otherwise" layer.""" - predicate = parse_as_expression(predicate) - return WhenThenThen(self._pywhenthen.when(predicate)) - - def otherwise(self, expr: IntoExpr) -> Expr: - """ - Values to return in case of the predicate being `False`. - - See Also - -------- - pl.when : Documentation for `when, then, otherwise` - - """ - expr = parse_as_expression(expr, str_as_lit=True) - return wrap_expr(self._pywhenthen.otherwise(expr)) - - def __getattr__(self, item: str) -> Any: - expr = self.otherwise(None) - return getattr(expr, item) - - -class WhenThenThen: - """Utility class. See the `when` function.""" - - def __init__(self, pywhenthenthen: Any): - self.pywhenthenthen = pywhenthenthen - - def when(self, predicate: IntoExpr) -> WhenThenThen: - """Start another "when, then, otherwise" layer.""" - predicate = parse_as_expression(predicate) - return WhenThenThen(self.pywhenthenthen.when(predicate)) - - def then(self, expr: IntoExpr) -> WhenThenThen: - """ - Values to return in case of the predicate being `True`. - - See Also - -------- - pl.when : Documentation for `when, then, otherwise` - - """ - expr = parse_as_expression(expr, str_as_lit=True) - return WhenThenThen(self.pywhenthenthen.then(expr)) - - def otherwise(self, expr: IntoExpr) -> Expr: - """ - Values to return in case of the predicate being `False`. - - See Also - -------- - pl.when : Documentation for `when, then, otherwise` - - """ - expr = parse_as_expression(expr, str_as_lit=True) - return wrap_expr(self.pywhenthenthen.otherwise(expr)) - - def __getattr__(self, item: str) -> Any: - expr = self.otherwise(None) - return getattr(expr, item) + condition_pyexpr = parse_as_expression(condition) + return pl.When(plr.when(condition_pyexpr)) diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index 742bfdeb0239c..3446bae1c70d1 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -88,8 +88,8 @@ def managed_file(file: Any) -> Iterator[Any]: finally: pass - has_non_utf8_non_utf8_lossy_encoding = ( - encoding not in {"utf8", "utf8-lossy"} if encoding else False + has_utf8_utf8_lossy_encoding = ( + encoding in {"utf8", "utf8-lossy"} if encoding else True ) encoding_str = encoding if encoding else "utf8" @@ -98,7 +98,7 @@ def managed_file(file: Any) -> Iterator[Any]: check_not_dir = not use_pyarrow if isinstance(file, bytes): - if has_non_utf8_non_utf8_lossy_encoding: + if not has_utf8_utf8_lossy_encoding: return _check_empty( BytesIO(file.decode(encoding_str).encode("utf8")), context="bytes", @@ -114,7 +114,7 @@ def managed_file(file: Any) -> Iterator[Any]: ) if isinstance(file, BytesIO): - if has_non_utf8_non_utf8_lossy_encoding: + if not has_utf8_utf8_lossy_encoding: return _check_empty( BytesIO(file.read().decode(encoding_str).encode("utf8")), context="BytesIO", @@ -129,7 +129,7 @@ def managed_file(file: Any) -> Iterator[Any]: ) if isinstance(file, Path): - if has_non_utf8_non_utf8_lossy_encoding: + if not has_utf8_utf8_lossy_encoding: return _check_empty( BytesIO(file.read_bytes().decode(encoding_str).encode("utf8")), context=f"Path ({file!r})", @@ -145,17 +145,33 @@ def managed_file(file: Any) -> Iterator[Any]: if _FSSPEC_AVAILABLE: from fsspec.utils import infer_storage_options - if not has_non_utf8_non_utf8_lossy_encoding: - if infer_storage_options(file)["protocol"] == "file": + # check if it is a local file + if infer_storage_options(file)["protocol"] == "file": + # (lossy) utf8 + if has_utf8_utf8_lossy_encoding: return managed_file(normalise_filepath(file, check_not_dir)) + # decode first + with Path(file).open(encoding=encoding_str) as f: + return _check_empty( + BytesIO(f.read().encode("utf8")), context=f"{file!r}" + ) + # non-local file + if "*" in file: + raise ValueError( + "globbing patterns not supported when scanning non-local files" + ) kwargs["encoding"] = encoding return fsspec.open(file, **kwargs) + # todo! add azure/ gcp/ ? + if file.startswith("s3://"): + raise ImportError("fsspec needs to be installed to read files from s3") + if isinstance(file, list) and bool(file) and all(isinstance(f, str) for f in file): if _FSSPEC_AVAILABLE: from fsspec.utils import infer_storage_options - if not has_non_utf8_non_utf8_lossy_encoding: + if has_utf8_utf8_lossy_encoding: if all(infer_storage_options(f)["protocol"] == "file" for f in file): return managed_file( [normalise_filepath(f, check_not_dir) for f in file] @@ -165,7 +181,7 @@ def managed_file(file: Any) -> Iterator[Any]: if isinstance(file, str): file = normalise_filepath(file, check_not_dir) - if has_non_utf8_non_utf8_lossy_encoding: + if not has_utf8_utf8_lossy_encoding: with Path(file).open(encoding=encoding_str) as f: return _check_empty( BytesIO(f.read().encode("utf8")), context=f"{file!r}" diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 62dda710dec19..f534cb93627d6 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -26,6 +26,8 @@ class BatchedCsvReader: + """Read a CSV file in batches.""" + def __init__( self, source: str | Path, @@ -126,7 +128,7 @@ def next_batches(self, n: int) -> list[DataFrame] | None: Returns ------- - Sequence of DataFrames + list of DataFrames """ batches = self._reader.next_batches(n) diff --git a/py-polars/polars/io/database.py b/py-polars/polars/io/database.py index d1501d48d5394..474db87d168f9 100644 --- a/py-polars/polars/io/database.py +++ b/py-polars/polars/io/database.py @@ -6,15 +6,17 @@ from typing import TYPE_CHECKING, Any from polars.convert import from_arrow +from polars.utils.deprecation import deprecated_alias if TYPE_CHECKING: from polars import DataFrame from polars.type_aliases import DbReadEngine +@deprecated_alias(connection_uri="connection") def read_database( query: list[str] | str, - connection_uri: str, + connection: str, *, partition_on: str | None = None, partition_range: tuple[int, int] | None = None, @@ -29,7 +31,7 @@ def read_database( ---------- query Raw SQL query (or queries). - connection_uri + connection A connectorx or ADBC connection URI that starts with the backend's driver name, for example: @@ -112,7 +114,7 @@ def read_database( if engine == "connectorx": return _read_sql_connectorx( query, - connection_uri, + connection, partition_on=partition_on, partition_range=partition_range, partition_num=partition_num, @@ -121,7 +123,7 @@ def read_database( elif engine == "adbc": if not isinstance(query, str): raise ValueError("Only a single SQL query string is accepted for adbc.") - return _read_sql_adbc(query, connection_uri) + return _read_sql_adbc(query, connection) else: raise ValueError(f"Engine {engine!r} not implemented; use connectorx or adbc.") @@ -154,20 +156,19 @@ def _read_sql_connectorx( def _read_sql_adbc(query: str, connection_uri: str) -> DataFrame: - with _open_adbc_connection(connection_uri) as conn: - cursor = conn.cursor() + with _open_adbc_connection(connection_uri) as conn, conn.cursor() as cursor: cursor.execute(query) tbl = cursor.fetch_arrow_table() - cursor.close() return from_arrow(tbl) # type: ignore[return-value] def _open_adbc_connection(connection_uri: str) -> Any: driver_name = connection_uri.split(":", 1)[0].lower() - # note: existing URI driver prefixes currently map 1:1 with - # the adbc module suffix; update this map if that changes. - module_suffix_map: dict[str, str] = {} + # map uri prefix to module when not 1:1 + module_suffix_map: dict[str, str] = { + "postgres": "postgresql", + } try: module_suffix = module_suffix_map.get(driver_name, driver_name) module_name = f"adbc_driver_{module_suffix}.dbapi" diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py index eb52e66f88b2e..f76a60784570c 100644 --- a/py-polars/polars/io/delta.py +++ b/py-polars/polars/io/delta.py @@ -126,7 +126,7 @@ def read_delta( if pyarrow_options is None: pyarrow_options = {} - resolved_uri = resolve_delta_lake_uri(source) + resolved_uri = _resolve_delta_lake_uri(source) dl_tbl = _get_delta_lake_table( table_path=resolved_uri, @@ -254,7 +254,7 @@ def scan_delta( if pyarrow_options is None: pyarrow_options = {} - resolved_uri = resolve_delta_lake_uri(source) + resolved_uri = _resolve_delta_lake_uri(source) dl_tbl = _get_delta_lake_table( table_path=resolved_uri, version=version, @@ -266,7 +266,7 @@ def scan_delta( return scan_pyarrow_dataset(pa_ds) -def resolve_delta_lake_uri(table_uri: str, strict: bool = True) -> str: +def _resolve_delta_lake_uri(table_uri: str, strict: bool = True) -> str: parsed_result = urlparse(table_uri) resolved_uri = str( @@ -285,19 +285,15 @@ def _get_delta_lake_table( delta_table_options: dict[str, Any] | None = None, ) -> deltalake.DeltaTable: """ - Initialise a Delta lake table for use in read and scan operations. + Initialize a Delta lake table for use in read and scan operations. Notes ----- Make sure to install deltalake>=0.8.0. Read the documentation `here `_. - Returns - ------- - DeltaTable - """ - check_if_delta_available() + _check_if_delta_available() if delta_table_options is None: delta_table_options = {} @@ -312,7 +308,7 @@ def _get_delta_lake_table( return dl_tbl -def check_if_delta_available() -> None: +def _check_if_delta_available() -> None: if not _DELTALAKE_AVAILABLE: raise ImportError( "deltalake is not installed. Please run `pip install deltalake>=0.9.0`." diff --git a/py-polars/polars/io/excel/_write_utils.py b/py-polars/polars/io/excel/_write_utils.py index 30a29dec74b85..e724aa52f448a 100644 --- a/py-polars/polars/io/excel/_write_utils.py +++ b/py-polars/polars/io/excel/_write_utils.py @@ -112,7 +112,7 @@ def _xl_apply_conditional_formats( for cols, formats in conditional_formats.items(): if not isinstance(cols, str) and len(cols) == 1: - cols = list(cols)[0] + cols = next(iter(cols)) if isinstance(formats, (str, dict)): formats = [formats] @@ -124,7 +124,7 @@ def _xl_apply_conditional_formats( else: col_range = _xl_column_multi_range(df, table_start, cols, has_header) if " " in col_range: - col = list(cols)[0] + col = next(iter(cols)) fmt["multi_range"] = col_range col_range = _xl_column_range(df, table_start, col, has_header) diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 47a42c4edf9d1..b711dbeab0e79 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -121,7 +121,8 @@ def read_ipc_schema(source: str | BinaryIO | Path | bytes) -> dict[str, PolarsDa Returns ------- - Dictionary mapping column names to datatypes + dict + Dictionary mapping column names to datatypes """ if isinstance(source, (str, Path)): diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 7382807f8e7cf..ea2381b9dce1f 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -146,7 +146,8 @@ def read_parquet_schema( Returns ------- - Dictionary mapping column names to datatypes + dict + Dictionary mapping column names to datatypes """ if isinstance(source, (str, Path)): diff --git a/py-polars/polars/io/pyarrow_dataset/functions.py b/py-polars/polars/io/pyarrow_dataset/functions.py index 9fed5ee4b1366..0c3e450dd025f 100644 --- a/py-polars/polars/io/pyarrow_dataset/functions.py +++ b/py-polars/polars/io/pyarrow_dataset/functions.py @@ -1,10 +1,9 @@ from __future__ import annotations -import warnings from typing import TYPE_CHECKING from polars.io.pyarrow_dataset.anonymous_scan import _scan_pyarrow_dataset -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import deprecated_name if TYPE_CHECKING: from polars import LazyFrame @@ -56,6 +55,7 @@ def scan_pyarrow_dataset( return _scan_pyarrow_dataset(source, allow_pyarrow_filter=allow_pyarrow_filter) +@deprecated_name(new_name="scan_pyarrow_dataset", version="0.16.10") def scan_ds(ds: pa.dataset.Dataset, *, allow_pyarrow_filter: bool = True) -> LazyFrame: """ Scan a pyarrow dataset. @@ -96,10 +96,4 @@ def scan_ds(ds: pa.dataset.Dataset, *, allow_pyarrow_filter: bool = True) -> Laz └───────┴────────┴────────────┘ """ - warnings.warn( - "`scan_ds` has been renamed; this" - " redirect is temporary, please use `scan_pyarrow_dataset` instead", - category=DeprecationWarning, - stacklevel=find_stacklevel(), - ) return scan_pyarrow_dataset(ds, allow_pyarrow_filter=allow_pyarrow_filter) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index f4ac1d5dced52..71a9c86cfdf74 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2,7 +2,6 @@ import contextlib import os -import warnings from datetime import date, datetime, time, timedelta from io import BytesIO, StringIO from pathlib import Path @@ -55,11 +54,11 @@ ) from polars.utils._wrap import wrap_df, wrap_expr from polars.utils.convert import _timedelta_to_pl_duration +from polars.utils.deprecation import deprecated_alias, issue_deprecation_warning from polars.utils.various import ( _in_notebook, _prepare_row_count_args, _process_null_values, - find_stacklevel, normalise_filepath, ) @@ -824,6 +823,7 @@ def pipe( """ return function(self, *args, **kwargs) + @deprecated_alias(common_subplan_elimination="comm_subplan_elim") def explain( self, *, @@ -833,7 +833,8 @@ def explain( projection_pushdown: bool = True, simplify_expression: bool = True, slice_pushdown: bool = True, - common_subplan_elimination: bool = True, + comm_subplan_elim: bool = True, + comm_subexpr_elim: bool = True, streaming: bool = False, ) -> str: """ @@ -858,8 +859,10 @@ def explain( Run simplify expressions optimization. slice_pushdown Slice pushdown optimization. - common_subplan_elimination + comm_subplan_elim Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. streaming Run parts of the query in a streaming fashion (this is in an alpha state) @@ -883,12 +886,14 @@ def explain( projection_pushdown, simplify_expression, slice_pushdown, - common_subplan_elimination, + comm_subplan_elim, + comm_subexpr_elim, streaming, ) return ldf.describe_optimized_plan() return self._ldf.describe_plan() + @deprecated_alias(common_subplan_elimination="comm_subplan_elim") def show_graph( self, *, @@ -902,7 +907,8 @@ def show_graph( projection_pushdown: bool = True, simplify_expression: bool = True, slice_pushdown: bool = True, - common_subplan_elimination: bool = True, + comm_subplan_elim: bool = True, + comm_subexpr_elim: bool = True, streaming: bool = False, ) -> str | None: """ @@ -930,8 +936,10 @@ def show_graph( Run simplify expressions optimization. slice_pushdown Slice pushdown optimization. - common_subplan_elimination + comm_subplan_elim Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. streaming Run parts of the query in a streaming fashion (this is in an alpha state) @@ -955,7 +963,8 @@ def show_graph( projection_pushdown, simplify_expression, slice_pushdown, - common_subplan_elimination, + comm_subplan_elim, + comm_subexpr_elim, streaming, ) @@ -1303,6 +1312,7 @@ def bottom_k( self._ldf.bottom_k(k, by, descending, nulls_last, maintain_order) ) + @deprecated_alias(common_subplan_elimination="comm_subplan_elim") def profile( self, *, @@ -1312,7 +1322,8 @@ def profile( simplify_expression: bool = True, no_optimization: bool = False, slice_pushdown: bool = True, - common_subplan_elimination: bool = True, + comm_subplan_elim: bool = True, + comm_subexpr_elim: bool = True, show_plot: bool = False, truncate_nodes: int = 0, figsize: tuple[int, int] = (18, 8), @@ -1341,8 +1352,10 @@ def profile( Turn off (certain) optimizations. slice_pushdown Slice pushdown optimization. - common_subplan_elimination + comm_subplan_elim Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. show_plot Show a gantt chart of the profiling result truncate_nodes @@ -1397,7 +1410,8 @@ def profile( projection_pushdown, simplify_expression, slice_pushdown, - common_subplan_elimination, + comm_subplan_elim, + comm_subexpr_elim, streaming, ) df, timings = ldf.profile() @@ -1446,6 +1460,7 @@ def profile( return df, timings + @deprecated_alias(common_subplan_elimination="comm_subplan_elim") def collect( self, *, @@ -1455,7 +1470,8 @@ def collect( simplify_expression: bool = True, no_optimization: bool = False, slice_pushdown: bool = True, - common_subplan_elimination: bool = True, + comm_subplan_elim: bool = True, + comm_subexpr_elim: bool = True, streaming: bool = False, ) -> DataFrame: """ @@ -1478,8 +1494,10 @@ def collect( Turn off (certain) optimizations. slice_pushdown Slice pushdown optimization. - common_subplan_elimination + comm_subplan_elim Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. streaming Run parts of the query in a streaming fashion (this is in an alpha state) @@ -1513,10 +1531,11 @@ def collect( predicate_pushdown = False projection_pushdown = False slice_pushdown = False - common_subplan_elimination = False + comm_subplan_elim = False + comm_subexpr_elim = False if streaming: - common_subplan_elimination = False + comm_subplan_elim = False ldf = self._ldf.optimization_toggle( type_coercion, @@ -1524,7 +1543,8 @@ def collect( projection_pushdown, simplify_expression, slice_pushdown, - common_subplan_elimination, + comm_subplan_elim, + comm_subexpr_elim, streaming, ) return wrap_df(ldf.collect()) @@ -1616,7 +1636,8 @@ def sink_parquet( projection_pushdown, simplify_expression, slice_pushdown, - cse=False, + comm_subplan_elim=False, + comm_subexpr_elim=False, streaming=True, ) return lf.sink_parquet( @@ -1691,7 +1712,8 @@ def sink_ipc( projection_pushdown, simplify_expression, slice_pushdown, - cse=False, + comm_subplan_elim=False, + comm_subexpr_elim=False, streaming=True, ) return lf.sink_ipc( @@ -1700,6 +1722,7 @@ def sink_ipc( maintain_order=maintain_order, ) + @deprecated_alias(common_subplan_elimination="comm_subplan_elim") def fetch( self, n_rows: int = 500, @@ -1710,7 +1733,8 @@ def fetch( simplify_expression: bool = True, no_optimization: bool = False, slice_pushdown: bool = True, - common_subplan_elimination: bool = True, + comm_subplan_elim: bool = True, + comm_subexpr_elim: bool = True, streaming: bool = False, ) -> DataFrame: """ @@ -1740,8 +1764,10 @@ def fetch( Turn off optimizations. slice_pushdown Slice pushdown optimization - common_subplan_elimination + comm_subplan_elim Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. streaming Run parts of the query in a streaming fashion (this is in an alpha state) @@ -1774,7 +1800,8 @@ def fetch( predicate_pushdown = False projection_pushdown = False slice_pushdown = False - common_subplan_elimination = False + comm_subplan_elim = False + comm_subexpr_elim = False lf = self._ldf.optimization_toggle( type_coercion, @@ -1782,7 +1809,8 @@ def fetch( projection_pushdown, simplify_expression, slice_pushdown, - common_subplan_elimination, + comm_subplan_elim, + comm_subexpr_elim, streaming, ) return wrap_df(lf.fetch(n_rows)) @@ -2059,11 +2087,10 @@ def select( structify = bool(int(os.environ.get("POLARS_AUTO_STRUCTIFY", 0))) if "exprs" in named_exprs: - warnings.warn( + issue_deprecation_warning( "passing expressions to `select` using the keyword argument `exprs` is" " deprecated. Use positional syntax instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.1", ) first_input = named_exprs.pop("exprs") pyexprs = parse_as_list_of_expressions( @@ -3057,7 +3084,8 @@ def with_columns( Returns ------- - A new LazyFrame with the columns added. + LazyFrame + A new LazyFrame with the columns added. Notes ----- @@ -3185,11 +3213,10 @@ def with_columns( structify = bool(int(os.environ.get("POLARS_AUTO_STRUCTIFY", 0))) if "exprs" in named_exprs: - warnings.warn( + issue_deprecation_warning( "passing expressions to `with_columns` using the keyword argument" " `exprs` is deprecated. Use positional syntax instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.1", ) first_input = named_exprs.pop("exprs") pyexprs = parse_as_list_of_expressions( @@ -4331,7 +4358,8 @@ def unique( Returns ------- - DataFrame with unique rows. + LazyFrame + LazyFrame with unique rows. Warnings -------- diff --git a/py-polars/polars/lazyframe/groupby.py b/py-polars/polars/lazyframe/groupby.py index a75726cde9ecd..674471d7ef12e 100644 --- a/py-polars/polars/lazyframe/groupby.py +++ b/py-polars/polars/lazyframe/groupby.py @@ -1,12 +1,11 @@ from __future__ import annotations -import warnings from typing import TYPE_CHECKING, Callable, Iterable from polars import functions as F from polars.utils._parse_expr_input import parse_as_list_of_expressions from polars.utils._wrap import wrap_ldf -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import issue_deprecation_warning if TYPE_CHECKING: from polars import DataFrame, LazyFrame @@ -142,11 +141,10 @@ def agg( ) if "aggs" in named_aggs: - warnings.warn( + issue_deprecation_warning( "passing expressions to `agg` using the keyword argument `aggs` is" " deprecated. Use positional syntax instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.1", ) first_input = named_aggs.pop("aggs") pyexprs = parse_as_list_of_expressions(first_input, *aggs, **named_aggs) diff --git a/py-polars/polars/series/_numpy.py b/py-polars/polars/series/_numpy.py index bc0e5fb69d299..8cec5445e6c6b 100644 --- a/py-polars/polars/series/_numpy.py +++ b/py-polars/polars/series/_numpy.py @@ -46,7 +46,8 @@ def _ptr_to_numpy(ptr: int, len: int, ptr_type: Any) -> np.ndarray[Any, Any]: Returns ------- - View of memory block as numpy array. + numpy.ndarray + View of memory block as numpy array. """ ptr_ctype = ctypes.cast(ptr, ctypes.POINTER(ptr_type)) diff --git a/py-polars/polars/series/binary.py b/py-polars/polars/series/binary.py index 49382d0a3015f..9511f7e850672 100644 --- a/py-polars/polars/series/binary.py +++ b/py-polars/polars/series/binary.py @@ -30,7 +30,8 @@ def contains(self, literal: bytes) -> Series: Returns ------- - Boolean mask + Series + Series of data type :class:`Boolean`. """ @@ -81,6 +82,7 @@ def encode(self, encoding: TransferEncoding) -> Series: Returns ------- - Binary array with values encoded using provided encoding + Series + Series of data type :class:`Boolean`. """ diff --git a/py-polars/polars/series/datetime.py b/py-polars/polars/series/datetime.py index eb75835fd6983..2b0620d1b4ebe 100644 --- a/py-polars/polars/series/datetime.py +++ b/py-polars/polars/series/datetime.py @@ -6,7 +6,7 @@ from polars.series.utils import expr_dispatch from polars.utils._wrap import wrap_s from polars.utils.convert import _to_python_date, _to_python_datetime -from polars.utils.decorators import deprecated_alias +from polars.utils.deprecation import deprecated_alias if TYPE_CHECKING: import datetime as dt @@ -221,7 +221,8 @@ def year(self) -> Series: Returns ------- - Year part as Int32 + Series + Series of data type :class:`Int32`. Examples -------- @@ -254,7 +255,8 @@ def is_leap_year(self) -> Series: Returns ------- - Leap year info as Boolean + Series + Series of data type :class:`Boolean`. Examples -------- @@ -292,7 +294,8 @@ def iso_year(self) -> Series: Returns ------- - ISO year as Int32 + Series + Series of data type :class:`Int32`. Examples -------- @@ -317,7 +320,8 @@ def quarter(self) -> Series: Returns ------- - Quarter as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -357,7 +361,8 @@ def month(self) -> Series: Returns ------- - Month part as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -397,7 +402,8 @@ def week(self) -> Series: Returns ------- - Week number as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -436,7 +442,8 @@ def weekday(self) -> Series: Returns ------- - Weekday as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -482,7 +489,8 @@ def day(self) -> Series: Returns ------- - Day part as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -524,7 +532,8 @@ def ordinal_day(self) -> Series: Returns ------- - Ordinal day as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -559,7 +568,8 @@ def time(self) -> Series: Returns ------- - Time Series + Series + Series of data type :class:`Time`. Examples -------- @@ -589,7 +599,8 @@ def date(self) -> Series: Returns ------- - Date Series + Series + Series of data type :class:`Date`. Examples -------- @@ -619,7 +630,8 @@ def datetime(self) -> Series: Returns ------- - Datetime Series + Series + Series of data type :class:`Datetime`. Examples -------- @@ -651,7 +663,8 @@ def hour(self) -> Series: Returns ------- - Hour part as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -690,7 +703,8 @@ def minute(self) -> Series: Returns ------- - Minute part as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -734,7 +748,8 @@ def second(self, *, fractional: bool = False) -> Series: Returns ------- - Second part as UInt32 (or Float64) + Series + Series of data type :class:`UInt32` or :class:`Float64`. Examples -------- @@ -795,7 +810,8 @@ def millisecond(self) -> Series: Returns ------- - Millisecond part as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -842,7 +858,8 @@ def microsecond(self) -> Series: Returns ------- - Microsecond part as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -889,7 +906,8 @@ def nanosecond(self) -> Series: Returns ------- - Nanosecond part as UInt32 + Series + Series of data type :class:`UInt32`. Examples -------- @@ -1139,9 +1157,10 @@ def replace_time_zone( time_zone Time zone for the `Datetime` Series. Pass `None` to unset time zone. use_earliest - If localizing an ambiguous datetime (say, due to daylight saving time), - determine whether to localize to the earliest datetime or not. - If None (the default), then ambiguous datetimes will raise. + Determine how to deal with ambiguous datetimes: + - None (default): raise; + - True: use the earliest datetime; + - False: use the latest datetime. Examples -------- @@ -1227,7 +1246,8 @@ def days(self) -> Series: Returns ------- - A series of dtype Int64 + Series + Series of data type :class:`Int64`. Examples -------- @@ -1260,7 +1280,8 @@ def hours(self) -> Series: Returns ------- - A series of dtype Int64 + Series + Series of data type :class:`Int64`. Examples -------- @@ -1295,7 +1316,8 @@ def minutes(self) -> Series: Returns ------- - A series of dtype Int64 + Series + Series of data type :class:`Int64`. Examples -------- @@ -1330,7 +1352,8 @@ def seconds(self) -> Series: Returns ------- - A series of dtype Int64 + Series + Series of data type :class:`Int64`. Examples -------- @@ -1367,7 +1390,8 @@ def milliseconds(self) -> Series: Returns ------- - A series of dtype Int64 + Series + Series of data type :class:`Int64`. Examples -------- @@ -1403,7 +1427,8 @@ def microseconds(self) -> Series: Returns ------- - A series of dtype Int64 + Series + Series of data type :class:`Int64`. Examples -------- @@ -1439,7 +1464,8 @@ def nanoseconds(self) -> Series: Returns ------- - A series of dtype Int64 + Series + Series of data type :class:`Int64`. Examples -------- @@ -1506,7 +1532,8 @@ def offset_by(self, by: str) -> Series: Returns ------- - Date/Datetime expression + Series + Series of data type :class:`Date` or :class:`Datetime`. Examples -------- @@ -1567,12 +1594,14 @@ def truncate( self, every: str | dt.timedelta, offset: str | dt.timedelta | None = None, + *, + use_earliest: bool | None = None, ) -> Series: """ Divide the date/ datetime range into buckets. - Each date/datetime is mapped to the start of its bucket. Note that weekly - buckets start on Monday. + Each date/datetime is mapped to the start of its bucket using the corresponding + local datetime. Note that weekly buckets start on Monday. Parameters ---------- @@ -1580,6 +1609,11 @@ def truncate( Every interval start and period length offset Offset the window + use_earliest + Determine how to deal with ambiguous datetimes: + - None (default): raise; + - True: use the earliest datetime; + - False: use the latest datetime. Notes ----- @@ -1612,7 +1646,8 @@ def truncate( Returns ------- - Date/Datetime series + Series + Series of data type :class:`Date` or :class:`Datetime`. Examples -------- @@ -1679,6 +1714,46 @@ def truncate( 2001-01-01 01:00:00 ] + If crossing daylight savings time boundaries, you may want to use + `use_earliest` and combine with :func:`~polars.Series.dt.dst_offset` + and :func:`~polars.when`: + + >>> ser = pl.date_range( + ... datetime(2020, 10, 25, 0), + ... datetime(2020, 10, 25, 2), + ... "30m", + ... eager=True, + ... time_zone="Europe/London", + ... ).dt.offset_by("15m") + >>> ser + shape: (7,) + Series: 'date' [datetime[μs, Europe/London]] + [ + 2020-10-25 00:15:00 BST + 2020-10-25 00:45:00 BST + 2020-10-25 01:15:00 BST + 2020-10-25 01:45:00 BST + 2020-10-25 01:15:00 GMT + 2020-10-25 01:45:00 GMT + 2020-10-25 02:15:00 GMT + ] + + >>> pl.select( + ... pl.when(ser.dt.dst_offset() == pl.duration(hours=1)) + ... .then(ser.dt.truncate("30m", use_earliest=True)) + ... .otherwise(ser.dt.truncate("30m", use_earliest=False)) + ... )["date"] + shape: (7,) + Series: 'date' [datetime[μs, Europe/London]] + [ + 2020-10-25 00:00:00 BST + 2020-10-25 00:30:00 BST + 2020-10-25 01:00:00 BST + 2020-10-25 01:30:00 BST + 2020-10-25 01:00:00 GMT + 2020-10-25 01:30:00 GMT + 2020-10-25 02:00:00 GMT + ] """ def round( @@ -1730,7 +1805,8 @@ def round( Returns ------- - Date/Datetime series + Series + Series of data type :class:`Date` or :class:`Datetime`. Warnings -------- @@ -1829,7 +1905,8 @@ def month_start(self) -> Series: Returns ------- - Date/Datetime series + Series + Series of data type :class:`Date` or :class:`Datetime`. Notes ----- @@ -1859,7 +1936,8 @@ def month_end(self) -> Series: Returns ------- - Date/Datetime series. + Series + Series of data type :class:`Date` or :class:`Datetime`. Notes ----- @@ -1893,7 +1971,8 @@ def base_utc_offset(self) -> Series: Returns ------- - Duration Series + Series + Series of data type :class:`Duration`. See Also -------- @@ -1931,7 +2010,8 @@ def dst_offset(self) -> Series: Returns ------- - Duration Series + Series + Series of data type :class:`Duration`. See Also -------- diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index a9d5e06c5cc03..0374847052049 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -5,7 +5,7 @@ from polars import functions as F from polars.series.utils import expr_dispatch from polars.utils._wrap import wrap_s -from polars.utils.decorators import deprecated_alias +from polars.utils.deprecation import deprecated_alias if TYPE_CHECKING: from datetime import date, datetime, time @@ -211,7 +211,8 @@ def join(self, separator: str) -> Series: Returns ------- - Series of dtype Utf8 + Series + Series of data type :class:`Utf8`. Examples -------- @@ -243,7 +244,8 @@ def contains(self, item: float | str | bool | int | date | datetime) -> Series: Returns ------- - Boolean mask + Series + Series of data type :class:`Boolean`. """ @@ -253,7 +255,9 @@ def arg_min(self) -> Series: Returns ------- - Series of dtype UInt32/UInt64 (depending on compilation) + Series + Series of data type :class:`UInt32` or :class:`UInt64` + (depending on compilation). """ @@ -263,7 +267,9 @@ def arg_max(self) -> Series: Returns ------- - Series of dtype UInt32/UInt64 (depending on compilation) + Series + Series of data type :class:`UInt32` or :class:`UInt64` + (depending on compilation). """ @@ -404,7 +410,8 @@ def explode(self) -> Series: Returns ------- - Exploded column with the datatype of the list elements. + Series + Series with the data type of the list elements. See Also -------- diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 594a6ef54ba46..6f55be80295d8 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -3,7 +3,6 @@ import contextlib import math import os -import warnings from datetime import date, datetime, time, timedelta from typing import ( TYPE_CHECKING, @@ -89,11 +88,10 @@ _datetime_to_pl_timestamp, _time_to_pl_time, ) -from polars.utils.decorators import deprecated_alias +from polars.utils.deprecation import deprecated_alias, issue_deprecation_warning from polars.utils.meta import get_index_type from polars.utils.various import ( _is_generator, - find_stacklevel, parse_version, range_to_series, range_to_slice, @@ -375,7 +373,8 @@ def flags(self) -> dict[str, bool]: Returns ------- - Dictionary containing the flag name and the value + dict + Dictionary containing the flag name and the value """ out = { @@ -411,11 +410,10 @@ def shape(self) -> tuple[int]: @property def time_unit(self) -> TimeUnit | None: """Get the time unit of underlying Datetime Series as {"ns", "us", "ms"}.""" - warnings.warn( + issue_deprecation_warning( "`Series.time_unit` is deprecated and will be removed in a future version," " please use `Series.dtype.time_unit` instead", - category=DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.17.5", ) return self._s.time_unit() @@ -1196,13 +1194,30 @@ def sqrt(self) -> Series: """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, drop_nulls: bool = True) -> bool | None: """ Check if any boolean value in the column is `True`. Returns ------- - Boolean literal + Series + Series of data type :class:`Boolean`. """ return self.to_frame().select(F.col(self.name).any(drop_nulls)).to_series()[0] @@ -1213,7 +1228,8 @@ def all(self, drop_nulls: bool = True) -> bool | None: Returns ------- - Boolean literal + Series + Series of data type :class:`Boolean`. """ return self.to_frame().select(F.col(self.name).all(drop_nulls)).to_series()[0] @@ -1298,7 +1314,8 @@ def describe( Returns ------- - Dictionary with summary statistics of a Series. + DataFrame + Mapping with summary statistics of a Series. Examples -------- @@ -1631,10 +1648,8 @@ def cut( include_breaks == True category_label Name given to the category column. Only used if series == False - maintain_order - Keep the order of the original `Series`. Only used if series == False series - If True, return the a categorical series in the data's original order. + If True, return a categorical Series in the data's original order. left_closed Whether intervals should be [) instead of (] include_breaks @@ -1722,26 +1737,27 @@ def cut( return res.struct.rename_fields([break_point_label, category_label]) return res + @deprecated_alias(quantiles="q") def qcut( self, - quantiles: list[float], + q: list[float] | int, *, labels: list[str] | None = None, break_point_label: str = "break_point", category_label: str = "category", - series: bool = False, + series: bool = True, left_closed: bool = False, allow_duplicates: bool = False, include_breaks: bool = False, ) -> DataFrame | Series: """ - Bin continuous values into discrete categories based on their quantiles. + Discretize continuous values into discrete categories based on their quantiles. Parameters ---------- - quantiles - List of quantiles to create. - We expect quantiles ``0.0 <= quantile <= 1`` + q + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. labels Labels to assign to the quantiles. If given the length of labels must be len(breaks) + 1. @@ -1750,10 +1766,8 @@ def qcut( include_breaks == True category_label Name given to the category column. Only used if series == False. - maintain_order - Keep the order of the original `Series`. Only used if series == False. series - If True, return a categorical series in the data's original order + If True, return a categorical Series in the data's original order left_closed Whether intervals should be [) instead of (] allow_duplicates @@ -1777,6 +1791,19 @@ def qcut( Examples -------- >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut(2, series=True) + shape: (8,) + Series: 'a' [cat] + [ + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + ] >>> a.qcut([0.0, 0.25, 0.75], series=False) shape: (8, 3) ┌─────┬─────────────┬───────────────┐ @@ -1821,14 +1848,13 @@ def qcut( ] """ n = self._s.name() - if not series: # "Old style" always includes breaks return ( self.to_frame() .with_columns( F.col(n) - .qcut(quantiles, labels, left_closed, allow_duplicates, True) + .qcut(q, labels, left_closed, allow_duplicates, True) .alias(n + "_bin") ) .unnest(n + "_bin") @@ -1837,9 +1863,7 @@ def qcut( res = ( self.to_frame() .select( - F.col(n).qcut( - quantiles, labels, left_closed, allow_duplicates, include_breaks - ) + F.col(n).qcut(q, labels, left_closed, allow_duplicates, include_breaks) ) .to_series() ) @@ -1853,7 +1877,8 @@ def rle(self) -> Series: Returns ------- - A Struct Series containing "lengths" and "values" Fields + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". Examples -------- @@ -1885,8 +1910,11 @@ def rle_id(self) -> Series: Returns ------- - Series + Series + See Also + -------- + rle Examples -------- @@ -2073,17 +2101,24 @@ def cumulative_eval( def alias(self, name: str) -> Series: """ - Return a copy of the Series with a new alias/name. + Rename the series. Parameters ---------- name - New name. + The new name. Examples -------- - >>> srs = pl.Series("x", [1, 2, 3]) - >>> new_aliased_srs = srs.alias("y") + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: 'b' [i64] + [ + 1 + 2 + 3 + ] """ s = self.clone() @@ -2118,12 +2153,11 @@ def rename(self, name: str, *, in_place: bool | None = None) -> Series: # if 'in_place' is not None, this indicates that the parameter was # explicitly set by the caller, and we should warn against it (use # of NoDefault only applies when one of the valid values is None). - warnings.warn( + issue_deprecation_warning( "the `in_place` parameter is deprecated and will be removed in a future" " version; note that renaming is a shallow-copy operation with" " essentially zero cost.", - category=DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.17.15", ) if in_place: self._s.rename(name) @@ -2372,12 +2406,11 @@ def append(self, other: Series, *, append_chunks: bool | None = None) -> Self: """ if append_chunks is not None: - warnings.warn( + issue_deprecation_warning( "the `append_chunks` argument will be removed and `append` will change" " to always behave like `append_chunks=True` (the previous default)." " For the behavior of `append_chunks=False`, use `Series.extend`.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.8", ) else: append_chunks = True @@ -2759,7 +2792,7 @@ def arg_min(self) -> int | None: Returns ------- - Integer + int Examples -------- @@ -2776,7 +2809,7 @@ def arg_max(self) -> int | None: Returns ------- - Integer + int Examples -------- @@ -2917,7 +2950,8 @@ def is_null(self) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. Examples -------- @@ -2940,7 +2974,8 @@ def is_not_null(self) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. Examples -------- @@ -2963,7 +2998,8 @@ def is_finite(self) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. Examples -------- @@ -2986,7 +3022,8 @@ def is_infinite(self) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. Examples -------- @@ -3009,7 +3046,8 @@ def is_nan(self) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. Examples -------- @@ -3033,7 +3071,8 @@ def is_not_nan(self) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. Examples -------- @@ -3057,7 +3096,8 @@ def is_in(self, other: Series | Collection[Any]) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. Examples -------- @@ -3107,7 +3147,8 @@ def arg_true(self) -> Series: Returns ------- - UInt32 Series + Series + Series of data type :class:`UInt32`. Examples -------- @@ -3128,7 +3169,8 @@ def is_unique(self) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. Examples -------- @@ -3151,7 +3193,8 @@ def is_first(self) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. """ @@ -3161,7 +3204,8 @@ def is_duplicated(self) -> Series: Returns ------- - Boolean Series + Series + Series of data type :class:`Boolean`. Examples -------- @@ -3186,7 +3230,8 @@ def explode(self) -> Series: Returns ------- - Exploded Series of same dtype + Series + Series with the data type of the list elements. See Also -------- @@ -3585,9 +3630,18 @@ def to_numpy( use_pyarrow: bool = True, ) -> np.ndarray[Any, Any]: """ - Convert this Series to numpy. This operation clones data but is completely safe. + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: - If you want a zero-copy view and know what you are doing, use `.view()`. + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can't be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. Parameters ---------- @@ -3894,7 +3948,8 @@ def set_at_idx( Returns ------- - the series mutated + Series + The mutated series. Notes ----- @@ -4502,7 +4557,7 @@ def apply( Examples -------- >>> s = pl.Series("a", [1, 2, 3]) - >>> s.apply(lambda x: x + 10) + >>> s.apply(lambda x: x + 10) # doctest: +SKIP shape: (3,) Series: 'a' [i64] [ @@ -4516,10 +4571,14 @@ def apply( Series """ + from polars.utils.udfs import warn_on_inefficient_apply + if return_dtype is None: pl_return_dtype = None else: pl_return_dtype = py_type_to_dtype(return_dtype) + + warn_on_inefficient_apply(function, columns=[self.name], apply_target="series") return self._from_pyseries( self._s.apply_lambda(function, pl_return_dtype, skip_nulls) ) @@ -4589,7 +4648,7 @@ def zip_with(self, mask: Series, other: Series) -> Self: Returns ------- - New Series + Series Examples -------- @@ -5363,12 +5422,12 @@ def reinterpret(self, *, signed: bool = True) -> Series: def interpolate(self, method: InterpolationMethod = "linear") -> Series: """ - Interpolate intermediate values. The interpolation method is linear. + Fill null values using interpolation. Parameters ---------- method : {'linear', 'nearest'} - Interpolation method + Interpolation method. Examples -------- @@ -5841,9 +5900,10 @@ def reshape(self, dimensions: tuple[int, ...]) -> Series: Returns ------- Series - If a single dimension is given, results in a flat Series of shape (len,). - If a multiple dimensions are given, results in a Series of Lists with shape - (rows, cols). + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). See Also -------- diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index 7826cf5a9e31d..7956bd0fd3678 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -1,13 +1,11 @@ from __future__ import annotations -import warnings from typing import TYPE_CHECKING from polars import functions as F from polars.series.utils import expr_dispatch from polars.utils._wrap import wrap_s -from polars.utils.decorators import deprecated_alias -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import deprecated_alias, issue_deprecation_warning if TYPE_CHECKING: from polars import Expr, Series @@ -262,14 +260,13 @@ def strptime( ] """ if utc is not None: - warnings.warn( + issue_deprecation_warning( "The `utc` argument is now a no-op and has no effect. " "You can safely remove it. " "Offset-naive strings are parsed as ``pl.Datetime(time_unit)``, " "and offset-aware strings are converted to " '``pl.Datetime(time_unit, "UTC")``.', - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.17.15", ) s = wrap_s(self._s) return ( @@ -331,7 +328,8 @@ def lengths(self) -> Series: Returns ------- - Series[u32] + Series + Series of data type :class:`UInt32`. Examples -------- @@ -354,7 +352,8 @@ def n_chars(self) -> Series: Returns ------- - Series[u32] + Series + Series of data type :class:`UInt32`. Notes ----- @@ -387,7 +386,8 @@ def concat(self, delimiter: str = "-") -> Series: Returns ------- - Series of dtype Utf8 + Series + Series of data type :class:`Utf8`. Examples -------- @@ -436,7 +436,8 @@ def contains( Returns ------- - Boolean mask + Series + Series of data type :class:`Boolean`. Examples -------- @@ -543,7 +544,8 @@ def encode(self, encoding: TransferEncoding) -> Series: Returns ------- - Utf8 array with values encoded using provided encoding + Series + Series of data type :class:`Utf8`. Examples -------- @@ -612,8 +614,9 @@ def json_path_match(self, json_path: str) -> Series: Returns ------- - Utf8 array. Contain null if original value is null or the json_path return - nothing. + Series + Series of data type :class:`Utf8`. Contains null values if the original + value is null or the json_path returns nothing. Examples -------- @@ -647,6 +650,12 @@ def extract(self, pattern: str, group_index: int = 1) -> Series: Group 0 mean the whole pattern, first group begin at index 1 Default to the first capture group + Returns + ------- + Series + Series of data type :class:`Utf8`. Contains null values if the original + value is null or regex captures nothing. + Notes ----- To modify regular expression behaviour (such as multi-line matching) @@ -671,10 +680,6 @@ def extract(self, pattern: str, group_index: int = 1) -> Series: `_ for additional information about the use of inline expression modifiers. - Returns - ------- - Utf8 array. Contain null if original value is null or regex capture nothing. - Examples -------- >>> s = pl.Series( @@ -749,7 +754,8 @@ def extract_all(self, pattern: str | Series) -> Series: Returns ------- - List[Utf8] + Series + Series of data type ``List(Utf8)``. Examples -------- @@ -776,7 +782,9 @@ def count_match(self, pattern: str) -> Series: Returns ------- - UInt32 array. Contain null if original value is null or regex capture nothing. + Series + Series of data type :class:`UInt32`. Contains null values if the original + value is null or if the regex captures nothing. Examples -------- @@ -805,7 +813,8 @@ def split(self, by: str, *, inclusive: bool = False) -> Series: Returns ------- - List of Utf8 type + Series + Series of data type ``List(Utf8)``. """ @@ -864,7 +873,8 @@ def split_exact(self, by: str, n: int, *, inclusive: bool = False) -> Series: Returns ------- - Struct of Utf8 type + Series + Series of data type :class:`Struct` with fields of data type :class:`Utf8`. """ @@ -921,7 +931,8 @@ def splitn(self, by: str, n: int) -> Series: Returns ------- - Struct of Utf8 type + Series + Series of data type :class:`Struct` with fields of data type :class:`Utf8`. """ s = wrap_s(self._s) @@ -1268,7 +1279,7 @@ def slice(self, offset: int, length: int | None = None) -> Series: Returns ------- Series - Series of dtype Utf8. + Series of data type :class:`Struct` with fields of data type :class:`Utf8`. Examples -------- @@ -1303,7 +1314,8 @@ def explode(self) -> Series: Returns ------- - Exploded column with string datatype. + Series + Series of data type :class:`Utf8`. Examples -------- @@ -1340,7 +1352,8 @@ def parse_int(self, radix: int = 2, *, strict: bool = True) -> Series: Returns ------- - Series of parsed integers in i32 format + Series + Series of data type :class:`Int32`. Examples -------- diff --git a/py-polars/polars/series/utils.py b/py-polars/polars/series/utils.py index b585c5228e07a..bbb00ececb606 100644 --- a/py-polars/polars/series/utils.py +++ b/py-polars/polars/series/utils.py @@ -138,7 +138,7 @@ def get_ffi_func( name: str, dtype: PolarsDataType, obj: PySeries ) -> Callable[..., Any] | None: """ - Dynamically obtain the proper ffi function/ method. + Dynamically obtain the proper FFI function/ method. Parameters ---------- @@ -153,7 +153,8 @@ def get_ffi_func( Returns ------- - ffi function, or None if not found + callable or None + FFI function, or None if not found. """ ffi_name = dtype_to_ffiname(dtype) diff --git a/py-polars/polars/sql/context.py b/py-polars/polars/sql/context.py index 5ec88b4080071..486405b4af320 100644 --- a/py-polars/polars/sql/context.py +++ b/py-polars/polars/sql/context.py @@ -13,7 +13,7 @@ from polars.lazyframe import LazyFrame from polars.type_aliases import FrameType from polars.utils._wrap import wrap_ldf -from polars.utils.decorators import deprecated_alias, redirect +from polars.utils.deprecation import deprecated_alias, redirect from polars.utils.various import _get_stack_locals with contextlib.suppress(ImportError): # Module not available when building docs diff --git a/py-polars/polars/string_cache.py b/py-polars/polars/string_cache.py index 928c6d22a0d39..9af6335df8380 100644 --- a/py-polars/polars/string_cache.py +++ b/py-polars/polars/string_cache.py @@ -1,10 +1,9 @@ from __future__ import annotations import contextlib -import warnings from typing import TYPE_CHECKING -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import deprecated_name with contextlib.suppress(ImportError): # Module not available when building docs from polars.polars import enable_string_cache as _enable_string_cache @@ -119,6 +118,7 @@ def enable_string_cache(enable: bool) -> None: _enable_string_cache(enable) +@deprecated_name(new_name="enable_string_cache", version="0.17.0") def toggle_string_cache(toggle: bool) -> None: """ Enable (or disable) the global string cache. @@ -129,12 +129,6 @@ def toggle_string_cache(toggle: bool) -> None: .. deprecated:: 0.17.0 """ - warnings.warn( - "`toggle_string_cache` has been renamed; this" - " redirect is temporary, please use `enable_string_cache` instead", - category=DeprecationWarning, - stacklevel=find_stacklevel(), - ) enable_string_cache(toggle) diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py index 931fa827cc011..459656ac8374f 100644 --- a/py-polars/polars/type_aliases.py +++ b/py-polars/polars/type_aliases.py @@ -25,7 +25,6 @@ from polars.dependencies import numpy as np from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa - from polars.functions.whenthen import WhenThen, WhenThenThen from polars.selectors import _selector_proxy_ if sys.version_info >= (3, 10): @@ -61,15 +60,12 @@ ] SchemaDict: TypeAlias = Mapping[str, PolarsDataType] -# Types that qualify as expressions (eg: for use in 'select', 'with_columns'...) -PolarsExprType: TypeAlias = Union["Expr", "WhenThen", "WhenThenThen"] - # literal types that are allowed in expressions (auto-converted to pl.lit) PythonLiteral: TypeAlias = Union[ str, int, float, bool, date, time, datetime, timedelta, bytes, Decimal, List[Any] ] -IntoExpr: TypeAlias = Union[PolarsExprType, PythonLiteral, "Series", None] +IntoExpr: TypeAlias = Union["Expr", PythonLiteral, "Series", None] ComparisonOperator: TypeAlias = Literal["eq", "neq", "gt", "lt", "gt_eq", "lt_eq"] # selector type diff --git a/py-polars/polars/utils/_construction.py b/py-polars/polars/utils/_construction.py index 25efe25d6a32f..832e9f04b4e3b 100644 --- a/py-polars/polars/utils/_construction.py +++ b/py-polars/polars/utils/_construction.py @@ -154,8 +154,8 @@ def nt_unpack(obj: Any) -> Any: def series_to_pyseries(name: str, values: Series) -> PySeries: - """Construct a PySeries from a Polars Series.""" - py_s = values._s + """Construct a new PySeries from a Polars Series.""" + py_s = values._s.clone() py_s.rename(name) return py_s @@ -1410,7 +1410,7 @@ def series_to_pydf( schema or series_name, schema_overrides=schema_overrides, n_expected=1 ) if schema_overrides: - new_dtype = list(schema_overrides.values())[0] + new_dtype = next(iter(schema_overrides.values())) if new_dtype != data.dtype: data_series[0] = data_series[0].cast(new_dtype, True) diff --git a/py-polars/polars/utils/_parse_expr_input.py b/py-polars/polars/utils/_parse_expr_input.py index b2671ff46bb75..f8dff1f86e601 100644 --- a/py-polars/polars/utils/_parse_expr_input.py +++ b/py-polars/polars/utils/_parse_expr_input.py @@ -1,13 +1,12 @@ from __future__ import annotations -import warnings from datetime import date, datetime, time, timedelta from typing import TYPE_CHECKING, Iterable import polars._reexport as pl from polars import functions as F from polars.exceptions import ComputeError -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import issue_deprecation_warning if TYPE_CHECKING: from polars import Expr @@ -55,12 +54,11 @@ def _parse_regular_inputs( and isinstance(inputs[0], Iterable) and not isinstance(inputs[0], (str, pl.Series)) ): - warnings.warn( + issue_deprecation_warning( "In the next breaking release, combining list input and positional input will result in an error." " To silence this warning, either unpack the list , or append the positional inputs to the list first." " The resulting behavior will be identical", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.4", ) input_list = _first_input_to_list(inputs[0]) @@ -72,13 +70,12 @@ def _first_input_to_list( inputs: IntoExpr | Iterable[IntoExpr], ) -> list[IntoExpr]: if inputs is None: - warnings.warn( + issue_deprecation_warning( "In the next breaking release, passing `None` as the first expression input will evaluate to `lit(None)`," " rather than be ignored." " To silence this warning, either pass no arguments or an empty list to retain the current behavior," " or pass `lit(None)` to opt into the new behavior.", - DeprecationWarning, - stacklevel=find_stacklevel(), + version="0.18.0", ) return [] elif not isinstance(inputs, Iterable) or isinstance(inputs, (str, pl.Series)): @@ -132,8 +129,6 @@ def parse_as_expression( elif isinstance(input, list): expr = F.lit(pl.Series("", [input])) structify = False - elif isinstance(input, (F.whenthen.WhenThen, F.whenthen.WhenThenThen)): - expr = input.otherwise(None) # implicitly add the null branch. else: raise TypeError( f"did not expect value {input!r} of type {type(input)}, maybe disambiguate with" diff --git a/py-polars/polars/utils/convert.py b/py-polars/polars/utils/convert.py index a102573930bfc..892943429ed48 100644 --- a/py-polars/polars/utils/convert.py +++ b/py-polars/polars/utils/convert.py @@ -30,13 +30,13 @@ elif _ZONEINFO_AVAILABLE: from backports.zoneinfo._zoneinfo import ZoneInfo - def get_zoneinfo(key: str) -> ZoneInfo: + def get_zoneinfo(key: str) -> ZoneInfo: # noqa: D103 pass else: @lru_cache(None) - def get_zoneinfo(key: str) -> ZoneInfo: + def get_zoneinfo(key: str) -> ZoneInfo: # noqa: D103 return zoneinfo.ZoneInfo(key) diff --git a/py-polars/polars/utils/decorators.py b/py-polars/polars/utils/deprecation.py similarity index 69% rename from py-polars/polars/utils/decorators.py rename to py-polars/polars/utils/deprecation.py index 37107ed5c7229..649582b9cdea7 100644 --- a/py-polars/polars/utils/decorators.py +++ b/py-polars/polars/utils/deprecation.py @@ -19,21 +19,35 @@ T = TypeVar("T") -def deprecated_alias(**aliases: str) -> Callable[[Callable[P, T]], Callable[P, T]]: +def issue_deprecation_warning(message: str, *, version: str) -> None: """ - Deprecate a function or method argument. + Issue a deprecation warning. - Decorator for deprecated function and method arguments. Use as follows: + Parameters + ---------- + message + The message associated with the warning. + version + The Polars version number in which the warning is first issued. + This argument is used to help developers determine when to remove the + deprecated functionality. - @deprecated_alias(old_arg='new_arg') - def myfunc(new_arg): - ... """ + warnings.warn(message, DeprecationWarning, stacklevel=find_stacklevel()) + + +def deprecated( + message: str, *, version: str +) -> Callable[[Callable[P, T]], Callable[P, T]]: + """Decorator to mark a function as deprecated.""" def deco(function: Callable[P, T]) -> Callable[P, T]: @wraps(function) def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: - _rename_kwargs(function.__name__, kwargs, aliases) + issue_deprecation_warning( + f"`{function.__name__}` is deprecated and will be removed in a future version. {message}", + version=version, + ) return function(*args, **kwargs) return wrapper @@ -41,31 +55,36 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: return deco -def warn_closed_future_change() -> Callable[[Callable[P, T]], Callable[P, T]]: +def deprecated_name( + new_name: str, *, version: str +) -> Callable[[Callable[P, T]], Callable[P, T]]: """ - Warn that user should pass in 'closed' as default value will change. + Decorator to mark a function as deprecated due to being renamed. - Decorator for rolling function. Use as follows: + Notes + ----- + For deprecating renamed class methods, use the ``redirect`` class decorator instead. + + """ + return deprecated(f"It has been renamed to `{new_name}`.", version=version) + + +def deprecated_alias(**aliases: str) -> Callable[[Callable[P, T]], Callable[P, T]]: + """ + Decorator to mark function arguments as deprecated due to being renamed. + + Use as follows:: + + @deprecated_alias(old_arg='new_arg') + def myfunc(new_arg): + ... - @warn_closed_future_change() - def myfunc(): - ... """ def deco(function: Callable[P, T]) -> Callable[P, T]: @wraps(function) def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: - # we only warn if 'by' is passed in, otherwise 'closed' is not used - if (kwargs.get("by") is not None) and ("closed" not in kwargs): - warnings.warn( - message=( - "The default argument for closed, 'left', will be changed to 'right' in the future." - "Fix this warning by explicitly passing in a value for closed" - ), - category=FutureWarning, - stacklevel=find_stacklevel(), - ) - + _rename_kwargs(function.__name__, kwargs, aliases) return function(*args, **kwargs) return wrapper @@ -81,29 +100,62 @@ def _rename_kwargs( """ Rename the keyword arguments of a function. - Helper function for deprecating function and method arguments. + Helper function for deprecating function arguments. + """ for alias, new in aliases.items(): if alias in kwargs: if new in kwargs: raise TypeError( - f"{func_name} received both {alias} and {new} as arguments!" - f" {alias} is deprecated, use {new} instead." + f"`{func_name}` received both `{alias}` and `{new}` as arguments." + f" `{alias}` is deprecated, use `{new}` instead." ) - warnings.warn( - message=( - f"`{alias}` is deprecated as an argument to `{func_name}`; use" - f" `{new}` instead." - ), - category=DeprecationWarning, - stacklevel=find_stacklevel(), + issue_deprecation_warning( + f"`{alias}` is deprecated as an argument to `{func_name}`;" + f" use `{new}` instead.", + version="", ) kwargs[new] = kwargs.pop(alias) +def redirect( + from_to: dict[str, str | tuple[str, dict[str, Any]]] +) -> Callable[[type[T]], type[T]]: + """ + Class decorator allowing deprecation/transition from one method name to another. + + The parameters must be the same (unless they are being renamed, in which case + you can use this in conjunction with @deprecated_alias). If you need to redirect + with custom kwargs, can redirect to a method name and associated kwargs dict. + + """ + + def _redirecting_getattr_(obj: T, item: Any) -> Any: + if isinstance(item, str) and item in from_to: + new_item = from_to[item] + new_item_name = new_item if isinstance(new_item, str) else new_item[0] + issue_deprecation_warning( + f"`{type(obj).__name__}.{item}` has been renamed;" + f" this redirect is temporary, please use `.{new_item_name}` instead", + version="", + ) + item = new_item_name + + attr = obj.__getattribute__(item) + if isinstance(new_item, tuple): + attr = partial(attr, **new_item[1]) + return attr + + def _cls_(cls: type[T]) -> type[T]: + # note: __getattr__ is only invoked if item isn't found on the class + cls.__getattr__ = _redirecting_getattr_ # type: ignore[attr-defined] + return cls + + return _cls_ + + def deprecate_nonkeyword_arguments( - allowed_args: list[str] | None = None, - message: str | None = None, + allowed_args: list[str] | None = None, message: str | None = None, *, version: str ) -> Callable[[Callable[P, T]], Callable[P, T]]: """ Decorator to deprecate the use of non-keyword arguments of a function. @@ -117,6 +169,11 @@ def deprecate_nonkeyword_arguments( default value. message Optionally overwrite the default warning message. + version + The Polars version number in which the warning is first issued. + This argument is used to help developers determine when to remove the + deprecated functionality. + """ def decorate(function: Callable[P, T]) -> Callable[P, T]: @@ -158,7 +215,7 @@ def decorate(function: Callable[P, T]) -> Callable[P, T]: @wraps(function) def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: if len(args) > num_allowed_args: - warnings.warn(msg, DeprecationWarning, stacklevel=find_stacklevel()) + issue_deprecation_warning(msg, version=version) return function(*args, **kwargs) wrapper.__signature__ = new_sig # type: ignore[attr-defined] @@ -184,37 +241,34 @@ def _format_argument_list(allowed_args: list[str]) -> str: return f" except for {args} and {last!r}" -def redirect( - from_to: dict[str, str | tuple[str, dict[str, Any]]] -) -> Callable[[type[T]], type[T]]: +def warn_closed_future_change() -> Callable[[Callable[P, T]], Callable[P, T]]: """ - Class decorator allowing deprecation/transition from one method name to another. + Warn that user should pass in 'closed' as default value will change. + + Decorator for rolling function. Use as follows:: + + @warn_closed_future_change() + def myfunc(): + ... - The parameters must be the same (unless they are being renamed, in which case - you can use this in conjunction with @deprecated_alias). If you need to redirect - with custom kwargs, can redirect to a method name and associated kwargs dict. """ - def _redirecting_getattr_(obj: T, item: Any) -> Any: - if isinstance(item, str) and item in from_to: - new_item = from_to[item] - new_item_name = new_item if isinstance(new_item, str) else new_item[0] - warnings.warn( - f"`{type(obj).__name__}.{item}` has been renamed; this" - f" redirect is temporary, please use `.{new_item_name}` instead", - category=DeprecationWarning, - stacklevel=find_stacklevel(), - ) - item = new_item_name + def deco(function: Callable[P, T]) -> Callable[P, T]: + @wraps(function) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: + # we only warn if 'by' is passed in, otherwise 'closed' is not used + if (kwargs.get("by") is not None) and ("closed" not in kwargs): + warnings.warn( + message=( + "The default argument for closed, 'left', will be changed to 'right' in the future." + "Fix this warning by explicitly passing in a value for closed" + ), + category=FutureWarning, + stacklevel=find_stacklevel(), + ) - attr = obj.__getattribute__(item) - if isinstance(new_item, tuple): - attr = partial(attr, **new_item[1]) - return attr + return function(*args, **kwargs) - def _cls_(cls: type[T]) -> type[T]: - # note: __getattr__ is only invoked if item isn't found on the class - cls.__getattr__ = _redirecting_getattr_ # type: ignore[attr-defined] - return cls + return wrapper - return _cls_ + return deco diff --git a/py-polars/polars/utils/meta.py b/py-polars/polars/utils/meta.py index 8bb2324a8273c..7572a701c034d 100644 --- a/py-polars/polars/utils/meta.py +++ b/py-polars/polars/utils/meta.py @@ -2,10 +2,9 @@ from __future__ import annotations import contextlib -import warnings from typing import TYPE_CHECKING -from polars.utils.various import find_stacklevel +from polars.utils.deprecation import deprecated_name with contextlib.suppress(ImportError): # Module not available when building docs from polars.polars import get_index_type as _get_index_type @@ -21,20 +20,16 @@ def get_index_type() -> DataTypeClass: Returns ------- - UInt32 in regular Polars, UInt64 in bigidx Polars. + DataType + :class:`UInt32` in regular Polars, :class:`UInt64` in bigidx Polars. """ return _get_index_type() +@deprecated_name(new_name="get_index_type", version="16.12") def get_idx_type() -> DataTypeClass: """Get the datatype used for Polars indexing.""" - warnings.warn( - "`get_idx_type` has been renamed; this" - " redirect is temporary, please use `get_index_type` instead", - category=DeprecationWarning, - stacklevel=find_stacklevel(), - ) return get_index_type() diff --git a/py-polars/polars/utils/udfs.py b/py-polars/polars/utils/udfs.py new file mode 100644 index 0000000000000..32716fc440864 --- /dev/null +++ b/py-polars/polars/utils/udfs.py @@ -0,0 +1,759 @@ +"""Utilities related to user defined functions (such as those passed to `apply`).""" +from __future__ import annotations + +import dis +import inspect +import re +import sys +import warnings +from bisect import bisect_left +from collections import defaultdict +from dis import get_instructions +from inspect import signature +from itertools import count, zip_longest +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, NamedTuple, Union + +if TYPE_CHECKING: + from dis import Instruction + + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + + +class StackValue(NamedTuple): + operator: str + operator_arity: int + left_operand: str + right_operand: str + + +ApplyTarget: TypeAlias = Literal["expr", "frame", "series"] +StackEntry: TypeAlias = Union[str, StackValue] + +_MIN_PY311 = sys.version_info >= (3, 11) + + +class OpNames: + BINARY = { + "BINARY_ADD": "+", + "BINARY_AND": "&", + "BINARY_FLOOR_DIVIDE": "//", + "BINARY_MODULO": "%", + "BINARY_MULTIPLY": "*", + "BINARY_OR": "|", + "BINARY_POWER": "**", + "BINARY_SUBTRACT": "-", + "BINARY_TRUE_DIVIDE": "/", + "BINARY_XOR": "^", + } + CALL = {"CALL"} if _MIN_PY311 else {"CALL_FUNCTION", "CALL_METHOD"} + CONTROL_FLOW = ( + { + "POP_JUMP_FORWARD_IF_FALSE": "&", + "POP_JUMP_FORWARD_IF_TRUE": "|", + "JUMP_IF_FALSE_OR_POP": "&", + "JUMP_IF_TRUE_OR_POP": "|", + } + if _MIN_PY311 + else { + "POP_JUMP_IF_FALSE": "&", + "POP_JUMP_IF_TRUE": "|", + "JUMP_IF_FALSE_OR_POP": "&", + "JUMP_IF_TRUE_OR_POP": "|", + } + ) + LOAD_VALUES = frozenset(("LOAD_CONST", "LOAD_DEREF", "LOAD_FAST", "LOAD_GLOBAL")) + LOAD_ATTR = {"LOAD_ATTR"} if _MIN_PY311 else {"LOAD_METHOD"} + LOAD = LOAD_VALUES | {"LOAD_METHOD", "LOAD_ATTR"} + SYNTHETIC = { + "POLARS_EXPRESSION": 1, + } + UNARY = { + "UNARY_NEGATIVE": "-", + "UNARY_POSITIVE": "+", + "UNARY_NOT": "~", + } + PARSEABLE_OPS = ( + {"BINARY_OP", "COMPARE_OP", "CONTAINS_OP", "IS_OP"} + | set(UNARY) + | set(CONTROL_FLOW) + | set(SYNTHETIC) + | LOAD_VALUES + ) + UNARY_VALUES = frozenset(UNARY.values()) + + +# numpy functions that we can map to native expressions +_NUMPY_MODULE_ALIASES = frozenset(("np", "numpy")) +_NUMPY_FUNCTIONS = frozenset( + ("cbrt", "cos", "cosh", "sin", "sinh", "sqrt", "tan", "tanh") +) + +# python functions that we can map to native expressions +_PYTHON_CASTS_MAP = {"float": "Float64", "int": "Int64", "str": "Utf8"} +_PYTHON_BUILTINS = frozenset(_PYTHON_CASTS_MAP) | {"abs"} +_PYTHON_METHODS_MAP = { + "lower": "str.to_lowercase", + "title": "str.to_titlecase", + "upper": "str.to_uppercase", +} + + +def _get_all_caller_variables() -> dict[str, Any]: + """Get all local and global variables from caller's frame.""" + pkg_dir = Path(__file__).parent.parent + + # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow + frame = inspect.currentframe() + n = 0 + while frame: + fname = inspect.getfile(frame) + if fname.startswith(str(pkg_dir)): + frame = frame.f_back + n += 1 + else: + break + if frame is None: + return {} + return {**frame.f_locals, **frame.f_globals} + + +class BytecodeParser: + """Introspect UDF bytecode and determine if we can rewrite as native expression.""" + + _can_rewrite: dict[str, bool] + _apply_target_name: str | None = None + + def __init__(self, function: Callable[[Any], Any], apply_target: ApplyTarget): + try: + original_instructions = get_instructions(function) + except TypeError: + # in case we hit something that can't be disassembled (eg: code object + # unavailable, like a bare numpy ufunc that isn't in a lambda/function) + original_instructions = iter([]) + + self._can_rewrite = {} + self._function = function + self._apply_target = apply_target + self._param_name = self._get_param_name(function) + self._rewritten_instructions = RewrittenInstructions( + instructions=original_instructions, + ) + + @staticmethod + def _get_param_name(function: Callable[[Any], Any]) -> str | None: + """Return single function parameter name.""" + try: + # note: we do not parse/handle functions with > 1 params + sig = signature(function) + return ( + next(iter(parameters.keys())) + if len(parameters := sig.parameters) == 1 + else None + ) + except ValueError: + return None + + def _inject_nesting( + self, + expression_blocks: dict[int, str], + logical_instructions: list[Instruction], + ) -> list[tuple[int, str]]: + """Inject nesting boundaries into expression blocks (as parentheses).""" + if logical_instructions: + # reconstruct nesting boundaries for mixed and/or ops by associating + # control flow jump offsets with their target expression blocks and + # injecting appropriate parentheses + combined_offset_idxs = set() + if len({inst.opname for inst in logical_instructions}) > 1: + block_offsets: list[int] = list(expression_blocks.keys()) + previous_logical_opname = "" + for i, inst in enumerate(logical_instructions): + # operator precedence means that we can combine logically connected + # 'and' blocks into one (depending on follow-on logic) and should + # parenthesise nested 'or' blocks + logical_op = OpNames.CONTROL_FLOW[inst.opname] + start = block_offsets[bisect_left(block_offsets, inst.offset) - 1] + if previous_logical_opname == ( + "POP_JUMP_FORWARD_IF_FALSE" + if _MIN_PY311 + else "POP_JUMP_IF_FALSE" + ): + # combine logical '&' blocks (and update start/block_offsets) + prev = block_offsets[bisect_left(block_offsets, start) - 1] + expression_blocks[prev] += f" & {expression_blocks.pop(start)}" + combined_offset_idxs.add(i - 1) + block_offsets.remove(start) + start = prev + + if logical_op == "|": + # parenthesise connected 'or' blocks + end = block_offsets[bisect_left(block_offsets, inst.argval) - 1] + if not (start == 0 and end == block_offsets[-1]): + expression_blocks[start] = "(" + expression_blocks[start] + expression_blocks[end] += ")" + + previous_logical_opname = inst.opname + + for i, inst in enumerate(logical_instructions): + if i not in combined_offset_idxs: + expression_blocks[inst.offset] = OpNames.CONTROL_FLOW[inst.opname] + + return sorted(expression_blocks.items()) + + def _get_target_name(self, col: str, expression: str) -> str: + """The name of the object against which the 'apply' is being invoked.""" + if self._apply_target_name is not None: + return self._apply_target_name + else: + col_expr = f'pl.col("{col}")' + if self._apply_target == "expr": + return col_expr + elif self._apply_target == "series": + # note: handle overlapping name from global variables; fallback + # through "s", "srs", "series" and (finally) srs0 -> srsN... + search_expr = expression.replace(col_expr, "") + for name in ("s", "srs", "series"): + if not re.search(rf"\b{name}\b", search_expr): + self._apply_target_name = name + return name + n = count() + while True: + name = f"srs{next(n)}" + if not re.search(rf"\b{name}\b", search_expr): + self._apply_target_name = name + return name + + raise NotImplementedError(f"TODO: apply_target = {self._apply_target!r}") + + @property + def apply_target(self) -> ApplyTarget: + """The apply target, eg: one of 'expr', 'frame', or 'series'.""" + return self._apply_target + + def can_rewrite(self) -> bool: + """ + Determine if bytecode indicates only simple binary ops and/or comparisons. + + Note that `lambda x: x` is inefficient, but we ignore it because it is not + guaranteed that using the equivalent bare constant value will return the + same output. (Hopefully nobody is writing lambdas like that anyway...) + """ + if (can_rewrite := self._can_rewrite.get(self._apply_target, None)) is not None: + return can_rewrite + else: + self._can_rewrite[self._apply_target] = False + if self._rewritten_instructions and self._param_name is not None: + self._can_rewrite[self._apply_target] = len( + self._rewritten_instructions + ) >= 2 and all( + inst.opname in OpNames.PARSEABLE_OPS + for inst in self._rewritten_instructions + ) + + return self._can_rewrite[self._apply_target] + + def dis(self) -> None: + """Print disassembled function bytecode.""" + dis.dis(self._function) + + @property + def function(self) -> Callable[[Any], Any]: + """The function being parsed.""" + return self._function + + @property + def original_instructions(self) -> list[Instruction]: + """The original bytecode instructions from the function we are parsing.""" + return list(get_instructions(self._function)) + + @property + def param_name(self) -> str | None: + """The parameter name of the function being parsed.""" + return self._param_name + + @property + def rewritten_instructions(self) -> list[Instruction]: + """The rewritten bytecode instructions from the function we are parsing.""" + return list(self._rewritten_instructions) + + def to_expression(self, col: str) -> str | None: + """Translate postfix bytecode instructions to polars expression/string.""" + self._apply_target_name = None + if not self.can_rewrite() or self._param_name is None: + return None + + # decompose bytecode into logical 'and'/'or' expression blocks (if present) + control_flow_blocks = defaultdict(list) + logical_instructions = [] + jump_offset = 0 + for idx, inst in enumerate(self._rewritten_instructions): + if inst.opname in OpNames.CONTROL_FLOW: + jump_offset = self._rewritten_instructions[idx + 1].offset + logical_instructions.append(inst) + else: + control_flow_blocks[jump_offset].append(inst) + + # convert each block to a polars expression string + expression_strings = self._inject_nesting( + { + offset: InstructionTranslator( + instructions=ops, + apply_target=self._apply_target, + ).to_expression( + col=col, + param_name=self._param_name, + depth=int(bool(logical_instructions)), + ) + for offset, ops in control_flow_blocks.items() + }, + logical_instructions, + ) + polars_expr = " ".join(expr for _offset, expr in expression_strings) + + # note: if no 'pl.col' in the expression, it likely represents a compound + # constant value (e.g. `lambda x: CONST + 123`), so we don't want to warn + if "pl.col(" not in polars_expr: + return None + elif self._apply_target == "series": + return polars_expr.replace( + f'pl.col("{col}")', + self._get_target_name(col, polars_expr), + ) + else: + return polars_expr + + def warn( + self, + col: str, + suggestion_override: str | None = None, + udf_override: str | None = None, + ) -> None: + """Generate warning that suggests an equivalent native polars expression.""" + # Import these here so that udfs can be imported without polars installed. + + from polars.exceptions import PolarsInefficientApplyWarning + from polars.utils.various import ( + find_stacklevel, + in_terminal_that_supports_colour, + ) + + suggested_expression = suggestion_override or self.to_expression(col) + + if suggested_expression is not None: + target_name = self._get_target_name(col, suggested_expression) + func_name = udf_override or self._function.__name__ or "..." + if func_name == "": + func_name = f"lambda {self._param_name}: ..." + + addendum = ( + 'Note: in list.eval context, pl.col("") should be written as pl.element()' + if 'pl.col("")' in suggested_expression + else "" + ) + if self._apply_target == "expr": + apitype = "expressions" + clsname = "Expr" + else: + apitype = "series" + clsname = "Series" + + before_after_suggestion = ( + ( + f" \033[31m- {target_name}.apply({func_name})\033[0m\n" + f" \033[32m+ {suggested_expression}\033[0m\n{addendum}" + ) + if in_terminal_that_supports_colour() + else ( + f" - {target_name}.apply({func_name})\n" + f" + {suggested_expression}\n{addendum}" + ) + ) + warnings.warn( + f"\n{clsname}.apply is significantly slower than the native {apitype} API.\n" + "Only use if you absolutely CANNOT implement your logic otherwise.\n" + "In this case, you can replace your `apply` with the following:\n" + f"{before_after_suggestion}", + PolarsInefficientApplyWarning, + stacklevel=find_stacklevel(), + ) + + +class InstructionTranslator: + """Translates Instruction bytecode to a polars expression string.""" + + def __init__(self, instructions: list[Instruction], apply_target: ApplyTarget): + self._stack = self._to_intermediate_stack(instructions, apply_target) + + def to_expression(self, col: str, param_name: str, depth: int) -> str: + """Convert intermediate stack to polars expression string.""" + return self._expr(self._stack, col, param_name, depth) + + @classmethod + def op(cls, inst: Instruction) -> str: + """Convert bytecode instruction to suitable intermediate op string.""" + if inst.opname in OpNames.CONTROL_FLOW: + return OpNames.CONTROL_FLOW[inst.opname] + elif inst.argrepr: + return inst.argrepr + elif inst.opname == "IS_OP": + return "is not" if inst.argval else "is" + elif inst.opname == "CONTAINS_OP": + return "not in" if inst.argval else "in" + elif inst.opname in OpNames.UNARY: + return OpNames.UNARY[inst.opname] + else: + raise AssertionError( + "Unrecognised opname; please report a bug to https://github.com/pola-rs/polars/issues " + "with the content of function you were passing to `apply` and the " + f"following instruction object:\n{inst}" + ) + + @classmethod + def _expr(cls, value: StackEntry, col: str, param_name: str, depth: int) -> str: + """Take stack entry value and convert to polars expression string.""" + if isinstance(value, StackValue): + op = value.operator + e1 = cls._expr(value.left_operand, col, param_name, depth + 1) + if value.operator_arity == 1: + if op not in OpNames.UNARY_VALUES: + if not e1.startswith("pl.col("): + # support use of consts as numpy/builtin params, eg: + # "np.sin(3) + np.cos(x)", or "len('const_string') + len(x)" + pfx = "np." if op in _NUMPY_FUNCTIONS else "" + return f"{pfx}{op}({e1})" + + call = "" if op.endswith(")") else "()" + return f"{e1}.{op}{call}" + return f"{op}{e1}" + else: + e2 = cls._expr(value.right_operand, col, param_name, depth + 1) + if op in ("is", "is not") and value[2] == "None": + not_ = "" if op == "is" else "not_" + return f"{e1}.is_{not_}null()" + elif op in ("in", "not in"): + not_ = "" if op == "in" else "~" + return ( + f"{not_}({e1}.is_in({e2}))" + if " " in e1 + else f"{not_}{e1}.is_in({e2})" + ) + else: + expr = f"{e1} {op} {e2}" + return f"({expr})" if depth else expr + + elif value == param_name: + return f'pl.col("{col}")' + + return value + + def _to_intermediate_stack( + self, instructions: list[Instruction], apply_target: ApplyTarget + ) -> StackEntry: + """Take postfix bytecode and convert to an intermediate natural-order stack.""" + if apply_target in ("expr", "series"): + stack: list[StackEntry] = [] + for inst in instructions: + stack.append( + inst.argrepr + if inst.opname in OpNames.LOAD + else ( + StackValue( + operator=self.op(inst), + operator_arity=1, + left_operand=stack.pop(), # type: ignore[arg-type] + right_operand=None, # type: ignore[arg-type] + ) + if ( + inst.opname in OpNames.UNARY + or OpNames.SYNTHETIC.get(inst.opname) == 1 + ) + else StackValue( + operator=self.op(inst), + operator_arity=2, + left_operand=stack.pop(-2), # type: ignore[arg-type] + right_operand=stack.pop(-1), # type: ignore[arg-type] + ) + ) + ) + return stack[0] + + # TODO: frame apply (account for BINARY_SUBSCR) + # TODO: series apply (rewrite col expr as series) + raise NotImplementedError(f"TODO: {apply_target!r} apply") + + +class RewrittenInstructions: + """ + Standalone class that applies Instruction rewrite/filtering rules. + + This significantly simplifies subsequent parsing by injecting + synthetic POLARS_EXPRESSION ops into the Instruction stream for + easy identification/translation and separates the parsing logic + from the identification of expression translation opportunities. + """ + + _ignored_ops = frozenset(["COPY_FREE_VARS", "PRECALL", "RESUME", "RETURN_VALUE"]) + + def __init__(self, instructions: Iterator[Instruction]): + self._rewritten_instructions = self._rewrite( + self._upgrade_instruction(inst) + for inst in instructions + if inst.opname not in self._ignored_ops + ) + + def __len__(self) -> int: + return len(self._rewritten_instructions) + + def __iter__(self) -> Iterator[Instruction]: + return iter(self._rewritten_instructions) + + def __getitem__(self, item: Any) -> Instruction: + return self._rewritten_instructions[item] + + def _matches( + self, + idx: int, + *, + opnames: list[set[str]], + argvals: list[set[Any] | frozenset[Any] | dict[Any, Any]] | None, + ) -> list[Instruction]: + """ + Check if a sequence of Instructions matches the specified ops/argvals. + + Parameters + ---------- + idx + The index of the first instruction to check. + opnames + The full opname sequence that defines a match. + argvals + Associated argvals that must also match (in same position as opnames). + """ + n_required_ops, argvals = len(opnames), argvals or [] + instructions = self._instructions[idx : idx + n_required_ops] + if len(instructions) == n_required_ops and all( + inst.opname in match_opnames + and (match_argval is None or inst.argval in match_argval) + for inst, match_opnames, match_argval in zip_longest( + instructions, opnames, argvals + ) + ): + return instructions + return [] + + def _rewrite(self, instructions: Iterator[Instruction]) -> list[Instruction]: + """ + Apply rewrite rules, potentially injecting synthetic operations. + + Rules operate on the instruction stream and can examine/modify + it as needed, pushing updates into "updated_instructions" and + returning True/False to indicate if any changes were made. + """ + self._instructions = list(instructions) + updated_instructions: list[Instruction] = [] + idx = 0 + while idx < len(self._instructions): + inst, increment = self._instructions[idx], 1 + if inst.opname not in OpNames.LOAD or not any( + (increment := apply_rewrite(idx, updated_instructions)) + for apply_rewrite in ( + # add any other rewrite methods here + self._rewrite_functions, + self._rewrite_methods, + self._rewrite_builtins, + self._rewrite_lookups, + ) + ): + updated_instructions.append(inst) + idx += increment or 1 + return updated_instructions + + def _rewrite_builtins( + self, idx: int, updated_instructions: list[Instruction] + ) -> int: + """Replace builtin function calls with a synthetic POLARS_EXPRESSION op.""" + if matching_instructions := self._matches( + idx, + opnames=[{"LOAD_GLOBAL"}, {"LOAD_FAST", "LOAD_CONST"}, OpNames.CALL], + argvals=[_PYTHON_BUILTINS], + ): + inst1, inst2 = matching_instructions[:2] + if (argval := inst1.argval) in _PYTHON_CASTS_MAP: + dtype = _PYTHON_CASTS_MAP[argval] + argval = f"cast(pl.{dtype})" + + synthetic_call = inst1._replace( + opname="POLARS_EXPRESSION", + argval=argval, + argrepr=argval, + offset=inst2.offset, + ) + # POLARS_EXPRESSION is mapped as a unary op, so switch instruction order + operand = inst2._replace(offset=inst1.offset) + updated_instructions.extend((operand, synthetic_call)) + + return len(matching_instructions) + + def _rewrite_lookups( + self, idx: int, updated_instructions: list[Instruction] + ) -> int: + """Replace dictionary lookups with a synthetic POLARS_EXPRESSION op.""" + if matching_instructions := self._matches( + idx, + opnames=[{"LOAD_GLOBAL"}, {"LOAD_FAST"}, {"BINARY_SUBSCR"}], + argvals=[], + ): + inst1, inst2 = matching_instructions[:2] + variables = _get_all_caller_variables() + if isinstance(variables.get(argval := inst1.argval, None), dict): + argval = f"map_dict({inst1.argval})" + else: + return 0 + + synthetic_call = inst1._replace( + opname="POLARS_EXPRESSION", + argval=argval, + argrepr=argval, + offset=inst2.offset, + ) + # POLARS_EXPRESSION is mapped as a unary op, so switch instruction order + operand = inst2._replace(offset=inst1.offset) + updated_instructions.extend((operand, synthetic_call)) + + return len(matching_instructions) + + def _rewrite_functions( + self, idx: int, updated_instructions: list[Instruction] + ) -> int: + """Replace numpy/json function calls with a synthetic POLARS_EXPRESSION op.""" + if matching_instructions := self._matches( + idx, + opnames=[ + {"LOAD_GLOBAL"}, + OpNames.LOAD_ATTR, + {"LOAD_FAST", "LOAD_CONST"}, + OpNames.CALL, + ], + argvals=[ + _NUMPY_MODULE_ALIASES | {"json"}, + _NUMPY_FUNCTIONS | {"loads"}, + ], + ): + inst1, inst2, inst3 = matching_instructions[:3] + expr_name = "str.json_extract" if inst1.argval == "json" else inst2.argval + synthetic_call = inst1._replace( + opname="POLARS_EXPRESSION", + argval=expr_name, + argrepr=expr_name, + offset=inst3.offset, + ) + # POLARS_EXPRESSION is mapped as a unary op, so switch instruction order + operand = inst3._replace(offset=inst1.offset) + updated_instructions.extend((operand, synthetic_call)) + + return len(matching_instructions) + + def _rewrite_methods( + self, idx: int, updated_instructions: list[Instruction] + ) -> int: + """Replace python method calls with synthetic POLARS_EXPRESSION op.""" + if matching_instructions := self._matches( + idx, + opnames=[{"LOAD_METHOD"}, OpNames.CALL], + argvals=[_PYTHON_METHODS_MAP], + ): + inst = matching_instructions[0] + expr_name = _PYTHON_METHODS_MAP[inst.argval] + synthetic_call = inst._replace( + opname="POLARS_EXPRESSION", argval=expr_name, argrepr=expr_name + ) + updated_instructions.append(synthetic_call) + + return len(matching_instructions) + + @staticmethod + def _upgrade_instruction(inst: Instruction) -> Instruction: + """Rewrite any older binary opcodes using py 3.11 'BINARY_OP' instead.""" + if not _MIN_PY311 and inst.opname in OpNames.BINARY: + inst = inst._replace( + argrepr=OpNames.BINARY[inst.opname], + opname="BINARY_OP", + ) + return inst + + +def _is_raw_function(function: Callable[[Any], Any]) -> tuple[str, str]: + """Identify translatable calls that aren't wrapped inside a lambda/function.""" + try: + func_module = function.__class__.__module__ + func_name = function.__name__ + + # numpy function calls + if func_module == "numpy" and func_name in _NUMPY_FUNCTIONS: + return "np", f"{func_name}()" + + # python function calls + elif func_module == "builtins": + if func_name in _PYTHON_CASTS_MAP: + return "builtins", f"cast(pl.{_PYTHON_CASTS_MAP[func_name]})" + elif func_name == "loads": + import json # double-check since it is referenced via 'builtins' + + if function is json.loads: + return "json", "str.json_extract()" + + except AttributeError: + pass + + return "", "" + + +def warn_on_inefficient_apply( + function: Callable[[Any], Any], columns: list[str], apply_target: ApplyTarget +) -> None: + """ + Generate ``PolarsInefficientApplyWarning`` on poor usage of ``apply`` func. + + Parameters + ---------- + function + The function passed to ``apply``. + columns + The column names of the original object; in the case of an ``Expr`` this + will be a list of length 1 containing the expression's root name. + apply_target + The target of the ``apply`` call. One of ``"expr"``, ``"frame"``, + or ``"series"``. + """ + if apply_target == "frame": + raise NotImplementedError("TODO: 'frame' and 'series' apply-function parsing") + + # note: we only consider simple functions with a single col/param + if not (col := columns and columns[0]): + return None + + # the parser introspects function bytecode to determine if we can + # rewrite as a much more optimal native polars expression instead + parser = BytecodeParser(function, apply_target) + if parser.can_rewrite(): + parser.warn(col) + else: + # handle bare numpy/json functions + module, suggestion = _is_raw_function(function) + if module and suggestion: + fn = function.__name__ + parser.warn( + col, + suggestion_override=f'pl.col("{col}").{suggestion}', + udf_override=fn if module == "builtins" else f"{module}.{fn}", + ) + + +__all__ = [ + "BytecodeParser", + "warn_on_inefficient_apply", +] diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index d6820aaf3b233..2032dc8e15c1e 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -1,6 +1,7 @@ from __future__ import annotations import inspect +import os import re import sys import warnings @@ -182,10 +183,15 @@ def can_create_dicts_with_pyarrow(dtypes: Sequence[PolarsDataType]) -> bool: def normalise_filepath(path: str | Path, check_not_directory: bool = True) -> str: """Create a string path, expanding the home directory if present.""" - path = Path(path).expanduser() - if check_not_directory and path.exists() and path.is_dir(): + # don't use pathlib here as it modifies slashes (s3:// -> s3:/) + path = os.path.expanduser(path) # noqa: PTH111 + if ( + check_not_directory + and os.path.exists(path) # noqa: PTH110 + and os.path.isdir(path) # noqa: PTH112 + ): raise IsADirectoryError(f"Expected a file path; {path!r} is a directory") - return str(path) + return path def parse_version(version: Sequence[str | int]) -> tuple[int, ...]: @@ -320,7 +326,7 @@ def str_duration_(td: str | None) -> int | None: NS = TypeVar("NS") -class sphinx_accessor(property): +class sphinx_accessor(property): # noqa: D101 def __get__( # type: ignore[override] self, instance: Any, @@ -352,20 +358,19 @@ def __repr__(self) -> str: def find_stacklevel() -> int: """ - Find the first place in the stack that is not inside polars (tests notwithstanding). + Find the first place in the stack that is not inside polars. Taken from: https://github.com/pandas-dev/pandas/blob/ab89c53f48df67709a533b6a95ce3d911871a0a8/pandas/util/_exceptions.py#L30-L51 """ pkg_dir = Path(pl.__file__).parent - test_dir = pkg_dir / "tests" # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow frame = inspect.currentframe() n = 0 while frame: fname = inspect.getfile(frame) - if fname.startswith(str(pkg_dir)) and not fname.startswith(str(test_dir)): + if fname.startswith(str(pkg_dir)): frame = frame.f_back n += 1 else: @@ -374,10 +379,13 @@ def find_stacklevel() -> int: def _get_stack_locals( - of_type: type | tuple[type, ...] | None = None, n_objects: int | None = None + of_type: type | tuple[type, ...] | None = None, + n_objects: int | None = None, + n_frames: int | None = None, + named: str | tuple[str, ...] | None = None, ) -> dict[str, Any]: """ - Retrieve f_locals from all stack frames (starting from the current frame). + Retrieve f_locals from all (or the last 'n') stack frames from the calling location. Parameters ---------- @@ -385,18 +393,36 @@ def _get_stack_locals( Only return objects of this type. n_objects If specified, return only the most recent ``n`` matching objects. + n_frames + If specified, look at objects in the last ``n`` stack frames only. + named + If specified, only return objects matching the given name(s). """ + if isinstance(named, str): + named = (named,) + objects = {} + examined_frames = 0 + if n_frames is None: + n_frames = sys.maxsize stack_frame = getattr(inspect.currentframe(), "f_back", None) - while stack_frame: + + while stack_frame and examined_frames < n_frames: local_items = list(stack_frame.f_locals.items()) for nm, obj in reversed(local_items): - if nm not in objects and (not of_type or isinstance(obj, of_type)): + if ( + nm not in objects + and (named is None or (nm in named)) + and (of_type is None or isinstance(obj, of_type)) + ): objects[nm] = obj if n_objects is not None and len(objects) >= n_objects: return objects + stack_frame = stack_frame.f_back + examined_frames += 1 + return objects @@ -406,3 +432,24 @@ def _polars_warn(msg: str) -> None: msg, stacklevel=find_stacklevel(), ) + + +def in_terminal_that_supports_colour() -> bool: + """ + Determine (within reason) if we are in an interactive terminal that supports color. + + Note: this is not exhaustive, but it covers a lot (most?) of the common cases. + """ + if hasattr(sys.stdout, "isatty"): + # can enhance as necessary, but this is a reasonable start + return ( + sys.stdout.isatty() + and ( + sys.platform != "win32" + or "ANSICON" in os.environ + or "WT_SESSION" in os.environ + or os.environ.get("TERM_PROGRAM") == "vscode" + or os.environ.get("TERM") == "xterm-256color" + ) + ) or os.environ.get("PYCHARM_HOSTED") == "1" + return False diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index b6a615424d50f..11bb614160091 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -144,12 +144,9 @@ ignore = [ # pycodestyle # TODO: Remove errors below to further improve docstring linting # Ordered from most common to least common errors. - "D105", - "D100", - "D103", - "D102", - "D104", - "D101", + "D105", # Missing docstring in magic method + "D100", # Missing docstring in public module + "D104", # Missing docstring in public package ] [tool.ruff.pycodestyle] @@ -165,6 +162,7 @@ strict = true "polars/datatypes.py" = ["B019"] "tests/**/*.py" = ["D100", "D103", "B018"] "polars/utils/show_versions.py" = ["D301"] +"polars/utils/udfs.py" = ["RUF012"] [tool.pytest.ini_options] addopts = [ diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index 96d05a38d3703..77bba6758d666 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -16,13 +16,14 @@ SQLAlchemy xlsx2csv XlsxWriter adbc_driver_sqlite; python_version >= '3.9' and platform_system != 'Windows' -connectorx==0.3.2a5; python_version >= '3.8' # Latest full release is broken - unpin when 0.3.2 released +connectorx==0.3.2a7 # Latest full release is broken - unpin when 0.3.2 released cloudpickle +fsspec # Tooling -hypothesis==6.79.4; python_version < '3.8' -hypothesis==6.80.0; python_version >= '3.8' +hypothesis==6.82.0 maturin==1.1.0 +patchelf; platform_system == 'Linux' # Extra dependency for maturin, only for Linux pytest==7.4.0 pytest-cov==4.1.0 pytest-xdist==3.3.1 diff --git a/py-polars/requirements-lint.txt b/py-polars/requirements-lint.txt index 3348d30ed8f57..ddf13052477e7 100644 --- a/py-polars/requirements-lint.txt +++ b/py-polars/requirements-lint.txt @@ -1,5 +1,5 @@ -black==23.3.0 +black==23.7.0 blackdoc==0.3.8 mypy==1.4.1 -ruff==0.0.275 -typos==1.15.9 +ruff==0.0.278 +typos==1.16.1 diff --git a/py-polars/scripts/check_stacklevels.py b/py-polars/scripts/check_stacklevels.py index 2ff14283ea010..5bcdca3bd57a6 100644 --- a/py-polars/scripts/check_stacklevels.py +++ b/py-polars/scripts/check_stacklevels.py @@ -15,12 +15,12 @@ EXCLUDE = frozenset(["polars/utils/polars_version.py"]) -class StackLevelChecker(NodeVisitor): +class StackLevelChecker(NodeVisitor): # noqa: D101 def __init__(self, file) -> None: self.file = file self.violations = set() - def visit_Call(self, node: ast.Call) -> None: + def visit_Call(self, node: ast.Call) -> None: # noqa: D102 for keyword in node.keywords: if keyword.arg == "stacklevel" and isinstance(keyword.value, ast.Constant): self.violations.add( diff --git a/py-polars/src/apply/lazy.rs b/py-polars/src/apply/lazy.rs index c73b0bbc9fd39..63778acb54fec 100644 --- a/py-polars/src/apply/lazy.rs +++ b/py-polars/src/apply/lazy.rs @@ -137,7 +137,7 @@ pub fn map_single( pub(crate) fn call_lambda_with_series_slice( py: Python, - s: &mut [Series], + s: &[Series], lambda: &PyObject, polars_module: &PyObject, ) -> PyObject { diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs index ff63d360a75dd..39bf68907b2eb 100644 --- a/py-polars/src/conversion.rs +++ b/py-polars/src/conversion.rs @@ -804,9 +804,16 @@ impl<'s> FromPyObject<'s> for Wrap> { } fn get_object(ob: &PyAny) -> PyResult> { - // this is slow, but hey don't use objects - let v = &ObjectValue { inner: ob.into() }; - Ok(Wrap(AnyValue::ObjectOwned(OwnedObject(v.to_boxed())))) + #[cfg(feature = "object")] + { + // this is slow, but hey don't use objects + let v = &ObjectValue { inner: ob.into() }; + Ok(Wrap(AnyValue::ObjectOwned(OwnedObject(v.to_boxed())))) + } + #[cfg(not(feature = "object"))] + { + panic!("activate object") + } } // TYPE key @@ -1355,6 +1362,7 @@ impl FromPyObject<'_> for Wrap { } } +#[cfg(feature = "ipc")] impl FromPyObject<'_> for Wrap { fn extract(ob: &PyAny) -> PyResult { let parsed = match ob.extract::<&str>()? { diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 11aeb760cb600..ccc6c3fb639ff 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -1,6 +1,7 @@ use std::io::BufWriter; use std::ops::Deref; +use either::Either; use numpy::IntoPyArray; use polars::frame::row::{rows_to_schema_supertypes, Row}; #[cfg(feature = "avro")] @@ -1356,17 +1357,20 @@ impl PyDataFrame { Ok(hash.into_series().into()) } - pub fn transpose(&self, include_header: bool, names: &str) -> PyResult { - let mut df = self.df.transpose().map_err(PyPolarsErr::from)?; - if include_header { - let s = Utf8Chunked::from_iter_values( - names, - self.df.get_columns().iter().map(|s| s.name()), - ) - .into_series(); - df.insert_at_idx(0, s).unwrap(); - } - Ok(df.into()) + #[pyo3(signature = (keep_names_as, column_names))] + pub fn transpose(&self, keep_names_as: Option<&str>, column_names: &PyAny) -> PyResult { + let new_col_names = if let Ok(name) = column_names.extract::>() { + Some(Either::Right(name)) + } else if let Ok(name) = column_names.extract::() { + Some(Either::Left(name)) + } else { + None + }; + Ok(self + .df + .transpose(keep_names_as, new_col_names) + .map_err(PyPolarsErr::from)? + .into()) } pub fn upsample( &self, diff --git a/py-polars/src/expr/datetime.rs b/py-polars/src/expr/datetime.rs index b04cc0e1c2e30..2369b0b93eee9 100644 --- a/py-polars/src/expr/datetime.rs +++ b/py-polars/src/expr/datetime.rs @@ -50,14 +50,16 @@ impl PyExpr { .into() } - #[cfg(feature = "timezones")] - #[allow(deprecated)] - fn dt_tz_localize(&self, time_zone: String) -> Self { - self.inner.clone().dt().tz_localize(time_zone).into() - } - - fn dt_truncate(&self, every: &str, offset: &str) -> Self { - self.inner.clone().dt().truncate(every, offset).into() + fn dt_truncate(&self, every: String, offset: String, use_earliest: Option) -> Self { + self.inner + .clone() + .dt() + .truncate(TruncateOptions { + every, + offset, + use_earliest, + }) + .into() } fn dt_month_start(&self) -> Self { diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs index f7a0f8f793522..960733f8353ce 100644 --- a/py-polars/src/expr/general.rs +++ b/py-polars/src/expr/general.rs @@ -207,6 +207,27 @@ impl PyExpr { .qcut(probs, labels, left_closed, allow_duplicates, include_breaks) .into() } + #[pyo3(signature = (n_bins, labels, left_closed, allow_duplicates, include_breaks))] + #[cfg(feature = "cutqcut")] + fn qcut_uniform( + &self, + n_bins: usize, + labels: Option>, + left_closed: bool, + allow_duplicates: bool, + include_breaks: bool, + ) -> Self { + self.inner + .clone() + .qcut_uniform( + n_bins, + labels, + left_closed, + allow_duplicates, + include_breaks, + ) + .into() + } #[cfg(feature = "rle")] fn rle(&self) -> Self { @@ -471,6 +492,11 @@ impl PyExpr { self.clone().inner.arctan().into() } + #[cfg(feature = "trigonometry")] + fn arctan2(&self, y: Self) -> Self { + self.clone().inner.arctan2(y.inner).into() + } + #[cfg(feature = "trigonometry")] fn sinh(&self) -> Self { self.clone().inner.sinh().into() @@ -556,6 +582,14 @@ impl PyExpr { self.clone().inner.pow(exponent.inner).into() } + fn sqrt(&self) -> Self { + self.clone().inner.sqrt().into() + } + + fn cbrt(&self) -> Self { + self.clone().inner.cbrt().into() + } + fn cumsum(&self, reverse: bool) -> Self { self.clone().inner.cumsum(reverse).into() } @@ -1146,8 +1180,4 @@ impl PyExpr { }; self.inner.clone().set_sorted_flag(is_sorted).into() } - - fn cache(&self) -> Self { - self.inner.clone().cache().into() - } } diff --git a/py-polars/src/expr/string.rs b/py-polars/src/expr/string.rs index 3dc76dc2c1a96..aa8d7330c24bc 100644 --- a/py-polars/src/expr/string.rs +++ b/py-polars/src/expr/string.rs @@ -84,6 +84,7 @@ impl PyExpr { self.inner.clone().str().to_lowercase().into() } + #[cfg(feature = "nightly")] fn str_to_titlecase(&self) -> Self { self.inner.clone().str().to_titlecase().into() } diff --git a/py-polars/src/functions/eager.rs b/py-polars/src/functions/eager.rs index 49b38ccb7011f..b46a26be42598 100644 --- a/py-polars/src/functions/eager.rs +++ b/py-polars/src/functions/eager.rs @@ -1,10 +1,9 @@ -use polars::{functions, time}; +use polars::functions; use polars_core::prelude::*; use pyo3::prelude::*; -use crate::conversion::{get_df, get_series, Wrap}; +use crate::conversion::{get_df, get_series}; use crate::error::PyPolarsErr; -use crate::prelude::{ClosedWindow, Duration}; use crate::{PyDataFrame, PySeries}; #[pyfunction] @@ -92,15 +91,3 @@ pub fn hor_concat_df(dfs: &PyAny) -> PyResult { let df = functions::hor_concat_df(&dfs).map_err(PyPolarsErr::from)?; Ok(df.into()) } - -#[pyfunction] -pub fn time_range_eager( - start: i64, - stop: i64, - every: &str, - closed: Wrap, -) -> PyResult { - let time_range = time::time_range_impl("time", start, stop, Duration::parse(every), closed.0) - .map_err(PyPolarsErr::from)?; - Ok(time_range.into_series().into()) -} diff --git a/py-polars/src/functions/lazy.rs b/py-polars/src/functions/lazy.rs index aafaff286385b..904d4813837e1 100644 --- a/py-polars/src/functions/lazy.rs +++ b/py-polars/src/functions/lazy.rs @@ -1,7 +1,6 @@ use polars::lazy::dsl; use polars::lazy::dsl::Expr; use polars::prelude::*; -use polars_core::datatypes::TimeZone; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{PyBool, PyBytes, PyFloat, PyInt, PyString}; @@ -9,9 +8,7 @@ use pyo3::types::{PyBool, PyBytes, PyFloat, PyInt, PyString}; use crate::apply::lazy::binary_lambda; use crate::conversion::{get_lf, Wrap}; use crate::expr::ToExprs; -use crate::prelude::{ - vec_extract_wrapped, ClosedWindow, DataType, DatetimeArgs, Duration, DurationArgs, ObjectValue, -}; +use crate::prelude::{vec_extract_wrapped, DataType, DatetimeArgs, DurationArgs, ObjectValue}; use crate::{apply, PyDataFrame, PyExpr, PyLazyFrame, PyPolarsErr, PySeries}; macro_rules! set_unwrapped_or_0 { @@ -163,6 +160,16 @@ pub fn cov(a: PyExpr, b: PyExpr) -> PyExpr { dsl::cov(a.inner, b.inner).into() } +#[pyfunction] +pub fn arctan2(y: PyExpr, x: PyExpr) -> PyExpr { + y.inner.arctan2(x.inner).into() +} + +#[pyfunction] +pub fn arctan2d(y: PyExpr, x: PyExpr) -> PyExpr { + y.inner.arctan2(x.inner).degrees().into() +} + #[pyfunction] pub fn cumfold(acc: PyExpr, lambda: PyObject, exprs: Vec, include_init: bool) -> PyExpr { let exprs = exprs.to_exprs(); @@ -179,29 +186,6 @@ pub fn cumreduce(lambda: PyObject, exprs: Vec) -> PyExpr { dsl::cumreduce_exprs(func, exprs).into() } -#[pyfunction] -pub fn date_range_lazy( - start: PyExpr, - end: PyExpr, - every: &str, - closed: Wrap, - time_unit: Option>, - time_zone: Option, -) -> PyExpr { - let start = start.inner; - let end = end.inner; - let every = Duration::parse(every); - dsl::functions::date_range( - start, - end, - every, - closed.0, - time_unit.map(|x| x.0), - time_zone, - ) - .into() -} - #[pyfunction] pub fn datetime( year: PyExpr, @@ -426,19 +410,6 @@ pub fn spearman_rank_corr(a: PyExpr, b: PyExpr, ddof: u8, propagate_nans: bool) } } -#[pyfunction] -pub fn time_range_lazy( - start: PyExpr, - end: PyExpr, - every: &str, - closed: Wrap, -) -> PyExpr { - let start = start.inner; - let end = end.inner; - let every = Duration::parse(every); - dsl::functions::time_range(start, end, every, closed.0).into() -} - #[pyfunction] #[cfg(feature = "sql")] pub fn sql_expr(sql: &str) -> PyResult { diff --git a/py-polars/src/functions/range.rs b/py-polars/src/functions/range.rs index 74c037dfecff1..7bdb621168467 100644 --- a/py-polars/src/functions/range.rs +++ b/py-polars/src/functions/range.rs @@ -34,3 +34,55 @@ pub fn int_ranges(start: PyExpr, end: PyExpr, step: i64, dtype: Wrap) result.into() } + +#[pyfunction] +pub fn date_range( + start: PyExpr, + end: PyExpr, + every: &str, + closed: Wrap, + time_unit: Option>, + time_zone: Option, +) -> PyExpr { + let start = start.inner; + let end = end.inner; + let every = Duration::parse(every); + let closed = closed.0; + let time_unit = time_unit.map(|x| x.0); + dsl::date_range(start, end, every, closed, time_unit, time_zone).into() +} + +#[pyfunction] +pub fn date_ranges( + start: PyExpr, + end: PyExpr, + every: &str, + closed: Wrap, + time_unit: Option>, + time_zone: Option, +) -> PyExpr { + let start = start.inner; + let end = end.inner; + let every = Duration::parse(every); + let closed = closed.0; + let time_unit = time_unit.map(|x| x.0); + dsl::date_ranges(start, end, every, closed, time_unit, time_zone).into() +} + +#[pyfunction] +pub fn time_range(start: PyExpr, end: PyExpr, every: &str, closed: Wrap) -> PyExpr { + let start = start.inner; + let end = end.inner; + let every = Duration::parse(every); + let closed = closed.0; + dsl::time_range(start, end, every, closed).into() +} + +#[pyfunction] +pub fn time_ranges(start: PyExpr, end: PyExpr, every: &str, closed: Wrap) -> PyExpr { + let start = start.inner; + let end = end.inner; + let every = Duration::parse(every); + let closed = closed.0; + dsl::time_ranges(start, end, every, closed).into() +} diff --git a/py-polars/src/functions/whenthen.rs b/py-polars/src/functions/whenthen.rs index 7813f27126d53..f5d1d231dce18 100644 --- a/py-polars/src/functions/whenthen.rs +++ b/py-polars/src/functions/whenthen.rs @@ -4,71 +4,76 @@ use pyo3::prelude::*; use crate::PyExpr; #[pyfunction] -pub fn when(predicate: PyExpr) -> When { - When { predicate } +pub fn when(condition: PyExpr) -> PyWhen { + PyWhen { + inner: dsl::when(condition.inner), + } +} + +#[pyclass] +#[derive(Clone)] +pub struct PyWhen { + inner: dsl::When, } #[pyclass] #[derive(Clone)] -pub struct When { - predicate: PyExpr, +pub struct PyThen { + inner: dsl::Then, } #[pyclass] #[derive(Clone)] -pub struct WhenThen { - predicate: PyExpr, - then: PyExpr, +pub struct PyChainedWhen { + inner: dsl::ChainedWhen, } #[pyclass] #[derive(Clone)] -pub struct WhenThenThen { - inner: dsl::WhenThenThen, +pub struct PyChainedThen { + inner: dsl::ChainedThen, } #[pymethods] -impl When { - fn then(&self, expr: PyExpr) -> WhenThen { - WhenThen { - predicate: self.predicate.clone(), - then: expr, +impl PyWhen { + fn then(&self, statement: PyExpr) -> PyThen { + PyThen { + inner: self.inner.clone().then(statement.inner), } } } #[pymethods] -impl WhenThen { - fn when(&self, predicate: PyExpr) -> WhenThenThen { - let e = dsl::when(self.predicate.inner.clone()) - .then(self.then.inner.clone()) - .when(predicate.inner); - WhenThenThen { inner: e } +impl PyThen { + fn when(&self, condition: PyExpr) -> PyChainedWhen { + PyChainedWhen { + inner: self.inner.clone().when(condition.inner), + } } - fn otherwise(&self, expr: PyExpr) -> PyExpr { - dsl::ternary_expr( - self.predicate.inner.clone(), - self.then.inner.clone(), - expr.inner, - ) - .into() + fn otherwise(&self, statement: PyExpr) -> PyExpr { + self.inner.clone().otherwise(statement.inner).into() } } #[pymethods] -impl WhenThenThen { - fn when(&self, predicate: PyExpr) -> Self { - Self { - inner: self.inner.clone().when(predicate.inner), +impl PyChainedWhen { + fn then(&self, statement: PyExpr) -> PyChainedThen { + PyChainedThen { + inner: self.inner.clone().then(statement.inner), } } - fn then(&self, expr: PyExpr) -> Self { - Self { - inner: self.inner.clone().then(expr.inner), +} + +#[pymethods] +impl PyChainedThen { + fn when(&self, condition: PyExpr) -> PyChainedWhen { + PyChainedWhen { + inner: self.inner.clone().when(condition.inner), } } - fn otherwise(&self, expr: PyExpr) -> PyExpr { - self.inner.clone().otherwise(expr.inner).into() + + fn otherwise(&self, statement: PyExpr) -> PyExpr { + self.inner.clone().otherwise(statement.inner).into() } } diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs index a44905d50099a..8aa8518f8af36 100644 --- a/py-polars/src/lazyframe.rs +++ b/py-polars/src/lazyframe.rs @@ -334,7 +334,8 @@ impl PyLazyFrame { projection_pushdown: bool, simplify_expr: bool, slice_pushdown: bool, - cse: bool, + comm_subplan_elim: bool, + comm_subexpr_elim: bool, streaming: bool, ) -> Self { let ldf = self.ldf.clone(); @@ -348,7 +349,8 @@ impl PyLazyFrame { #[cfg(feature = "cse")] { - ldf = ldf.with_common_subplan_elimination(cse); + ldf = ldf.with_comm_subplan_elim(comm_subplan_elim); + ldf = ldf.with_comm_subexpr_elim(comm_subexpr_elim); } ldf.into() diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs index 7974b72c2586b..be90ac391c135 100644 --- a/py-polars/src/lib.rs +++ b/py-polars/src/lib.rs @@ -88,8 +88,6 @@ fn polars(py: Python, m: &PyModule) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::eager::hor_concat_df)) .unwrap(); - m.add_wrapped(wrap_pyfunction!(functions::eager::time_range_eager)) - .unwrap(); // Functions - range m.add_wrapped(wrap_pyfunction!(functions::range::arange)) @@ -98,6 +96,14 @@ fn polars(py: Python, m: &PyModule) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::range::int_ranges)) .unwrap(); + m.add_wrapped(wrap_pyfunction!(functions::range::date_range)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(functions::range::date_ranges)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(functions::range::time_range)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(functions::range::time_ranges)) + .unwrap(); // Functions - aggregation m.add_wrapped(wrap_pyfunction!(functions::aggregation::all_horizontal)) @@ -140,7 +146,9 @@ fn polars(py: Python, m: &PyModule) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::lazy::cumreduce)) .unwrap(); - m.add_wrapped(wrap_pyfunction!(functions::lazy::date_range_lazy)) + m.add_wrapped(wrap_pyfunction!(functions::lazy::arctan2)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(functions::lazy::arctan2d)) .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::lazy::datetime)) .unwrap(); @@ -174,8 +182,6 @@ fn polars(py: Python, m: &PyModule) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::lazy::spearman_rank_corr)) .unwrap(); - m.add_wrapped(wrap_pyfunction!(functions::lazy::time_range_lazy)) - .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::whenthen::when)) .unwrap(); diff --git a/py-polars/src/series/construction.rs b/py-polars/src/series/construction.rs index 898a0f8ae0439..1d7ecfd9135c7 100644 --- a/py-polars/src/series/construction.rs +++ b/py-polars/src/series/construction.rs @@ -231,19 +231,25 @@ impl PySeries { val: Vec>, _strict: bool, ) -> PyResult { - let val = vec_extract_wrapped(val); - let out = Series::new(name, &val); - match out.dtype() { - DataType::List(list_inner) => { - let out = out - .cast(&DataType::Array( - Box::new(inner.map(|dt| dt.0).unwrap_or(*list_inner.clone())), - width, - )) - .map_err(PyPolarsErr::from)?; - Ok(out.into()) + if val.is_empty() { + let series = + Series::new_empty(name, &DataType::Array(Box::new(inner.unwrap().0), width)); + Ok(series.into()) + } else { + let val = vec_extract_wrapped(val); + let series = Series::new(name, &val); + match series.dtype() { + DataType::List(list_inner) => { + let series = series + .cast(&DataType::Array( + Box::new(inner.map(|dt| dt.0).unwrap_or(*list_inner.clone())), + width, + )) + .map_err(PyPolarsErr::from)?; + Ok(series.into()) + } + _ => Err(PyValueError::new_err("could not create Array from input")), } - _ => Err(PyValueError::new_err("could not create Array from input")), } } diff --git a/py-polars/src/series/set_at_idx.rs b/py-polars/src/series/set_at_idx.rs index f47b1fa8283ba..c6f65e1b3bdf3 100644 --- a/py-polars/src/series/set_at_idx.rs +++ b/py-polars/src/series/set_at_idx.rs @@ -47,52 +47,52 @@ fn set_at_idx(mut s: Series, idx: &Series, values: &Series) -> PolarsResult { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.i8()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::Int16 => { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.i16()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::Int32 => { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.i32()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::Int64 => { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.i64()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::UInt8 => { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.u8()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::UInt16 => { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.u16()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::UInt32 => { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.u32()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::UInt64 => { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.u64()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::Float32 => { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.f32()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::Float64 => { let ca: &mut ChunkedArray = mutable_s.as_mut(); let values = values.f64()?; - std::mem::take(ca).set_at_idx2(idx, values.into_iter()) + std::mem::take(ca).set_at_idx2(idx, values) } DataType::Boolean => { let ca = s.bool()?; diff --git a/py-polars/tests/__init__.py b/py-polars/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/py-polars/tests/docs/run_doctest.py b/py-polars/tests/docs/run_doctest.py index 7ac692a3bea31..f071cfe91e7c4 100644 --- a/py-polars/tests/docs/run_doctest.py +++ b/py-polars/tests/docs/run_doctest.py @@ -75,8 +75,11 @@ def modules_in_path(p: Path) -> Iterator[ModuleType]: OutputChecker = doctest.OutputChecker - class CustomOutputChecker(OutputChecker): + class IgnoreResultOutputChecker(OutputChecker): + """Python doctest output checker with support for IGNORE_RESULT.""" + def check_output(self, want: str, got: str, optionflags: Any) -> bool: + """Return True iff the actual output from an example matches the output.""" if IGNORE_RESULT_ALL: return True if IGNORE_RESULT & optionflags: @@ -84,7 +87,7 @@ def check_output(self, want: str, got: str, optionflags: Any) -> bool: else: return OutputChecker.check_output(self, want, got, optionflags) - doctest.OutputChecker = CustomOutputChecker # type: ignore[misc] + doctest.OutputChecker = IgnoreResultOutputChecker # type: ignore[misc] # We want to be relaxed about whitespace, but strict on True vs 1 doctest.NORMALIZE_WHITESPACE = True diff --git a/py-polars/tests/parametric/test_series.py b/py-polars/tests/parametric/test_series.py index 18defd02e6fca..ba91662ded756 100644 --- a/py-polars/tests/parametric/test_series.py +++ b/py-polars/tests/parametric/test_series.py @@ -18,7 +18,7 @@ def alpha_guard(**decay_param: float) -> bool: """Protects against unnecessary noise in small number regime.""" - if not list(decay_param.values())[0]: + if not next(iter(decay_param.values())): return True alpha = _prepare_alpha(**decay_param) return ((1 - alpha) if round(alpha) else alpha) > 1e-6 diff --git a/py-polars/tests/test_udfs.py b/py-polars/tests/test_udfs.py new file mode 100644 index 0000000000000..68044b89d4fc1 --- /dev/null +++ b/py-polars/tests/test_udfs.py @@ -0,0 +1,171 @@ +""" +Minimal test of the BytecodeParser class. + +This can be run without polars installed, and so can be easily run in CI +over all supported Python versions. + +All that needs to be installed is numpy and pytest. + +Usage: + + $ PYTHONPATH=polars/utils pytest tests/test_udfs.py + +Running it without `PYTHONPATH` set will result in the test being skipped. +""" +import json +from typing import Any, Callable + +import numpy +import numpy as np +import pytest + +MY_CONSTANT = 3 +MY_DICT = {1: "1", 2: "2", 3: "3"} +MY_LIST = [1, 2, 3] + +# column_name, function, expected_suggestion +TEST_CASES = [ + # --------------------------------------------- + # numeric expr: math, comparison, logic ops + # --------------------------------------------- + ("a", lambda x: x + 1 - (2 / 3), '(pl.col("a") + 1) - 0.6666666666666666'), + ("a", lambda x: x // 1 % 2, '(pl.col("a") // 1) % 2'), + ("a", lambda x: x & True, 'pl.col("a") & True'), + ("a", lambda x: x | False, 'pl.col("a") | False'), + ("a", lambda x: abs(x) != 3, 'pl.col("a").abs() != 3'), + ("a", lambda x: int(x) > 1, 'pl.col("a").cast(pl.Int64) > 1'), + ("a", lambda x: not (x > 1) or x == 2, '~(pl.col("a") > 1) | (pl.col("a") == 2)'), + ("a", lambda x: x is None, 'pl.col("a") is None'), + ("a", lambda x: x is not None, 'pl.col("a") is not None'), + ( + "a", + lambda x: ((x * -x) ** x) * 1.0, + '((pl.col("a") * -pl.col("a")) ** pl.col("a")) * 1.0', + ), + ( + "a", + lambda x: 1.0 * (x * (x**x)), + '1.0 * (pl.col("a") * (pl.col("a") ** pl.col("a")))', + ), + ( + "a", + lambda x: (x / x) + ((x * x) - x), + '(pl.col("a") / pl.col("a")) + ((pl.col("a") * pl.col("a")) - pl.col("a"))', + ), + ( + "a", + lambda x: (10 - x) / (((x * 4) - x) // (2 + (x * (x - 1)))), + '(10 - pl.col("a")) / (((pl.col("a") * 4) - pl.col("a")) // (2 + (pl.col("a") * (pl.col("a") - 1))))', + ), + ("a", lambda x: x in (2, 3, 4), 'pl.col("a").is_in((2, 3, 4))'), + ("a", lambda x: x not in (2, 3, 4), '~pl.col("a").is_in((2, 3, 4))'), + ( + "a", + lambda x: x in (1, 2, 3, 4, 3) and x % 2 == 0 and x > 0, + 'pl.col("a").is_in((1, 2, 3, 4, 3)) & ((pl.col("a") % 2) == 0) & (pl.col("a") > 0)', + ), + ("a", lambda x: MY_CONSTANT + x, 'MY_CONSTANT + pl.col("a")'), + ("a", lambda x: 0 + numpy.cbrt(x), '0 + pl.col("a").cbrt()'), + ("a", lambda x: np.sin(x) + 1, 'pl.col("a").sin() + 1'), + ( + "a", # note: functions operate on consts + lambda x: np.sin(3.14159265358979) + (x - 1) + abs(-3), + '(np.sin(3.14159265358979) + (pl.col("a") - 1)) + abs(-3)', + ), + ( + "a", + lambda x: (float(x) * int(x)) // 2, + '(pl.col("a").cast(pl.Float64) * pl.col("a").cast(pl.Int64)) // 2', + ), + # --------------------------------------------- + # logical 'and/or' (validate nesting levels) + # --------------------------------------------- + ( + "a", + lambda x: x > 1 or (x == 1 and x == 2), + '(pl.col("a") > 1) | (pl.col("a") == 1) & (pl.col("a") == 2)', + ), + ( + "a", + lambda x: (x > 1 or x == 1) and x == 2, + '((pl.col("a") > 1) | (pl.col("a") == 1)) & (pl.col("a") == 2)', + ), + ( + "a", + lambda x: x > 2 or x != 3 and x not in (0, 1, 4), + '(pl.col("a") > 2) | (pl.col("a") != 3) & ~pl.col("a").is_in((0, 1, 4))', + ), + ( + "a", + lambda x: x > 1 and x != 2 or x % 2 == 0 and x < 3, + '(pl.col("a") > 1) & (pl.col("a") != 2) | ((pl.col("a") % 2) == 0) & (pl.col("a") < 3)', + ), + ( + "a", + lambda x: x > 1 and (x != 2 or x % 2 == 0) and x < 3, + '(pl.col("a") > 1) & ((pl.col("a") != 2) | ((pl.col("a") % 2) == 0)) & (pl.col("a") < 3)', + ), + # --------------------------------------------- + # string expr: case/cast ops + # --------------------------------------------- + ("b", lambda x: str(x).title(), 'pl.col("b").cast(pl.Utf8).str.to_titlecase()'), + ( + "b", + lambda x: x.lower() + ":" + x.upper() + ":" + x.title(), + '(((pl.col("b").str.to_lowercase() + \':\') + pl.col("b").str.to_uppercase()) + \':\') + pl.col("b").str.to_titlecase()', + ), + # --------------------------------------------- + # json expr: load/extract + # --------------------------------------------- + ("c", lambda x: json.loads(x), 'pl.col("c").str.json_extract()'), + # --------------------------------------------- + # map_dict + # --------------------------------------------- + ("a", lambda x: MY_DICT[x], 'pl.col("a").map_dict(MY_DICT)'), +] + +NOOP_TEST_CASES = [ + lambda x: x, + lambda x, y: x + y, + lambda x: x[0] + 1, + lambda x: MY_LIST[x], + lambda x: MY_DICT[1], +] + + +@pytest.mark.parametrize( + ("col", "func", "expected"), + TEST_CASES, +) +def test_bytecode_parser_expression( + col: str, func: Callable[[Any], Any], expected: str +) -> None: + try: + import udfs # type: ignore[import] + except ModuleNotFoundError as exc: + assert "No module named 'udfs'" in str(exc) # noqa: PT017 + # Skip test if udfs can't be imported because it's not in the path. + # Prefer this over importorskip, so that if `udfs` can't be + # imported for some other reason, then the test + # won't be skipped. + return + bytecode_parser = udfs.BytecodeParser(func, apply_target="expr") + result = bytecode_parser.to_expression(col) + assert result == expected + + +@pytest.mark.parametrize( + "func", + NOOP_TEST_CASES, +) +def test_bytecode_parser_expression_noop(func: Callable[[Any], Any]) -> None: + try: + import udfs + except ModuleNotFoundError as exc: + assert "No module named 'udfs'" in str(exc) # noqa: PT017 + # Skip test if udfs can't be imported because it's not in the path. + # Prefer this over importorskip, so that if `udfs` can't be + # imported for some other reason, then the test + # won't be skipped. + return + assert not udfs.BytecodeParser(func, apply_target="expr").can_rewrite() diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 6739a9a6d72f0..8da2f7dccf8a0 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -3656,3 +3656,15 @@ def test_sum_empty_column_names() -> None: {"x": [0], "y": [0]}, schema={"x": pl.UInt32, "y": pl.UInt32} ) assert_frame_equal(df.sum(), expected) + + +def test_flags() -> None: + df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + assert df.flags == { + "a": {"SORTED_ASC": False, "SORTED_DESC": False}, + "b": {"SORTED_ASC": False, "SORTED_DESC": False}, + } + assert df.set_sorted("a").flags == { + "a": {"SORTED_ASC": True, "SORTED_DESC": False}, + "b": {"SORTED_ASC": False, "SORTED_DESC": False}, + } diff --git a/py-polars/tests/unit/datatypes/test_array.py b/py-polars/tests/unit/datatypes/test_array.py index fc4a5e24b91e2..1fc308fba53a9 100644 --- a/py-polars/tests/unit/datatypes/test_array.py +++ b/py-polars/tests/unit/datatypes/test_array.py @@ -37,6 +37,19 @@ def test_array_construction() -> None: assert s.dtype == dtype assert s.to_list() == payload + # create using schema + df = pl.DataFrame( + schema={ + "a": pl.Array(width=3, inner=pl.Float32), + "b": pl.Array(width=5, inner=pl.Datetime("ms")), + } + ) + assert df.dtypes == [ + pl.Array(width=3, inner=pl.Float32), + pl.Array(width=5, inner=pl.Datetime("ms")), + ] + assert df.rows() == [] + def test_array_in_groupby() -> None: df = pl.DataFrame( diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index 24a892e3d325a..9db461143d58c 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -77,20 +77,6 @@ def test_cat_to_dummies() -> None: } -def test_comp_categorical_lit_dtype() -> None: - df = pl.DataFrame( - data={"column": ["a", "b", "e"], "values": [1, 5, 9]}, - schema=[("column", pl.Categorical), ("more", pl.Int32)], - ) - - assert df.with_columns( - pl.when(pl.col("column") == "e") - .then("d") - .otherwise(pl.col("column")) - .alias("column") - ).dtypes == [pl.Categorical, pl.Int32] - - def test_categorical_describe_3487() -> None: # test if we don't err df = pl.DataFrame({"cats": ["a", "b"]}) diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 0171aed01b21e..d9af8d8296b0b 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -463,3 +463,16 @@ def test_struct_with_nulls_as_list() -> None: ] ] } + + +def test_list_amortized_iter_clear_settings_10126() -> None: + out = ( + pl.DataFrame({"a": [[1], [1], [2]], "b": [[1, 2], [1, 3], [4]]}) + .explode("a") + .groupby("a") + .agg(pl.col("b").flatten()) + .with_columns(pl.col("b").list.unique()) + .sort("a") + ) + + assert out.to_dict(False) == {"a": [1, 2], "b": [[1, 2, 3], [4]]} diff --git a/py-polars/tests/unit/datatypes/test_object.py b/py-polars/tests/unit/datatypes/test_object.py index d2d0d484333b6..570366d2e2c80 100644 --- a/py-polars/tests/unit/datatypes/test_object.py +++ b/py-polars/tests/unit/datatypes/test_object.py @@ -5,22 +5,6 @@ import polars as pl -def test_object_when_then_4702() -> None: - # please don't ever do this - x = pl.DataFrame({"Row": [1, 2], "Type": [pl.Date, pl.UInt8]}) - - assert x.with_columns( - pl.when(pl.col("Row") == 1) - .then(pl.lit(pl.UInt16, allow_object=True)) - .otherwise(pl.lit(pl.UInt8, allow_object=True)) - .alias("New_Type") - ).to_dict(False) == { - "Row": [1, 2], - "Type": [pl.Date, pl.UInt8], - "New_Type": [pl.UInt16, pl.UInt8], - } - - def test_object_empty_filter_5911() -> None: df = pl.DataFrame( data=[ diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 72e9316448d51..1fc7bf5b27b2b 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -857,3 +857,9 @@ def test_struct_null_count_strict_cast() -> None: s = pl.Series([{"a": None}]).cast(pl.Struct({"a": pl.Categorical})) assert s.dtype == pl.Struct([pl.Field("a", pl.Categorical)]) assert s.to_list() == [{"a": None}] + + +def test_struct_get_field_by_index() -> None: + df = pl.DataFrame({"val": [{"a": 1, "b": 2}]}) + expected = {"b": [2]} + assert df.select(pl.all().struct[1]).to_dict(as_series=False) == expected diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index a1c451219fc01..d514a6c41a31c 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -1891,7 +1891,7 @@ def test_iso_year() -> None: assert pl.Series([date(2022, 1, 1)]).dt.iso_year()[0] == 2021 -def test_replace_timezone() -> None: +def test_replace_time_zone() -> None: ny = ZoneInfo("America/New_York") assert pl.DataFrame({"a": [datetime(2022, 9, 25, 14)]}).with_columns( pl.col("a").dt.replace_time_zone("America/New_York").alias("b") @@ -1910,7 +1910,7 @@ def test_replace_timezone() -> None: ) @pytest.mark.parametrize("from_tz", ["Asia/Seoul", None]) @pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) -def test_replace_timezone_from_to( +def test_replace_time_zone_from_to( from_tz: str, to_tz: str, tzinfo: timezone | ZoneInfo, @@ -1957,16 +1957,14 @@ def test_strptime_with_invalid_tz() -> None: def test_utc_deprecation() -> None: - with pytest.warns( - DeprecationWarning, - match="The `utc` argument is now a no-op and has no effect. You can safely remove it", + with pytest.deprecated_call( + match="The `utc` argument is now a no-op and has no effect. You can safely remove it" ): pl.Series(["2020-01-01 03:00:00"]).str.strptime( pl.Datetime("us"), "%Y-%m-%d %H:%M:%S", utc=True ) - with pytest.warns( - DeprecationWarning, - match="The `utc` argument is now a no-op and has no effect. You can safely remove it", + with pytest.deprecated_call( + match="The `utc` argument is now a no-op and has no effect. You can safely remove it" ): pl.Series(["2020-01-01 03:00:00"]).str.to_datetime( "%Y-%m-%d %H:%M:%S", utc=True @@ -2414,6 +2412,48 @@ def test_truncate_by_multiple_weeks() -> None: } +def test_truncate_use_earliest() -> None: + ser = pl.date_range( + date(2020, 10, 25), + datetime(2020, 10, 25, 2), + "30m", + eager=True, + time_zone="Europe/London", + ).dt.offset_by("15m") + df = ser.to_frame() + df = df.with_columns( + use_earliest=pl.col("date").dt.dst_offset() == pl.duration(hours=1) + ) + result = df.select( + pl.when(pl.col("use_earliest")) + .then(pl.col("date").dt.truncate("30m", use_earliest=True)) + .otherwise(pl.col("date").dt.truncate("30m", use_earliest=False)) + ) + expected = pl.date_range( + date(2020, 10, 25), + datetime(2020, 10, 25, 2), + "30m", + eager=True, + time_zone="Europe/London", + ).to_frame() + assert_frame_equal(result, expected) + + +def test_truncate_ambiguous() -> None: + ser = pl.date_range( + date(2020, 10, 25), + datetime(2020, 10, 25, 2), + "30m", + eager=True, + time_zone="Europe/London", + ).dt.offset_by("15m") + with pytest.raises( + ComputeError, + match="datetime '2020-10-25 01:00:00' is ambiguous in time zone 'Europe/London'", + ): + ser.dt.truncate("30m") + + def test_round_by_week() -> None: df = pl.DataFrame( { diff --git a/py-polars/tests/unit/functions/test_range.py b/py-polars/tests/unit/functions/test_range.py index 184e333afc58b..bf9fb9de0819b 100644 --- a/py-polars/tests/unit/functions/test_range.py +++ b/py-polars/tests/unit/functions/test_range.py @@ -171,9 +171,7 @@ def test_date_range() -> None: time_unit=time_unit, eager=True, ) - with pytest.warns( - DeprecationWarning, match="`Series.time_unit` is deprecated.*" - ): + with pytest.deprecated_call(match="`Series.time_unit` is deprecated.*"): assert rng.time_unit == time_unit assert rng.shape == (13,) assert rng.dt[0] == datetime(2020, 1, 1) @@ -234,18 +232,18 @@ def test_date_range_precision(time_unit: TimeUnit | None, expected_micros: int) def test_range_invalid_unit() -> None: - with pytest.raises(pl.PolarsPanicError, match="'D' not supported"): + with pytest.raises(pl.PolarsPanicError, match="'x' not supported"): pl.date_range( start=datetime(2021, 12, 16), end=datetime(2021, 12, 16, 3), - interval="1D", + interval="1X", eager=True, ) def test_date_range_lazy_with_literals() -> None: df = pl.DataFrame({"misc": ["x"]}).with_columns( - pl.date_range( + pl.date_ranges( date(2000, 1, 1), date(2023, 8, 31), interval="987d", @@ -303,15 +301,18 @@ def test_date_range_lazy_time_zones_invalid() -> None: def test_date_range_lazy_with_expressions( low: str | pl.Expr, high: str | pl.Expr ) -> None: - ldf = ( - pl.DataFrame({"start": [date(2015, 6, 30)], "stop": [date(2022, 12, 31)]}) - .with_columns( - pl.date_range(low, high, interval="678d", eager=False).alias("dts") - ) - .lazy() + lf = pl.LazyFrame( + { + "start": [date(2015, 6, 30)], + "stop": [date(2022, 12, 31)], + } ) - assert ldf.collect().rows() == [ + result = lf.with_columns( + pl.date_ranges(low, high, interval="678d", eager=False).alias("dts") + ) + + assert result.collect().rows() == [ ( date(2015, 6, 30), date(2022, 12, 31), @@ -325,20 +326,16 @@ def test_date_range_lazy_with_expressions( ) ] - assert pl.DataFrame( + df = pl.DataFrame( { "start": [date(2000, 1, 1), date(2022, 6, 1)], "stop": [date(2000, 1, 2), date(2022, 6, 2)], } - ).with_columns( - pl.date_range( - low, - high, - interval="1d", - ).alias("dts") - ).to_dict( - False - ) == { + ) + + result_df = df.with_columns(pl.date_ranges(low, high, interval="1d").alias("dts")) + + assert result_df.to_dict(False) == { "start": [date(2000, 1, 1), date(2022, 6, 1)], "stop": [date(2000, 1, 2), date(2022, 6, 2)], "dts": [ @@ -347,20 +344,16 @@ def test_date_range_lazy_with_expressions( ], } - assert pl.DataFrame( + df = pl.DataFrame( { "start": [datetime(2000, 1, 1), datetime(2022, 6, 1)], "stop": [datetime(2000, 1, 2), datetime(2022, 6, 2)], } - ).with_columns( - pl.date_range( - low, - high, - interval="1d", - ).alias("dts") - ).to_dict( - False - ) == { + ) + + result_df = df.with_columns(pl.date_ranges(low, high, interval="1d").alias("dts")) + + assert result_df.to_dict(False) == { "start": [datetime(2000, 1, 1, 0, 0), datetime(2022, 6, 1, 0, 0)], "stop": [datetime(2000, 1, 2, 0, 0), datetime(2022, 6, 2, 0, 0)], "dts": [ @@ -379,7 +372,7 @@ def test_date_range_single_row_lazy_7110() -> None: } ) result = df.with_columns( - pl.date_range( + pl.date_ranges( start=pl.col("from"), end=pl.col("to"), interval="1d", @@ -554,60 +547,146 @@ def test_date_range_name() -> None: result_eager = pl.date_range(date(2020, 1, 1), date(2020, 1, 3), eager=True) assert result_eager.name == expected_name - result_lazy = pl.select( - pl.date_range(date(2020, 1, 1), date(2020, 1, 3), eager=False) - ).to_series() + with pytest.deprecated_call(): + result_lazy = pl.select( + pl.date_range(date(2020, 1, 1), date(2020, 1, 3), eager=False) + ).to_series() assert result_lazy.name == expected_name -def test_time_range_lit() -> None: - for eager in (True, False): +def test_date_ranges_eager() -> None: + start = pl.Series([date(2022, 1, 1), date(2022, 1, 2)]) + end = pl.Series([date(2022, 1, 4), date(2022, 1, 3)]) + + result = pl.date_ranges(start, end, eager=True) + + expected = pl.Series( + "date_range", + [ + [date(2022, 1, 1), date(2022, 1, 2), date(2022, 1, 3), date(2022, 1, 4)], + [date(2022, 1, 2), date(2022, 1, 3)], + ], + ) + assert_series_equal(result, expected) + + +def test_date_range_eager_explode() -> None: + start = pl.Series([date(2022, 1, 1)]) + end = pl.Series([date(2022, 1, 3)]) + + result = pl.date_range(start, end, eager=True) + + expected = pl.Series("date", [date(2022, 1, 1), date(2022, 1, 2), date(2022, 1, 3)]) + assert_series_equal(result, expected) + + +def test_date_range_deprecated_eager() -> None: + start = pl.Series([date(2022, 1, 1), date(2022, 1, 2)]) + end = pl.Series([date(2022, 1, 4), date(2022, 1, 3)]) + + with pytest.deprecated_call(): + result = pl.date_range(start, end, eager=True) + + expected = pl.Series( + "date", + [ + [date(2022, 1, 1), date(2022, 1, 2), date(2022, 1, 3), date(2022, 1, 4)], + [date(2022, 1, 2), date(2022, 1, 3)], + ], + ) + assert_series_equal(result, expected) + + +def test_time_range_lit_lazy() -> None: + with pytest.deprecated_call(): tm = pl.select( pl.time_range( start=time(1, 2, 3), end=time(23, 59, 59), interval="5h45m10s333ms", closed="right", - eager=eager, - ).alias("tm") - ) - if not eager: - tm = tm.select(pl.col("tm").explode()) - assert tm["tm"].to_list() == [ - time(6, 47, 13, 333000), - time(12, 32, 23, 666000), - time(18, 17, 33, 999000), - ] - - # validate unset start/end - tm = pl.select( - pl.time_range( - interval="5h45m10s333ms", - eager=eager, ).alias("tm") ) - if not eager: - tm = tm.select(pl.col("tm").explode()) - assert tm["tm"].to_list() == [ - time(0, 0), - time(5, 45, 10, 333000), - time(11, 30, 20, 666000), - time(17, 15, 30, 999000), - time(23, 0, 41, 332000), - ] + tm = tm.select(pl.col("tm").explode()) + assert tm["tm"].to_list() == [ + time(6, 47, 13, 333000), + time(12, 32, 23, 666000), + time(18, 17, 33, 999000), + ] + # validate unset start/end + with pytest.deprecated_call(): + tm = pl.select(pl.time_range(interval="5h45m10s333ms").alias("tm")) + tm = tm.select(pl.col("tm").explode()) + assert tm["tm"].to_list() == [ + time(0, 0), + time(5, 45, 10, 333000), + time(11, 30, 20, 666000), + time(17, 15, 30, 999000), + time(23, 0, 41, 332000), + ] + + with pytest.deprecated_call(): tm = pl.select( pl.time_range( - start=pl.lit(time(23, 59, 59, 999980)), - interval="10000ns", - eager=eager, + start=pl.lit(time(23, 59, 59, 999980)), interval="10000ns" ).alias("tm") ) + tm = tm.select(pl.col("tm").explode()) + assert tm["tm"].to_list() == [ + time(23, 59, 59, 999980), + time(23, 59, 59, 999990), + ] + + +def test_time_range_lit_eager() -> None: + eager = True + tm = pl.select( + pl.time_range( + start=time(1, 2, 3), + end=time(23, 59, 59), + interval="5h45m10s333ms", + closed="right", + eager=eager, + ).alias("tm") + ) + if not eager: tm = tm.select(pl.col("tm").explode()) - assert tm["tm"].to_list() == [ - time(23, 59, 59, 999980), - time(23, 59, 59, 999990), - ] + assert tm["tm"].to_list() == [ + time(6, 47, 13, 333000), + time(12, 32, 23, 666000), + time(18, 17, 33, 999000), + ] + + # validate unset start/end + tm = pl.select( + pl.time_range( + interval="5h45m10s333ms", + eager=eager, + ).alias("tm") + ) + if not eager: + tm = tm.select(pl.col("tm").explode()) + assert tm["tm"].to_list() == [ + time(0, 0), + time(5, 45, 10, 333000), + time(11, 30, 20, 666000), + time(17, 15, 30, 999000), + time(23, 0, 41, 332000), + ] + + tm = pl.select( + pl.time_range( + start=pl.lit(time(23, 59, 59, 999980)), + interval="10000ns", + eager=eager, + ).alias("tm") + ) + tm = tm.select(pl.col("tm").explode()) + assert tm["tm"].to_list() == [ + time(23, 59, 59, 999980), + time(23, 59, 59, 999990), + ] def test_time_range_expr() -> None: @@ -616,9 +695,7 @@ def test_time_range_expr() -> None: "start": pl.time_range(interval="6h", eager=True), "stop": pl.time_range(start=time(2, 59), interval="5h59m", eager=True), } - ).with_columns( - intervals=pl.time_range("start", pl.col("stop"), interval="1h29m", eager=False) - ) + ).with_columns(intervals=pl.time_ranges("start", pl.col("stop"), interval="1h29m")) # shape: (4, 3) # ┌──────────┬──────────┬────────────────────────────────┐ # │ start ┆ stop ┆ intervals │ @@ -643,7 +720,10 @@ def test_time_range_name() -> None: result_eager = pl.time_range(time(10), time(12), eager=True) assert result_eager.name == expected_name - result_lazy = pl.select(pl.time_range(time(10), time(12), eager=False)).to_series() + with pytest.deprecated_call(): + result_lazy = pl.select( + pl.time_range(time(10), time(12), eager=False) + ).to_series() assert result_lazy.name == expected_name @@ -712,7 +792,7 @@ def test_date_range_schema( .lazy() ) result = df.with_columns( - pl.date_range( + pl.date_ranges( pl.col("start"), pl.col("end"), time_zone=input_time_zone, @@ -787,7 +867,7 @@ def test_date_range_schema_no_upcast( ) -> None: df = pl.DataFrame({"start": [date(2020, 1, 1)], "end": [date(2020, 1, 3)]}).lazy() result = df.with_columns( - pl.date_range( + pl.date_ranges( pl.col("start"), pl.col("end"), interval=interval, @@ -883,7 +963,7 @@ def test_date_range_schema_upcasts_to_datetime( ) -> None: df = pl.DataFrame({"start": [date(2020, 1, 1)], "end": [date(2020, 1, 3)]}).lazy() result = df.with_columns( - pl.date_range( + pl.date_ranges( pl.col("start"), pl.col("end"), interval=interval, @@ -921,11 +1001,11 @@ def test_date_range_no_alias_schema_9037() -> None: df = pl.DataFrame( {"start": [datetime(2020, 1, 1)], "end": [datetime(2020, 1, 2)]} ).lazy() - result = df.with_columns(pl.date_range(pl.col("start"), pl.col("end"))) + result = df.with_columns(pl.date_ranges(pl.col("start"), pl.col("end"))) expected_schema = { "start": pl.Datetime(time_unit="us", time_zone=None), "end": pl.Datetime(time_unit="us", time_zone=None), - "date": pl.List(pl.Datetime(time_unit="us", time_zone=None)), + "date_range": pl.List(pl.Datetime(time_unit="us", time_zone=None)), } assert result.schema == expected_schema assert result.collect().schema == expected_schema @@ -933,9 +1013,7 @@ def test_date_range_no_alias_schema_9037() -> None: def test_time_range_schema() -> None: df = pl.DataFrame({"start": [time(1)], "end": [time(1, 30)]}).lazy() - result = df.with_columns( - pl.time_range(pl.col("start"), pl.col("end")).alias("time_range") - ) + result = df.with_columns(pl.time_ranges(pl.col("start"), pl.col("end"))) expected_schema = {"start": pl.Time, "end": pl.Time, "time_range": pl.List(pl.Time)} assert result.schema == expected_schema assert result.collect().schema == expected_schema @@ -943,7 +1021,50 @@ def test_time_range_schema() -> None: def test_time_range_no_alias_schema_9037() -> None: df = pl.DataFrame({"start": [time(1)], "end": [time(1, 30)]}).lazy() - result = df.with_columns(pl.time_range(pl.col("start"), pl.col("end"))) - expected_schema = {"start": pl.Time, "end": pl.Time, "time": pl.List(pl.Time)} + result = df.with_columns(pl.time_ranges(pl.col("start"), pl.col("end"))) + expected_schema = {"start": pl.Time, "end": pl.Time, "time_range": pl.List(pl.Time)} assert result.schema == expected_schema assert result.collect().schema == expected_schema + + +def test_time_ranges_eager() -> None: + start = pl.Series([time(9, 0), time(10, 0)]) + end = pl.Series([time(12, 0), time(11, 0)]) + + result = pl.time_ranges(start, end, eager=True) + + expected = pl.Series( + "time_range", + [ + [time(9, 0), time(10, 0), time(11, 0), time(12, 0)], + [time(10, 0), time(11, 0)], + ], + ) + assert_series_equal(result, expected) + + +def test_time_range_eager_explode() -> None: + start = pl.Series([time(9, 0)]) + end = pl.Series([time(11, 0)]) + + result = pl.time_range(start, end, eager=True) + + expected = pl.Series("time", [time(9, 0), time(10, 0), time(11, 0)]) + assert_series_equal(result, expected) + + +def test_time_range_deprecated_eager() -> None: + start = pl.Series([time(9, 0), time(10, 0)]) + end = pl.Series([time(12, 0), time(11, 0)]) + + with pytest.deprecated_call(): + result = pl.time_range(start, end, eager=True) + + expected = pl.Series( + "time", + [ + [time(9, 0), time(10, 0), time(11, 0), time(12, 0)], + [time(10, 0), time(11, 0)], + ], + ) + assert_series_equal(result, expected) diff --git a/py-polars/tests/unit/functions/test_whenthen.py b/py-polars/tests/unit/functions/test_whenthen.py new file mode 100644 index 0000000000000..b55192975f7e7 --- /dev/null +++ b/py-polars/tests/unit/functions/test_whenthen.py @@ -0,0 +1,258 @@ +from datetime import datetime + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal, assert_series_equal + + +def test_when_then() -> None: + df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + + expr = pl.when(pl.col("a") < 3).then(pl.lit("x")) + + result = df.select( + expr.otherwise(pl.lit("y")).alias("a"), + expr.alias("b"), + ) + + expected = pl.DataFrame( + { + "a": ["x", "x", "y", "y", "y"], + "b": ["x", "x", None, None, None], + } + ) + assert_frame_equal(result, expected) + + +def test_when_then_chained() -> None: + df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + + expr = ( + pl.when(pl.col("a") < 3) + .then(pl.lit("x")) + .when(pl.col("a") > 4) + .then(pl.lit("z")) + ) + + result = df.select( + expr.otherwise(pl.lit("y")).alias("a"), + expr.alias("b"), + ) + + expected = pl.DataFrame( + { + "a": ["x", "x", "y", "y", "z"], + "b": ["x", "x", None, None, "z"], + } + ) + assert_frame_equal(result, expected) + + +def test_when_then_invalid_chains() -> None: + with pytest.raises(AttributeError): + pl.when("a").when("b") # type: ignore[attr-defined] + with pytest.raises(AttributeError): + pl.when("a").otherwise(2) # type: ignore[attr-defined] + with pytest.raises(AttributeError): + pl.when("a").then(1).then(2) # type: ignore[attr-defined] + with pytest.raises(AttributeError): + pl.when("a").then(1).otherwise(2).otherwise(3) # type: ignore[attr-defined] + with pytest.raises(AttributeError): + pl.when("a").then(1).when("b").when("c") # type: ignore[attr-defined] + with pytest.raises(AttributeError): + pl.when("a").then(1).when("b").otherwise("2") # type: ignore[attr-defined] + with pytest.raises(AttributeError): + pl.when("a").then(1).when("b").then(2).when("c").when("d") # type: ignore[attr-defined] + + +def test_when_then_implicit_none() -> None: + df = pl.DataFrame( + { + "team": ["A", "A", "A", "B", "B", "C"], + "points": [11, 8, 10, 6, 6, 5], + } + ) + + result = df.select( + pl.when(pl.col("points") > 7).then(pl.lit("Foo")), + pl.when(pl.col("points") > 7).then(pl.lit("Foo")).alias("bar"), + ) + + expected = pl.DataFrame( + { + "literal": ["Foo", "Foo", "Foo", None, None, None], + "bar": ["Foo", "Foo", "Foo", None, None, None], + } + ) + assert_frame_equal(result, expected) + + +def test_when_then_empty_list_5547() -> None: + out = pl.DataFrame({"a": []}).select([pl.when(pl.col("a") > 1).then([1])]) + assert out.shape == (0, 1) + assert out.dtypes == [pl.List(pl.Int64)] + + +def test_nested_when_then_and_wildcard_expansion_6284() -> None: + df = pl.DataFrame( + { + "1": ["a", "b"], + "2": ["c", "d"], + } + ) + + out0 = df.with_columns( + pl.when(pl.any_horizontal(pl.all() == "a")) + .then(pl.lit("a")) + .otherwise( + pl.when(pl.any_horizontal(pl.all() == "d")) + .then(pl.lit("d")) + .otherwise(None) + ) + .alias("result") + ) + + out1 = df.with_columns( + pl.when(pl.any_horizontal(pl.all() == "a")) + .then(pl.lit("a")) + .when(pl.any_horizontal(pl.all() == "d")) + .then(pl.lit("d")) + .otherwise(None) + .alias("result") + ) + + assert_frame_equal(out0, out1) + assert out0.to_dict(False) == { + "1": ["a", "b"], + "2": ["c", "d"], + "result": ["a", "d"], + } + + +def test_list_zip_with_logical_type() -> None: + df = pl.DataFrame( + { + "start": [datetime(2023, 1, 1, 1, 1, 1), datetime(2023, 1, 1, 1, 1, 1)], + "stop": [datetime(2023, 1, 1, 1, 3, 1), datetime(2023, 1, 1, 1, 4, 1)], + "use": [1, 0], + } + ) + + df = df.with_columns( + pl.date_ranges( + pl.col("start"), pl.col("stop"), interval="1h", eager=False, closed="left" + ).alias("interval_1"), + pl.date_ranges( + pl.col("start"), pl.col("stop"), interval="1h", eager=False, closed="left" + ).alias("interval_2"), + ) + + out = df.select( + pl.when(pl.col("use") == 1) + .then(pl.col("interval_2")) + .otherwise(pl.col("interval_1")) + .alias("interval_new") + ) + assert out.dtypes == [pl.List(pl.Datetime(time_unit="us", time_zone=None))] + + +def test_type_coercion_when_then_otherwise_2806() -> None: + out = ( + pl.DataFrame({"names": ["foo", "spam", "spam"], "nrs": [1, 2, 3]}) + .select( + [ + pl.when(pl.col("names") == "spam") + .then(pl.col("nrs") * 2) + .otherwise(pl.lit("other")) + .alias("new_col"), + ] + ) + .to_series() + ) + expected = pl.Series("new_col", ["other", "4", "6"]) + assert out.to_list() == expected.to_list() + + # test it remains float32 + assert ( + pl.Series("a", [1.0, 2.0, 3.0], dtype=pl.Float32) + .to_frame() + .select(pl.when(pl.col("a") > 2.0).then(pl.col("a")).otherwise(0.0)) + ).to_series().dtype == pl.Float32 + + +def test_when_then_edge_cases_3994() -> None: + df = pl.DataFrame(data={"id": [1, 1], "type": [2, 2]}) + + # this tests if lazy correctly assigns the list schema to the column aggregation + assert ( + df.lazy() + .groupby(["id"]) + .agg(pl.col("type")) + .with_columns( + pl.when(pl.col("type").list.lengths() == 0) + .then(pl.lit(None)) + .otherwise(pl.col("type")) + .keep_name() + ) + .collect() + ).to_dict(False) == {"id": [1], "type": [[2, 2]]} + + # this tests ternary with an empty argument + assert ( + df.filter(pl.col("id") == 42) + .groupby(["id"]) + .agg(pl.col("type")) + .with_columns( + pl.when(pl.col("type").list.lengths() == 0) + .then(pl.lit(None)) + .otherwise(pl.col("type")) + .keep_name() + ) + ).to_dict(False) == {"id": [], "type": []} + + +def test_object_when_then_4702() -> None: + # please don't ever do this + x = pl.DataFrame({"Row": [1, 2], "Type": [pl.Date, pl.UInt8]}) + + assert x.with_columns( + pl.when(pl.col("Row") == 1) + .then(pl.lit(pl.UInt16, allow_object=True)) + .otherwise(pl.lit(pl.UInt8, allow_object=True)) + .alias("New_Type") + ).to_dict(False) == { + "Row": [1, 2], + "Type": [pl.Date, pl.UInt8], + "New_Type": [pl.UInt16, pl.UInt8], + } + + +def test_comp_categorical_lit_dtype() -> None: + df = pl.DataFrame( + data={"column": ["a", "b", "e"], "values": [1, 5, 9]}, + schema=[("column", pl.Categorical), ("more", pl.Int32)], + ) + + assert df.with_columns( + pl.when(pl.col("column") == "e") + .then(pl.lit("d")) + .otherwise(pl.col("column")) + .alias("column") + ).dtypes == [pl.Categorical, pl.Int32] + + +def test_when_then_deprecated_string_input() -> None: + df = pl.DataFrame( + { + "a": [True, False], + "b": [1, 2], + "c": [3, 4], + } + ) + + with pytest.deprecated_call(): + result = df.select(pl.when("a").then("b").otherwise("c").alias("when")) + + expected = pl.Series("when", ["b", "c"]) + assert_series_equal(result.to_series(), expected) diff --git a/py-polars/tests/unit/io/test_cloud.py b/py-polars/tests/unit/io/test_cloud.py new file mode 100644 index 0000000000000..f4ed20ec19555 --- /dev/null +++ b/py-polars/tests/unit/io/test_cloud.py @@ -0,0 +1,13 @@ +import pytest + +import polars as pl + + +def test_err_on_s3_glob() -> None: + with pytest.raises( + ValueError, + match=r"globbing patterns not supported when scanning non-local files", + ): + pl.scan_parquet( + "s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-1.*.parquet" + ) diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 8f065bdff851d..27c3d0ac0acdd 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -1368,3 +1368,12 @@ def test_write_csv_stdout_stderr(capsys: pytest.CaptureFixture[str]) -> None: "2,csv,2023-01-02\n" "3,stdout,2023-01-03\n" ) + + +def test_csv_9929() -> None: + df = pl.DataFrame({"nrs": [1, 2, 3]}) + f = io.BytesIO() + df.write_csv(f) + f.seek(0) + with pytest.raises(pl.NoDataError): + pl.read_csv(f, skip_rows=10**6) diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py index 4466a57761c60..a4a77560a74f7 100644 --- a/py-polars/tests/unit/io/test_database.py +++ b/py-polars/tests/unit/io/test_database.py @@ -106,7 +106,7 @@ def test_read_database( create_temp_sqlite_db(test_db) df = pl.read_database( - connection_uri=f"sqlite:///{test_db}", + connection=f"sqlite:///{test_db}", query="SELECT * FROM test_data", engine=engine, ) @@ -154,7 +154,7 @@ def test_read_database_exceptions( ) -> None: with pytest.raises(errclass, match=err): pl.read_database( - connection_uri=f"{database}://test", + connection=f"{database}://test", query=query, engine=engine, ) @@ -198,7 +198,6 @@ def test_write_database( engine: DbWriteEngine, mode: DbWriteMode, sample_df: pl.DataFrame, tmp_path: Path ) -> None: tmp_path.mkdir(exist_ok=True) - tmp_db = f"test_{engine}.db" test_db = str(tmp_path / tmp_db) @@ -208,22 +207,20 @@ def test_write_database( sample_df.write_database( table_name=f"main.{tbl_name}", - connection_uri=f"sqlite:///{test_db}", + connection=f"sqlite:///{test_db}", if_exists="replace", engine=engine, ) - if mode == "append": sample_df.write_database( table_name=f'"main".{tbl_name}', - connection_uri=f"sqlite:///{test_db}", + connection=f"sqlite:///{test_db}", if_exists="append", engine=engine, ) sample_df = pl.concat([sample_df, sample_df]) result = pl.read_database(f"SELECT * FROM {tbl_name}", f"sqlite:///{test_db}") - sample_df = sample_df.with_columns(pl.col("date").cast(pl.Utf8)) assert_frame_equal(sample_df, result) @@ -234,7 +231,7 @@ def test_write_database( ): with pytest.raises(ValueError): sample_df.write_database( - connection_uri=f"sqlite:///{test_db}", + connection=f"sqlite:///{test_db}", engine=engine, **invalid_params, # type: ignore[arg-type] ) diff --git a/py-polars/tests/unit/io/test_excel.py b/py-polars/tests/unit/io/test_excel.py index d33cbf4a8f9ac..23b164a7cb680 100644 --- a/py-polars/tests/unit/io/test_excel.py +++ b/py-polars/tests/unit/io/test_excel.py @@ -297,3 +297,28 @@ def test_excel_write_multiple_tables() -> None: ) assert table_names == {f"Frame{n}" for n in range(4)} assert pl.read_excel(xls, sheet_name="sheet3").rows() == [] + + +def test_excel_freeze_panes() -> None: + from xlsxwriter import Workbook + + # note: checks that empty tables don't error on write + df1 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) + df2 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) + df3 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) + + xls = BytesIO() + + # use all three freeze_pane notations + with Workbook(xls) as wb: + df1.write_excel(workbook=wb, worksheet="sheet1", freeze_panes=(1, 0)) + df2.write_excel(workbook=wb, worksheet="sheet2", freeze_panes=(1, 0, 3, 4)) + df3.write_excel(workbook=wb, worksheet="sheet3", freeze_panes=("B2")) + + table_names: set[str] = set() + for sheet in ("sheet1", "sheet2", "sheet3"): + table_names.update( + tbl["name"] for tbl in wb.get_worksheet_by_name(sheet).tables + ) + assert table_names == {f"Frame{n}" for n in range(3)} + assert pl.read_excel(xls, sheet_name="sheet3").rows() == [] diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py index f762e045a404d..5d1ebf1e3e79a 100644 --- a/py-polars/tests/unit/io/test_json.py +++ b/py-polars/tests/unit/io/test_json.py @@ -199,3 +199,17 @@ def test_ndjson_ignore_errors() -> None: [{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}], ], } + + +def test_write_json_duration() -> None: + df = pl.DataFrame( + { + "a": pl.Series( + [91762939, 91762890, 6020836], dtype=pl.Duration(time_unit="ms") + ) + } + ) + assert ( + df.write_json(row_oriented=True) + == '[{"a":"P1DT5362.939S"},{"a":"P1DT5362.890S"},{"a":"PT6020.836S"}]' + ) diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index 2eaa730b0bc8d..ad1c13489bc4c 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -25,7 +25,7 @@ def test_scan_csv(io_files_path: Path) -> None: def test_scan_csv_no_cse_deadlock(io_files_path: Path) -> None: dfs = [pl.scan_csv(io_files_path / "small.csv")] * (pl.threadpool_size() + 1) - pl.concat(dfs, parallel=True).collect(common_subplan_elimination=False) + pl.concat(dfs, parallel=True).collect(comm_subplan_elim=False) def test_scan_empty_csv(io_files_path: Path) -> None: diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index bc461c3adecc8..1c7848ff3c6f3 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -376,3 +376,16 @@ def test_glob_n_rows(io_files_path: Path) -> None: "fats_g": [0.5, 6.0], "sugars_g": [2, 2], } + + +@pytest.mark.write_disk() +def test_parquet_statistics_filter_9925(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + file_path = tmp_path / "codes.parquet" + df = pl.DataFrame({"code": [300964, 300972, 500_000, 26]}) + df.write_parquet(file_path, statistics=True) + + q = pl.scan_parquet(file_path).filter( + (pl.col("code").floordiv(100_000)).is_in([0, 3]) + ) + assert q.collect().to_dict(False) == {"code": [300964, 300972, 26]} diff --git a/py-polars/tests/unit/namespaces/test_datetime.py b/py-polars/tests/unit/namespaces/test_datetime.py index fb528381da62e..dec496f274fbe 100644 --- a/py-polars/tests/unit/namespaces/test_datetime.py +++ b/py-polars/tests/unit/namespaces/test_datetime.py @@ -159,7 +159,12 @@ def test_local_time_sortedness(time_zone: str | None) -> None: def test_offset_by_sortedness( time_zone: str | None, offset: str, expected: bool ) -> None: - ser = (pl.Series([datetime(2022, 1, 1, 23)]).dt.replace_time_zone(time_zone)).sort() + # create 2 values, as a single value is always sorted + ser = ( + pl.Series( + [datetime(2022, 1, 1, 22), datetime(2022, 1, 1, 22)] + ).dt.replace_time_zone(time_zone) + ).sort() result = ser.dt.offset_by(offset) assert result.flags["SORTED_ASC"] == expected assert result.flags["SORTED_DESC"] is False diff --git a/py-polars/tests/unit/namespaces/test_list.py b/py-polars/tests/unit/namespaces/test_list.py index 882e719785077..43e8077b0b2ae 100644 --- a/py-polars/tests/unit/namespaces/test_list.py +++ b/py-polars/tests/unit/namespaces/test_list.py @@ -508,3 +508,14 @@ def test_list_set_operations() -> None: exp = [[2, 3], [3, 1], [3]] assert r1 == exp assert r2 == exp + + +def test_list_take_oob_10079() -> None: + df = pl.DataFrame( + { + "a": [[1, 2, 3], [], [None, 3], [5, 6, 7]], + "b": [["2"], ["3"], [None], ["3", "Hi"]], + } + ) + with pytest.raises(pl.ComputeError, match="take indices are out of bounds"): + df.select(pl.col("a").take(999)) diff --git a/py-polars/tests/unit/namespaces/test_string.py b/py-polars/tests/unit/namespaces/test_string.py index 2de43032cc6d3..1a05327d4ab88 100644 --- a/py-polars/tests/unit/namespaces/test_string.py +++ b/py-polars/tests/unit/namespaces/test_string.py @@ -723,3 +723,17 @@ def test_titlecase() -> None: "And\tA\t Tab", ] } + + +def test_string_replace_with_nulls_10124() -> None: + df = pl.DataFrame({"col1": ["S", "S", "S", None, "S", "S", "S", "S"]}) + + assert df.select( + pl.col("col1"), + pl.col("col1").str.replace("S", "O", n=1).alias("n_1"), + pl.col("col1").str.replace("S", "O", n=3).alias("n_3"), + ).to_dict(False) == { + "col1": ["S", "S", "S", None, "S", "S", "S", "S"], + "n_1": ["O", "O", "O", None, "O", "O", "O", "O"], + "n_3": ["O", "O", "O", None, "O", "O", "O", "O"], + } diff --git a/py-polars/tests/unit/operations/test_apply.py b/py-polars/tests/unit/operations/test_apply.py index f9105b0dc6580..840a2754830ea 100644 --- a/py-polars/tests/unit/operations/test_apply.py +++ b/py-polars/tests/unit/operations/test_apply.py @@ -9,6 +9,7 @@ import pytest import polars as pl +from polars.exceptions import PolarsInefficientApplyWarning from polars.testing import assert_frame_equal @@ -94,9 +95,12 @@ def test_apply_infer_list() -> None: def test_apply_arithmetic_consistency() -> None: df = pl.DataFrame({"A": ["a", "a"], "B": [2, 3]}) - assert df.groupby("A").agg(pl.col("B").apply(lambda x: x + 1.0))["B"].to_list() == [ - [3.0, 4.0] - ] + with pytest.warns( + PolarsInefficientApplyWarning, match="In this case, you can replace" + ): + assert df.groupby("A").agg(pl.col("B").apply(lambda x: x + 1.0))[ + "B" + ].to_list() == [[3.0, 4.0]] def test_apply_struct() -> None: @@ -168,18 +172,28 @@ def test_datelike_identity() -> None: def test_apply_list_anyvalue_fallback() -> None: import json - df = pl.DataFrame({"text": ['[{"x": 1, "y": 2}, {"x": 3, "y": 4}]']}) - assert df.select(pl.col("text").apply(json.loads)).to_dict(False) == { - "text": [[{"x": 1, "y": 2}, {"x": 3, "y": 4}]] - } + with pytest.warns( + PolarsInefficientApplyWarning, + match=r'(?s)replace your `apply` with.*pl.col\("text"\).str.json_extract()', + ): + df = pl.DataFrame({"text": ['[{"x": 1, "y": 2}, {"x": 3, "y": 4}]']}) + assert df.select(pl.col("text").apply(json.loads)).to_dict(False) == { + "text": [[{"x": 1, "y": 2}, {"x": 3, "y": 4}]] + } - # starts with empty list '[]' - df = pl.DataFrame( - {"text": ["[]", '[{"x": 1, "y": 2}, {"x": 3, "y": 4}]', '[{"x": 1, "y": 2}]']} - ) - assert df.select(pl.col("text").apply(json.loads)).to_dict(False) == { - "text": [[], [{"x": 1, "y": 2}, {"x": 3, "y": 4}], [{"x": 1, "y": 2}]] - } + # starts with empty list '[]' + df = pl.DataFrame( + { + "text": [ + "[]", + '[{"x": 1, "y": 2}, {"x": 3, "y": 4}]', + '[{"x": 1, "y": 2}]', + ] + } + ) + assert df.select(pl.col("text").apply(json.loads)).to_dict(False) == { + "text": [[], [{"x": 1, "y": 2}, {"x": 3, "y": 4}], [{"x": 1, "y": 2}]] + } def test_apply_all_types() -> None: @@ -241,25 +255,29 @@ def test_apply_skip_nulls() -> None: def test_apply_object_dtypes() -> None: - assert pl.DataFrame( - {"a": pl.Series([1, 2, "a", 4, 5], dtype=pl.Object)} - ).with_columns( - [ - pl.col("a").apply(lambda x: x * 2, return_dtype=pl.Object), - pl.col("a") - .apply(lambda x: isinstance(x, (int, float)), return_dtype=pl.Boolean) - .alias("is_numeric1"), - pl.col("a") - .apply(lambda x: isinstance(x, (int, float))) - .alias("is_numeric_infer"), - ] - ).to_dict( - False - ) == { - "a": [2, 4, "aa", 8, 10], - "is_numeric1": [True, True, False, True, True], - "is_numeric_infer": [True, True, False, True, True], - } + with pytest.warns( + PolarsInefficientApplyWarning, + match=r"(?s)replace your `apply` with.*lambda x:", + ): + assert pl.DataFrame( + {"a": pl.Series([1, 2, "a", 4, 5], dtype=pl.Object)} + ).with_columns( + [ + pl.col("a").apply(lambda x: x * 2, return_dtype=pl.Object), + pl.col("a") + .apply(lambda x: isinstance(x, (int, float)), return_dtype=pl.Boolean) + .alias("is_numeric1"), + pl.col("a") + .apply(lambda x: isinstance(x, (int, float))) + .alias("is_numeric_infer"), + ] + ).to_dict( + False + ) == { + "a": [2, 4, "aa", 8, 10], + "is_numeric1": [True, True, False, True, True], + "is_numeric_infer": [True, True, False, True, True], + } def test_apply_explicit_list_output_type() -> None: @@ -276,15 +294,19 @@ def test_apply_explicit_list_output_type() -> None: def test_apply_dict() -> None: - df = pl.DataFrame({"Col": ['{"A":"Value1"}', '{"B":"Value2"}']}) - assert df.select(pl.col("Col").apply(json.loads)).to_dict(False) == { - "Col": [{"A": "Value1", "B": None}, {"A": None, "B": "Value2"}] - } - assert pl.DataFrame( - {"Col": ['{"A":"Value1", "B":"Value2"}', '{"B":"Value3"}']} - ).select(pl.col("Col").apply(json.loads)).to_dict(False) == { - "Col": [{"A": "Value1", "B": "Value2"}, {"A": None, "B": "Value3"}] - } + with pytest.warns( + PolarsInefficientApplyWarning, + match=r'(?s)replace your `apply` with.*pl.col\("abc"\).str.json_extract()', + ): + df = pl.DataFrame({"abc": ['{"A":"Value1"}', '{"B":"Value2"}']}) + assert df.select(pl.col("abc").apply(json.loads)).to_dict(False) == { + "abc": [{"A": "Value1", "B": None}, {"A": None, "B": "Value2"}] + } + assert pl.DataFrame( + {"abc": ['{"A":"Value1", "B":"Value2"}', '{"B":"Value3"}']} + ).select(pl.col("abc").apply(json.loads)).to_dict(False) == { + "abc": [{"A": "Value1", "B": "Value2"}, {"A": None, "B": "Value3"}] + } def test_apply_pass_name() -> None: diff --git a/py-polars/tests/unit/operations/test_arithmetic.py b/py-polars/tests/unit/operations/test_arithmetic.py index 5f7d5abb231fd..fed4bb7aaba8d 100644 --- a/py-polars/tests/unit/operations/test_arithmetic.py +++ b/py-polars/tests/unit/operations/test_arithmetic.py @@ -170,19 +170,36 @@ def test_fused_arithm() -> None: q = df.lazy().select(pl.lit(1) * pl.lit(2) - pl.col("c")) assert """(2) - (col("c")""" in q.explain() - # 8752 - df = pl.DataFrame({"x": pl.Series(values=[0, 0])}) - q = df.lazy().with_columns((0 + 2.5 * (0.5 + pl.col("x"))).alias("compute")) - assert q.collect()["compute"][0] == 1.25 - assert "0.0.fma" in q.explain() - - -def test_fused_arithm_9009() -> None: - q = pl.LazyFrame({"a": [1, 2], "b": [3, 4]}) - q = q.select((pl.col("b") * 2 + 3).over("a")) - - assert """3.fma([col("b"), 2]).alias("b")""" in q.explain() - assert q.collect()["b"].to_list() == [9, 11] + # Check if fused is turned off for literals see: #9857 + for expr in [ + pl.col("c") * 2 + 5, + pl.col("c") * 2 + pl.col("c"), + pl.col("c") * 2 - 5, + pl.col("c") * 2 - pl.col("c"), + 5 - pl.col("c") * 2, + pl.col("c") - pl.col("c") * 2, + ]: + q = df.lazy().select(expr) + assert all( + el not in q.explain() for el in ["fms", "fsm", "fma"] + ), f"Fused Arithmetic applied on literal {expr}: {q.explain()}" + + +def test_literal_no_upcast() -> None: + df = pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Float32)}) + + q = ( + df.lazy() + .select( + (pl.col("a") * -5 + 2).alias("fma"), + (2 - pl.col("a") * 5).alias("fsm"), + (pl.col("a") * 5 - 2).alias("fms"), + ) + .collect() + ) + assert set(q.schema.values()) == { + pl.Float32 + }, "Literal * Column (Float32) should not lead upcast" def test_boolean_addition() -> None: diff --git a/py-polars/tests/unit/operations/test_groupby_rolling.py b/py-polars/tests/unit/operations/test_groupby_rolling.py index 13de2008754a9..d06bdbed75ecf 100644 --- a/py-polars/tests/unit/operations/test_groupby_rolling.py +++ b/py-polars/tests/unit/operations/test_groupby_rolling.py @@ -1,6 +1,6 @@ from __future__ import annotations -from datetime import datetime +from datetime import date, datetime from typing import TYPE_CHECKING, Any import pytest @@ -50,13 +50,8 @@ def apply(df: pl.DataFrame) -> pl.DataFrame: def test_rolling_groupby_overlapping_groups() -> None: - # this first aggregates overlapping groups - # so they cannot be naively flattened - df = pl.DataFrame( - { - "a": [41, 60, 37, 51, 52, 39, 40], - } - ) + # this first aggregates overlapping groups so they cannot be naively flattened + df = pl.DataFrame({"a": [41, 60, 37, 51, 52, 39, 40]}) assert_series_equal( ( @@ -67,7 +62,7 @@ def test_rolling_groupby_overlapping_groups() -> None: period="5i", ) .agg( - # the apply to trigger the apply on the expression engine + # trigger the apply on the expression engine pl.col("a") .apply(lambda x: x) .sum() @@ -263,3 +258,42 @@ def test_groupby_rolling_dynamic_sortedness_check() -> None: match=r"argument in operation 'groupby_rolling' is not explicitly sorted", ): df.groupby_rolling("idx", period="2i").agg(pl.col("idx").alias("idx1")) + + +def test_groupby_rolling_empty_groups_9973() -> None: + dt1 = date(2001, 1, 1) + dt2 = date(2001, 1, 2) + + data = pl.DataFrame( + { + "id": ["A", "A", "B", "B", "C", "C"], + "date": [dt1, dt2, dt1, dt2, dt1, dt2], + "value": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + } + ).sort(by=["id", "date"]) + + expected = pl.DataFrame( + { + "id": ["A", "A", "B", "B", "C", "C"], + "date": [ + date(2001, 1, 1), + date(2001, 1, 2), + date(2001, 1, 1), + date(2001, 1, 2), + date(2001, 1, 1), + date(2001, 1, 2), + ], + "value": [[2.0], [], [4.0], [], [6.0], []], + } + ) + + out = data.groupby_rolling( + index_column="date", + by="id", + period="2d", + offset="1d", + closed="left", + check_sorted=True, + ).agg(pl.col("value")) + + assert_frame_equal(out, expected) diff --git a/py-polars/tests/unit/operations/test_inefficient_apply.py b/py-polars/tests/unit/operations/test_inefficient_apply.py new file mode 100644 index 0000000000000..6412ba2569fa0 --- /dev/null +++ b/py-polars/tests/unit/operations/test_inefficient_apply.py @@ -0,0 +1,206 @@ +from __future__ import annotations + +import json +import re +from typing import Any, Callable + +import numpy +import pytest + +import polars as pl +from polars.exceptions import PolarsInefficientApplyWarning +from polars.testing import assert_frame_equal, assert_series_equal +from polars.utils.udfs import _NUMPY_FUNCTIONS, BytecodeParser +from tests.test_udfs import MY_CONSTANT, MY_DICT, MY_LIST, NOOP_TEST_CASES, TEST_CASES + +EVAL_ENVIRONMENT = { + "np": numpy, + "pl": pl, + "MY_CONSTANT": MY_CONSTANT, + "MY_DICT": MY_DICT, + "MY_LIST": MY_LIST, +} + + +@pytest.mark.parametrize( + "func", + NOOP_TEST_CASES, +) +def test_parse_invalid_function(func: Callable[[Any], Any]) -> None: + # functions we don't (yet?) offer suggestions for + assert not BytecodeParser(func, apply_target="expr").can_rewrite() + + +@pytest.mark.parametrize( + ("col", "func", "expr_repr"), + TEST_CASES, +) +def test_parse_apply_functions( + col: str, func: Callable[[Any], Any], expr_repr: str +) -> None: + with pytest.warns( + PolarsInefficientApplyWarning, + match=r"(?s)Expr\.apply.*In this case, you can replace", + ): + parser = BytecodeParser(func, apply_target="expr") + suggested_expression = parser.to_expression(col) + assert suggested_expression == expr_repr + + df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["AB", "cd", "eF"], + "c": ['{"a": 1}', '{"b": 2}', '{"c": 3}'], + } + ) + result_frame = df.select( + x=col, + y=eval(suggested_expression, EVAL_ENVIRONMENT), + ) + expected_frame = df.select( + x=pl.col(col), + y=pl.col(col).apply(func), + ) + assert_frame_equal(result_frame, expected_frame) + + +def test_parse_apply_raw_functions() -> None: + lf = pl.LazyFrame({"a": [1, 2, 3]}) + + # test bare 'numpy' functions + for func_name in _NUMPY_FUNCTIONS: + func = getattr(numpy, func_name) + + # note: we can't parse/rewrite raw numpy functions... + parser = BytecodeParser(func, apply_target="expr") + assert not parser.can_rewrite() + + # ...but we ARE still able to warn + with pytest.warns( + PolarsInefficientApplyWarning, + match=rf"(?s)Expr\.apply.*In this case, you can replace.*np\.{func_name}", + ): + df1 = lf.select(pl.col("a").apply(func)).collect() + df2 = lf.select(getattr(pl.col("a"), func_name)()).collect() + assert_frame_equal(df1, df2) + + # test bare 'json.loads' + result_frames = [] + with pytest.warns( + PolarsInefficientApplyWarning, + match=r"(?s)Expr\.apply.*In this case, you can replace.*\.str\.json_extract", + ): + for expr in ( + pl.col("value").str.json_extract(), + pl.col("value").apply(json.loads), + ): + result_frames.append( + pl.LazyFrame({"value": ['{"a":1, "b": true, "c": "xx"}', None]}) + .select(extracted=expr) + .unnest("extracted") + .collect() + ) + + assert_frame_equal(*result_frames) + + # test primitive python casts + for py_cast, pl_dtype in ((str, pl.Utf8), (int, pl.Int64), (float, pl.Float64)): + with pytest.warns( + PolarsInefficientApplyWarning, + match=rf'(?s)replace.*pl\.col\("a"\)\.cast\(pl\.{pl_dtype.__name__}\)', + ): + assert_frame_equal( + lf.select(pl.col("a").apply(py_cast)).collect(), + lf.select(pl.col("a").cast(pl_dtype)).collect(), + ) + + +def test_parse_apply_miscellaneous() -> None: + # note: can also identify inefficient functions and methods as well as lambdas + class Test: + def x10(self, x: pl.Expr) -> pl.Expr: + return x * 10 + + parser = BytecodeParser(Test().x10, apply_target="expr") + suggested_expression = parser.to_expression(col="colx") + assert suggested_expression == 'pl.col("colx") * 10' + + # note: all constants - should not create a warning/suggestion + suggested_expression = BytecodeParser( + lambda x: MY_CONSTANT + 42, apply_target="expr" + ).to_expression(col="colx") + assert suggested_expression is None + + # literals as method parameters + with pytest.warns( + PolarsInefficientApplyWarning, + match=r"(?s)Series\.apply.*replace.*\(np\.cos\(3\) \+ s\) - abs\(-1\)", + ): + pl_series = pl.Series("srs", [0, 1, 2, 3, 4]) + assert_series_equal( + pl_series.apply(lambda x: numpy.cos(3) + x - abs(-1)), + numpy.cos(3) + pl_series - 1, + ) + + # if 's' is already the name of a global variable then the series alias + # used in the user warning will fall back (in priority order) through + # various aliases until it finds one that is available. + s, srs, series = -1, 0, 1 + expr1 = BytecodeParser(lambda x: x + s, apply_target="series") + expr2 = BytecodeParser(lambda x: srs + x + s, apply_target="series") + expr3 = BytecodeParser(lambda x: srs + x + s - x + series, apply_target="series") + + assert expr1.to_expression(col="srs") == "srs + s" + assert expr2.to_expression(col="srs") == "(srs + series) + s" + assert expr3.to_expression(col="srs") == "(((srs + srs0) + s) - srs0) + series" + + +@pytest.mark.parametrize( + ("data", "func", "expr_repr"), + [ + ( + [1, 2, 3], + lambda x: str(x), + "s.cast(pl.Utf8)", + ), + ( + [-20, -12, -5, 0, 5, 12, 20], + lambda x: (abs(x) != 12) and (x > 10 or x < -10 or x == 0), + "(s.abs() != 12) & ((s > 10) | ((s < -10) | (s == 0)))", + ), + ], +) +def test_parse_apply_series( + data: list[Any], func: Callable[[Any], Any], expr_repr: str +) -> None: + # expression/series generate same warning, with 's' as the series placeholder + with pytest.warns( + PolarsInefficientApplyWarning, match=r"(?s)Series\.apply.*s\.\w+\(" + ): + s = pl.Series("srs", data) + + parser = BytecodeParser(func, apply_target="series") + suggested_expression = parser.to_expression(s.name) + assert suggested_expression == expr_repr + + expected_series = s.apply(func) + result_series = eval(suggested_expression) + assert_series_equal(expected_series, result_series) + + +def test_expr_exact_warning_message() -> None: + msg = re.escape( + "\n" + "Expr.apply is significantly slower than the native expressions API.\n" + "Only use if you absolutely CANNOT implement your logic otherwise.\n" + "In this case, you can replace your `apply` with the following:\n" + ' - pl.col("a").apply(lambda x: ...)\n' + ' + pl.col("a") + 1\n' + ) + # Check the EXACT warning message. If modifying the message in the future, + # please make sure to keep the `^` and `$`, + # and to keep the assertion on `len(warnings)`. + with pytest.warns(PolarsInefficientApplyWarning, match=rf"^{msg}$") as warnings: + df = pl.DataFrame({"a": [1, 2, 3]}) + df.select(pl.col("a").apply(lambda x: x + 1)) + assert len(warnings) == 1 diff --git a/py-polars/tests/unit/operations/test_is_in.py b/py-polars/tests/unit/operations/test_is_in.py index 8d283e4d8a6f4..078438759b0ad 100644 --- a/py-polars/tests/unit/operations/test_is_in.py +++ b/py-polars/tests/unit/operations/test_is_in.py @@ -5,6 +5,7 @@ import pytest import polars as pl +from polars.testing import assert_series_equal def test_struct_logical_is_in() -> None: @@ -28,10 +29,10 @@ def test_struct_logical_is_in() -> None: def test_is_in_bool() -> None: - bool_value_to_filter_on = {True, None} + vals = [True, None] df = pl.DataFrame({"A": [True, False, None]}) - assert df.filter(pl.col("A").is_in(bool_value_to_filter_on)).to_dict(False) == { - "A": [True, False] + assert df.select(pl.col("A").is_in(vals)).to_dict(False) == { + "A": [True, False, None] } @@ -101,3 +102,10 @@ def test_is_in_series() -> None: with pytest.raises(pl.ComputeError, match=r"cannot compare"): df.select(pl.col("b").is_in(["x", "x"])) + + # check we don't shallow-copy and accidentally modify 'a' (see: #10072) + a = pl.Series("a", [1, 2]) + b = pl.Series("b", [1, 3]).is_in(a) + + assert a.name == "a" + assert_series_equal(b, pl.Series("b", [True, False])) diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index f0e20d52251e9..ce5c77e9a841b 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -359,6 +359,16 @@ def test_with_pd( joined = dfb.join(dfa, on="b", how="anti") assert not joined["a"].flags["SORTED_ASC"] + # streaming left join + df1 = pl.DataFrame({"x": [1, 2, 3, 4], "y": [2, 4, 6, 6]}).set_sorted("x") + df2 = pl.DataFrame({"x": [4, 2, 3, 1], "z": [1, 4, 9, 1]}) + assert ( + df1.lazy() + .join(df2.lazy(), on="x", how="left") + .collect(streaming=True)["x"] + .flags["SORTED_ASC"] + ) + def test_jit_sort_joins() -> None: n = 200 diff --git a/py-polars/tests/unit/operations/test_join_asof.py b/py-polars/tests/unit/operations/test_join_asof.py index 9a86ca28cf219..953255c3f974a 100644 --- a/py-polars/tests/unit/operations/test_join_asof.py +++ b/py-polars/tests/unit/operations/test_join_asof.py @@ -441,6 +441,31 @@ def test_asof_join_nearest_by() -> None: out = df1.join_asof(df2, on="asof_key", by="group", strategy="nearest") assert_frame_equal(out, expected) + a = pl.DataFrame( + { + "code": [676, 35, 676, 676, 676], + "time": [364360, 364370, 364380, 365400, 367440], + } + ) + b = pl.DataFrame( + { + "code": [676, 676, 35, 676, 676], + "time": [364000, 365000, 365000, 366000, 367000], + "price": [1.0, 2.0, 50, 3.0, None], + } + ) + + expected = pl.DataFrame( + { + "code": [676, 35, 676, 676, 676], + "time": [364360, 364370, 364380, 365400, 367440], + "price": [1.0, 50.0, 1.0, 2.0, None], + } + ) + + out = a.join_asof(b, by="code", on="time", strategy="nearest") + assert_frame_equal(out, expected) + def test_asof_join_nearest_by_date() -> None: df1 = pl.DataFrame( diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py index 5f5ca96b2062b..26c8ea277fc0a 100644 --- a/py-polars/tests/unit/operations/test_pivot.py +++ b/py-polars/tests/unit/operations/test_pivot.py @@ -302,9 +302,8 @@ def test_pivot_negative_duration() -> None: def test_aggregate_function_deprecation_warning() -> None: df = pl.DataFrame({"a": [1, 2], "b": ["foo", "foo"], "c": ["x", "x"]}) - with pytest.warns( - DeprecationWarning, - match="the default `aggregate_function` will change from `'first'` to `None`", + with pytest.deprecated_call( + match="the default `aggregate_function` will change from `'first'` to `None`" ): df.pivot("a", "b", "c") diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py index 81096064da745..423c5b0212e36 100644 --- a/py-polars/tests/unit/operations/test_rolling.py +++ b/py-polars/tests/unit/operations/test_rolling.py @@ -17,7 +17,7 @@ from backports.zoneinfo._zoneinfo import ZoneInfo import polars as pl -from polars.testing import assert_frame_equal +from polars.testing import assert_frame_equal, assert_series_equal if TYPE_CHECKING: from polars.type_aliases import ClosedInterval @@ -819,3 +819,22 @@ def test_rolling_empty_window_9406() -> None: ] ), ) + + +def test_rolling_weighted_quantile_10031() -> None: + assert_series_equal( + pl.Series([1, 2]).rolling_median(window_size=2, weights=[0, 1]), + pl.Series([None, 2.0]), + ) + + assert_series_equal( + pl.Series([1, 2, 3, 5]).rolling_quantile(0.7, "linear", 3, [0.1, 0.3, 0.6]), + pl.Series([None, None, 2.55, 4.1]), + ) + + assert_series_equal( + pl.Series([1, 2, 3, 5, 8]).rolling_quantile( + 0.7, "linear", 4, [0.1, 0.2, 0, 0.3] + ), + pl.Series([None, None, None, 3.5, 5.5]), + ) diff --git a/py-polars/tests/unit/operations/test_sort.py b/py-polars/tests/unit/operations/test_sort.py index 0d1cb06ecad05..7333e453744e0 100644 --- a/py-polars/tests/unit/operations/test_sort.py +++ b/py-polars/tests/unit/operations/test_sort.py @@ -513,7 +513,7 @@ def get_str_ints_df(n: int) -> pl.DataFrame: strs = pl.Series("strs", random.choices(string.ascii_lowercase, k=n)) strs = pl.select( pl.when(strs == "a") - .then("") + .then(pl.lit("")) .when(strs == "b") .then(None) .otherwise(strs) @@ -534,7 +534,7 @@ def test_sort_row_fmt() -> None: df_pd = df.to_pandas() for descending in [True, False]: - pl.testing.assert_frame_equal( + assert_frame_equal( df.sort(["strs", "vals"], nulls_last=True, descending=descending), pl.from_pandas( df_pd.sort_values(["strs", "vals"], ascending=not descending) @@ -705,3 +705,19 @@ def test_top_k_9385() -> None: assert pl.LazyFrame({"b": [True, False]}).sort(["b"]).slice(0, 1).collect()[ "b" ].to_list() == [False] + + +def test_sorted_flag_partition_by() -> None: + assert ( + pl.DataFrame({"one": [1, 2, 3], "two": ["a", "a", "b"]}) + .set_sorted("one") + .partition_by("two", maintain_order=True)[0]["one"] + .flags["SORTED_ASC"] + ) + + +def test_sorted_flag_singletons() -> None: + assert pl.DataFrame({"x": [1]})["x"].flags["SORTED_ASC"] + assert pl.DataFrame({"x": ["a"]})["x"].flags["SORTED_ASC"] + assert pl.DataFrame({"x": [True]})["x"].flags["SORTED_ASC"] + assert pl.DataFrame({"x": [None]})["x"].flags["SORTED_ASC"] diff --git a/py-polars/tests/unit/operations/test_statistics.py b/py-polars/tests/unit/operations/test_statistics.py index e052862b9b065..435c8827ab08e 100644 --- a/py-polars/tests/unit/operations/test_statistics.py +++ b/py-polars/tests/unit/operations/test_statistics.py @@ -103,7 +103,7 @@ def test_qcut() -> None: + ["(0.25, inf]"] * 2, } ) - out = cast(pl.DataFrame, input.qcut([0.0, 0.25, 0.75])) + out = cast(pl.DataFrame, input.qcut([0.0, 0.25, 0.75], series=False)) out_s = cast(pl.Series, input.qcut([0.0, 0.25, 0.75], series=True)) assert_frame_equal(out, exp, check_dtype=False) assert_series_equal( diff --git a/py-polars/tests/unit/operations/test_transpose.py b/py-polars/tests/unit/operations/test_transpose.py index 1211016d9f861..149fde29ac301 100644 --- a/py-polars/tests/unit/operations/test_transpose.py +++ b/py-polars/tests/unit/operations/test_transpose.py @@ -147,3 +147,11 @@ def test_transpose_logical_data() -> None: } ) assert_frame_equal(result, expected) + + +def test_err_transpose_object() -> None: + class CustomObject: + pass + + with pytest.raises(pl.InvalidOperationError): + pl.DataFrame([CustomObject()]).transpose() diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index 624147a8b69a7..4b3dce6c071cb 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -23,7 +23,7 @@ UInt64, Unknown, ) -from polars.exceptions import ShapeError +from polars.exceptions import PolarsInefficientApplyWarning, ShapeError from polars.testing import assert_frame_equal, assert_series_equal from polars.utils._construction import iterable_to_pyseries @@ -980,13 +980,15 @@ def test_fill_nan() -> None: def test_apply() -> None: - a = pl.Series("a", [1, 2, None]) - b = a.apply(lambda x: x**2) - assert list(b) == [1, 4, None] + with pytest.warns(PolarsInefficientApplyWarning): + a = pl.Series("a", [1, 2, None]) + b = a.apply(lambda x: x**2) + assert list(b) == [1, 4, None] - a = pl.Series("a", ["foo", "bar", None]) - b = a.apply(lambda x: x + "py") - assert list(b) == ["foopy", "barpy", None] + with pytest.warns(PolarsInefficientApplyWarning): + a = pl.Series("a", ["foo", "bar", None]) + b = a.apply(lambda x: x + "py") + assert list(b) == ["foopy", "barpy", None] b = a.apply(lambda x: len(x), return_dtype=pl.Int32) assert list(b) == [3, 3, None] @@ -1262,6 +1264,9 @@ def test_mode() -> None: ) assert pl.Series([1.0, 2.0, 3.0, 2.0]).mode().item() == 2.0 + # sorted data + assert pl.int_range(0, 3, eager=True).mode().to_list() == [2, 1, 0] + def test_rank() -> None: s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) @@ -1366,6 +1371,15 @@ def test_sqrt() -> None: ) +def test_cbrt() -> None: + s = pl.Series("a", [1, 2]) + assert_series_equal(s.cbrt(), pl.Series("a", [1.0, np.cbrt(2)])) + df = pl.DataFrame([s]) + assert_series_equal( + df.select(pl.col("a").cbrt())["a"], pl.Series("a", [1.0, np.cbrt(2)]) + ) + + def test_range() -> None: s1 = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) assert_series_equal(s1[2:5], s1[range(2, 5)]) diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py index d0ae74dcfa015..1aef2a7c3fa0f 100644 --- a/py-polars/tests/unit/streaming/test_streaming.py +++ b/py-polars/tests/unit/streaming/test_streaming.py @@ -8,6 +8,7 @@ import pytest import polars as pl +from polars.exceptions import PolarsInefficientApplyWarning from polars.testing import assert_frame_equal, assert_series_equal if TYPE_CHECKING: @@ -326,15 +327,19 @@ def test_tree_validation_streaming() -> None: def test_streaming_apply(monkeypatch: Any, capfd: Any) -> None: monkeypatch.setenv("POLARS_VERBOSE", "1") + q = pl.DataFrame({"a": [1, 2]}).lazy() - ( - q.select(pl.col("a").apply(lambda x: x * 2, return_dtype=pl.Int64)).collect( - streaming=True + with pytest.warns( + PolarsInefficientApplyWarning, match="In this case, you can replace" + ): + ( + q.select(pl.col("a").apply(lambda x: x * 2, return_dtype=pl.Int64)).collect( + streaming=True + ) ) - ) - (_, err) = capfd.readouterr() - assert "df -> projection -> ordered_sink" in err + (_, err) = capfd.readouterr() + assert "df -> projection -> ordered_sink" in err def test_streaming_ternary() -> None: @@ -688,3 +693,28 @@ def test_streaming_groupby_list_9758() -> None: .to_dict(False) == payload ) + + +@pytest.mark.write_disk() +def test_streaming_10115(tmp_path: Path) -> None: + in_path = tmp_path / "in.parquet" + out_path = tmp_path / "out.parquet" + + # this fails if the schema will be incorrectly due to the projection + # pushdown + (pl.DataFrame([{"x": 1, "y": "foo"}]).write_parquet(in_path)) + + joiner = pl.LazyFrame([{"y": "foo", "z": "_"}]) + + ( + pl.scan_parquet(in_path) + .join(joiner, how="left", on="y") + .select("x", "y", "z") + .sink_parquet(out_path) # + ) + + assert pl.read_parquet(out_path).to_dict(False) == { + "x": [1], + "y": ["foo"], + "z": ["_"], + } diff --git a/py-polars/tests/unit/test_arity.py b/py-polars/tests/unit/test_arity.py index 3e663d4a43a75..46eed154e09bb 100644 --- a/py-polars/tests/unit/test_arity.py +++ b/py-polars/tests/unit/test_arity.py @@ -1,41 +1,4 @@ -from datetime import datetime - import polars as pl -from polars.testing import assert_frame_equal - - -def test_nested_when_then_and_wildcard_expansion_6284() -> None: - df = pl.DataFrame( - { - "1": ["a", "b"], - "2": ["c", "d"], - } - ) - - out0 = df.with_columns( - pl.when(pl.any_horizontal(pl.all() == "a")) - .then("a") - .otherwise( - pl.when(pl.any_horizontal(pl.all() == "d")).then("d").otherwise(None) - ) - .alias("result") - ) - - out1 = df.with_columns( - pl.when(pl.any_horizontal(pl.all() == "a")) - .then("a") - .when(pl.any_horizontal(pl.all() == "d")) - .then("d") - .otherwise(None) - .alias("result") - ) - - assert_frame_equal(out0, out1) - assert out0.to_dict(False) == { - "1": ["a", "b"], - "2": ["c", "d"], - "result": ["a", "d"], - } def test_expression_literal_series_order() -> None: @@ -44,30 +7,3 @@ def test_expression_literal_series_order() -> None: assert df.select(pl.col("a") + s).to_dict(False) == {"a": [2, 4, 6]} assert df.select(pl.lit(s) + pl.col("a")).to_dict(False) == {"": [2, 4, 6]} - - -def test_list_zip_with_logical_type() -> None: - df = pl.DataFrame( - { - "start": [datetime(2023, 1, 1, 1, 1, 1), datetime(2023, 1, 1, 1, 1, 1)], - "stop": [datetime(2023, 1, 1, 1, 3, 1), datetime(2023, 1, 1, 1, 4, 1)], - "use": [1, 0], - } - ) - - df = df.with_columns( - pl.date_range( - pl.col("start"), pl.col("stop"), interval="1h", eager=False, closed="left" - ).alias("interval_1"), - pl.date_range( - pl.col("start"), pl.col("stop"), interval="1h", eager=False, closed="left" - ).alias("interval_2"), - ) - - out = df.select( - pl.when(pl.col("use") == 1) - .then(pl.col("interval_2")) - .otherwise(pl.col("interval_1")) - .alias("interval_new") - ) - assert out.dtypes == [pl.List(pl.Datetime(time_unit="us", time_zone=None))] diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index 2573ec5155da3..c10b22b32ebe5 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -1,10 +1,12 @@ import re from datetime import date from tempfile import NamedTemporaryFile +from typing import Any import pytest import polars as pl +from polars.testing import assert_frame_equal def test_cse_rename_cross_join_5405() -> None: @@ -14,7 +16,7 @@ def test_cse_rename_cross_join_5405() -> None: out = left.join(right.rename({"B": "C"}), on=["A", "C"], how="left") - assert out.collect(common_subplan_elimination=True).to_dict(False) == { + assert out.collect(comm_subplan_elim=True).to_dict(False) == { "C": [3, 3, 4, 4], "A": [1, 2, 1, 2], "D": [5, None, None, 6], @@ -53,9 +55,9 @@ def test_cse_schema_6081() -> None: ) result = df.join(min_value_by_group, on=["date", "id"], how="left") - assert result.collect( - common_subplan_elimination=True, projection_pushdown=True - ).to_dict(False) == { + assert result.collect(comm_subplan_elim=True, projection_pushdown=True).to_dict( + False + ) == { "date": [date(2022, 12, 12), date(2022, 12, 12), date(2022, 12, 13)], "id": [1, 1, 5], "value": [1, 2, 2], @@ -99,7 +101,7 @@ def test_cse_9630() -> None: intersected_df2 = all_subsections.join(df2, on="key") assert intersected_df1.join(intersected_df2, on=["key"], how="left").collect( - common_subplan_elimination=True + comm_subplan_elim=True ).to_dict(False) == { "key": [1], "value": [[1, 2]], @@ -132,3 +134,116 @@ def test_schema_row_count_cse() -> None: "A_right": [["Gr1", "Gr1"]], } csv_a.close() + + +def test_cse_expr_selection_context(monkeypatch: Any, capfd: Any) -> None: + monkeypatch.setenv("POLARS_VERBOSE", "1") + q = pl.LazyFrame( + { + "a": [1, 2, 3, 4], + "b": [1, 2, 3, 4], + "c": [1, 2, 3, 4], + } + ) + + derived = (pl.col("a") * pl.col("b")).sum() + derived2 = derived * derived + + exprs = [ + derived.alias("d1"), + (derived * pl.col("c").sum() - 1).alias("foo"), + derived2.alias("d2"), + (derived2 * 10).alias("d3"), + ] + + assert q.select(exprs).collect(comm_subexpr_elim=True).to_dict(False) == { + "d1": [30], + "foo": [299], + "d2": [900], + "d3": [9000], + } + assert q.with_columns(exprs).collect(comm_subexpr_elim=True).to_dict(False) == { + "a": [1, 2, 3, 4], + "b": [1, 2, 3, 4], + "c": [1, 2, 3, 4], + "d1": [30, 30, 30, 30], + "foo": [299, 299, 299, 299], + "d2": [900, 900, 900, 900], + "d3": [9000, 9000, 9000, 9000], + } + + out = capfd.readouterr().out + assert "run ProjectionExec with 2 CSE" in out + assert "run StackExec with 2 CSE" in out + + +def test_cse_expr_selection_streaming(monkeypatch: Any, capfd: Any) -> None: + monkeypatch.setenv("POLARS_VERBOSE", "1") + q = pl.LazyFrame( + { + "a": [1, 2, 3, 4], + "b": [1, 2, 3, 4], + "c": [1, 2, 3, 4], + } + ) + + derived = pl.col("a") * pl.col("b") + derived2 = derived * derived + + exprs = [ + derived.alias("d1"), + derived2.alias("d2"), + (derived2 * 10).alias("d3"), + ] + + assert q.select(exprs).collect(comm_subexpr_elim=True, streaming=True).to_dict( + False + ) == {"d1": [1, 4, 9, 16], "d2": [1, 16, 81, 256], "d3": [10, 160, 810, 2560]} + assert q.with_columns(exprs).collect( + comm_subexpr_elim=True, streaming=True + ).to_dict(False) == { + "a": [1, 2, 3, 4], + "b": [1, 2, 3, 4], + "c": [1, 2, 3, 4], + "d1": [1, 4, 9, 16], + "d2": [1, 16, 81, 256], + "d3": [10, 160, 810, 2560], + } + err = capfd.readouterr().err + assert "df -> projection[cse] -> ordered_sink" in err + assert "df -> hstack[cse] -> ordered_sink" in err + + +def test_cse_expr_groupby() -> None: + q = pl.LazyFrame( + { + "a": [1, 2, 3, 4], + "b": [1, 2, 3, 4], + "c": [1, 2, 3, 4], + } + ) + + derived = pl.col("a") * pl.col("b") + + q = ( + q.groupby("a") + .agg(derived.sum().alias("sum"), derived.min().alias("min")) + .sort("min") + ) + + assert "__POLARS_CSER" in q.explain(comm_subexpr_elim=True, optimized=True) + + s = q.explain( + comm_subexpr_elim=True, optimized=True, streaming=True, comm_subplan_elim=False + ) + # check if it uses CSE_expr + # and is a complete pipeline + assert "__POLARS_CSER" in s + assert s.startswith("--- PIPELINE") + + expected = pl.DataFrame( + {"a": [1, 2, 3, 4], "sum": [1, 4, 9, 16], "min": [1, 4, 9, 16]} + ) + for streaming in [True, False]: + out = q.collect(comm_subexpr_elim=True, streaming=streaming) + assert_frame_equal(out, expected) diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py index f7a1eb956d90c..d88b1f9faf417 100644 --- a/py-polars/tests/unit/test_exprs.py +++ b/py-polars/tests/unit/test_exprs.py @@ -976,23 +976,6 @@ def test_tail() -> None: assert df.select(pl.col("a").tail(pl.count() / 2)).to_dict(False) == {"a": [4, 5]} -def test_cache_expr(monkeypatch: Any, capfd: Any) -> None: - monkeypatch.setenv("POLARS_VERBOSE", "1") - df = pl.DataFrame( - { - "x": [3, 3, 3, 5, 8], - } - ) - x = (pl.col("x") * 10).cache() - - assert (df.groupby(1).agg([x * x * x])).to_dict(False) == { - "literal": [1], - "x": [[27000, 27000, 27000, 125000, 512000]], - } - _, err = capfd.readouterr() - assert """cache hit: [(col("x")) * (10)].cache()""" in err - - @pytest.mark.parametrize( ("const", "dtype"), [ diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index 377ccb624b55c..3bd4fb82ce49b 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -14,6 +14,7 @@ import polars as pl from polars import lit, when from polars.datatypes import FLOAT_DTYPES +from polars.exceptions import PolarsInefficientApplyWarning from polars.testing import assert_frame_equal from polars.testing.asserts import assert_series_equal @@ -88,14 +89,16 @@ def test_apply() -> None: assert_frame_equal(new, expected) assert_frame_equal(new.collect(), expected.collect()) - for strategy in ["thread_local", "threading"]: - ldf = pl.LazyFrame({"a": [1, 2, 3] * 20, "b": [1.0, 2.0, 3.0] * 20}) - new = ldf.with_columns( - pl.col("a").apply(lambda s: s * 2, strategy=strategy).alias("foo") # type: ignore[arg-type] - ) - - expected = ldf.clone().with_columns((pl.col("a") * 2).alias("foo")) - assert_frame_equal(new.collect(), expected.collect()) + with pytest.warns( + PolarsInefficientApplyWarning, match="In this case, you can replace" + ): + for strategy in ["thread_local", "threading"]: + ldf = pl.LazyFrame({"a": [1, 2, 3] * 20, "b": [1.0, 2.0, 3.0] * 20}) + new = ldf.with_columns( + pl.col("a").apply(lambda s: s * 2, strategy=strategy).alias("foo") # type: ignore[arg-type] + ) + expected = ldf.clone().with_columns((pl.col("a") * 2).alias("foo")) + assert_frame_equal(new.collect(), expected.collect()) def test_add_eager_column() -> None: diff --git a/py-polars/tests/unit/test_predicates.py b/py-polars/tests/unit/test_predicates.py index 7a26c3d82ea7f..75f1a6d140d21 100644 --- a/py-polars/tests/unit/test_predicates.py +++ b/py-polars/tests/unit/test_predicates.py @@ -24,23 +24,6 @@ def test_predicate_4906() -> None: ).collect().to_dict(False) == {"dt": [date(2022, 9, 10), date(2022, 9, 20)]} -def test_when_then_implicit_none() -> None: - df = pl.DataFrame( - { - "team": ["A", "A", "A", "B", "B", "C"], - "points": [11, 8, 10, 6, 6, 5], - } - ) - - assert df.select( - pl.when(pl.col("points") > 7).then("Foo"), - pl.when(pl.col("points") > 7).then("Foo").alias("bar"), - ).to_dict(False) == { - "literal": ["Foo", "Foo", "Foo", None, None, None], - "bar": ["Foo", "Foo", "Foo", None, None, None], - } - - def test_predicate_null_block_asof_join() -> None: left = ( pl.DataFrame( @@ -108,12 +91,6 @@ def test_streaming_empty_df() -> None: assert result.to_dict(False) == {"a": [], "b": [], "b_right": []} -def test_when_then_empty_list_5547() -> None: - out = pl.DataFrame({"a": []}).select([pl.when(pl.col("a") > 1).then([1])]) - assert out.shape == (0, 1) - assert out.dtypes == [pl.List(pl.Int64)] - - def test_predicate_strptime_6558() -> None: assert ( pl.DataFrame({"date": ["2022-01-03", "2020-01-04", "2021-02-03", "2019-01-04"]}) @@ -169,3 +146,15 @@ def test_predicate_pushdown_cumsum_9566() -> None: q = df.lazy().sort(["B", "A"]).filter(pl.col("A").is_in([8, 2]).cumsum() == 1) assert q.collect()["A"].to_list() == [8, 9, 0, 1] + + +def test_predicate_pushdown_join_fill_null_10058() -> None: + ids = pl.LazyFrame({"id": [0, 1, 2]}) + filters = pl.LazyFrame({"id": [0, 1], "filter": [True, False]}) + + assert ( + ids.join(filters, how="left", on="id") + .filter(pl.col("filter").fill_null(True)) + .collect() + .to_dict(False)["id"] + ) == [0, 2] diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index fdc3bd62d54c5..e0390eeef3249 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -288,3 +288,38 @@ def test_join_suffix_collision_9562() -> None: assert df.lazy().join( other_df.lazy(), how="inner", left_on="ham", right_on="ham", suffix="m" ).select("ham").collect().to_dict(False) == {"ham": ["a", "b"]} + + +def test_projection_join_names_9955() -> None: + batting = pl.DataFrame( + { + "playerID": ["abercda01"], + "yearID": [1871], + "lgID": ["NA"], + } + ).lazy() + + awards_players = pl.DataFrame( + { + "playerID": ["bondto01"], + "yearID": [1877], + "lgID": ["NL"], + } + ).lazy() + + right = awards_players.filter(pl.col("lgID") == "NL").select("playerID") + + q = batting.join( + right, + left_on=[pl.col("playerID")], + right_on=[pl.col("playerID")], + how="inner", + ) + + q = q.select(batting.columns) + + assert q.collect().schema == { + "playerID": pl.Utf8, + "yearID": pl.Int64, + "lgID": pl.Utf8, + } diff --git a/py-polars/tests/unit/test_queries.py b/py-polars/tests/unit/test_queries.py index 6f658157c275c..946c0f42e6f63 100644 --- a/py-polars/tests/unit/test_queries.py +++ b/py-polars/tests/unit/test_queries.py @@ -30,30 +30,6 @@ def test_sort_by_bools() -> None: assert out.shape == (3, 4) -def test_type_coercion_when_then_otherwise_2806() -> None: - out = ( - pl.DataFrame({"names": ["foo", "spam", "spam"], "nrs": [1, 2, 3]}) - .select( - [ - pl.when(pl.col("names") == "spam") - .then(pl.col("nrs") * 2) - .otherwise(pl.lit("other")) - .alias("new_col"), - ] - ) - .to_series() - ) - expected = pl.Series("new_col", ["other", "4", "6"]) - assert out.to_list() == expected.to_list() - - # test it remains float32 - assert ( - pl.Series("a", [1.0, 2.0, 3.0], dtype=pl.Float32) - .to_frame() - .select(pl.when(pl.col("a") > 2.0).then(pl.col("a")).otherwise(0.0)) - ).to_series().dtype == pl.Float32 - - def test_repeat_expansion_in_groupby() -> None: out = ( pl.DataFrame({"g": [1, 2, 2, 3, 3, 3]}) @@ -300,37 +276,6 @@ def map_expr(name: str) -> pl.Expr: } -def test_when_then_edge_cases_3994() -> None: - df = pl.DataFrame(data={"id": [1, 1], "type": [2, 2]}) - - # this tests if lazy correctly assigns the list schema to the column aggregation - assert ( - df.lazy() - .groupby(["id"]) - .agg(pl.col("type")) - .with_columns( - pl.when(pl.col("type").list.lengths() == 0) - .then(pl.lit(None)) - .otherwise(pl.col("type")) - .keep_name() - ) - .collect() - ).to_dict(False) == {"id": [1], "type": [[2, 2]]} - - # this tests ternary with an empty argument - assert ( - df.filter(pl.col("id") == 42) - .groupby(["id"]) - .agg(pl.col("type")) - .with_columns( - pl.when(pl.col("type").list.lengths() == 0) - .then(pl.lit(None)) - .otherwise(pl.col("type")) - .keep_name() - ) - ).to_dict(False) == {"id": [], "type": []} - - def test_edge_cast_string_duplicates_4259() -> None: # carefully constructed data. # note that row 2, 3 concatenated are the same string ('5461214484') diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py index d40201c0bf8fa..3bcc20fbed55f 100644 --- a/py-polars/tests/unit/test_selectors.py +++ b/py-polars/tests/unit/test_selectors.py @@ -427,3 +427,21 @@ def test_selector_expr_dispatch() -> None: pl.when(nan_or_inf).then(0.0).otherwise(cs.float()).keep_name() ).fill_null(0), ) + + +def test_regex_expansion_groupby_9947() -> None: + df = pl.DataFrame({"g": [3], "abc": [1], "abcd": [3]}) + assert df.groupby("g").agg(pl.col("^ab.*$")).columns == ["g", "abc", "abcd"] + + +def test_regex_expansion_exclude_10002() -> None: + df = pl.DataFrame({"col_1": [1, 2, 3], "col_2": [2, 4, 3]}) + expected = {"col_1": [10, 20, 30], "col_2": [0.2, 0.4, 0.3]} + + assert ( + df.select( + pl.col("^col_.*$").exclude("col_2").mul(10), + pl.col("^col_.*$").exclude("col_1") / 10, + ).to_dict(as_series=False) + == expected + ) diff --git a/py-polars/tests/unit/test_sql.py b/py-polars/tests/unit/test_sql.py index 5f328390ac1b0..830e735f30d0f 100644 --- a/py-polars/tests/unit/test_sql.py +++ b/py-polars/tests/unit/test_sql.py @@ -1,5 +1,6 @@ from __future__ import annotations +import math import warnings from pathlib import Path @@ -166,6 +167,33 @@ def test_sql_equal_not_equal() -> None: } +def test_sql_arctan2() -> None: + twoRootTwo = math.sqrt(2) / 2.0 + df = pl.DataFrame( + { + "y": [twoRootTwo, -twoRootTwo, twoRootTwo, -twoRootTwo], + "x": [twoRootTwo, twoRootTwo, -twoRootTwo, -twoRootTwo], + } + ) + + sql = pl.SQLContext(df=df) + res = sql.execute( + """ + SELECT + ATAN2D(y,x) as "atan2d", + ATAN2(y,x) as "atan2" + FROM df + """, + eager=True, + ) + + df_result = pl.DataFrame({"atan2d": [45.0, -45.0, 135.0, -135.0]}) + df_result = df_result.with_columns(pl.col("atan2d").cast(pl.Float64)) + df_result = df_result.with_columns(pl.col("atan2d").radians().alias("atan2")) + + assert_frame_equal(df_result, res) + + def test_sql_trig() -> None: df = pl.DataFrame( { diff --git a/py-polars/tests/unit/utils/test_deprecation.py b/py-polars/tests/unit/utils/test_deprecation.py new file mode 100644 index 0000000000000..9d905d9596d52 --- /dev/null +++ b/py-polars/tests/unit/utils/test_deprecation.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import inspect +import warnings + +import pytest + +from polars.utils.deprecation import ( + deprecate_nonkeyword_arguments, + deprecated, + deprecated_name, + issue_deprecation_warning, + redirect, +) + + +def test_issue_deprecation_warning() -> None: + with pytest.deprecated_call(): + issue_deprecation_warning("deprecated", version="0.1.2") + + +def test_deprecated_decorator() -> None: + @deprecated("This is deprecated.", version="3.2.1") + def hello() -> None: + ... + + with pytest.deprecated_call(): + hello() + + +def test_deprecated_name_decorator() -> None: + @deprecated_name("new_hello", version="3.2.1") + def hello() -> None: + ... + + with pytest.deprecated_call(match="new_hello"): + hello() + + +def test_redirect() -> None: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + + # one-to-one redirection + @redirect({"foo": "bar"}) + class DemoClass1: + def bar(self, upper: bool = False) -> str: + return "BAZ" if upper else "baz" + + assert DemoClass1().foo() == "baz" # type: ignore[attr-defined] + + # redirection with **kwargs + @redirect({"foo": ("bar", {"upper": True})}) + class DemoClass2: + def bar(self, upper: bool = False) -> str: + return "BAZ" if upper else "baz" + + assert DemoClass2().foo() == "BAZ" # type: ignore[attr-defined] + + +class Foo: # noqa: D101 + @deprecate_nonkeyword_arguments(allowed_args=["self", "baz"], version="0.1.2") + def bar( # noqa: D102 + self, baz: str, ham: str | None = None, foobar: str | None = None + ) -> None: + ... + + +def test_deprecate_nonkeyword_arguments_method_signature() -> None: + # Note the added star indicating keyword-only arguments after 'baz' + expected = "(self, baz: 'str', *, ham: 'str | None' = None, foobar: 'str | None' = None) -> 'None'" + assert str(inspect.signature(Foo.bar)) == expected + + +def test_deprecate_nonkeyword_arguments_method_warning() -> None: + msg = ( + r"All arguments of Foo\.bar except for \'baz\' will be keyword-only in the next breaking release." + r" Use keyword arguments to silence this warning." + ) + with pytest.deprecated_call(match=msg): + Foo().bar("qux", "quox") diff --git a/py-polars/tests/unit/utils/test_utils.py b/py-polars/tests/unit/utils/test_utils.py index 7119dab5cfedc..39cf6bec40661 100644 --- a/py-polars/tests/unit/utils/test_utils.py +++ b/py-polars/tests/unit/utils/test_utils.py @@ -1,7 +1,5 @@ from __future__ import annotations -import inspect -import warnings from datetime import date, datetime, time, timedelta from typing import TYPE_CHECKING, Any @@ -15,7 +13,6 @@ _timedelta_to_pl_duration, _timedelta_to_pl_timedelta, ) -from polars.utils.decorators import deprecate_nonkeyword_arguments, redirect from polars.utils.meta import get_idx_type from polars.utils.various import _in_notebook, parse_version @@ -119,48 +116,6 @@ def test_parse_version(v1: Any, v2: Any) -> None: assert parse_version(v2) < parse_version(v1) -class Foo: - @deprecate_nonkeyword_arguments(allowed_args=["self", "baz"]) - def bar(self, baz: str, ham: str | None = None, foobar: str | None = None) -> None: - ... - - -def test_deprecate_nonkeyword_arguments_method_signature() -> None: - # Note the added star indicating keyword-only arguments after 'baz' - expected = "(self, baz: 'str', *, ham: 'str | None' = None, foobar: 'str | None' = None) -> 'None'" - assert str(inspect.signature(Foo.bar)) == expected - - -def test_deprecate_nonkeyword_arguments_method_warning() -> None: - msg = ( - r"All arguments of Foo\.bar except for \'baz\' will be keyword-only in the next breaking release." - r" Use keyword arguments to silence this warning." - ) - with pytest.deprecated_call(match=msg): - Foo().bar("qux", "quox") - - -def test_redirect() -> None: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - - # one-to-one redirection - @redirect({"foo": "bar"}) - class DemoClass1: - def bar(self, upper: bool = False) -> str: - return "BAZ" if upper else "baz" - - assert DemoClass1().foo() == "baz" # type: ignore[attr-defined] - - # redirection with **kwargs - @redirect({"foo": ("bar", {"upper": True})}) - class DemoClass2: - def bar(self, upper: bool = False) -> str: - return "BAZ" if upper else "baz" - - assert DemoClass2().foo() == "BAZ" # type: ignore[attr-defined] - - def test_get_idx_type_deprecation() -> None: with pytest.deprecated_call(): get_idx_type() diff --git a/rust-toolchain.toml b/rust-toolchain.toml index bc8b526821c30..8eed5f0be0ad0 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "nightly-2023-06-23" +channel = "nightly-2023-07-27"