diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4a23b4091..b7a84a592 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,8 +5,8 @@ updates: - package-ecosystem: "github-actions" directory: "/" schedule: - # Check for updates to GitHub Actions every week - interval: "weekly" + # Check for updates to GitHub Actions every month + interval: "monthly" commit-message: prefix: "skip changelog" # So this PR will not be added to release-drafter include: "scope" # List of the updated dependencies in the commit will be added \ No newline at end of file diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml index 4f9cbe06d..bb69ccd94 100644 --- a/.github/workflows/downstream_tests.yml +++ b/.github/workflows/downstream_tests.yml @@ -55,7 +55,7 @@ jobs: matrix: python-version: ["3.12"] os: [ubuntu-latest] - dependencies: ["core", "core,optional"] + dependencies: ["core,optional"] runs-on: ${{ matrix.os }} steps: @@ -73,19 +73,27 @@ jobs: run: | git clone https://github.com/marimo-team/marimo.git --depth=1 cd marimo + uv venv -p 3.12 git log - name: install-basics run: uv pip install --upgrade tox virtualenv setuptools hatch --system - name: install-marimo-dev run: | cd marimo - uv pip install -e ".[dev]" --system + . .venv/bin/activate + uv pip install -e ".[dev]" + which python - name: install-narwhals-dev run: | - uv pip uninstall narwhals --system - uv pip install -e . --system + cd marimo + . .venv/bin/activate + uv pip uninstall narwhals + uv pip install -e ./.. - name: show-deps - run: uv pip freeze + run: | + cd marimo + . .venv/bin/activate + uv pip freeze - name: Create assets directory, copy over index.html continue-on-error: true run: | @@ -96,12 +104,13 @@ jobs: if: ${{ matrix.dependencies == 'core,optional' }} run: | cd marimo - hatch run +py=${{ matrix.python-version }} test-optional:test-narwhals + . .venv/bin/activate + # make sure that we use the .venv when running tests, so that + # the local narwhals install is picked up + sed -i '/^\[tool.hatch.envs.default\]/a path = ".venv"' pyproject.toml + hatch run python -c "import narwhals; print(narwhals.__file__)" + hatch run test-optional:test-narwhals timeout-minutes: 15 - - name: Run typechecks - run: | - cd marimo - hatch run typecheck:check scikit-lego: strategy: @@ -181,3 +190,43 @@ jobs: run: | cd py-shiny make narwhals-test-integration + + tubular: + strategy: + matrix: + python-version: ["3.12"] + os: [ubuntu-latest] + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: "true" + cache-suffix: ${{ matrix.python-version }} + cache-dependency-glob: "**requirements*.txt" + - name: clone-tubular + run: | + git clone https://github.com/lvgig/tubular --depth=1 + cd tubular + git log + - name: install-basics + run: uv pip install --upgrade tox virtualenv setuptools pytest-env --system + - name: install-tubular-dev + run: | + cd tubular + uv pip install -e .[dev] --system + - name: install-narwhals-dev + run: | + uv pip uninstall narwhals --system + uv pip install -e . --system + - name: show-deps + run: uv pip freeze + - name: Run pytest + run: | + cd tubular + pytest tests --config-file=pyproject.toml diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index 3f02f965f..fd6a7cfb2 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -90,7 +90,7 @@ jobs: nightlies: strategy: matrix: - python-version: ["3.12"] + python-version: ["3.13"] os: [ubuntu-latest] if: github.event.pull_request.head.repo.full_name == github.repository runs-on: ${{ matrix.os }} diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index ee88911ea..7847939b9 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -34,7 +34,7 @@ jobs: pytest-windows: strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.12"] os: [windows-latest] runs-on: ${{ matrix.os }} @@ -61,7 +61,7 @@ jobs: pytest-coverage: strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.11", "3.13"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} diff --git a/.gitignore b/.gitignore index 8b9adeb8f..774f09637 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ coverage.xml # Documentation site/ todo.md +docs/this.md docs/api-completeness/*.md !docs/api-completeness/index.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4d416e237..141e9d3c4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,9 @@ +ci: + autoupdate_schedule: monthly repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.6.9' + rev: 'v0.7.1' hooks: # Run the formatter. - id: ruff-format @@ -9,7 +11,7 @@ repos: - id: ruff args: [--fix] - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.11.2' + rev: 'v1.13.0' hooks: - id: mypy additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2'] @@ -40,7 +42,7 @@ repos: hooks: - id: nbstripout - repo: https://github.com/adamchainz/blacken-docs - rev: "1.19.0" # replace with latest tag on GitHub + rev: "1.19.1" # replace with latest tag on GitHub hooks: - id: blacken-docs args: [--skip-errors] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c7d7c44a0..b8f333f1e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -109,6 +109,10 @@ nox Notice that nox will also require to have all the python versions that are defined in the `noxfile.py` installed in your system. +#### Testing cuDF + +We can't currently test in CI against cuDF, but you can test it manually in Kaggle using GPUs. Please follow this [Kaggle notebook](https://www.kaggle.com/code/marcogorelli/testing-cudf-in-narwhals) to run the tests. + ### 7. Building docs To build the docs, run `mkdocs serve`, and then open the link provided in a browser. diff --git a/README.md b/README.md index 44fc31e56..b3acb17ba 100644 --- a/README.md +++ b/README.md @@ -43,10 +43,13 @@ Join the party! - [Altair](https://github.com/vega/altair/) - [Hamilton](https://github.com/DAGWorks-Inc/hamilton/tree/main/examples/narwhals) +- [marimo](https://github.com/marimo-team/marimo) +- [pymarginaleffects](https://github.com/vincentarelbundock/pymarginaleffects) - [scikit-lego](https://github.com/koaning/scikit-lego) - [scikit-playtime](https://github.com/koaning/scikit-playtime) - [timebasedcv](https://github.com/FBruzzesi/timebasedcv) -- [marimo](https://github.com/marimo-team/marimo) +- [tubular](https://github.com/lvgig/tubular) +- [wimsey](https://github.com/benrutter/wimsey) Feel free to add your project to the list if it's missing, and/or [chat with us on Discord](https://discord.gg/V3PqtB4VA4) if you'd like any support. diff --git a/docs/api-reference/dependencies.md b/docs/api-reference/dependencies.md index 959e8ee0c..f8995e36a 100644 --- a/docs/api-reference/dependencies.md +++ b/docs/api-reference/dependencies.md @@ -11,14 +11,20 @@ - get_polars - get_pyarrow - is_cudf_dataframe + - is_cudf_index - is_cudf_series - is_dask_dataframe - is_ibis_table + - is_into_dataframe + - is_into_series - is_modin_dataframe + - is_modin_index - is_modin_series - is_numpy_array - is_pandas_dataframe + - is_pandas_index - is_pandas_like_dataframe + - is_pandas_like_index - is_pandas_like_series - is_pandas_series - is_polars_dataframe diff --git a/docs/api-reference/expr_dt.md b/docs/api-reference/expr_dt.md index 5c9ab41f3..604ac4abf 100644 --- a/docs/api-reference/expr_dt.md +++ b/docs/api-reference/expr_dt.md @@ -6,22 +6,23 @@ members: - convert_time_zone - date - - year - - month - day - - ordinal_day - hour - - minute - - second - - millisecond - microsecond + - millisecond + - minute + - month - nanosecond + - ordinal_day - replace_time_zone - - total_minutes - - total_seconds - - total_milliseconds + - second + - timestamp - total_microseconds + - total_milliseconds + - total_minutes - total_nanoseconds + - total_seconds - to_string + - year show_source: false show_bases: false diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index c4b04a2f4..2b5be6e8c 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -15,6 +15,7 @@ Here are the top-level functions available in Narwhals. - from_dict - from_native - from_arrow + - generate_temporary_column_name - get_level - get_native_namespace - is_ordered_categorical diff --git a/docs/api-reference/series_dt.md b/docs/api-reference/series_dt.md index c92592411..23d4817cb 100644 --- a/docs/api-reference/series_dt.md +++ b/docs/api-reference/series_dt.md @@ -6,22 +6,23 @@ members: - convert_time_zone - date - - year - - month - day - - ordinal_day - hour - - minute - - second - - millisecond - microsecond + - millisecond + - minute + - month - nanosecond + - ordinal_day - replace_time_zone - - total_minutes - - total_seconds - - total_milliseconds + - second + - timestamp - total_microseconds + - total_milliseconds + - total_minutes - total_nanoseconds + - total_seconds - to_string + - year show_source: false show_bases: false diff --git a/docs/basics/dataframe_conversion.md b/docs/basics/dataframe_conversion.md new file mode 100644 index 000000000..690f5d093 --- /dev/null +++ b/docs/basics/dataframe_conversion.md @@ -0,0 +1,76 @@ +# Conversion between libraries + +Some library maintainers must apply complex dataframe operations, using methods and functions that may not (yet) be implemented in Narwhals. In such cases, Narwhals can still be highly beneficial, by allowing easy dataframe conversion. + +## Dataframe X in, pandas out + +Imagine that you maintain a library with a function that operates on pandas dataframes to produce automated reports. You want to allow users to supply a dataframe in any format to that function (pandas, Polars, DuckDB, cuDF, Modin, etc.) without adding all those dependencies to your own project and without special-casing each input library's variation of `to_pandas` / `toPandas` / `to_pandas_df` / `df` ... + +One solution is to use Narwhals as a thin Dataframe ingestion layer, to convert user-supplied dataframe to the format that your library uses internally. Since Narwhals is zero-dependency, this is a much more lightweight solution than including all the dataframe libraries as dependencies, +and easier to write than special casing each input library's `to_pandas` method (if it even exists!). + +To illustrate, we create dataframes in various formats: + +```python exec="1" source="above" session="conversion" +import narwhals as nw +from narwhals.typing import IntoDataFrame + +import duckdb +import polars as pl +import pandas as pd + +df_polars = pl.DataFrame( + { + "A": [1, 2, 3, 4, 5], + "fruits": ["banana", "banana", "apple", "apple", "banana"], + "B": [5, 4, 3, 2, 1], + "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + } +) +df_pandas = df_polars.to_pandas() +df_duckdb = duckdb.sql("SELECT * FROM df_polars") +``` + +Now, we define a function that can ingest any dataframe type supported by Narwhals, and convert it to a pandas DataFrame for internal use: + +```python exec="1" source="above" session="conversion" result="python" +def df_to_pandas(df: IntoDataFrame) -> pd.DataFrame: + return nw.from_native(df).to_pandas() + + +print(df_to_pandas(df_polars)) +``` + +## Dataframe X in, Polars out + +### Via PyCapsule Interface + +Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe to Polars format using Narwhals. + +```python exec="1" source="above" session="conversion" result="python" +def df_to_polars(df: IntoDataFrame) -> pl.DataFrame: + return nw.from_arrow(nw.from_native(df), native_namespace=pl).to_native() + + +print(df_to_polars(df_duckdb)) # You can only execute this line of code once. +``` + +It works to pass Polars to `native_namespace` here because Polars supports the [PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) for import. + +Note that the PyCapsule Interface makes no guarantee that you can call it repeatedly, so the approach above only works if you +only expect to perform the conversion a single time on each input object. + +### Via PyArrow + +If you need to ingest the same dataframe multiple times, then you may want to go via PyArrow instead. +This may be less efficient than the PyCapsule approach above (and always requires PyArrow!), but is more forgiving: + +```python exec="1" source="above" session="conversion" result="python" +def df_to_polars(df: IntoDataFrame) -> pl.DataFrame: + return pl.DataFrame(nw.from_native(df).to_arrow()) + + +df_duckdb = duckdb.sql("SELECT * FROM df_polars") +print(df_to_polars(df_duckdb)) # We can execute this... +print(df_to_polars(df_duckdb)) # ...as many times as we like! +``` diff --git a/docs/extending.md b/docs/extending.md index 22d85f701..865a93b08 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -37,6 +37,7 @@ def func(df: FrameT) -> FrameT: b_std=nw.col("b").std(), ) ``` + will work for any of pandas, Polars, cuDF, Modin, and PyArrow. However, sometimes you don't need to do complex operations on dataframes - all you need @@ -57,9 +58,22 @@ def func(df: Any) -> Schema: df = nw.from_native(df, eager_or_interchange_only=True) return df.schema ``` + is also supported, meaning that, in addition to the libraries mentioned above, you can also pass Ibis, DuckDB, Vaex, and any library which implements the protocol. +#### Interchange-only support + +While libraries for which we have full support can benefit from the whole Narwhals API, +libraries which have interchange only support can access the following methods after +converting to Narwhals DataFrame: + +- `.schema`, hence column names via `.schema.names()` and column types via `.schema.dtypes()` +- `.columns` +- `.to_pandas()` and `.to_arrow()`, for converting to Pandas and Arrow, respectively. +- `.select(names)` (Ibis and DuckDB), where `names` is a list of (string) column names. This is useful for + selecting columns before converting to another library. + ### Extending Narwhals If you want your own library to be recognised too, you're welcome open a PR (with tests)!. diff --git a/docs/index.md b/docs/index.md index f18d9af85..e9fe02170 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@ Extremely lightweight and extensible compatibility layer between dataframe libra - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow - **Lazy-only support**: Dask -- **Interchange-level support**: Ibis, Vaex, anything else which implements the DataFrame Interchange Protocol +- **Interchange-level support**: Ibis, DuckDB, Vaex, anything else which implements the DataFrame Interchange Protocol Seamlessly support all, without depending on any! diff --git a/docs/installation.md b/docs/installation.md index 1695a7eec..9f57a05df 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -29,7 +29,7 @@ To verify the installation, start the Python REPL and execute: ```python >>> import narwhals >>> narwhals.__version__ -'1.9.4' +'1.12.1' ``` If you see the version number, then the installation was successful! diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 929f35790..beec6070b 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,4 +1,5 @@ jinja2 +duckdb markdown-exec[ansi] mkdocs mkdocs-autorefs diff --git a/mkdocs.yml b/mkdocs.yml index 3793d898a..46cb5335f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,6 +10,7 @@ nav: - basics/dataframe.md - basics/series.md - basics/complete_example.md + - basics/dataframe_conversion.md - Pandas-like concepts: - other/pandas_index.md - other/user_warning.md @@ -45,6 +46,7 @@ nav: - api-reference/dtypes.md - api-reference/selectors.md - api-reference/typing.md + - This: this.md theme: name: material font: false @@ -76,9 +78,7 @@ theme: toggle: icon: material/brightness-4 name: Switch to system preference -extra_css: - - https://unpkg.com/katex@0/dist/katex.min.css - - css/mkdocstrings.css + plugins: - search @@ -89,9 +89,12 @@ plugins: - https://installer.readthedocs.io/en/stable/objects.inv rendering: show_signature_annotations: true + options: + members_order: alphabetical hooks: - utils/generate_backend_completeness.py +- utils/generate_zen_content.py markdown_extensions: diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 8dd76d081..2214d1cf7 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -59,6 +59,7 @@ from narwhals.translate import narwhalify from narwhals.translate import to_native from narwhals.translate import to_py_scalar +from narwhals.utils import generate_temporary_column_name from narwhals.utils import is_ordered_categorical from narwhals.utils import maybe_align_index from narwhals.utils import maybe_convert_dtypes @@ -66,7 +67,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.9.4" +__version__ = "1.12.1" __all__ = [ "dependencies", @@ -74,6 +75,7 @@ "concat", "from_dict", "from_arrow", + "generate_temporary_column_name", "get_level", "new_series", "to_native", diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 6b87f1d8d..ac845853a 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -17,7 +17,7 @@ from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation from narwhals.utils import flatten -from narwhals.utils import generate_unique_token +from narwhals.utils import generate_temporary_column_name from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop @@ -172,7 +172,7 @@ def __getitem__( ), ) -> ArrowSeries | ArrowDataFrame: if isinstance(item, tuple): - item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) + item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) # type: ignore[assignment] if isinstance(item, str): from narwhals._arrow.series import ArrowSeries @@ -335,10 +335,10 @@ def with_columns( df = self._native_frame.__class__.from_arrays(to_concat, names=output_names) return self._from_native_frame(df) - def group_by(self, *keys: str) -> ArrowGroupBy: + def group_by(self, *keys: str, drop_null_keys: bool) -> ArrowGroupBy: from narwhals._arrow.group_by import ArrowGroupBy - return ArrowGroupBy(self, list(keys)) + return ArrowGroupBy(self, list(keys), drop_null_keys=drop_null_keys) def join( self, @@ -358,7 +358,7 @@ def join( if how == "cross": plx = self.__narwhals_namespace__() - key_token = generate_unique_token( + key_token = generate_temporary_column_name( n_bytes=8, columns=[*self.columns, *other.columns] ) @@ -579,7 +579,7 @@ def is_duplicated(self: Self) -> ArrowSeries: df = self._native_frame columns = self.columns - col_token = generate_unique_token(n_bytes=8, columns=columns) + col_token = generate_temporary_column_name(n_bytes=8, columns=columns) row_count = ( df.append_column(col_token, pa.array(np.arange(len(self)))) .group_by(columns) @@ -638,7 +638,7 @@ def unique( agg_func_map = {"any": "min", "first": "min", "last": "max"} agg_func = agg_func_map[keep] - col_token = generate_unique_token(n_bytes=8, columns=self.columns) + col_token = generate_temporary_column_name(n_bytes=8, columns=self.columns) keep_idx = ( df.append_column(col_token, pa.array(np.arange(len(self)))) .group_by(subset) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 55c529d30..35e936d72 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -353,7 +353,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: "`nw.col('a', 'b')`\n" ) raise ValueError(msg) - tmp = df.group_by(*keys).agg(self) + tmp = df.group_by(*keys, drop_null_keys=False).agg(self) tmp = df.select(*keys).join( tmp, how="left", left_on=keys, right_on=keys, suffix="_right" ) @@ -420,6 +420,11 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr: self._expr, "dt", "convert_time_zone", time_zone ) + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowExpr: + return reuse_series_namespace_implementation( + self._expr, "dt", "timestamp", time_unit + ) + def date(self: Self) -> ArrowExpr: return reuse_series_namespace_implementation(self._expr, "dt", "date") diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 6c7b20485..991a96a51 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -37,10 +37,15 @@ def get_function_name_option(function_name: str) -> Any | None: class ArrowGroupBy: - def __init__(self, df: ArrowDataFrame, keys: list[str]) -> None: + def __init__( + self, df: ArrowDataFrame, keys: list[str], *, drop_null_keys: bool + ) -> None: import pyarrow as pa # ignore-banned-import() - self._df = df + if drop_null_keys: + self._df = df.drop_nulls(keys) + else: + self._df = df self._keys = list(keys) self._grouped = pa.TableGroupBy(self._df._native_frame, list(self._keys)) @@ -74,11 +79,7 @@ def agg( ) def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]: - key_values = ( - self._df.select(*self._keys) - .unique(subset=self._keys, keep="first") - .iter_rows() - ) + key_values = self._df.select(*self._keys).unique(subset=self._keys, keep="first") nw_namespace = self._df.__narwhals_namespace__() yield from ( ( @@ -87,7 +88,7 @@ def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]: *[nw_namespace.col(k) == v for k, v in zip(self._keys, key_value)] ), ) - for key_value in key_values + for key_value in key_values.iter_rows() ) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 2eb738291..70009df43 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -12,9 +12,10 @@ from narwhals._arrow.utils import floordiv_compat from narwhals._arrow.utils import narwhals_to_native_dtype from narwhals._arrow.utils import native_to_narwhals_dtype +from narwhals._arrow.utils import parse_datetime_format from narwhals._arrow.utils import validate_column_comparand from narwhals.utils import Implementation -from narwhals.utils import generate_unique_token +from narwhals.utils import generate_temporary_column_name if TYPE_CHECKING: from types import ModuleType @@ -604,7 +605,7 @@ def is_first_distinct(self: Self) -> Self: import pyarrow.compute as pc # ignore-banned-import() row_number = pa.array(np.arange(len(self))) - col_token = generate_unique_token(n_bytes=8, columns=[self.name]) + col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) first_distinct_index = ( pa.Table.from_arrays([self._native_series], names=[self.name]) .append_column(col_token, row_number) @@ -621,7 +622,7 @@ def is_last_distinct(self: Self) -> Self: import pyarrow.compute as pc # ignore-banned-import() row_number = pa.array(np.arange(len(self))) - col_token = generate_unique_token(n_bytes=8, columns=[self.name]) + col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) last_distinct_index = ( pa.Table.from_arrays([self._native_series], names=[self.name]) .append_column(col_token, row_number) @@ -715,7 +716,7 @@ def to_arrow(self: Self) -> pa.Array: def mode(self: Self) -> ArrowSeries: plx = self.__narwhals_namespace__() - col_token = generate_unique_token(n_bytes=8, columns=[self.name]) + col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) return self.value_counts(name=col_token, normalize=False).filter( plx.col(col_token) == plx.col(col_token).max() )[self.name] @@ -780,6 +781,59 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries: return self._arrow_series._from_native_series(result) + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowSeries: + import pyarrow as pa # ignore-banned-import + import pyarrow.compute as pc # ignore-banned-import + + s = self._arrow_series._native_series + dtype = self._arrow_series.dtype + if dtype == self._arrow_series._dtypes.Datetime: + unit = dtype.time_unit # type: ignore[attr-defined] + s_cast = s.cast(pa.int64()) + if unit == "ns": + if time_unit == "ns": + result = s_cast + elif time_unit == "us": + result = floordiv_compat(s_cast, 1_000) + else: + result = floordiv_compat(s_cast, 1_000_000) + elif unit == "us": + if time_unit == "ns": + result = pc.multiply(s_cast, 1_000) + elif time_unit == "us": + result = s_cast + else: + result = floordiv_compat(s_cast, 1_000) + elif unit == "ms": + if time_unit == "ns": + result = pc.multiply(s_cast, 1_000_000) + elif time_unit == "us": + result = pc.multiply(s_cast, 1_000) + else: + result = s_cast + elif unit == "s": + if time_unit == "ns": + result = pc.multiply(s_cast, 1_000_000_000) + elif time_unit == "us": + result = pc.multiply(s_cast, 1_000_000) + else: + result = pc.multiply(s_cast, 1_000) + else: # pragma: no cover + msg = f"unexpected time unit {unit}, please report an issue at https://github.com/narwhals-dev/narwhals" + raise AssertionError(msg) + elif dtype == self._arrow_series._dtypes.Date: + time_s = pc.multiply(s.cast(pa.int32()), 86400) + if time_unit == "ns": + result = pc.multiply(time_s, 1_000_000_000) + elif time_unit == "us": + result = pc.multiply(time_s, 1_000_000) + else: + result = pc.multiply(time_s, 1_000) + else: + msg = "Input should be either of Date or Datetime type" + raise TypeError(msg) + return self._arrow_series._from_native_series(result) + def date(self: Self) -> ArrowSeries: import pyarrow as pa # ignore-banned-import() @@ -1062,8 +1116,7 @@ def to_datetime(self: Self, format: str | None) -> ArrowSeries: # noqa: A002 import pyarrow.compute as pc # ignore-banned-import() if format is None: - msg = "`format` is required for pyarrow backend." - raise ValueError(msg) + format = parse_datetime_format(self._arrow_series._native_series) return self._arrow_series._from_native_series( pc.strptime(self._arrow_series._native_series, format=format, unit="us") diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 7f6fa6558..6f74294d5 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -335,3 +335,97 @@ def convert_str_slice_to_int_slice( stop = columns.index(str_slice.stop) + 1 if str_slice.stop is not None else None step = str_slice.step return (start, stop, step) + + +# Regex for date, time, separator and timezone components +DATE_RE = r"(?P\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4})" +SEP_RE = r"(?P\s|T)" +TIME_RE = r"(?P