diff --git a/polars/polars-core/src/frame/row/transpose.rs b/polars/polars-core/src/frame/row/transpose.rs index 153456819dc5..79d7a146c8fd 100644 --- a/polars/polars-core/src/frame/row/transpose.rs +++ b/polars/polars-core/src/frame/row/transpose.rs @@ -1,25 +1,44 @@ +use std::borrow::Cow; + +use either::Either; + use super::*; impl DataFrame { - pub(crate) fn transpose_from_dtype(&self, dtype: &DataType) -> PolarsResult { + pub(crate) fn transpose_from_dtype( + &self, + dtype: &DataType, + keep_names_as: Option<&str>, + names_out: &[String], + ) -> PolarsResult { let new_width = self.height(); let new_height = self.width(); + // Allocate space for the transposed columns, putting the "row names" first if needed + let mut cols_t = match keep_names_as { + None => Vec::::with_capacity(new_width), + Some(name) => { + let mut tmp = Vec::::with_capacity(new_width + 1); + tmp.push(Utf8Chunked::new(name, self.get_column_names()).into()); + tmp + } + }; + let cols = &self.columns; match dtype { #[cfg(feature = "dtype-i8")] - DataType::Int8 => numeric_transpose::(&self.columns), + DataType::Int8 => numeric_transpose::(cols, names_out, &mut cols_t), #[cfg(feature = "dtype-i16")] - DataType::Int16 => numeric_transpose::(&self.columns), - DataType::Int32 => numeric_transpose::(&self.columns), - DataType::Int64 => numeric_transpose::(&self.columns), + DataType::Int16 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::Int32 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::Int64 => numeric_transpose::(cols, names_out, &mut cols_t), #[cfg(feature = "dtype-u8")] - DataType::UInt8 => numeric_transpose::(&self.columns), + DataType::UInt8 => numeric_transpose::(cols, names_out, &mut cols_t), #[cfg(feature = "dtype-u16")] - DataType::UInt16 => numeric_transpose::(&self.columns), - DataType::UInt32 => numeric_transpose::(&self.columns), - DataType::UInt64 => numeric_transpose::(&self.columns), - DataType::Float32 => numeric_transpose::(&self.columns), - DataType::Float64 => numeric_transpose::(&self.columns), + DataType::UInt16 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::UInt32 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::UInt64 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::Float32 => numeric_transpose::(cols, names_out, &mut cols_t), + DataType::Float64 => numeric_transpose::(cols, names_out, &mut cols_t), #[cfg(feature = "object")] DataType::Object(_) => { // this requires to support `Object` in Series::iter which we don't yet @@ -52,27 +71,51 @@ impl DataFrame { } }); } - let cols = buffers - .into_iter() - .enumerate() - .map(|(i, buf)| { - let mut s = buf.into_series().cast(dtype).unwrap(); - s.rename(&format!("column_{i}")); - s - }) - .collect::>(); - Ok(DataFrame::new_no_checks(cols)) + cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| { + let mut s = buf.into_series().cast(dtype).unwrap(); + s.rename(name); + s + })); } - } + }; + Ok(DataFrame::new_no_checks(cols_t)) } /// Transpose a DataFrame. This is a very expensive operation. - pub fn transpose(&self) -> PolarsResult { + pub fn transpose( + &self, + keep_names_as: Option<&str>, + new_col_names: Option>>, + ) -> PolarsResult { + let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column + let names_out = match new_col_names { + None => (0..self.height()).map(|i| format!("column_{i}")).collect(), + Some(cn) => match cn { + Either::Left(name) => { + let new_names = self.column(&name).and_then(|x| x.utf8())?; + polars_ensure!(!new_names.has_validity(), ComputeError: "Column with new names can't have null values"); + df = Cow::Owned(self.drop(&name)?); + new_names + .into_no_null_iter() + .map(|s| s.to_owned()) + .collect() + } + Either::Right(names) => { + polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count"); + names + } + }, + }; + if let Some(cn) = keep_names_as { + // Check that the column name we're using for the original column names is unique before + // wasting time transposing + polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn) + } polars_ensure!( - self.height() != 0 && self.width() != 0, + df.height() != 0 && df.width() != 0, NoData: "unable to transpose an empty dataframe" ); - let dtype = self.get_supertype().unwrap()?; + let dtype = df.get_supertype().unwrap()?; match dtype { #[cfg(feature = "dtype-categorical")] DataType::Categorical(_) => { @@ -97,7 +140,7 @@ impl DataFrame { } _ => {} } - self.transpose_from_dtype(&dtype) + df.transpose_from_dtype(&dtype, keep_names_as, &names_out) } } @@ -113,9 +156,12 @@ unsafe fn add_value( *el_ptr.add(row_idx) = value; } -pub(super) fn numeric_transpose(cols: &[Series]) -> PolarsResult +// This just fills a pre-allocated mutable series vector, which may have a name column. +// Nothing is returned and the actual DataFrame is constructed above. +pub(super) fn numeric_transpose(cols: &[Series], names_out: &[String], cols_t: &mut Vec) where T: PolarsNumericType, + //S: AsRef, ChunkedArray: IntoSeries, { let new_width = cols[0].len(); @@ -177,12 +223,12 @@ where }) }); - let series = POOL.install(|| { + cols_t.par_extend(POOL.install(|| { values_buf .into_par_iter() .zip(validity_buf) - .enumerate() - .map(|(i, (mut values, validity))| { + .zip(names_out) + .map(|((mut values, validity), name)| { // Safety: // all values are written we can now set len unsafe { @@ -205,16 +251,12 @@ where values.into(), validity, ); - let name = format!("column_{i}"); unsafe { - ChunkedArray::::from_chunks(&name, vec![Box::new(arr) as ArrayRef]) + ChunkedArray::::from_chunks(name, vec![Box::new(arr) as ArrayRef]) .into_series() } }) - .collect() - }); - - Ok(DataFrame::new_no_checks(series)) + })); } #[cfg(test)] @@ -228,7 +270,7 @@ mod test { "b" => [10, 20, 30], ]?; - let out = df.transpose()?; + let out = df.transpose(None, None)?; let expected = df![ "column_0" => [1, 10], "column_1" => [2, 20], @@ -241,7 +283,7 @@ mod test { "a" => [Some(1), None, Some(3)], "b" => [Some(10), Some(20), None], ]?; - let out = df.transpose()?; + let out = df.transpose(None, None)?; let expected = df![ "column_0" => [1, 10], "column_1" => [None, Some(20)], @@ -254,7 +296,7 @@ mod test { "a" => ["a", "b", "c"], "b" => [Some(10), Some(20), None], ]?; - let out = df.transpose()?; + let out = df.transpose(None, None)?; let expected = df![ "column_0" => ["a", "10"], "column_1" => ["b", "20"], diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 5e13dd688731..e10d2fdbe6d3 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -1724,6 +1724,7 @@ dependencies = [ "ahash", "built", "ciborium", + "either", "jemallocator", "lexical-core", "libc", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 1b4f6161061a..5eaf86dc0b47 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -16,6 +16,7 @@ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] } [dependencies] ahash = "0.8" ciborium = "0.2.0" +either = "1.8" lexical-core = "0.8" # todo: unfix when compilation problem is solved libc = "0.2" diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 01322edc25f9..9e914a180040 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3531,23 +3531,10 @@ def transpose( │ col2 ┆ 3 ┆ 4 ┆ 6 │ └────────┴─────┴─────┴─────┘ """ - pydf = self._df - if isinstance(column_names, str): - pydf = self.drop(column_names)._df - column_names = self._df.column(column_names).to_list() - df = self._from_pydf(pydf.transpose(include_header, header_name)) - if column_names is not None: - names = [] - n = df.width - if include_header: - names.append(header_name) - n -= 1 - - column_names = iter(column_names) - for _ in range(n): - names.append(next(column_names)) - df.columns = names - return df + keep_names_as = header_name if include_header else None + if isinstance(column_names, Generator): + column_names = [next(column_names) for _ in range(self.height)] + return self._from_pydf(self._df.transpose(keep_names_as, column_names)) def reverse(self) -> DataFrame: """ diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 11aeb760cb60..ccc6c3fb639f 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -1,6 +1,7 @@ use std::io::BufWriter; use std::ops::Deref; +use either::Either; use numpy::IntoPyArray; use polars::frame::row::{rows_to_schema_supertypes, Row}; #[cfg(feature = "avro")] @@ -1356,17 +1357,20 @@ impl PyDataFrame { Ok(hash.into_series().into()) } - pub fn transpose(&self, include_header: bool, names: &str) -> PyResult { - let mut df = self.df.transpose().map_err(PyPolarsErr::from)?; - if include_header { - let s = Utf8Chunked::from_iter_values( - names, - self.df.get_columns().iter().map(|s| s.name()), - ) - .into_series(); - df.insert_at_idx(0, s).unwrap(); - } - Ok(df.into()) + #[pyo3(signature = (keep_names_as, column_names))] + pub fn transpose(&self, keep_names_as: Option<&str>, column_names: &PyAny) -> PyResult { + let new_col_names = if let Ok(name) = column_names.extract::>() { + Some(Either::Right(name)) + } else if let Ok(name) = column_names.extract::() { + Some(Either::Left(name)) + } else { + None + }; + Ok(self + .df + .transpose(keep_names_as, new_col_names) + .map_err(PyPolarsErr::from)? + .into()) } pub fn upsample( &self,