feat(rust): Move transpose naming to Rust (#10009)

Co-authored-by: Ritchie Vink <[email protected]>
pola-rs · Jul 26, 2023 · c52e70c · c52e70c
1 parent e181a3e
commit c52e70c
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 67 deletions.
diff --git a/polars/polars-core/src/frame/row/transpose.rs b/polars/polars-core/src/frame/row/transpose.rs
@@ -1,25 +1,44 @@
+use std::borrow::Cow;
+
+use either::Either;
+
 use super::*;
 
 impl DataFrame {
-    pub(crate) fn transpose_from_dtype(&self, dtype: &DataType) -> PolarsResult<DataFrame> {
+    pub(crate) fn transpose_from_dtype(
+        &self,
+        dtype: &DataType,
+        keep_names_as: Option<&str>,
+        names_out: &[String],
+    ) -> PolarsResult<DataFrame> {
         let new_width = self.height();
         let new_height = self.width();
+        // Allocate space for the transposed columns, putting the "row names" first if needed
+        let mut cols_t = match keep_names_as {
+            None => Vec::<Series>::with_capacity(new_width),
+            Some(name) => {
+                let mut tmp = Vec::<Series>::with_capacity(new_width + 1);
+                tmp.push(Utf8Chunked::new(name, self.get_column_names()).into());
+                tmp
+            }
+        };
 
+        let cols = &self.columns;
         match dtype {
             #[cfg(feature = "dtype-i8")]
-            DataType::Int8 => numeric_transpose::<Int8Type>(&self.columns),
+            DataType::Int8 => numeric_transpose::<Int8Type>(cols, names_out, &mut cols_t),
             #[cfg(feature = "dtype-i16")]
-            DataType::Int16 => numeric_transpose::<Int16Type>(&self.columns),
-            DataType::Int32 => numeric_transpose::<Int32Type>(&self.columns),
-            DataType::Int64 => numeric_transpose::<Int64Type>(&self.columns),
+            DataType::Int16 => numeric_transpose::<Int16Type>(cols, names_out, &mut cols_t),
+            DataType::Int32 => numeric_transpose::<Int32Type>(cols, names_out, &mut cols_t),
+            DataType::Int64 => numeric_transpose::<Int64Type>(cols, names_out, &mut cols_t),
             #[cfg(feature = "dtype-u8")]
-            DataType::UInt8 => numeric_transpose::<UInt8Type>(&self.columns),
+            DataType::UInt8 => numeric_transpose::<UInt8Type>(cols, names_out, &mut cols_t),
             #[cfg(feature = "dtype-u16")]
-            DataType::UInt16 => numeric_transpose::<UInt16Type>(&self.columns),
-            DataType::UInt32 => numeric_transpose::<UInt32Type>(&self.columns),
-            DataType::UInt64 => numeric_transpose::<UInt64Type>(&self.columns),
-            DataType::Float32 => numeric_transpose::<Float32Type>(&self.columns),
-            DataType::Float64 => numeric_transpose::<Float64Type>(&self.columns),
+            DataType::UInt16 => numeric_transpose::<UInt16Type>(cols, names_out, &mut cols_t),
+            DataType::UInt32 => numeric_transpose::<UInt32Type>(cols, names_out, &mut cols_t),
+            DataType::UInt64 => numeric_transpose::<UInt64Type>(cols, names_out, &mut cols_t),
+            DataType::Float32 => numeric_transpose::<Float32Type>(cols, names_out, &mut cols_t),
+            DataType::Float64 => numeric_transpose::<Float64Type>(cols, names_out, &mut cols_t),
             #[cfg(feature = "object")]
             DataType::Object(_) => {
                 // this requires to support `Object` in Series::iter which we don't yet
@@ -52,27 +71,51 @@ impl DataFrame {
                         }
                     });
                 }
-                let cols = buffers
-                    .into_iter()
-                    .enumerate()
-                    .map(|(i, buf)| {
-                        let mut s = buf.into_series().cast(dtype).unwrap();
-                        s.rename(&format!("column_{i}"));
-                        s
-                    })
-                    .collect::<Vec<_>>();
-                Ok(DataFrame::new_no_checks(cols))
+                cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| {
+                    let mut s = buf.into_series().cast(dtype).unwrap();
+                    s.rename(name);
+                    s
+                }));
             }
-        }
+        };
+        Ok(DataFrame::new_no_checks(cols_t))
     }
 
     /// Transpose a DataFrame. This is a very expensive operation.
-    pub fn transpose(&self) -> PolarsResult<DataFrame> {
+    pub fn transpose(
+        &self,
+        keep_names_as: Option<&str>,
+        new_col_names: Option<Either<String, Vec<String>>>,
+    ) -> PolarsResult<DataFrame> {
+        let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column
+        let names_out = match new_col_names {
+            None => (0..self.height()).map(|i| format!("column_{i}")).collect(),
+            Some(cn) => match cn {
+                Either::Left(name) => {
+                    let new_names = self.column(&name).and_then(|x| x.utf8())?;
+                    polars_ensure!(!new_names.has_validity(), ComputeError: "Column with new names can't have null values");
+                    df = Cow::Owned(self.drop(&name)?);
+                    new_names
+                        .into_no_null_iter()
+                        .map(|s| s.to_owned())
+                        .collect()
+                }
+                Either::Right(names) => {
+                    polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count");
+                    names
+                }
+            },
+        };
+        if let Some(cn) = keep_names_as {
+            // Check that the column name we're using for the original column names is unique before
+            // wasting time transposing
+            polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn)
+        }
         polars_ensure!(
-            self.height() != 0 && self.width() != 0,
+            df.height() != 0 && df.width() != 0,
             NoData: "unable to transpose an empty dataframe"
         );
-        let dtype = self.get_supertype().unwrap()?;
+        let dtype = df.get_supertype().unwrap()?;
         match dtype {
             #[cfg(feature = "dtype-categorical")]
             DataType::Categorical(_) => {
@@ -97,7 +140,7 @@ impl DataFrame {
             }
             _ => {}
         }
-        self.transpose_from_dtype(&dtype)
+        df.transpose_from_dtype(&dtype, keep_names_as, &names_out)
     }
 }
 
@@ -113,9 +156,12 @@ unsafe fn add_value<T: NumericNative>(
     *el_ptr.add(row_idx) = value;
 }
 
-pub(super) fn numeric_transpose<T>(cols: &[Series]) -> PolarsResult<DataFrame>
+// This just fills a pre-allocated mutable series vector, which may have a name column.
+// Nothing is returned and the actual DataFrame is constructed above.
+pub(super) fn numeric_transpose<T>(cols: &[Series], names_out: &[String], cols_t: &mut Vec<Series>)
 where
     T: PolarsNumericType,
+    //S: AsRef<str>,
     ChunkedArray<T>: IntoSeries,
 {
     let new_width = cols[0].len();
@@ -177,12 +223,12 @@ where
         })
     });
 
-    let series = POOL.install(|| {
+    cols_t.par_extend(POOL.install(|| {
         values_buf
             .into_par_iter()
             .zip(validity_buf)
-            .enumerate()
-            .map(|(i, (mut values, validity))| {
+            .zip(names_out)
+            .map(|((mut values, validity), name)| {
                 // Safety:
                 // all values are written we can now set len
                 unsafe {
@@ -205,16 +251,12 @@ where
                     values.into(),
                     validity,
                 );
-                let name = format!("column_{i}");
                 unsafe {
-                    ChunkedArray::<T>::from_chunks(&name, vec![Box::new(arr) as ArrayRef])
+                    ChunkedArray::<T>::from_chunks(name, vec![Box::new(arr) as ArrayRef])
                         .into_series()
                 }
             })
-            .collect()
-    });
-
-    Ok(DataFrame::new_no_checks(series))
+    }));
 }
 
 #[cfg(test)]
@@ -228,7 +270,7 @@ mod test {
             "b" => [10, 20, 30],
         ]?;
 
-        let out = df.transpose()?;
+        let out = df.transpose(None, None)?;
         let expected = df![
             "column_0" => [1, 10],
             "column_1" => [2, 20],
@@ -241,7 +283,7 @@ mod test {
             "a" => [Some(1), None, Some(3)],
             "b" => [Some(10), Some(20), None],
         ]?;
-        let out = df.transpose()?;
+        let out = df.transpose(None, None)?;
         let expected = df![
             "column_0" => [1, 10],
             "column_1" => [None, Some(20)],
@@ -254,7 +296,7 @@ mod test {
             "a" => ["a", "b", "c"],
             "b" => [Some(10), Some(20), None],
         ]?;
-        let out = df.transpose()?;
+        let out = df.transpose(None, None)?;
         let expected = df![
             "column_0" => ["a", "10"],
             "column_1" => ["b", "20"],

diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock
diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml
@@ -16,6 +16,7 @@ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
 [dependencies]
 ahash = "0.8"
 ciborium = "0.2.0"
+either = "1.8"
 lexical-core = "0.8"
 # todo: unfix when compilation problem is solved
 libc = "0.2"

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -3531,23 +3531,10 @@ def transpose(
         │ col2   ┆ 3   ┆ 4   ┆ 6   │
         └────────┴─────┴─────┴─────┘
         """
-        pydf = self._df
-        if isinstance(column_names, str):
-            pydf = self.drop(column_names)._df
-            column_names = self._df.column(column_names).to_list()
-        df = self._from_pydf(pydf.transpose(include_header, header_name))
-        if column_names is not None:
-            names = []
-            n = df.width
-            if include_header:
-                names.append(header_name)
-                n -= 1
-
-            column_names = iter(column_names)
-            for _ in range(n):
-                names.append(next(column_names))
-            df.columns = names
-        return df
+        keep_names_as = header_name if include_header else None
+        if isinstance(column_names, Generator):
+            column_names = [next(column_names) for _ in range(self.height)]
+        return self._from_pydf(self._df.transpose(keep_names_as, column_names))
 
     def reverse(self) -> DataFrame:
         """

diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
@@ -1,6 +1,7 @@
 use std::io::BufWriter;
 use std::ops::Deref;
 
+use either::Either;
 use numpy::IntoPyArray;
 use polars::frame::row::{rows_to_schema_supertypes, Row};
 #[cfg(feature = "avro")]
@@ -1356,17 +1357,20 @@ impl PyDataFrame {
         Ok(hash.into_series().into())
     }
 
-    pub fn transpose(&self, include_header: bool, names: &str) -> PyResult<Self> {
-        let mut df = self.df.transpose().map_err(PyPolarsErr::from)?;
-        if include_header {
-            let s = Utf8Chunked::from_iter_values(
-                names,
-                self.df.get_columns().iter().map(|s| s.name()),
-            )
-            .into_series();
-            df.insert_at_idx(0, s).unwrap();
-        }
-        Ok(df.into())
+    #[pyo3(signature = (keep_names_as, column_names))]
+    pub fn transpose(&self, keep_names_as: Option<&str>, column_names: &PyAny) -> PyResult<Self> {
+        let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {
+            Some(Either::Right(name))
+        } else if let Ok(name) = column_names.extract::<String>() {
+            Some(Either::Left(name))
+        } else {
+            None
+        };
+        Ok(self
+            .df
+            .transpose(keep_names_as, new_col_names)
+            .map_err(PyPolarsErr::from)?
+            .into())
     }
     pub fn upsample(
         &self,