Skip to content

Commit

Permalink
feat(rust): Move transpose naming to Rust (#10009)
Browse files Browse the repository at this point in the history
Co-authored-by: Ritchie Vink <[email protected]>
  • Loading branch information
magarick and ritchie46 authored Jul 26, 2023
1 parent e181a3e commit c52e70c
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 67 deletions.
120 changes: 81 additions & 39 deletions polars/polars-core/src/frame/row/transpose.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,44 @@
use std::borrow::Cow;

use either::Either;

use super::*;

impl DataFrame {
pub(crate) fn transpose_from_dtype(&self, dtype: &DataType) -> PolarsResult<DataFrame> {
pub(crate) fn transpose_from_dtype(
&self,
dtype: &DataType,
keep_names_as: Option<&str>,
names_out: &[String],
) -> PolarsResult<DataFrame> {
let new_width = self.height();
let new_height = self.width();
// Allocate space for the transposed columns, putting the "row names" first if needed
let mut cols_t = match keep_names_as {
None => Vec::<Series>::with_capacity(new_width),
Some(name) => {
let mut tmp = Vec::<Series>::with_capacity(new_width + 1);
tmp.push(Utf8Chunked::new(name, self.get_column_names()).into());
tmp
}
};

let cols = &self.columns;
match dtype {
#[cfg(feature = "dtype-i8")]
DataType::Int8 => numeric_transpose::<Int8Type>(&self.columns),
DataType::Int8 => numeric_transpose::<Int8Type>(cols, names_out, &mut cols_t),
#[cfg(feature = "dtype-i16")]
DataType::Int16 => numeric_transpose::<Int16Type>(&self.columns),
DataType::Int32 => numeric_transpose::<Int32Type>(&self.columns),
DataType::Int64 => numeric_transpose::<Int64Type>(&self.columns),
DataType::Int16 => numeric_transpose::<Int16Type>(cols, names_out, &mut cols_t),
DataType::Int32 => numeric_transpose::<Int32Type>(cols, names_out, &mut cols_t),
DataType::Int64 => numeric_transpose::<Int64Type>(cols, names_out, &mut cols_t),
#[cfg(feature = "dtype-u8")]
DataType::UInt8 => numeric_transpose::<UInt8Type>(&self.columns),
DataType::UInt8 => numeric_transpose::<UInt8Type>(cols, names_out, &mut cols_t),
#[cfg(feature = "dtype-u16")]
DataType::UInt16 => numeric_transpose::<UInt16Type>(&self.columns),
DataType::UInt32 => numeric_transpose::<UInt32Type>(&self.columns),
DataType::UInt64 => numeric_transpose::<UInt64Type>(&self.columns),
DataType::Float32 => numeric_transpose::<Float32Type>(&self.columns),
DataType::Float64 => numeric_transpose::<Float64Type>(&self.columns),
DataType::UInt16 => numeric_transpose::<UInt16Type>(cols, names_out, &mut cols_t),
DataType::UInt32 => numeric_transpose::<UInt32Type>(cols, names_out, &mut cols_t),
DataType::UInt64 => numeric_transpose::<UInt64Type>(cols, names_out, &mut cols_t),
DataType::Float32 => numeric_transpose::<Float32Type>(cols, names_out, &mut cols_t),
DataType::Float64 => numeric_transpose::<Float64Type>(cols, names_out, &mut cols_t),
#[cfg(feature = "object")]
DataType::Object(_) => {
// this requires to support `Object` in Series::iter which we don't yet
Expand Down Expand Up @@ -52,27 +71,51 @@ impl DataFrame {
}
});
}
let cols = buffers
.into_iter()
.enumerate()
.map(|(i, buf)| {
let mut s = buf.into_series().cast(dtype).unwrap();
s.rename(&format!("column_{i}"));
s
})
.collect::<Vec<_>>();
Ok(DataFrame::new_no_checks(cols))
cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| {
let mut s = buf.into_series().cast(dtype).unwrap();
s.rename(name);
s
}));
}
}
};
Ok(DataFrame::new_no_checks(cols_t))
}

/// Transpose a DataFrame. This is a very expensive operation.
pub fn transpose(&self) -> PolarsResult<DataFrame> {
pub fn transpose(
&self,
keep_names_as: Option<&str>,
new_col_names: Option<Either<String, Vec<String>>>,
) -> PolarsResult<DataFrame> {
let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column
let names_out = match new_col_names {
None => (0..self.height()).map(|i| format!("column_{i}")).collect(),
Some(cn) => match cn {
Either::Left(name) => {
let new_names = self.column(&name).and_then(|x| x.utf8())?;
polars_ensure!(!new_names.has_validity(), ComputeError: "Column with new names can't have null values");
df = Cow::Owned(self.drop(&name)?);
new_names
.into_no_null_iter()
.map(|s| s.to_owned())
.collect()
}
Either::Right(names) => {
polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count");
names
}
},
};
if let Some(cn) = keep_names_as {
// Check that the column name we're using for the original column names is unique before
// wasting time transposing
polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn)
}
polars_ensure!(
self.height() != 0 && self.width() != 0,
df.height() != 0 && df.width() != 0,
NoData: "unable to transpose an empty dataframe"
);
let dtype = self.get_supertype().unwrap()?;
let dtype = df.get_supertype().unwrap()?;
match dtype {
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(_) => {
Expand All @@ -97,7 +140,7 @@ impl DataFrame {
}
_ => {}
}
self.transpose_from_dtype(&dtype)
df.transpose_from_dtype(&dtype, keep_names_as, &names_out)
}
}

Expand All @@ -113,9 +156,12 @@ unsafe fn add_value<T: NumericNative>(
*el_ptr.add(row_idx) = value;
}

pub(super) fn numeric_transpose<T>(cols: &[Series]) -> PolarsResult<DataFrame>
// This just fills a pre-allocated mutable series vector, which may have a name column.
// Nothing is returned and the actual DataFrame is constructed above.
pub(super) fn numeric_transpose<T>(cols: &[Series], names_out: &[String], cols_t: &mut Vec<Series>)
where
T: PolarsNumericType,
//S: AsRef<str>,
ChunkedArray<T>: IntoSeries,
{
let new_width = cols[0].len();
Expand Down Expand Up @@ -177,12 +223,12 @@ where
})
});

let series = POOL.install(|| {
cols_t.par_extend(POOL.install(|| {
values_buf
.into_par_iter()
.zip(validity_buf)
.enumerate()
.map(|(i, (mut values, validity))| {
.zip(names_out)
.map(|((mut values, validity), name)| {
// Safety:
// all values are written we can now set len
unsafe {
Expand All @@ -205,16 +251,12 @@ where
values.into(),
validity,
);
let name = format!("column_{i}");
unsafe {
ChunkedArray::<T>::from_chunks(&name, vec![Box::new(arr) as ArrayRef])
ChunkedArray::<T>::from_chunks(name, vec![Box::new(arr) as ArrayRef])
.into_series()
}
})
.collect()
});

Ok(DataFrame::new_no_checks(series))
}));
}

#[cfg(test)]
Expand All @@ -228,7 +270,7 @@ mod test {
"b" => [10, 20, 30],
]?;

let out = df.transpose()?;
let out = df.transpose(None, None)?;
let expected = df![
"column_0" => [1, 10],
"column_1" => [2, 20],
Expand All @@ -241,7 +283,7 @@ mod test {
"a" => [Some(1), None, Some(3)],
"b" => [Some(10), Some(20), None],
]?;
let out = df.transpose()?;
let out = df.transpose(None, None)?;
let expected = df![
"column_0" => [1, 10],
"column_1" => [None, Some(20)],
Expand All @@ -254,7 +296,7 @@ mod test {
"a" => ["a", "b", "c"],
"b" => [Some(10), Some(20), None],
]?;
let out = df.transpose()?;
let out = df.transpose(None, None)?;
let expected = df![
"column_0" => ["a", "10"],
"column_1" => ["b", "20"],
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
[dependencies]
ahash = "0.8"
ciborium = "0.2.0"
either = "1.8"
lexical-core = "0.8"
# todo: unfix when compilation problem is solved
libc = "0.2"
Expand Down
21 changes: 4 additions & 17 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3531,23 +3531,10 @@ def transpose(
│ col2 ┆ 3 ┆ 4 ┆ 6 │
└────────┴─────┴─────┴─────┘
"""
pydf = self._df
if isinstance(column_names, str):
pydf = self.drop(column_names)._df
column_names = self._df.column(column_names).to_list()
df = self._from_pydf(pydf.transpose(include_header, header_name))
if column_names is not None:
names = []
n = df.width
if include_header:
names.append(header_name)
n -= 1

column_names = iter(column_names)
for _ in range(n):
names.append(next(column_names))
df.columns = names
return df
keep_names_as = header_name if include_header else None
if isinstance(column_names, Generator):
column_names = [next(column_names) for _ in range(self.height)]
return self._from_pydf(self._df.transpose(keep_names_as, column_names))

def reverse(self) -> DataFrame:
"""
Expand Down
26 changes: 15 additions & 11 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::io::BufWriter;
use std::ops::Deref;

use either::Either;
use numpy::IntoPyArray;
use polars::frame::row::{rows_to_schema_supertypes, Row};
#[cfg(feature = "avro")]
Expand Down Expand Up @@ -1356,17 +1357,20 @@ impl PyDataFrame {
Ok(hash.into_series().into())
}

pub fn transpose(&self, include_header: bool, names: &str) -> PyResult<Self> {
let mut df = self.df.transpose().map_err(PyPolarsErr::from)?;
if include_header {
let s = Utf8Chunked::from_iter_values(
names,
self.df.get_columns().iter().map(|s| s.name()),
)
.into_series();
df.insert_at_idx(0, s).unwrap();
}
Ok(df.into())
#[pyo3(signature = (keep_names_as, column_names))]
pub fn transpose(&self, keep_names_as: Option<&str>, column_names: &PyAny) -> PyResult<Self> {
let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {
Some(Either::Right(name))
} else if let Ok(name) = column_names.extract::<String>() {
Some(Either::Left(name))
} else {
None
};
Ok(self
.df
.transpose(keep_names_as, new_col_names)
.map_err(PyPolarsErr::from)?
.into())
}
pub fn upsample(
&self,
Expand Down

0 comments on commit c52e70c

Please sign in to comment.