Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(rust): Move transpose naming to Rust #10009

Merged
merged 9 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 81 additions & 39 deletions polars/polars-core/src/frame/row/transpose.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,44 @@
use std::borrow::Cow;

use either::Either;

use super::*;

impl DataFrame {
pub(crate) fn transpose_from_dtype(&self, dtype: &DataType) -> PolarsResult<DataFrame> {
pub(crate) fn transpose_from_dtype(
&self,
dtype: &DataType,
keep_names_as: Option<&str>,
names_out: &[String],
) -> PolarsResult<DataFrame> {
let new_width = self.height();
let new_height = self.width();
// Allocate space for the transposed columns, putting the "row names" first if needed
let mut cols_t = match keep_names_as {
None => Vec::<Series>::with_capacity(new_width),
Some(name) => {
let mut tmp = Vec::<Series>::with_capacity(new_width + 1);
tmp.push(Utf8Chunked::new(name, self.get_column_names()).into());
tmp
}
};

let cols = &self.columns;
match dtype {
#[cfg(feature = "dtype-i8")]
DataType::Int8 => numeric_transpose::<Int8Type>(&self.columns),
DataType::Int8 => numeric_transpose::<Int8Type>(cols, names_out, &mut cols_t),
#[cfg(feature = "dtype-i16")]
DataType::Int16 => numeric_transpose::<Int16Type>(&self.columns),
DataType::Int32 => numeric_transpose::<Int32Type>(&self.columns),
DataType::Int64 => numeric_transpose::<Int64Type>(&self.columns),
DataType::Int16 => numeric_transpose::<Int16Type>(cols, names_out, &mut cols_t),
DataType::Int32 => numeric_transpose::<Int32Type>(cols, names_out, &mut cols_t),
DataType::Int64 => numeric_transpose::<Int64Type>(cols, names_out, &mut cols_t),
#[cfg(feature = "dtype-u8")]
DataType::UInt8 => numeric_transpose::<UInt8Type>(&self.columns),
DataType::UInt8 => numeric_transpose::<UInt8Type>(cols, names_out, &mut cols_t),
#[cfg(feature = "dtype-u16")]
DataType::UInt16 => numeric_transpose::<UInt16Type>(&self.columns),
DataType::UInt32 => numeric_transpose::<UInt32Type>(&self.columns),
DataType::UInt64 => numeric_transpose::<UInt64Type>(&self.columns),
DataType::Float32 => numeric_transpose::<Float32Type>(&self.columns),
DataType::Float64 => numeric_transpose::<Float64Type>(&self.columns),
DataType::UInt16 => numeric_transpose::<UInt16Type>(cols, names_out, &mut cols_t),
DataType::UInt32 => numeric_transpose::<UInt32Type>(cols, names_out, &mut cols_t),
DataType::UInt64 => numeric_transpose::<UInt64Type>(cols, names_out, &mut cols_t),
DataType::Float32 => numeric_transpose::<Float32Type>(cols, names_out, &mut cols_t),
DataType::Float64 => numeric_transpose::<Float64Type>(cols, names_out, &mut cols_t),
#[cfg(feature = "object")]
DataType::Object(_) => {
// this requires to support `Object` in Series::iter which we don't yet
Expand Down Expand Up @@ -52,27 +71,51 @@ impl DataFrame {
}
});
}
let cols = buffers
.into_iter()
.enumerate()
.map(|(i, buf)| {
let mut s = buf.into_series().cast(dtype).unwrap();
s.rename(&format!("column_{i}"));
s
})
.collect::<Vec<_>>();
Ok(DataFrame::new_no_checks(cols))
cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| {
let mut s = buf.into_series().cast(dtype).unwrap();
s.rename(name);
s
}));
}
}
};
Ok(DataFrame::new_no_checks(cols_t))
}

/// Transpose a DataFrame. This is a very expensive operation.
pub fn transpose(&self) -> PolarsResult<DataFrame> {
pub fn transpose(
&self,
keep_names_as: Option<&str>,
new_col_names: Option<Either<String, Vec<String>>>,
) -> PolarsResult<DataFrame> {
let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column
let names_out = match new_col_names {
None => (0..self.height()).map(|i| format!("column_{i}")).collect(),
Some(cn) => match cn {
Either::Left(name) => {
let new_names = self.column(&name).and_then(|x| x.utf8())?;
polars_ensure!(!new_names.has_validity(), ComputeError: "Column with new names can't have null values");
df = Cow::Owned(self.drop(&name)?);
new_names
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we have new_names scoped above, we can collect into a Vec<&str> and we don't need to heap allocate the strings.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do I make that work with the other branch that generates names by formatting? I couldn't figure out how to end up with &str instead of String because of that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, right. Yeah, from_dtypes accepts &[String]. Let's leave that one for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I had initially tried to do was return an iterator in these blocks that could be any AsRef<str> so we wouldn't have to allocate when either generating column names or taking them from an existing column. But I wasn't able to get that to work. If you have any ideas that would be helpful. I assumed that this operation is so expensive you'd generally be allocating a few thousand strings at most, but who knows.

.into_no_null_iter()
.map(|s| s.to_owned())
.collect()
}
Either::Right(names) => {
polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count");
names
}
},
};
if let Some(cn) = keep_names_as {
// Check that the column name we're using for the original column names is unique before
// wasting time transposing
polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn)
}
polars_ensure!(
self.height() != 0 && self.width() != 0,
df.height() != 0 && df.width() != 0,
NoData: "unable to transpose an empty dataframe"
);
let dtype = self.get_supertype().unwrap()?;
let dtype = df.get_supertype().unwrap()?;
match dtype {
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(_) => {
Expand All @@ -97,7 +140,7 @@ impl DataFrame {
}
_ => {}
}
self.transpose_from_dtype(&dtype)
df.transpose_from_dtype(&dtype, keep_names_as, &names_out)
}
}

Expand All @@ -113,9 +156,12 @@ unsafe fn add_value<T: NumericNative>(
*el_ptr.add(row_idx) = value;
}

pub(super) fn numeric_transpose<T>(cols: &[Series]) -> PolarsResult<DataFrame>
// This just fills a pre-allocated mutable series vector, which may have a name column.
// Nothing is returned and the actual DataFrame is constructed above.
pub(super) fn numeric_transpose<T>(cols: &[Series], names_out: &[String], cols_t: &mut Vec<Series>)
where
T: PolarsNumericType,
//S: AsRef<str>,
ChunkedArray<T>: IntoSeries,
{
let new_width = cols[0].len();
Expand Down Expand Up @@ -177,12 +223,12 @@ where
})
});

let series = POOL.install(|| {
cols_t.par_extend(POOL.install(|| {
values_buf
.into_par_iter()
.zip(validity_buf)
.enumerate()
.map(|(i, (mut values, validity))| {
.zip(names_out)
.map(|((mut values, validity), name)| {
// Safety:
// all values are written we can now set len
unsafe {
Expand All @@ -205,16 +251,12 @@ where
values.into(),
validity,
);
let name = format!("column_{i}");
unsafe {
ChunkedArray::<T>::from_chunks(&name, vec![Box::new(arr) as ArrayRef])
ChunkedArray::<T>::from_chunks(name, vec![Box::new(arr) as ArrayRef])
.into_series()
}
})
.collect()
});

Ok(DataFrame::new_no_checks(series))
}));
}

#[cfg(test)]
Expand All @@ -228,7 +270,7 @@ mod test {
"b" => [10, 20, 30],
]?;

let out = df.transpose()?;
let out = df.transpose(None, None)?;
let expected = df![
"column_0" => [1, 10],
"column_1" => [2, 20],
Expand All @@ -241,7 +283,7 @@ mod test {
"a" => [Some(1), None, Some(3)],
"b" => [Some(10), Some(20), None],
]?;
let out = df.transpose()?;
let out = df.transpose(None, None)?;
let expected = df![
"column_0" => [1, 10],
"column_1" => [None, Some(20)],
Expand All @@ -254,7 +296,7 @@ mod test {
"a" => ["a", "b", "c"],
"b" => [Some(10), Some(20), None],
]?;
let out = df.transpose()?;
let out = df.transpose(None, None)?;
let expected = df![
"column_0" => ["a", "10"],
"column_1" => ["b", "20"],
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
[dependencies]
ahash = "0.8"
ciborium = "0.2.0"
either = "1.8"
lexical-core = "0.8"
# todo: unfix when compilation problem is solved
libc = "0.2"
Expand Down
21 changes: 4 additions & 17 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3547,23 +3547,10 @@ def transpose(
│ col2 ┆ 3 ┆ 4 ┆ 6 │
└────────┴─────┴─────┴─────┘
"""
pydf = self._df
if isinstance(column_names, str):
pydf = self.drop(column_names)._df
column_names = self._df.column(column_names).to_list()
df = self._from_pydf(pydf.transpose(include_header, header_name))
if column_names is not None:
names = []
n = df.width
if include_header:
names.append(header_name)
n -= 1

column_names = iter(column_names)
for _ in range(n):
names.append(next(column_names))
df.columns = names
return df
keep_names_as = header_name if include_header else None
if isinstance(column_names, Generator):
column_names = [next(column_names) for _ in range(self.height)]
return self._from_pydf(self._df.transpose(keep_names_as, column_names))

def reverse(self) -> DataFrame:
"""
Expand Down
27 changes: 16 additions & 11 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::io::BufWriter;
use std::ops::Deref;

use either::Either;
use numpy::IntoPyArray;
use polars::frame::row::{rows_to_schema_supertypes, Row};
#[cfg(feature = "avro")]
Expand Down Expand Up @@ -1356,17 +1357,21 @@ impl PyDataFrame {
Ok(hash.into_series().into())
}

pub fn transpose(&self, include_header: bool, names: &str) -> PyResult<Self> {
let mut df = self.df.transpose().map_err(PyPolarsErr::from)?;
if include_header {
let s = Utf8Chunked::from_iter_values(
names,
self.df.get_columns().iter().map(|s| s.name()),
)
.into_series();
df.insert_at_idx(0, s).unwrap();
}
Ok(df.into())
#[pyo3(signature = (keep_names_as, column_names))]
pub fn transpose(&self, keep_names_as: Option<&str>, column_names: &PyAny) -> PyResult<Self> {
// It doesn't automatically translate Python types to Either :-(
magarick marked this conversation as resolved.
Show resolved Hide resolved
let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {
Some(Either::Right(name))
} else if let Ok(name) = column_names.extract::<String>() {
Some(Either::Left(name))
} else {
None
};
Ok(self
.df
.transpose(keep_names_as, new_col_names)
.map_err(PyPolarsErr::from)?
.into())
}
pub fn upsample(
&self,
Expand Down