Skip to content

Commit

Permalink
fix(rust): non-compliant Parquet list element name
Browse files Browse the repository at this point in the history
Fixes #17100.
  • Loading branch information
coastalwhite committed Jul 23, 2024
1 parent 6f8b478 commit 398f781
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 60 deletions.
53 changes: 29 additions & 24 deletions crates/polars-parquet/src/arrow/write/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,24 @@ fn convert_field(field: Field) -> Field {
}

fn convert_data_type(data_type: ArrowDataType) -> ArrowDataType {
use ArrowDataType::*;
use ArrowDataType as D;
match data_type {
LargeList(field) => LargeList(Box::new(convert_field(*field))),
Struct(mut fields) => {
D::LargeList(field) => D::LargeList(Box::new(convert_field(*field))),
D::Struct(mut fields) => {
for field in &mut fields {
*field = convert_field(std::mem::take(field))
}
Struct(fields)
D::Struct(fields)
},
BinaryView => LargeBinary,
Utf8View => LargeUtf8,
Dictionary(it, data_type, sorted) => {
D::BinaryView => D::LargeBinary,
D::Utf8View => D::LargeUtf8,
D::Dictionary(it, data_type, sorted) => {
let dtype = convert_data_type(*data_type);
Dictionary(it, Box::new(dtype), sorted)
D::Dictionary(it, Box::new(dtype), sorted)
},
Extension(name, data_type, metadata) => {
D::Extension(name, data_type, metadata) => {
let data_type = convert_data_type(*data_type);
Extension(name, Box::new(data_type), metadata)
D::Extension(name, Box::new(data_type), metadata)
},
dt => dt,
}
Expand Down Expand Up @@ -390,21 +390,26 @@ pub fn to_parquet_type(field: &Field) -> PolarsResult<ParquetType> {
)?),
ArrowDataType::List(f)
| ArrowDataType::FixedSizeList(f, _)
| ArrowDataType::LargeList(f) => Ok(ParquetType::from_group(
name,
repetition,
Some(GroupConvertedType::List),
Some(GroupLogicalType::List),
vec![ParquetType::from_group(
"list".to_string(),
Repetition::Repeated,
None,
None,
vec![to_parquet_type(f)?],
| ArrowDataType::LargeList(f) => {
let mut f = f.clone();
f.name = "element".to_string();

Ok(ParquetType::from_group(
name,
repetition,
Some(GroupConvertedType::List),
Some(GroupLogicalType::List),
vec![ParquetType::from_group(
"list".to_string(),
Repetition::Repeated,
None,
None,
vec![to_parquet_type(&f)?],
None,
)],
None,
)],
None,
)),
))
},
ArrowDataType::Map(f, _) => Ok(ParquetType::from_group(
name,
repetition,
Expand Down
65 changes: 29 additions & 36 deletions crates/polars-parquet/src/parquet/schema/types/converted_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,15 @@ pub enum PrimitiveConvertedType {
Enum,
/// A decimal value.
///
/// This may be used to annotate binary or fixed primitive types. The
/// underlying byte array stores the unscaled value encoded as two's
/// complement using big-endian byte order (the most significant byte is the
/// zeroth element). The value of the decimal is the value * 10^{-scale}.
/// This may be used to annotate binary or fixed primitive types. The underlying byte array
/// stores the unscaled value encoded as two's complement using big-endian byte order (the most
/// significant byte is the zeroth element). The value of the decimal is the value *
/// 10^{-scale}.
///
/// This must be accompanied by a (maximum) precision and a scale in the
/// SchemaElement. The precision specifies the number of digits in the decimal
/// and the scale stores the location of the decimal point. For example 1.23
/// would have precision 3 (3 total digits) and scale 2 (the decimal point is
/// 2 digits over).
/// This must be accompanied by a (maximum) precision and a scale in the SchemaElement. The
/// precision specifies the number of digits in the decimal and the scale stores the location
/// of the decimal point. For example 1.23 would have precision 3 (3 total digits) and scale 2
/// (the decimal point is 2 digits over).
// (precision, scale)
Decimal(usize, usize),
/// A Date
Expand All @@ -31,41 +30,38 @@ pub enum PrimitiveConvertedType {
Date,
/// A time
///
/// The total number of milliseconds since midnight. The value is stored
/// as an INT32 physical type.
/// The total number of milliseconds since midnight. The value is stored as an INT32 physical
/// type.
TimeMillis,
/// A time.
///
/// The total number of microseconds since midnight. The value is stored as
/// an INT64 physical type.
/// The total number of microseconds since midnight. The value is stored as an INT64 physical
/// type.
TimeMicros,
/// A date/time combination
///
/// Date and time recorded as milliseconds since the Unix epoch. Recorded as
/// a physical type of INT64.
/// Date and time recorded as milliseconds since the Unix epoch. Recorded as a physical type
/// of INT64.
TimestampMillis,
/// A date/time combination
///
/// Date and time recorded as microseconds since the Unix epoch. The value is
/// stored as an INT64 physical type.
/// Date and time recorded as microseconds since the Unix epoch. The value is stored as an
/// INT64 physical type.
TimestampMicros,
/// An unsigned integer value.
///
/// The number describes the maximum number of meainful data bits in
/// the stored value. 8, 16 and 32 bit values are stored using the
/// INT32 physical type. 64 bit values are stored using the INT64
/// physical type.
///
/// The number describes the maximum number of meaningful data bits in the stored value. 8, 16
/// and 32 bit values are stored using the INT32 physical type. 64 bit values are stored using
/// the INT64 physical type.
Uint8,
Uint16,
Uint32,
Uint64,
/// A signed integer value.
///
/// The number describes the maximum number of meainful data bits in
/// the stored value. 8, 16 and 32 bit values are stored using the
/// INT32 physical type. 64 bit values are stored using the INT64
/// physical type.
/// The number describes the maximum number of meainful data bits in the stored value. 8, 16
/// and 32 bit values are stored using the INT32 physical type. 64 bit values are stored using
/// the INT64 physical type.
///
Int8,
Int16,
Expand All @@ -81,14 +77,12 @@ pub enum PrimitiveConvertedType {
Bson,
/// An interval of time
///
/// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12
/// This data is composed of three separate little endian unsigned
/// integers. Each stores a component of a duration of time. The first
/// integer identifies the number of months associated with the duration,
/// the second identifies the number of days associated with the duration
/// and the third identifies the number of milliseconds associated with
/// the provided duration. This duration of time is independent of any
/// particular timezone or date.
/// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 This data is
/// composed of three separate little endian unsigned integers. Each stores a component of a
/// duration of time. The first integer identifies the number of months associated with the
/// duration, the second identifies the number of days associated with the duration and the
/// third identifies the number of milliseconds associated with the provided duration. This
/// duration of time is independent of any particular timezone or date.
Interval,
}

Expand All @@ -99,8 +93,7 @@ pub enum GroupConvertedType {
Map,
/// a key/value pair is converted into a group of two fields
MapKeyValue,
/// a list is converted into an optional field containing a repeated field for its
/// values
/// a list is converted into an optional field containing a repeated field for its values
List,
}

Expand Down

0 comments on commit 398f781

Please sign in to comment.