From c368419679898b700946a56bf5f390db4ddb33a4 Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Tue, 23 Jul 2024 12:07:14 +0200 Subject: [PATCH] fix(rust): Non-compliant Parquet list element name (#17803) --- .../polars-parquet/src/arrow/write/schema.rs | 53 ++++++++------- .../parquet/schema/types/converted_type.rs | 65 +++++++++---------- py-polars/tests/unit/io/test_parquet.py | 18 +++++ 3 files changed, 76 insertions(+), 60 deletions(-) diff --git a/crates/polars-parquet/src/arrow/write/schema.rs b/crates/polars-parquet/src/arrow/write/schema.rs index a26b96ae91de..047291770180 100644 --- a/crates/polars-parquet/src/arrow/write/schema.rs +++ b/crates/polars-parquet/src/arrow/write/schema.rs @@ -23,24 +23,24 @@ fn convert_field(field: Field) -> Field { } fn convert_data_type(data_type: ArrowDataType) -> ArrowDataType { - use ArrowDataType::*; + use ArrowDataType as D; match data_type { - LargeList(field) => LargeList(Box::new(convert_field(*field))), - Struct(mut fields) => { + D::LargeList(field) => D::LargeList(Box::new(convert_field(*field))), + D::Struct(mut fields) => { for field in &mut fields { *field = convert_field(std::mem::take(field)) } - Struct(fields) + D::Struct(fields) }, - BinaryView => LargeBinary, - Utf8View => LargeUtf8, - Dictionary(it, data_type, sorted) => { + D::BinaryView => D::LargeBinary, + D::Utf8View => D::LargeUtf8, + D::Dictionary(it, data_type, sorted) => { let dtype = convert_data_type(*data_type); - Dictionary(it, Box::new(dtype), sorted) + D::Dictionary(it, Box::new(dtype), sorted) }, - Extension(name, data_type, metadata) => { + D::Extension(name, data_type, metadata) => { let data_type = convert_data_type(*data_type); - Extension(name, Box::new(data_type), metadata) + D::Extension(name, Box::new(data_type), metadata) }, dt => dt, } @@ -390,21 +390,26 @@ pub fn to_parquet_type(field: &Field) -> PolarsResult { )?), ArrowDataType::List(f) | ArrowDataType::FixedSizeList(f, _) - | ArrowDataType::LargeList(f) => Ok(ParquetType::from_group( - name, - repetition, - Some(GroupConvertedType::List), - Some(GroupLogicalType::List), - vec![ParquetType::from_group( - "list".to_string(), - Repetition::Repeated, - None, - None, - vec![to_parquet_type(f)?], + | ArrowDataType::LargeList(f) => { + let mut f = f.clone(); + f.name = "element".to_string(); + + Ok(ParquetType::from_group( + name, + repetition, + Some(GroupConvertedType::List), + Some(GroupLogicalType::List), + vec![ParquetType::from_group( + "list".to_string(), + Repetition::Repeated, + None, + None, + vec![to_parquet_type(&f)?], + None, + )], None, - )], - None, - )), + )) + }, ArrowDataType::Map(f, _) => Ok(ParquetType::from_group( name, repetition, diff --git a/crates/polars-parquet/src/parquet/schema/types/converted_type.rs b/crates/polars-parquet/src/parquet/schema/types/converted_type.rs index 946d6b8e1d39..8432167fcd3b 100644 --- a/crates/polars-parquet/src/parquet/schema/types/converted_type.rs +++ b/crates/polars-parquet/src/parquet/schema/types/converted_type.rs @@ -12,16 +12,15 @@ pub enum PrimitiveConvertedType { Enum, /// A decimal value. /// - /// This may be used to annotate binary or fixed primitive types. The - /// underlying byte array stores the unscaled value encoded as two's - /// complement using big-endian byte order (the most significant byte is the - /// zeroth element). The value of the decimal is the value * 10^{-scale}. + /// This may be used to annotate binary or fixed primitive types. The underlying byte array + /// stores the unscaled value encoded as two's complement using big-endian byte order (the most + /// significant byte is the zeroth element). The value of the decimal is the value * + /// 10^{-scale}. /// - /// This must be accompanied by a (maximum) precision and a scale in the - /// SchemaElement. The precision specifies the number of digits in the decimal - /// and the scale stores the location of the decimal point. For example 1.23 - /// would have precision 3 (3 total digits) and scale 2 (the decimal point is - /// 2 digits over). + /// This must be accompanied by a (maximum) precision and a scale in the SchemaElement. The + /// precision specifies the number of digits in the decimal and the scale stores the location + /// of the decimal point. For example 1.23 would have precision 3 (3 total digits) and scale 2 + /// (the decimal point is 2 digits over). // (precision, scale) Decimal(usize, usize), /// A Date @@ -31,41 +30,38 @@ pub enum PrimitiveConvertedType { Date, /// A time /// - /// The total number of milliseconds since midnight. The value is stored - /// as an INT32 physical type. + /// The total number of milliseconds since midnight. The value is stored as an INT32 physical + /// type. TimeMillis, /// A time. /// - /// The total number of microseconds since midnight. The value is stored as - /// an INT64 physical type. + /// The total number of microseconds since midnight. The value is stored as an INT64 physical + /// type. TimeMicros, /// A date/time combination /// - /// Date and time recorded as milliseconds since the Unix epoch. Recorded as - /// a physical type of INT64. + /// Date and time recorded as milliseconds since the Unix epoch. Recorded as a physical type + /// of INT64. TimestampMillis, /// A date/time combination /// - /// Date and time recorded as microseconds since the Unix epoch. The value is - /// stored as an INT64 physical type. + /// Date and time recorded as microseconds since the Unix epoch. The value is stored as an + /// INT64 physical type. TimestampMicros, /// An unsigned integer value. /// - /// The number describes the maximum number of meainful data bits in - /// the stored value. 8, 16 and 32 bit values are stored using the - /// INT32 physical type. 64 bit values are stored using the INT64 - /// physical type. - /// + /// The number describes the maximum number of meaningful data bits in the stored value. 8, 16 + /// and 32 bit values are stored using the INT32 physical type. 64 bit values are stored using + /// the INT64 physical type. Uint8, Uint16, Uint32, Uint64, /// A signed integer value. /// - /// The number describes the maximum number of meainful data bits in - /// the stored value. 8, 16 and 32 bit values are stored using the - /// INT32 physical type. 64 bit values are stored using the INT64 - /// physical type. + /// The number describes the maximum number of meainful data bits in the stored value. 8, 16 + /// and 32 bit values are stored using the INT32 physical type. 64 bit values are stored using + /// the INT64 physical type. /// Int8, Int16, @@ -81,14 +77,12 @@ pub enum PrimitiveConvertedType { Bson, /// An interval of time /// - /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 - /// This data is composed of three separate little endian unsigned - /// integers. Each stores a component of a duration of time. The first - /// integer identifies the number of months associated with the duration, - /// the second identifies the number of days associated with the duration - /// and the third identifies the number of milliseconds associated with - /// the provided duration. This duration of time is independent of any - /// particular timezone or date. + /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 This data is + /// composed of three separate little endian unsigned integers. Each stores a component of a + /// duration of time. The first integer identifies the number of months associated with the + /// duration, the second identifies the number of days associated with the duration and the + /// third identifies the number of milliseconds associated with the provided duration. This + /// duration of time is independent of any particular timezone or date. Interval, } @@ -99,8 +93,7 @@ pub enum GroupConvertedType { Map, /// a key/value pair is converted into a group of two fields MapKeyValue, - /// a list is converted into an optional field containing a repeated field for its - /// values + /// a list is converted into an optional field containing a repeated field for its values List, } diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 34c7782b2b0a..4ec443919b83 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -1244,3 +1244,21 @@ def test_parquet_record_batches_pyarrow_fixed_size_list_16614(tmp_path: Path) -> assert b["x"].shape[0] == n assert_frame_equal(b, x) + + +@pytest.mark.write_disk() +def test_parquet_list_element_field_name(tmp_path: Path) -> None: + filename = tmp_path / "list.parquet" + + ( + pl.DataFrame( + { + "a": [[1, 2], [1, 1, 1]], + }, + schema={"a": pl.List(pl.Int64)}, + ).write_parquet(filename, use_pyarrow=False) + ) + + schema_str = str(pq.read_schema(filename)) + assert "" in schema_str + assert "child 0, element: int64" in schema_str