Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove use of deprecated dict_id in datafusion-proto (#14173) #14227

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions datafusion/proto-common/proto/datafusion_common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,7 @@ message Field {
// for complex data types like structs, unions
repeated Field children = 4;
map<string, string> metadata = 5;
int64 dict_id = 6;
bool dict_ordered = 7;
bool dict_ordered = 6;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is dict_ordered still used anywhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure, but we can open PR and handle it. I'm ready to work on it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome -- thanks. Let's get this PR ready to go and then we can work on remvoing that as a follow on

}

message Timestamp{
Expand Down
55 changes: 12 additions & 43 deletions datafusion/proto-common/src/from_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -320,21 +320,8 @@ impl TryFrom<&protobuf::Field> for Field {
type Error = Error;
fn try_from(field: &protobuf::Field) -> Result<Self, Self::Error> {
let datatype = field.arrow_type.as_deref().required("arrow_type")?;
let field = if field.dict_id != 0 {
// https://github.com/apache/datafusion/issues/14173
#[allow(deprecated)]
Self::new_dict(
field.name.as_str(),
datatype,
field.nullable,
field.dict_id,
field.dict_ordered,
)
.with_metadata(field.metadata.clone())
} else {
Self::new(field.name.as_str(), datatype, field.nullable)
.with_metadata(field.metadata.clone())
};
let field = Self::new(field.name.as_str(), datatype, field.nullable)
.with_metadata(field.metadata.clone());
Ok(field)
}
}
Expand Down Expand Up @@ -436,36 +423,18 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {

let id = dict_batch.id();

let fields_using_this_dictionary = {
// See https://github.com/apache/datafusion/issues/14173
#[allow(deprecated)]
schema.fields_with_dict_id(id)
};
let record_batch = read_record_batch(
&buffer,
dict_batch.data().unwrap(),
Arc::new(schema.clone()),
&Default::default(),
None,
&message.version(),
)?;

let first_field = fields_using_this_dictionary.first().ok_or_else(|| {
Error::General("dictionary id not found in schema while deserializing ScalarValue::List".to_string())
})?;
let values: ArrayRef = Arc::clone(record_batch.column(0));

let values: ArrayRef = match first_field.data_type() {
DataType::Dictionary(_, ref value_type) => {
// Make a fake schema for the dictionary batch.
let value = value_type.as_ref().clone();
let schema = Schema::new(vec![Field::new("", value, true)]);
// Read a single column
let record_batch = read_record_batch(
&buffer,
dict_batch.data().unwrap(),
Arc::new(schema),
&Default::default(),
None,
&message.version(),
)?;
Ok(Arc::clone(record_batch.column(0)))
}
_ => Err(Error::General("dictionary id not found in schema while deserializing ScalarValue::List".to_string())),
}?;

Ok((id,values))
Ok((id, values))
}).collect::<datafusion_common::Result<HashMap<_, _>>>()?;

let record_batch = read_record_batch(
Expand Down
23 changes: 0 additions & 23 deletions datafusion/proto-common/src/generated/pbjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3107,9 +3107,6 @@ impl serde::Serialize for Field {
if !self.metadata.is_empty() {
len += 1;
}
if self.dict_id != 0 {
len += 1;
}
if self.dict_ordered {
len += 1;
}
Expand All @@ -3129,19 +3126,13 @@ impl serde::Serialize for Field {
if !self.metadata.is_empty() {
struct_ser.serialize_field("metadata", &self.metadata)?;
}
if self.dict_id != 0 {
#[allow(clippy::needless_borrow)]
#[allow(clippy::needless_borrows_for_generic_args)]
struct_ser.serialize_field("dictId", ToString::to_string(&self.dict_id).as_str())?;
}
if self.dict_ordered {
struct_ser.serialize_field("dictOrdered", &self.dict_ordered)?;
}
struct_ser.end()
}
}
impl<'de> serde::Deserialize<'de> for Field {
#[allow(deprecated)]
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
Expand All @@ -3153,8 +3144,6 @@ impl<'de> serde::Deserialize<'de> for Field {
"nullable",
"children",
"metadata",
"dict_id",
"dictId",
"dict_ordered",
"dictOrdered",
];
Expand All @@ -3166,7 +3155,6 @@ impl<'de> serde::Deserialize<'de> for Field {
Nullable,
Children,
Metadata,
DictId,
DictOrdered,
}
impl<'de> serde::Deserialize<'de> for GeneratedField {
Expand Down Expand Up @@ -3194,7 +3182,6 @@ impl<'de> serde::Deserialize<'de> for Field {
"nullable" => Ok(GeneratedField::Nullable),
"children" => Ok(GeneratedField::Children),
"metadata" => Ok(GeneratedField::Metadata),
"dictId" | "dict_id" => Ok(GeneratedField::DictId),
"dictOrdered" | "dict_ordered" => Ok(GeneratedField::DictOrdered),
_ => Err(serde::de::Error::unknown_field(value, FIELDS)),
}
Expand All @@ -3220,7 +3207,6 @@ impl<'de> serde::Deserialize<'de> for Field {
let mut nullable__ = None;
let mut children__ = None;
let mut metadata__ = None;
let mut dict_id__ = None;
let mut dict_ordered__ = None;
while let Some(k) = map_.next_key()? {
match k {
Expand Down Expand Up @@ -3256,14 +3242,6 @@ impl<'de> serde::Deserialize<'de> for Field {
map_.next_value::<std::collections::HashMap<_, _>>()?
);
}
GeneratedField::DictId => {
if dict_id__.is_some() {
return Err(serde::de::Error::duplicate_field("dictId"));
}
dict_id__ =
Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
;
}
GeneratedField::DictOrdered => {
if dict_ordered__.is_some() {
return Err(serde::de::Error::duplicate_field("dictOrdered"));
Expand All @@ -3278,7 +3256,6 @@ impl<'de> serde::Deserialize<'de> for Field {
nullable: nullable__.unwrap_or_default(),
children: children__.unwrap_or_default(),
metadata: metadata__.unwrap_or_default(),
dict_id: dict_id__.unwrap_or_default(),
dict_ordered: dict_ordered__.unwrap_or_default(),
})
}
Expand Down
4 changes: 1 addition & 3 deletions datafusion/proto-common/src/generated/prost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,7 @@ pub struct Field {
::prost::alloc::string::String,
::prost::alloc::string::String,
>,
#[prost(int64, tag = "6")]
pub dict_id: i64,
#[prost(bool, tag = "7")]
#[prost(bool, tag = "6")]
pub dict_ordered: bool,
}
#[derive(Clone, PartialEq, ::prost::Message)]
Expand Down
3 changes: 0 additions & 3 deletions datafusion/proto-common/src/to_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,6 @@ impl TryFrom<&Field> for protobuf::Field {
nullable: field.is_nullable(),
children: Vec::new(),
metadata: field.metadata().clone(),
#[allow(deprecated)]
// See https://github.com/apache/datafusion/issues/14173 to remove deprecated dict_id
dict_id: field.dict_id().unwrap_or(0),
dict_ordered: field.dict_is_ordered().unwrap_or(false),
})
}
Expand Down
4 changes: 1 addition & 3 deletions datafusion/proto/src/generated/datafusion_proto_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,7 @@ pub struct Field {
::prost::alloc::string::String,
::prost::alloc::string::String,
>,
#[prost(int64, tag = "6")]
pub dict_id: i64,
#[prost(bool, tag = "7")]
#[prost(bool, tag = "6")]
pub dict_ordered: bool,
}
#[derive(Clone, PartialEq, ::prost::Message)]
Expand Down
72 changes: 0 additions & 72 deletions datafusion/proto/tests/cases/roundtrip_logical_plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1494,20 +1494,6 @@ fn round_trip_scalar_values_and_data_types() {
Field::new("b", DataType::Boolean, false),
ScalarValue::from(false),
)
.with_scalar(
Field::new(
"c",
DataType::Dictionary(
Box::new(DataType::UInt16),
Box::new(DataType::Utf8),
),
false,
),
ScalarValue::Dictionary(
Box::new(DataType::UInt16),
Box::new("value".into()),
),
)
.build()
.unwrap(),
ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
Expand All @@ -1518,25 +1504,6 @@ fn round_trip_scalar_values_and_data_types() {
ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int32, true),
Field::new("b", DataType::Boolean, false),
Field::new(
"c",
DataType::Dictionary(
Box::new(DataType::UInt16),
Box::new(DataType::Binary),
),
false,
),
Field::new(
"d",
DataType::new_list(
DataType::Dictionary(
Box::new(DataType::UInt16),
Box::new(DataType::Binary),
),
false,
),
false,
),
])))
.unwrap(),
ScalarValue::try_from(&DataType::Map(
Expand Down Expand Up @@ -1815,45 +1782,6 @@ fn round_trip_datatype() {
}
}

// See https://github.com/apache/datafusion/issues/14173 to remove deprecated dict_id
#[allow(deprecated)]
#[test]
fn roundtrip_dict_id() -> Result<()> {
let dict_id = 42;
let field = Field::new(
"keys",
DataType::List(Arc::new(Field::new_dict(
"item",
DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)),
true,
dict_id,
false,
))),
false,
);
let schema = Arc::new(Schema::new(vec![field]));

// encode
let mut buf: Vec<u8> = vec![];
let schema_proto: protobuf::Schema = schema.try_into().unwrap();
schema_proto.encode(&mut buf).unwrap();

// decode
let schema_proto = protobuf::Schema::decode(buf.as_slice()).unwrap();
let decoded: Schema = (&schema_proto).try_into()?;

// assert
let keys = decoded.fields().iter().last().unwrap();
match keys.data_type() {
DataType::List(field) => {
assert_eq!(field.dict_id(), Some(dict_id), "dict_id should be retained");
}
_ => panic!("Invalid type"),
}

Ok(())
}

#[test]
fn roundtrip_null_scalar_values() {
let test_types = vec![
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/copy.slt
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,6 @@ select * from validate_arrow_file_dict;
c foo
d bar


# Copy from table to folder of json
query I
COPY source_table to 'test_files/scratch/copy/table_arrow' STORED AS ARROW;
Expand Down Expand Up @@ -632,3 +631,4 @@ COPY source_table to '/tmp/table.parquet' (row_group_size 55 + 102);
# Copy using execution.keep_partition_by_columns with an invalid value
query error DataFusion error: Invalid or Unsupported Configuration: provided value for 'execution.keep_partition_by_columns' was not recognized: "invalid_value"
COPY source_table to '/tmp/table.parquet' OPTIONS (execution.keep_partition_by_columns invalid_value);

4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/regexp.slt
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,8 @@ SELECT 'foo\nbar\nbaz' ~ 'bar';
true

statement error
Error during planning: Cannot infer common argument type for regex operation List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata
: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
Error during planning: Cannot infer common argument type for regex operation List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered: false, metadata
: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered: false, metadata: {} })
select [1,2] ~ [3];

query B
Expand Down