Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(stdlib): add unflatten vrl function #993

Merged
merged 14 commits into from
Aug 27, 2024
Merged
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ iana-time-zone = { version = "0.1", optional = true }
idna = { version = "0.5", optional = true }
indexmap = { version = "~2.4.0", default-features = false, features = ["std"], optional = true}
indoc = {version = "2", optional = true }
itertools = { version = "0.13", default-features = false, optional = true }
itertools = { version = "0.13", default-features = false, features=["use_std"], optional = true }
jorgehermo9 marked this conversation as resolved.
Show resolved Hide resolved
lalrpop-util = { version = "0.20", optional = true }
mlua = { version = "0.9", default-features = false, features = ["lua54", "send", "vendored"], optional = true}
nom = { version = "7", default-features = false, features = ["std"], optional = true }
Expand Down
26 changes: 26 additions & 0 deletions benches/stdlib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ criterion_group!(
to_syslog_severity,
to_unix_timestamp,
truncate,
unflatten,
unique,
// TODO: Cannot pass a Path to bench_function
//unnest
Expand Down Expand Up @@ -2718,6 +2719,31 @@ bench_function! {
}
}

bench_function! {
unflatten => vrl::stdlib::Unflatten;

nested_map {
args: func_args![value: value!({"parent.child1": 1, "parent.child2": 2, key: "val"})],
want: Ok(value!({parent: {child1: 1, child2: 2}, key: "val"})),
}

map_and_array {
args: func_args![value: value!({
"parent.child1": [1, [2, 3]],
"parent.child2.grandchild1": 1,
"parent.child2.grandchild2": [1, [2, 3], 4],
"key": "val",
})],
want: Ok(value!({
"parent": {
"child1": [1, [2, 3]],
"child2": {"grandchild1": 1, "grandchild2": [1, [2, 3], 4]},
},
"key": "val",
})),
}
}

bench_function! {
unique => vrl::stdlib::Unique;

Expand Down
2 changes: 2 additions & 0 deletions src/stdlib/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ cfg_if::cfg_if! {
mod to_unix_timestamp;
mod community_id;
mod truncate;
mod unflatten;
mod type_def;
mod unique;
mod unnest;
Expand Down Expand Up @@ -379,6 +380,7 @@ cfg_if::cfg_if! {
pub use to_unix_timestamp::ToUnixTimestamp;
pub use truncate::Truncate;
pub use type_def::TypeDef;
pub use unflatten::Unflatten;
pub use unique::Unique;
pub use unnest::Unnest;
pub use upcase::Upcase;
Expand Down
279 changes: 279 additions & 0 deletions src/stdlib/unflatten.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
use itertools::Itertools;

use crate::compiler::prelude::*;

static DEFAULT_SEPARATOR: &str = ".";

fn unflatten(value: Value, separator: Value) -> Resolved {
let separator = separator.try_bytes_utf8_lossy()?.into_owned();
let map = value.try_object()?;
Ok(do_unflatten(map.into(), &separator))
}

fn do_unflatten(value: Value, separator: &str) -> Value {
match value {
Value::Object(map) => do_unflatten_entries(map, separator).into(),
// Note that objects inside arrays are not unflattened
_ => value,
}
}

// this should return the key to insert?
jorgehermo9 marked this conversation as resolved.
Show resolved Hide resolved
fn do_unflatten_entries<I>(entries: I, separator: &str) -> ObjectMap
where
I: IntoIterator<Item = (KeyString, Value)>,
{
let grouped = entries
.into_iter()
.map(|(key, value)| {
let (head, rest) = match key.split_once(separator) {
Some((key, rest)) => (key.to_string().into(), Some(rest.to_string())),
None => (key.clone(), None),
};
(head, rest, value)
})
.into_group_map_by(|(head, _, _)| head.clone());

grouped
.into_iter()
.map(|(key, mut values)| {
if values.len() == 1 {
match values.pop().unwrap() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if values.len() == 1 {
match values.pop().unwrap() {
if values.len() == 1 {
match values.pop().expect("exactly one element") {

We typically expect over unwrap to provide more context about why the panic should never be hit.

I think one potential way to refactor this that might avoid this unwrap/expect and avoid the early returns is something like:

match values {
  [value] => { ... what you currently have for handling if there is only one value ... }
  values => { ... handle multiple values ... }
}

Copy link
Contributor Author

@jorgehermo9 jorgehermo9 Aug 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to do that, but the problem is values is a Vec and therefore, we should convert it to a slice in order to pattern match against it, using values[..], but then, at the first branch we would have the first element borrowed and we need it owned. The same with the second branch, we will have a borrowed slice of values, but not the owned values and we need owned values for the do_unflatten_entries function.

Does this makes sense?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mmm, yeah, that makes sense. I wish there was a better way to model this that doesn't require the expect but I'm not seeing it.

(_, None, value) => return (key, do_unflatten(value, separator)),
(_, Some(rest), value) => {
let result = do_unflatten_entries([(rest.into(), value)], separator);
return (key, result.into());
}
}
}

let new_entries = values
.into_iter()
.filter_map(|(_, rest, value)| {
// In this case, there is more than one value with the same key
// and then there must be nested values, we can't set a single top-level value
// so we filter it out.
// Example input of this case:
// {
// "a.b": 1,
// "a": 2
// }
// Here, we will have two items grouped by "a",
// one will have "b" as rest and the other will have None.
// We have to filter the second, as we can't set the second value
// as the value of "a" (considered the top-level key at this level)
rest.map(|rest| (rest.into(), value))
})
.collect::<Vec<_>>();
let result = do_unflatten_entries(new_entries, separator);
(key, result.into())
})
.collect()
}

#[derive(Clone, Copy, Debug)]
pub struct Unflatten;

impl Function for Unflatten {
fn identifier(&self) -> &'static str {
"unflatten"
}

fn parameters(&self) -> &'static [Parameter] {
&[
Parameter {
keyword: "value",
kind: kind::OBJECT,
required: true,
},
Parameter {
keyword: "separator",
kind: kind::BYTES,
required: false,
},
]
}

fn examples(&self) -> &'static [Example] {
&[
Example {
title: "object",
source: r#"{ "foo.bar": true }"#,
result: Ok(r#"flatten({ "foo": { "bar": true }})"#),
},
Example {
title: "object",
source: r#"{ "foo_bar": true }"#,
result: Ok(r#"flatten({ "foo": { "bar": true }}, "_")"#),
},
]
}

fn compile(
&self,
_state: &state::TypeState,
_ctx: &mut FunctionCompileContext,
arguments: ArgumentList,
) -> Compiled {
let separator = arguments
.optional("separator")
.unwrap_or_else(|| expr!(DEFAULT_SEPARATOR));
let value = arguments.required("value");
Ok(UnflattenFn { value, separator }.as_expr())
}
}

#[derive(Debug, Clone)]
struct UnflattenFn {
value: Box<dyn Expression>,
separator: Box<dyn Expression>,
}

impl FunctionExpression for UnflattenFn {
fn resolve(&self, ctx: &mut Context) -> Resolved {
let value = self.value.resolve(ctx)?;
let separator = self.separator.resolve(ctx)?;

unflatten(value, separator)
}

fn type_def(&self, _: &TypeState) -> TypeDef {
TypeDef::object(Collection::any())
}
}

#[cfg(test)]
mod test {
use super::*;
use crate::value;

test_function![
unflatten => Unflatten;

map {
args: func_args![value: value!({parent: "child"})],
want: Ok(value!({parent: "child"})),
tdef: TypeDef::object(Collection::any()),
}

nested_map {
args: func_args![value: value!({"parent.child1": 1, "parent.child2": 2, key: "val"})],
want: Ok(value!({parent: {child1: 1, child2: 2}, key: "val"})),
tdef: TypeDef::object(Collection::any()),
}

nested_map_with_separator {
args: func_args![value: value!({"parent_child1": 1, "parent_child2": 2, key: "val"}), separator: "_"],
want: Ok(value!({parent: {child1: 1, child2: 2}, key: "val"})),
tdef: TypeDef::object(Collection::any()),
}

double_nested_map {
args: func_args![value: value!({
"parent.child1": 1,
"parent.child2.grandchild1": 1,
"parent.child2.grandchild2": 2,
key: "val",
})],
want: Ok(value!({
parent: {
child1: 1,
child2: { grandchild1: 1, grandchild2: 2 },
},
key: "val",
})),
tdef: TypeDef::object(Collection::any()),
}

// Not only keys at first level are unflattened
double_inner_nested_map {
args: func_args![value: value!({
"parent": {"child1":1, "child2.grandchild1": 1, "child2.grandchild2": 2 },
key: "val",
})],
want: Ok(value!({
parent: {
child1: 1,
child2: { grandchild1: 1, grandchild2: 2 },
},
key: "val",
})),
tdef: TypeDef::object(Collection::any()),
}

map_and_array {
args: func_args![value: value!({
"parent.child1": [1, [2, 3]],
"parent.child2.grandchild1": 1,
"parent.child2.grandchild2": [1, [2, 3], 4],
key: "val",
})],
want: Ok(value!({
parent: {
child1: [1, [2, 3]],
child2: {grandchild1: 1, grandchild2: [1, [2, 3], 4]},
},
key: "val",
})),
tdef: TypeDef::object(Collection::any()),
}

map_and_array_with_separator {
args: func_args![value: value!({
"parent_child1": [1, [2, 3]],
"parent_child2_grandchild1": 1,
"parent_child2_grandchild2": [1, [2, 3], 4],
key: "val",
}), separator: "_"],
want: Ok(value!({
parent: {
child1: [1, [2, 3]],
child2: {grandchild1: 1, grandchild2: [1, [2, 3], 4]},
},
key: "val",
})),
tdef: TypeDef::object(Collection::any()),
}

// Objects inside arrays are not unflattened
objects_inside_arrays {
args: func_args![value: value!({
"parent": [{"child1":1},{"child2.grandchild1": 1, "child2.grandchild2": 2 }],
key: "val",
})],
want: Ok(value!({
"parent": [{"child1":1},{"child2.grandchild1": 1, "child2.grandchild2": 2 }],
key: "val",
})),
tdef: TypeDef::object(Collection::any()),
}

triple_nested_map {
args: func_args![value: value!({
"parent1.child1.grandchild1": 1,
"parent1.child2.grandchild2": 2,
"parent1.child2.grandchild3": 3,
parent2: 4,
})],
want: Ok(value!({
parent1: {
child1: { grandchild1: 1 },
child2: { grandchild2: 2, grandchild3: 3 },
},
parent2: 4,
})),
tdef: TypeDef::object(Collection::any()),
}

filter_out_top_level_value_when_multiple_values {
args: func_args![value: value!({
"a.b": 1,
"a": 2,
})],
want: Ok(value!({
a: { b: 1 },
})),
tdef: TypeDef::object(Collection::any()),
}
];
}