Skip to content

Commit

Permalink
feat: add lossy option to parse_json function (#988)
Browse files Browse the repository at this point in the history
* feat: Add lossy argument to parse_json function

* docs: Add changelog

* Update 269.feature.md

Co-authored-by: Jesse Szwedko <[email protected]>

* Update 269.feature.md

---------

Co-authored-by: Jesse Szwedko <[email protected]>
  • Loading branch information
jorgehermo9 and jszwedko authored Aug 15, 2024
1 parent 4dc1f26 commit 8289523
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 8 deletions.
5 changes: 5 additions & 0 deletions changelog.d/269.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
The `parse_json` function now accepts an optional `lossy` parameter (which defaults to `true`).

This new parameter allows to control whether the UTF-8 decoding should be lossy or not, replacing
invalid UTF-8 sequences with the Unicode replacement character (U+FFFD) if set to `true` or raising an error
if set to `false` and an invalid utf-8 sequence is found.
72 changes: 64 additions & 8 deletions src/stdlib/parse_json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,28 @@ use serde_json::{

use crate::compiler::prelude::*;

fn parse_json(value: Value) -> Resolved {
let bytes = value.try_bytes()?;
fn parse_json(value: Value, lossy: Option<Value>) -> Resolved {
let lossy = lossy.map(Value::try_boolean).transpose()?.unwrap_or(true);
let bytes = if lossy {
value.try_bytes_utf8_lossy()?.into_owned().into()
} else {
value.try_bytes()?
};
let value = serde_json::from_slice::<'_, Value>(&bytes)
.map_err(|e| format!("unable to parse json: {e}"))?;
Ok(value)
}

// parse_json_with_depth method recursively traverses the value and returns raw JSON-formatted bytes
// after reaching provided depth.
fn parse_json_with_depth(value: Value, max_depth: Value) -> Resolved {
let bytes = value.try_bytes()?;
fn parse_json_with_depth(value: Value, max_depth: Value, lossy: Option<Value>) -> Resolved {
let parsed_depth = validate_depth(max_depth)?;
let lossy = lossy.map(Value::try_boolean).transpose()?.unwrap_or(true);
let bytes = if lossy {
value.try_bytes_utf8_lossy()?.into_owned().into()
} else {
value.try_bytes()?
};

let raw_value = serde_json::from_slice::<'_, &RawValue>(&bytes)
.map_err(|e| format!("unable to read json: {e}"))?;
Expand Down Expand Up @@ -121,6 +131,11 @@ impl Function for ParseJson {
kind: kind::INTEGER,
required: false,
},
Parameter {
keyword: "lossy",
kind: kind::BOOLEAN,
required: false,
},
]
}

Expand Down Expand Up @@ -179,23 +194,35 @@ impl Function for ParseJson {
) -> Compiled {
let value = arguments.required("value");
let max_depth = arguments.optional("max_depth");
let lossy = arguments.optional("lossy");

match max_depth {
Some(max_depth) => Ok(ParseJsonMaxDepthFn { value, max_depth }.as_expr()),
None => Ok(ParseJsonFn { value }.as_expr()),
Some(max_depth) => Ok(ParseJsonMaxDepthFn {
value,
max_depth,
lossy,
}
.as_expr()),
None => Ok(ParseJsonFn { value, lossy }.as_expr()),
}
}
}

#[derive(Debug, Clone)]
struct ParseJsonFn {
value: Box<dyn Expression>,
lossy: Option<Box<dyn Expression>>,
}

impl FunctionExpression for ParseJsonFn {
fn resolve(&self, ctx: &mut Context) -> Resolved {
let value = self.value.resolve(ctx)?;
parse_json(value)
let lossy = self
.lossy
.as_ref()
.map(|expr| expr.resolve(ctx))
.transpose()?;
parse_json(value, lossy)
}

fn type_def(&self, _: &state::TypeState) -> TypeDef {
Expand All @@ -207,13 +234,19 @@ impl FunctionExpression for ParseJsonFn {
struct ParseJsonMaxDepthFn {
value: Box<dyn Expression>,
max_depth: Box<dyn Expression>,
lossy: Option<Box<dyn Expression>>,
}

impl FunctionExpression for ParseJsonMaxDepthFn {
fn resolve(&self, ctx: &mut Context) -> Resolved {
let value = self.value.resolve(ctx)?;
let max_depth = self.max_depth.resolve(ctx)?;
parse_json_with_depth(value, max_depth)
let lossy = self
.lossy
.as_ref()
.map(|expr| expr.resolve(ctx))
.transpose()?;
parse_json_with_depth(value, max_depth, lossy)
}

fn type_def(&self, _: &state::TypeState) -> TypeDef {
Expand Down Expand Up @@ -322,6 +355,29 @@ mod tests {
want: Ok(value!({"num": 9.223_372_036_854_776e18})),
tdef: type_def(),
}

// Checks that the parsing uses the default lossy argument value
parse_invalid_utf8_default_lossy_arg {
// 0x22 is a quote character
// 0xf5 is out of the range of valid UTF-8 bytes
args: func_args![ value: Bytes::from_static(&[0x22,0xf5,0x22])],
want: Ok(value!(std::char::REPLACEMENT_CHARACTER.to_string())),
tdef: type_def(),
}

parse_invalid_utf8_lossy_arg_true {
// 0xf5 is out of the range of valid UTF-8 bytes
args: func_args![ value: Bytes::from_static(&[0x22,0xf5,0x22]), lossy: true],
// U+FFFD is the replacement character for invalid UTF-8
want: Ok(value!(std::char::REPLACEMENT_CHARACTER.to_string())),
tdef: type_def(),
}

invalid_utf8_json_lossy_arg_false {
args: func_args![ value: Bytes::from_static(&[0x22,0xf5,0x22]), lossy: false],
want: Err("unable to parse json: invalid unicode code point at line 1 column 3"),
tdef: type_def(),
}
];

#[cfg(not(feature = "float_roundtrip"))]
Expand Down

0 comments on commit 8289523

Please sign in to comment.