From d3c30cef1ea9bfb30687bf55fa35edf4f77b5a74 Mon Sep 17 00:00:00 2001 From: chielP Date: Thu, 24 Aug 2023 11:15:05 +0200 Subject: [PATCH 1/6] add check for non-numeric characters --- crates/polars-arrow/src/compute/decimal.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index cb265d43d75e..2af5037a2a86 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -15,6 +15,12 @@ fn split_decimal_bytes(bytes: &[u8]) -> (Option<&[u8]>, Option<&[u8]>) { (lhs, rhs) } +fn is_numeric(bytes: &[u8]) -> bool { + bytes + .iter() + .all(|b| matches!(*b, b'0'..=b'9' | b'.' | b'+' | b'-')) +} + pub fn infer_scale(bytes: &[u8]) -> Option { let (_lhs, rhs) = split_decimal_bytes(bytes); rhs.map(significant_digits) @@ -26,6 +32,9 @@ pub fn infer_scale(bytes: &[u8]) -> Option { pub(super) fn deserialize_decimal(bytes: &[u8], precision: Option, scale: u8) -> Option { let (lhs, rhs) = split_decimal_bytes(bytes); let precision = precision.unwrap_or(u8::MAX); + if !is_numeric(bytes) { + return None; + } match (lhs, rhs) { (Some(lhs), Some(rhs)) => atoi::(lhs).and_then(|x| { atoi::(rhs) @@ -127,5 +136,14 @@ mod test { deserialize_decimal(val.as_bytes(), precision, scale), Some(1000000000000000000) ); + let scale = 5; + let val = "12ABC.34"; + assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None); + + let val = "1ABC2.34"; + assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None); + + let val = "12.3ABC4"; + assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None); } } From 51675b1fb69456aa694ea0d6c0b9cc2e52199172 Mon Sep 17 00:00:00 2001 From: chielP Date: Thu, 24 Aug 2023 12:12:17 +0200 Subject: [PATCH 2/6] better way of checking numeric --- crates/polars-arrow/src/compute/decimal.rs | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index 2af5037a2a86..05d91b5eea16 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -1,4 +1,4 @@ -use atoi::atoi; +use atoi::FromRadix10SignedChecked; fn significant_digits(bytes: &[u8]) -> u8 { (bytes.len() as u8) - leading_zeros(bytes) @@ -15,10 +15,12 @@ fn split_decimal_bytes(bytes: &[u8]) -> (Option<&[u8]>, Option<&[u8]>) { (lhs, rhs) } -fn is_numeric(bytes: &[u8]) -> bool { - bytes - .iter() - .all(|b| matches!(*b, b'0'..=b'9' | b'.' | b'+' | b'-')) +fn parse_integer_checked(bytes: &[u8]) -> Option{ + let (n,len) = i128::from_radix_10_signed_checked(bytes); + match n { + Some(i) if len == bytes.len() => Some(i), + _ => None + } } pub fn infer_scale(bytes: &[u8]) -> Option { @@ -32,12 +34,10 @@ pub fn infer_scale(bytes: &[u8]) -> Option { pub(super) fn deserialize_decimal(bytes: &[u8], precision: Option, scale: u8) -> Option { let (lhs, rhs) = split_decimal_bytes(bytes); let precision = precision.unwrap_or(u8::MAX); - if !is_numeric(bytes) { - return None; - } + match (lhs, rhs) { - (Some(lhs), Some(rhs)) => atoi::(lhs).and_then(|x| { - atoi::(rhs) + (Some(lhs), Some(rhs)) => parse_integer_checked(lhs).and_then(|x| { + parse_integer_checked(rhs) .map(|y| (x, lhs, y, rhs)) .and_then(|(lhs, lhs_b, rhs, rhs_b)| { let lhs_s = significant_digits(lhs_b); @@ -86,13 +86,13 @@ pub(super) fn deserialize_decimal(bytes: &[u8], precision: Option, scale: u8 if rhs.len() > precision as usize || rhs.len() != scale as usize { return None; } - atoi::(rhs) + parse_integer_checked(rhs) }, (Some(lhs), None) => { if lhs.len() > precision as usize || scale != 0 { return None; } - atoi::(lhs) + parse_integer_checked(lhs) }, (None, None) => None, } From 11ff7c22672eff2846099989828b61dd840e382e Mon Sep 17 00:00:00 2001 From: chielP Date: Thu, 24 Aug 2023 13:00:32 +0200 Subject: [PATCH 3/6] fix option handling --- crates/polars-arrow/src/compute/decimal.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index 05d91b5eea16..d74b833ec248 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -15,12 +15,9 @@ fn split_decimal_bytes(bytes: &[u8]) -> (Option<&[u8]>, Option<&[u8]>) { (lhs, rhs) } -fn parse_integer_checked(bytes: &[u8]) -> Option{ - let (n,len) = i128::from_radix_10_signed_checked(bytes); - match n { - Some(i) if len == bytes.len() => Some(i), - _ => None - } +fn parse_integer_checked(bytes: &[u8]) -> Option { + let (n, len) = i128::from_radix_10_signed_checked(bytes); + n.filter(|_| len == bytes.len()) } pub fn infer_scale(bytes: &[u8]) -> Option { From ba7a332e1061594579df8b4f16d0a81174237808 Mon Sep 17 00:00:00 2001 From: chielP Date: Thu, 24 Aug 2023 13:24:25 +0200 Subject: [PATCH 4/6] fix failing case --- crates/polars-arrow/src/compute/decimal.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index d74b833ec248..bb18372cb433 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -9,7 +9,7 @@ fn leading_zeros(bytes: &[u8]) -> u8 { } fn split_decimal_bytes(bytes: &[u8]) -> (Option<&[u8]>, Option<&[u8]>) { - let mut a = bytes.split(|x| *x == b'.'); + let mut a = bytes.splitn(2,|x| *x == b'.'); let lhs = a.next(); let rhs = a.next(); (lhs, rhs) @@ -142,5 +142,18 @@ mod test { let val = "12.3ABC4"; assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None); + + let val = "12.3.ABC4"; + assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None); + + let val = ""; + assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None); + + let val = "5."; + assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), Some(500000i128)); + + let val = ".5"; + assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), Some(50000i128)); + } } From 984519fdd0ee800c4171f1139291df3b81fac68b Mon Sep 17 00:00:00 2001 From: ritchie Date: Tue, 29 Aug 2023 10:10:25 +0200 Subject: [PATCH 5/6] fmt --- crates/polars-arrow/src/compute/decimal.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index bb18372cb433..a5a45fa239a7 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -9,7 +9,7 @@ fn leading_zeros(bytes: &[u8]) -> u8 { } fn split_decimal_bytes(bytes: &[u8]) -> (Option<&[u8]>, Option<&[u8]>) { - let mut a = bytes.splitn(2,|x| *x == b'.'); + let mut a = bytes.splitn(2, |x| *x == b'.'); let lhs = a.next(); let rhs = a.next(); (lhs, rhs) @@ -150,10 +150,15 @@ mod test { assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None); let val = "5."; - assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), Some(500000i128)); + assert_eq!( + deserialize_decimal(val.as_bytes(), precision, scale), + Some(500000i128) + ); let val = ".5"; - assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), Some(50000i128)); - + assert_eq!( + deserialize_decimal(val.as_bytes(), precision, scale), + Some(50000i128) + ); } } From 530ad833da25bbb4804f98343a44bbefc601d2fe Mon Sep 17 00:00:00 2001 From: chielP Date: Tue, 29 Aug 2023 11:36:23 +0200 Subject: [PATCH 6/6] simplify match --- crates/polars-arrow/src/compute/decimal.rs | 110 ++++++++++----------- 1 file changed, 53 insertions(+), 57 deletions(-) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index a5a45fa239a7..04066f1d9629 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -32,67 +32,63 @@ pub(super) fn deserialize_decimal(bytes: &[u8], precision: Option, scale: u8 let (lhs, rhs) = split_decimal_bytes(bytes); let precision = precision.unwrap_or(u8::MAX); - match (lhs, rhs) { - (Some(lhs), Some(rhs)) => parse_integer_checked(lhs).and_then(|x| { - parse_integer_checked(rhs) - .map(|y| (x, lhs, y, rhs)) - .and_then(|(lhs, lhs_b, rhs, rhs_b)| { - let lhs_s = significant_digits(lhs_b); - let leading_zeros_rhs = leading_zeros(rhs_b); - let rhs_s = rhs_b.len() as u8 - leading_zeros_rhs; - - // parameters don't match bytes - if lhs_s + rhs_s > precision || rhs_s > scale { - None - } - // significant digits don't fit scale - else if rhs_s < scale { - // scale: 2 - // number: x.09 - // significant digits: 1 - // leading_zeros: 1 - // parsed: 9 - // so this is correct - if leading_zeros_rhs + rhs_s == scale { - Some((lhs, rhs)) + let lhs_b = lhs?; + parse_integer_checked(lhs_b).and_then(|x| { + match rhs { + Some(rhs) => { + parse_integer_checked(rhs) + .map(|y| (x, lhs_b, y, rhs)) + .and_then(|(lhs, lhs_b, rhs, rhs_b)| { + let lhs_s = significant_digits(lhs_b); + let leading_zeros_rhs = leading_zeros(rhs_b); + let rhs_s = rhs_b.len() as u8 - leading_zeros_rhs; + + // parameters don't match bytes + if lhs_s + rhs_s > precision || rhs_s > scale { + None + } + // significant digits don't fit scale + else if rhs_s < scale { + // scale: 2 + // number: x.09 + // significant digits: 1 + // leading_zeros: 1 + // parsed: 9 + // so this is correct + if leading_zeros_rhs + rhs_s == scale { + Some((lhs, rhs)) + } + // scale: 2 + // number: x.9 + // significant digits: 1 + // parsed: 9 + // so we must multiply by 10 to get 90 + else { + let diff = scale as u32 - (rhs_s + leading_zeros_rhs) as u32; + Some((lhs, rhs * 10i128.pow(diff))) + } } // scale: 2 - // number: x.9 - // significant digits: 1 - // parsed: 9 - // so we must multiply by 10 to get 90 + // number: x.90 + // significant digits: 2 + // parsed: 90 + // so this is correct else { - let diff = scale as u32 - (rhs_s + leading_zeros_rhs) as u32; - Some((lhs, rhs * 10i128.pow(diff))) + Some((lhs, rhs)) } - } - // scale: 2 - // number: x.90 - // significant digits: 2 - // parsed: 90 - // so this is correct - else { - Some((lhs, rhs)) - } - }) - .map(|(lhs, rhs)| { - lhs * 10i128.pow(scale as u32) + (if lhs < 0 { -rhs } else { rhs }) - }) - }), - (None, Some(rhs)) => { - if rhs.len() > precision as usize || rhs.len() != scale as usize { - return None; - } - parse_integer_checked(rhs) - }, - (Some(lhs), None) => { - if lhs.len() > precision as usize || scale != 0 { - return None; - } - parse_integer_checked(lhs) - }, - (None, None) => None, - } + }) + .map(|(lhs, rhs)| { + lhs * 10i128.pow(scale as u32) + (if lhs < 0 { -rhs } else { rhs }) + }) + }, + None => { + if lhs_b.len() > precision as usize || scale != 0 { + return None; + } + parse_integer_checked(lhs_b) + }, + } + }) } #[cfg(test)]