Skip to content

Commit

Permalink
Fix the case of "centi-meter" and "100-kilometer" (#4418)
Browse files Browse the repository at this point in the history
  • Loading branch information
younies authored Jan 5, 2024
1 parent f36d948 commit e804871
Show file tree
Hide file tree
Showing 10 changed files with 357 additions and 312 deletions.
2 changes: 2 additions & 0 deletions experimental/unitsconversion/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ pub mod provider;
pub mod si_prefix;

/// Represents the possible errors that can occur during the measurement unit operations.
#[derive(Debug)]
pub enum ConversionError {
/// The unit is not valid.
/// This can happen if the unit id is not following the CLDR specification.
Expand Down
137 changes: 74 additions & 63 deletions experimental/unitsconversion/src/measureunit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,75 +3,76 @@
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use smallvec::SmallVec;
use zerotrie::ZeroTrie;
use zerotrie::ZeroTrieSimpleAscii;
use zerovec::ZeroVec;

use crate::{
power::get_power,
provider::{Base, MeasureUnitItem, SiPrefix},
si_prefix::{get_si_prefix_base_ten, get_si_prefix_base_two},
si_prefix::get_si_prefix,
ConversionError,
};

// TODO: add test cases for this parser after adding UnitsTest.txt to the test data.
/// A parser for the CLDR unit identifier (e.g. `meter-per-square-second`)
pub struct MeasureUnitParser<'data> {
/// Contains the payload.
payload: &'data ZeroTrie<ZeroVec<'data, u8>>,
payload: &'data ZeroTrieSimpleAscii<ZeroVec<'data, u8>>,
}

impl<'data> MeasureUnitParser<'data> {
// TODO: revisit the public nature of the API. Maybe we should make it private and add a function to create it from a ConverterFactory.
/// Creates a new MeasureUnitParser from a ZeroTrie payload.
#[cfg(feature = "datagen")]
pub fn from_payload(payload: &'data ZeroTrie<ZeroVec<'data, u8>>) -> Self {
pub fn from_payload(payload: &'data ZeroTrieSimpleAscii<ZeroVec<u8>>) -> Self {
Self { payload }
}

// TODO: complete all the cases for the prefixes.
// TODO: consider using a trie for the prefixes.
/// Extracts the SI prefix.
/// Get the unit id.
/// NOTE:
/// if the prefix is found, the function will return (SiPrefix, part without the prefix string).
/// if the prefix is not found, the function will return (SiPrefix { power: 0, base: Base::Decimal }, part).
fn get_si_prefix(part: &str) -> (SiPrefix, &str) {
let (si_prefix_base_10, part) = get_si_prefix_base_ten(part);
if si_prefix_base_10 != 0 {
return (
SiPrefix {
power: si_prefix_base_10,
base: Base::Decimal,
},
part,
);
/// if the unit id is found, the function will return (unit id, part without the unit id and without `-` at the beginning of the remaining part if it exists).
/// if the unit id is not found, the function will return an error.
fn get_unit_id<'a>(&'a self, part: &'a str) -> Result<(u16, &str), ConversionError> {
let mut cursor = self.payload.cursor();
let mut longest_match = Err(ConversionError::InvalidUnit);

for (i, byte) in part.bytes().enumerate() {
cursor.step(byte);
if cursor.is_empty() {
break;
}
if let Some(value) = cursor.take_value() {
longest_match = Ok((value as u16, &part[i + 1..]));
}
}
longest_match
}

let (si_prefix_base_2, part) = get_si_prefix_base_two(part);
if si_prefix_base_2 != 0 {
return (
SiPrefix {
power: si_prefix_base_2,
base: Base::Binary,
},
part,
);
fn get_power<'a>(&'a self, part: &'a str) -> Result<(u8, &str), ConversionError> {
let (power, part_without_power) = get_power(part);

// If the power is not found, return the part as it is.
if part_without_power.len() == part.len() {
return Ok((power, part));
}

(
SiPrefix {
power: 0,
base: Base::Decimal,
},
part,
)
// If the power is found, this means that the part must start with the `-` sign.
match part_without_power.strip_prefix('-') {
Some(part_without_power) => Ok((power, part_without_power)),
None => Err(ConversionError::InvalidUnit),
}
}

/// Get the unit id.
/// NOTE:
/// if the unit id is found, the function will return (unit id, part without the unit id and without `-` at the beginning of the remaining part if it exists).
/// if the unit id is not found, the function will return None.
fn get_unit_id(&self, part: &'data str) -> Option<usize> {
self.payload.get(part.as_bytes())
fn get_si_prefix<'a>(&'a self, part: &'a str) -> (SiPrefix, &str) {
let (si_prefix, part_without_si_prefix) = get_si_prefix(part);
if part_without_si_prefix.len() == part.len() {
return (si_prefix, part);
}

match part_without_si_prefix.strip_prefix('-') {
Some(part_without_dash) => (si_prefix, part_without_dash),
None => (si_prefix, part_without_si_prefix),
}
}

/// Process a part of an identifier.
Expand All @@ -83,31 +84,41 @@ impl<'data> MeasureUnitParser<'data> {
sign: i8,
result: &mut Vec<MeasureUnitItem>,
) -> Result<(), ConversionError> {
if identifier_part.is_empty() {
return Ok(());
}
let mut identifier_split = identifier_part.split('-');
while let Some(mut part) = identifier_split.next() {
let power = match get_power(part) {
Some(power) => {
part = identifier_split
.next()
.ok_or(ConversionError::InvalidUnit)?;
power
}
None => 1,
};

let (si_prefix, identifier_after_si) = Self::get_si_prefix(part);
let unit_id = self
.get_unit_id(identifier_after_si)
.ok_or(ConversionError::InvalidUnit)?;
let mut identifier_part = identifier_part;
while !identifier_part.is_empty() {
let (power, identifier_part_without_power) = self.get_power(identifier_part)?;
let (si_prefix, unit_id, identifier_part_without_unit_id) =
match self.get_unit_id(identifier_part_without_power) {
Ok((unit_id, identifier_part_without_unit_id)) => (
SiPrefix {
power: 0,
base: Base::Decimal,
},
unit_id,
identifier_part_without_unit_id,
),
Err(_) => {
let (si_prefix, identifier_part_without_si_prefix) =
self.get_si_prefix(identifier_part_without_power);
let (unit_id, identifier_part_without_unit_id) =
self.get_unit_id(identifier_part_without_si_prefix)?;
(si_prefix, unit_id, identifier_part_without_unit_id)
}
};

result.push(MeasureUnitItem {
power: sign * power,
power: sign * power as i8,
si_prefix,
unit_id: unit_id as u16,
unit_id,
});

identifier_part = match identifier_part_without_unit_id.len() {
0 => identifier_part_without_unit_id,
_ if identifier_part_without_unit_id.starts_with('-') => {
&identifier_part_without_unit_id[1..]
}
_ => return Err(ConversionError::InvalidUnit),
};
}

Ok(())
Expand All @@ -119,7 +130,7 @@ impl<'data> MeasureUnitParser<'data> {
&self,
identifier: &'data str,
) -> Result<Vec<MeasureUnitItem>, ConversionError> {
if identifier.starts_with('-') {
if identifier.starts_with('-') || identifier.ends_with('-') {
return Err(ConversionError::InvalidUnit);
}

Expand Down
57 changes: 38 additions & 19 deletions experimental/unitsconversion/src/power.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,46 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use zerotrie::ZeroTrieSimpleAscii;

/// A trie that contains the powers.
const POWERS_TRIE: ZeroTrieSimpleAscii<[u8; 64]> = ZeroTrieSimpleAscii::from_sorted_str_tuples(&[
("cubic", 3),
("pow1", 1),
("pow10", 10),
("pow11", 11),
("pow12", 12),
("pow13", 13),
("pow14", 14),
("pow15", 15),
("pow2", 2),
("pow3", 3),
("pow4", 4),
("pow5", 5),
("pow6", 6),
("pow7", 7),
("pow8", 8),
("pow9", 9),
("square", 2),
]);

// TODO: consider returning Option<(u8, &str)> instead of (1, part) for the case when the power is not found.
// TODO: complete all the cases for the powers.
// TODO: consider using a trie for the powers.
/// Converts a power string to a power.
pub fn get_power(part: &str) -> Option<i8> {
match part {
"pow1" => Some(1),
"square" | "pow2" => Some(2),
"cubic" | "pow3" => Some(3),
"pow4" => Some(4),
"pow5" => Some(5),
"pow6" => Some(6),
"pow7" => Some(7),
"pow8" => Some(8),
"pow9" => Some(9),
"pow10" => Some(10),
"pow11" => Some(11),
"pow12" => Some(12),
"pow13" => Some(13),
"pow14" => Some(14),
"pow15" => Some(15),
_ => None,
/// Extracts the power from the given CLDR ID part.
/// - If the power is not found, the function returns (1, part).
/// - If the power is found, the function will return (power, part without the string of the power).
pub fn get_power(part: &str) -> (u8, &str) {
let mut cursor = POWERS_TRIE.cursor();
let mut longest_match = (1, part);
for (i, b) in part.bytes().enumerate() {
cursor.step(b);
if cursor.is_empty() {
break;
}
if let Some(value) = cursor.take_value() {
longest_match = (value as u8, &part[i + 1..]);
}
}
longest_match
}
Loading

0 comments on commit e804871

Please sign in to comment.