From 0b32e80146065d0d8783d769aec21d674c32d551 Mon Sep 17 00:00:00 2001 From: hgrsd Date: Tue, 2 Jul 2024 19:26:29 +0100 Subject: [PATCH] perf: string inference runs fewer expensive tests --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/infer_string.rs | 54 ++++++++++++++++++++++++++++----------------- 3 files changed, 36 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6026fed..bc19874 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -177,7 +177,7 @@ dependencies = [ [[package]] name = "drivel" -version = "0.2.1" +version = "0.2.2" dependencies = [ "chrono", "clap", diff --git a/Cargo.toml b/Cargo.toml index 9ebc9bc..cf5460a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ name = "drivel" description = "Infer a schema from JSON input, and generate synthetic data based on the inferred schema." license = "MIT" authors = ["Daniƫl Hogers "] -version = "0.2.1" +version = "0.2.2" edition = "2021" repository = "https://github.com/hgrsd/drivel" diff --git a/src/infer_string.rs b/src/infer_string.rs index 65e26a0..e75d538 100644 --- a/src/infer_string.rs +++ b/src/infer_string.rs @@ -12,26 +12,40 @@ lazy_static! { } pub(crate) fn infer_string_type(s: &str) -> StringType { - if ISO_DATE_REGEX.is_match(s) { - StringType::IsoDate - } else if chrono::DateTime::parse_from_rfc2822(s).is_ok() { - StringType::DateTimeISO8601 - } else if chrono::DateTime::parse_from_rfc3339(s).is_ok() { - StringType::DateTimeISO8601 - } else if UUIDREGEX.is_match(s) { - StringType::UUID - } else if EMAIL_REGEX.is_match(s) { - StringType::Email - } else if url::Url::parse(s).is_ok() { - StringType::Url - } else if HOSTNAME_REGEX.is_match(s) { - StringType::Hostname - } else { - StringType::Unknown { - strings_seen: vec![s.to_owned()], - chars_seen: s.chars().collect(), - min_length: Some(s.len()), - max_length: Some(s.len()), + if s.len() == 36 && UUIDREGEX.is_match(s) { + return StringType::UUID; + } + + if s.contains('@') && EMAIL_REGEX.is_match(s) { + return StringType::Email; + } + + if s.contains('.') { + if url::Url::parse(s).is_ok() { + return StringType::Url; + } + if HOSTNAME_REGEX.is_match(s) { + return StringType::Hostname; + } + } + + if s.chars().take(1).all(|char| char.is_numeric()) { + if ISO_DATE_REGEX.is_match(s) { + return StringType::IsoDate; + } + if chrono::DateTime::parse_from_rfc3339(s).is_ok() { + return StringType::DateTimeISO8601; } } + + if chrono::DateTime::parse_from_rfc2822(s).is_ok() { + return StringType::DateTimeISO8601; + } + + return StringType::Unknown { + strings_seen: vec![s.to_owned()], + chars_seen: s.chars().collect(), + min_length: Some(s.len()), + max_length: Some(s.len()), + }; }