From 6a55364bab1cfb9431a24cda3484fbc2bedd7596 Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Mon, 23 Dec 2024 16:20:39 +0100 Subject: [PATCH] Remove obsolete dependency on once_cell --- Cargo.lock | 1 - Cargo.toml | 1 - src/alphabet.rs | 40 ++++++++++++++++++------------------- src/bin/accuracy_reports.rs | 20 +++++++++---------- src/constant.rs | 21 +++++++++---------- src/detector.rs | 37 +++++++++++++++++----------------- 6 files changed, 59 insertions(+), 61 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d17afe0f..43bc106f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -719,7 +719,6 @@ dependencies = [ "lingua-yoruba-language-model", "lingua-zulu-language-model", "maplit", - "once_cell", "pyo3", "rayon", "regex 1.11.1", diff --git a/Cargo.toml b/Cargo.toml index 18fc6ebb..a02ecbe1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,7 +58,6 @@ fraction = "0.15.3" include_dir = "0.7.4" itertools = "0.13.0" maplit = "1.0.2" -once_cell = "1.20.2" regex = "1.11.1" serde = { version = "1.0.216", features = ["derive"] } serde_json = "1.0.133" diff --git a/src/alphabet.rs b/src/alphabet.rs index 9afdcaa2..1dba5f4b 100644 --- a/src/alphabet.rs +++ b/src/alphabet.rs @@ -15,9 +15,9 @@ */ use std::collections::HashMap; +use std::sync::LazyLock; use ahash::AHashSet; -use once_cell::sync::Lazy; use strum::IntoEnumIterator; use strum_macros::EnumIter; @@ -75,7 +75,7 @@ impl Alphabet { languages } - fn char_set(&self) -> &Lazy { + fn char_set(&self) -> &LazyLock { match self { Alphabet::Arabic => &ARABIC, Alphabet::Armenian => &ARMENIAN, @@ -137,21 +137,21 @@ impl CharSet { } } -static ARABIC: Lazy = Lazy::new(|| CharSet::from_char_class("Arabic")); -static ARMENIAN: Lazy = Lazy::new(|| CharSet::from_char_class("Armenian")); -static BENGALI: Lazy = Lazy::new(|| CharSet::from_char_class("Bengali")); -static CYRILLIC: Lazy = Lazy::new(|| CharSet::from_char_class("Cyrillic")); -static DEVANAGARI: Lazy = Lazy::new(|| CharSet::from_char_class("Devanagari")); -static GEORGIAN: Lazy = Lazy::new(|| CharSet::from_char_class("Georgian")); -static GREEK: Lazy = Lazy::new(|| CharSet::from_char_class("Greek")); -static GUJARATI: Lazy = Lazy::new(|| CharSet::from_char_class("Gujarati")); -static GURMUKHI: Lazy = Lazy::new(|| CharSet::from_char_class("Gurmukhi")); -static HAN: Lazy = Lazy::new(|| CharSet::from_char_class("Han")); -static HANGUL: Lazy = Lazy::new(|| CharSet::from_char_class("Hangul")); -static HEBREW: Lazy = Lazy::new(|| CharSet::from_char_class("Hebrew")); -static HIRAGANA: Lazy = Lazy::new(|| CharSet::from_char_class("Hiragana")); -static KATAKANA: Lazy = Lazy::new(|| CharSet::from_char_class("Katakana")); -static LATIN: Lazy = Lazy::new(|| CharSet::from_char_class("Latin")); -static TAMIL: Lazy = Lazy::new(|| CharSet::from_char_class("Tamil")); -static TELUGU: Lazy = Lazy::new(|| CharSet::from_char_class("Telugu")); -static THAI: Lazy = Lazy::new(|| CharSet::from_char_class("Thai")); +static ARABIC: LazyLock = LazyLock::new(|| CharSet::from_char_class("Arabic")); +static ARMENIAN: LazyLock = LazyLock::new(|| CharSet::from_char_class("Armenian")); +static BENGALI: LazyLock = LazyLock::new(|| CharSet::from_char_class("Bengali")); +static CYRILLIC: LazyLock = LazyLock::new(|| CharSet::from_char_class("Cyrillic")); +static DEVANAGARI: LazyLock = LazyLock::new(|| CharSet::from_char_class("Devanagari")); +static GEORGIAN: LazyLock = LazyLock::new(|| CharSet::from_char_class("Georgian")); +static GREEK: LazyLock = LazyLock::new(|| CharSet::from_char_class("Greek")); +static GUJARATI: LazyLock = LazyLock::new(|| CharSet::from_char_class("Gujarati")); +static GURMUKHI: LazyLock = LazyLock::new(|| CharSet::from_char_class("Gurmukhi")); +static HAN: LazyLock = LazyLock::new(|| CharSet::from_char_class("Han")); +static HANGUL: LazyLock = LazyLock::new(|| CharSet::from_char_class("Hangul")); +static HEBREW: LazyLock = LazyLock::new(|| CharSet::from_char_class("Hebrew")); +static HIRAGANA: LazyLock = LazyLock::new(|| CharSet::from_char_class("Hiragana")); +static KATAKANA: LazyLock = LazyLock::new(|| CharSet::from_char_class("Katakana")); +static LATIN: LazyLock = LazyLock::new(|| CharSet::from_char_class("Latin")); +static TAMIL: LazyLock = LazyLock::new(|| CharSet::from_char_class("Tamil")); +static TELUGU: LazyLock = LazyLock::new(|| CharSet::from_char_class("Telugu")); +static THAI: LazyLock = LazyLock::new(|| CharSet::from_char_class("Thai")); diff --git a/src/bin/accuracy_reports.rs b/src/bin/accuracy_reports.rs index 87284713..23cc7bac 100644 --- a/src/bin/accuracy_reports.rs +++ b/src/bin/accuracy_reports.rs @@ -18,6 +18,7 @@ use std::collections::HashMap; use std::fs; use std::io::Write; use std::path::{Path, PathBuf}; +use std::sync::LazyLock; use std::time::Instant; use cld2::{detect_language as cld2_detect_language, Format, Lang as CLD2Language}; @@ -25,7 +26,6 @@ use fraction::{Decimal, Zero}; use include_dir::Dir; use indoc::formatdoc; use itertools::Itertools; -use once_cell::sync::Lazy; use strum::IntoEnumIterator; use titlecase::titlecase; use whatlang::{Detector, Lang as WhatlangLanguage}; @@ -329,16 +329,16 @@ impl Statistic { } } -static WHATLANG_DETECTOR: Lazy = Lazy::new(Detector::new); +static WHATLANG_DETECTOR: LazyLock = LazyLock::new(Detector::new); -static LINGUA_DETECTOR_WITH_LOW_ACCURACY: Lazy = Lazy::new(|| { +static LINGUA_DETECTOR_WITH_LOW_ACCURACY: LazyLock = LazyLock::new(|| { LanguageDetectorBuilder::from_all_languages() .with_low_accuracy_mode() .with_preloaded_language_models() .build() }); -static LINGUA_DETECTOR_WITH_HIGH_ACCURACY: Lazy = Lazy::new(|| { +static LINGUA_DETECTOR_WITH_HIGH_ACCURACY: LazyLock = LazyLock::new(|| { LanguageDetectorBuilder::from_all_languages() .with_preloaded_language_models() .build() @@ -390,14 +390,14 @@ fn get_file_content(file_name: &str) -> HashMap> { .collect() } -static SINGLE_WORDS: Lazy>> = - Lazy::new(|| get_file_content("single-words.txt")); +static SINGLE_WORDS: LazyLock>> = + LazyLock::new(|| get_file_content("single-words.txt")); -static WORD_PAIRS: Lazy>> = - Lazy::new(|| get_file_content("word-pairs.txt")); +static WORD_PAIRS: LazyLock>> = + LazyLock::new(|| get_file_content("word-pairs.txt")); -static SENTENCES: Lazy>> = - Lazy::new(|| get_file_content("sentences.txt")); +static SENTENCES: LazyLock>> = + LazyLock::new(|| get_file_content("sentences.txt")); fn collect_statistics( detector_name: &str, diff --git a/src/constant.rs b/src/constant.rs index 33092fcd..28c998dc 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -16,32 +16,33 @@ use std::collections::{HashMap, HashSet}; use std::str::FromStr; +use std::sync::LazyLock; -use once_cell::sync::Lazy; use regex::Regex; use crate::alphabet::CharSet; use crate::language::Language; -pub(crate) static JAPANESE_CHARACTER_SET: Lazy = - Lazy::new(|| CharSet::from_char_classes(&["Hiragana", "Katakana", "Han"])); -pub(crate) static MULTIPLE_WHITESPACE: Lazy = Lazy::new(|| Regex::new("\\s+").unwrap()); -pub(crate) static NUMBERS: Lazy = Lazy::new(|| Regex::new("\\p{N}").unwrap()); -pub(crate) static PUNCTUATION: Lazy = Lazy::new(|| Regex::new("\\p{P}").unwrap()); -pub(crate) static TOKENS_WITHOUT_WHITESPACE: Lazy = Lazy::new(|| { +pub(crate) static JAPANESE_CHARACTER_SET: LazyLock = + LazyLock::new(|| CharSet::from_char_classes(&["Hiragana", "Katakana", "Han"])); +pub(crate) static MULTIPLE_WHITESPACE: LazyLock = + LazyLock::new(|| Regex::new("\\s+").unwrap()); +pub(crate) static NUMBERS: LazyLock = LazyLock::new(|| Regex::new("\\p{N}").unwrap()); +pub(crate) static PUNCTUATION: LazyLock = LazyLock::new(|| Regex::new("\\p{P}").unwrap()); +pub(crate) static TOKENS_WITHOUT_WHITESPACE: LazyLock = LazyLock::new(|| { Regex::new( "\\p{Bengali}+|\\p{Devanagari}+|\\p{Gujarati}+|\\p{Gurmukhi}+|\\p{Han}|\\p{Hangul}+|\\p{Hiragana}|\\p{Katakana}|\\p{Tamil}+|\\p{Telugu}+|\\p{Thai}+|\\p{L}+", ) .unwrap() }); -pub(crate) static TOKENS_WITH_OPTIONAL_WHITESPACE: Lazy = Lazy::new(|| { +pub(crate) static TOKENS_WITH_OPTIONAL_WHITESPACE: LazyLock = LazyLock::new(|| { Regex::new( "\\s*(?:\\p{Bengali}+|\\p{Devanagari}+|\\p{Gujarati}+|\\p{Gurmukhi}+|\\p{Han}|\\p{Hangul}+|\\p{Hiragana}|\\p{Katakana}|\\p{Tamil}+|\\p{Telugu}+|\\p{Thai}+|[\\p{L}'-]+)[\\p{N}\\p{P}]*\\s*", ) .unwrap() }); -pub(crate) static CHARS_TO_LANGUAGES_MAPPING: Lazy>> = - Lazy::new(|| { +pub(crate) static CHARS_TO_LANGUAGES_MAPPING: LazyLock>> = + LazyLock::new(|| { let mut mapping = hashmap!(); if cfg!(feature = "portuguese") || cfg!(feature = "vietnamese") { diff --git a/src/detector.rs b/src/detector.rs index fc1ca4f8..1fca7568 100644 --- a/src/detector.rs +++ b/src/detector.rs @@ -18,13 +18,12 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; use std::hash::Hash; use std::str::FromStr; -use std::sync::RwLock; +use std::sync::{LazyLock, RwLock}; use ahash::AHashMap; use compact_str::CompactString; use fraction::Zero; use itertools::Itertools; -use once_cell::sync::Lazy; #[cfg(not(target_family = "wasm"))] use rayon::prelude::*; use strum::IntoEnumIterator; @@ -39,15 +38,15 @@ use crate::language::Language; use crate::model::{TestDataLanguageModel, TrainingDataLanguageModel}; use crate::result::DetectionResult; -type LazyLanguageModelMap = Lazy>>>; +type LazyLanguageModelMap = LazyLock>>>; type StaticLanguageModelMap = &'static RwLock>>; type LanguageModelArray<'a> = [Option<&'a HashMap>>; 5]; -static UNIGRAM_MODELS: LazyLanguageModelMap = Lazy::new(|| RwLock::new(HashMap::new())); -static BIGRAM_MODELS: LazyLanguageModelMap = Lazy::new(|| RwLock::new(HashMap::new())); -static TRIGRAM_MODELS: LazyLanguageModelMap = Lazy::new(|| RwLock::new(HashMap::new())); -static QUADRIGRAM_MODELS: LazyLanguageModelMap = Lazy::new(|| RwLock::new(HashMap::new())); -static FIVEGRAM_MODELS: LazyLanguageModelMap = Lazy::new(|| RwLock::new(HashMap::new())); +static UNIGRAM_MODELS: LazyLanguageModelMap = LazyLock::new(|| RwLock::new(HashMap::new())); +static BIGRAM_MODELS: LazyLanguageModelMap = LazyLock::new(|| RwLock::new(HashMap::new())); +static TRIGRAM_MODELS: LazyLanguageModelMap = LazyLock::new(|| RwLock::new(HashMap::new())); +static QUADRIGRAM_MODELS: LazyLanguageModelMap = LazyLock::new(|| RwLock::new(HashMap::new())); +static FIVEGRAM_MODELS: LazyLanguageModelMap = LazyLock::new(|| RwLock::new(HashMap::new())); /// This struct detects the language of given input text. #[cfg_attr(feature = "python", pyo3::prelude::pyclass(module = "lingua"))] @@ -1343,8 +1342,8 @@ fn merge_adjacent_results( #[cfg(test)] mod tests { use float_cmp::approx_eq; - use once_cell::sync::OnceCell; use rstest::*; + use std::sync::OnceLock; use crate::builder::LanguageDetectorBuilder; use crate::language::Language::*; @@ -1499,9 +1498,9 @@ mod tests { unigram_language_model_for_english: AHashMap, unigram_language_model_for_german: AHashMap, ) -> StaticLanguageModelMap { - static UNIGRAM_MODELS_FIXTURE: OnceCell< + static UNIGRAM_MODELS_FIXTURE: OnceLock< RwLock>>, - > = OnceCell::new(); + > = OnceLock::new(); UNIGRAM_MODELS_FIXTURE.get_or_init(|| { RwLock::new(hashmap!( English => unigram_language_model_for_english, @@ -1515,9 +1514,9 @@ mod tests { bigram_language_model_for_english: AHashMap, bigram_language_model_for_german: AHashMap, ) -> StaticLanguageModelMap { - static BIGRAM_MODELS_FIXTURE: OnceCell< + static BIGRAM_MODELS_FIXTURE: OnceLock< RwLock>>, - > = OnceCell::new(); + > = OnceLock::new(); BIGRAM_MODELS_FIXTURE.get_or_init(|| { RwLock::new(hashmap!( English => bigram_language_model_for_english, @@ -1531,9 +1530,9 @@ mod tests { trigram_language_model_for_english: AHashMap, trigram_language_model_for_german: AHashMap, ) -> StaticLanguageModelMap { - static TRIGRAM_MODELS_FIXTURE: OnceCell< + static TRIGRAM_MODELS_FIXTURE: OnceLock< RwLock>>, - > = OnceCell::new(); + > = OnceLock::new(); TRIGRAM_MODELS_FIXTURE.get_or_init(|| { RwLock::new(hashmap!( English => trigram_language_model_for_english, @@ -1547,9 +1546,9 @@ mod tests { quadrigram_language_model_for_english: AHashMap, quadrigram_language_model_for_german: AHashMap, ) -> StaticLanguageModelMap { - static QUADRIGRAM_MODELS_FIXTURE: OnceCell< + static QUADRIGRAM_MODELS_FIXTURE: OnceLock< RwLock>>, - > = OnceCell::new(); + > = OnceLock::new(); QUADRIGRAM_MODELS_FIXTURE.get_or_init(|| { RwLock::new(hashmap!( English => quadrigram_language_model_for_english, @@ -1563,9 +1562,9 @@ mod tests { fivegram_language_model_for_english: AHashMap, fivegram_language_model_for_german: AHashMap, ) -> StaticLanguageModelMap { - static FIVEGRAM_MODELS_FIXTURE: OnceCell< + static FIVEGRAM_MODELS_FIXTURE: OnceLock< RwLock>>, - > = OnceCell::new(); + > = OnceLock::new(); FIVEGRAM_MODELS_FIXTURE.get_or_init(|| { RwLock::new(hashmap!( English => fivegram_language_model_for_english,