Change content_locale to be &LanguageIdentifier (#5566)

unicode-org · Oct 8, 2024 · 0bf22da · 0bf22da
1 parent d70e5cd
commit 0bf22da
Show file tree

Hide file tree

Showing 9 changed files with 76 additions and 76 deletions.
diff --git a/components/segmenter/src/line.rs b/components/segmenter/src/line.rs
@@ -11,6 +11,7 @@ use alloc::vec::Vec;
 use core::char;
 use core::str::CharIndices;
 use icu_locale_core::subtags::language;
+use icu_locale_core::LanguageIdentifier;
 use icu_provider::prelude::*;
 use utf8_iter::Utf8CharIndices;
 
@@ -184,8 +185,8 @@ pub enum LineBreakWordOption {
 
 /// Options to tailor line-breaking behavior.
 #[non_exhaustive]
-#[derive(Clone, PartialEq, Eq, Debug)]
-pub struct LineBreakOptions {
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub struct LineBreakOptions<'a> {
     /// Strictness of line-breaking rules. See [`LineBreakStrictness`].
     pub strictness: LineBreakStrictness,
 
@@ -198,10 +199,10 @@ pub struct LineBreakOptions {
     /// `Normal` or `Loose`. See
     /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
     /// This option has no effect in Latin-1 mode.
-    pub content_locale: Option<DataLocale>,
+    pub content_locale: Option<&'a LanguageIdentifier>,
 }
 
-impl Default for LineBreakOptions {
+impl Default for LineBreakOptions<'_> {
     fn default() -> Self {
         Self {
             strictness: LineBreakStrictness::Strict,
@@ -211,6 +212,28 @@ impl Default for LineBreakOptions {
     }
 }
 
+#[derive(Debug)]
+struct ResolvedLineBreakOptions {
+    strictness: LineBreakStrictness,
+    word_option: LineBreakWordOption,
+    ja_zh: bool,
+}
+
+impl From<LineBreakOptions<'_>> for ResolvedLineBreakOptions {
+    fn from(options: LineBreakOptions<'_>) -> Self {
+        let ja_zh = if let Some(content_locale) = options.content_locale.as_ref() {
+            content_locale.language == language!("ja") || content_locale.language == language!("zh")
+        } else {
+            false
+        };
+        Self {
+            strictness: options.strictness,
+            word_option: options.word_option,
+            ja_zh,
+        }
+    }
+}
+
 /// Line break iterator for an `str` (a UTF-8 string).
 ///
 /// For examples of use, see [`LineSegmenter`].
@@ -353,7 +376,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp
 /// ```
 #[derive(Debug)]
 pub struct LineSegmenter {
-    options: LineBreakOptions,
+    options: ResolvedLineBreakOptions,
     payload: DataPayload<LineBreakDataV2Marker>,
     complex: ComplexPayloads,
 }
@@ -536,7 +559,7 @@ impl LineSegmenter {
     #[cfg(feature = "compiled_data")]
     pub fn new_lstm_with_options(options: LineBreakOptions) -> Self {
         Self {
-            options,
+            options: options.into(),
             payload: DataPayload::from_static_ref(
                 crate::provider::Baked::SINGLETON_LINE_BREAK_DATA_V2_MARKER,
             ),
@@ -569,7 +592,7 @@ impl LineSegmenter {
             + ?Sized,
     {
         Ok(Self {
-            options,
+            options: options.into(),
             payload: provider.load(Default::default())?.payload,
             complex: ComplexPayloads::try_new_lstm(provider)?,
         })
@@ -589,7 +612,7 @@ impl LineSegmenter {
     #[cfg(feature = "compiled_data")]
     pub fn new_dictionary_with_options(options: LineBreakOptions) -> Self {
         Self {
-            options,
+            options: options.into(),
             payload: DataPayload::from_static_ref(
                 crate::provider::Baked::SINGLETON_LINE_BREAK_DATA_V2_MARKER,
             ),
@@ -626,7 +649,7 @@ impl LineSegmenter {
             + ?Sized,
     {
         Ok(Self {
-            options,
+            options: options.into(),
             payload: provider.load(Default::default())?.payload,
             // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
             // characters [1]. Southeast Asian languages however require complex context analysis
@@ -642,11 +665,6 @@ impl LineSegmenter {
     ///
     /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
     pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
-        let ja_zh = if let Some(content_locale) = &self.options.content_locale {
-            content_locale.language == language!("ja") || content_locale.language == language!("zh")
-        } else {
-            false
-        };
         LineBreakIterator {
             iter: input.char_indices(),
             len: input.len(),
@@ -655,7 +673,6 @@ impl LineSegmenter {
             data: self.payload.get(),
             options: &self.options,
             complex: &self.complex,
-            ja_zh,
         }
     }
     /// Creates a line break iterator for a potentially ill-formed UTF8 string
@@ -667,11 +684,6 @@ impl LineSegmenter {
         &'l self,
         input: &'s [u8],
     ) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
-        let ja_zh = if let Some(content_locale) = &self.options.content_locale {
-            content_locale.language == language!("ja") || content_locale.language == language!("zh")
-        } else {
-            false
-        };
         LineBreakIterator {
             iter: Utf8CharIndices::new(input),
             len: input.len(),
@@ -680,7 +692,6 @@ impl LineSegmenter {
             data: self.payload.get(),
             options: &self.options,
             complex: &self.complex,
-            ja_zh,
         }
     }
     /// Creates a line break iterator for a Latin-1 (8-bit) string.
@@ -695,19 +706,13 @@ impl LineSegmenter {
             data: self.payload.get(),
             options: &self.options,
             complex: &self.complex,
-            ja_zh: false,
         }
     }
 
     /// Creates a line break iterator for a UTF-16 string.
     ///
     /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
     pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
-        let ja_zh = if let Some(content_locale) = &self.options.content_locale {
-            content_locale.language == language!("ja") || content_locale.language == language!("zh")
-        } else {
-            false
-        };
         LineBreakIterator {
             iter: Utf16Indices::new(input),
             len: input.len(),
@@ -716,7 +721,6 @@ impl LineSegmenter {
             data: self.payload.get(),
             options: &self.options,
             complex: &self.complex,
-            ja_zh,
         }
     }
 }
@@ -871,9 +875,8 @@ pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
     current_pos_data: Option<(usize, Y::CharType)>,
     result_cache: Vec<usize>,
     data: &'l RuleBreakDataV2<'l>,
-    options: &'l LineBreakOptions,
+    options: &'l ResolvedLineBreakOptions,
     complex: &'l ComplexPayloads,
-    ja_zh: bool,
 }
 
 impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
@@ -969,7 +972,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
                         right_codepoint.into(),
                         left_prop,
                         right_prop,
-                        self.ja_zh,
+                        self.options.ja_zh,
                     ) {
                         if breakable && !after_zwj {
                             return self.get_current_position();
@@ -1172,7 +1175,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {
 
     fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
         match codepoint.into() {
-            0x301C | 0x30A0 => self.ja_zh,
+            0x301C | 0x30A0 => self.options.ja_zh,
             _ => false,
         }
     }

diff --git a/components/segmenter/src/sentence.rs b/components/segmenter/src/sentence.rs
@@ -3,6 +3,7 @@
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
 use alloc::vec::Vec;
+use icu_locale_core::LanguageIdentifier;
 use icu_provider::prelude::*;
 
 use crate::indices::{Latin1Indices, Utf16Indices};
@@ -13,10 +14,10 @@ use utf8_iter::Utf8CharIndices;
 
 /// Options to tailor sentence breaking behavior.
 #[non_exhaustive]
-#[derive(Clone, PartialEq, Eq, Debug, Default)]
-pub struct SentenceBreakOptions {
+#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
+pub struct SentenceBreakOptions<'a> {
     /// Content locale for sentence segmenter.
-    pub content_locale: Option<DataLocale>,
+    pub content_locale: Option<&'a LanguageIdentifier>,
 }
 
 /// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
@@ -184,6 +185,7 @@ impl SentenceSegmenter {
     {
         let payload = provider.load(Default::default())?.payload;
         let payload_locale_override = if let Some(locale) = options.content_locale {
+            let locale = DataLocale::from(locale);
             let req = DataRequest {
                 id: DataIdentifierBorrowed::for_locale(&locale),
                 metadata: {

diff --git a/components/segmenter/src/word.rs b/components/segmenter/src/word.rs
@@ -11,15 +11,16 @@ use alloc::string::String;
 use alloc::vec;
 use alloc::vec::Vec;
 use core::str::CharIndices;
+use icu_locale_core::LanguageIdentifier;
 use icu_provider::prelude::*;
 use utf8_iter::Utf8CharIndices;
 
 /// Options to tailor word breaking behavior.
 #[non_exhaustive]
-#[derive(Clone, PartialEq, Eq, Debug, Default)]
-pub struct WordBreakOptions {
+#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
+pub struct WordBreakOptions<'a> {
     /// Content locale for word segmenter
-    pub content_locale: Option<DataLocale>,
+    pub content_locale: Option<&'a LanguageIdentifier>,
 }
 
 /// Implements the [`Iterator`] trait over the word boundaries of the given string.
@@ -280,6 +281,7 @@ impl WordSegmenter {
             payload: provider.load(Default::default())?.payload,
             complex: ComplexPayloads::try_new_auto(provider)?,
             payload_locale_override: if let Some(locale) = options.content_locale {
+                let locale = DataLocale::from(locale);
                 let req = DataRequest {
                     id: DataIdentifierBorrowed::for_locale(&locale),
                     metadata: {
@@ -405,6 +407,7 @@ impl WordSegmenter {
             payload: provider.load(Default::default())?.payload,
             complex: ComplexPayloads::try_new_lstm(provider)?,
             payload_locale_override: if let Some(locale) = options.content_locale {
+                let locale = DataLocale::from(locale);
                 let req = DataRequest {
                     id: DataIdentifierBorrowed::for_locale(&locale),
                     metadata: {
@@ -522,6 +525,7 @@ impl WordSegmenter {
             payload: provider.load(Default::default())?.payload,
             complex: ComplexPayloads::try_new_dict(provider)?,
             payload_locale_override: if let Some(locale) = options.content_locale {
+                let locale = DataLocale::from(locale);
                 let req = DataRequest {
                     id: DataIdentifierBorrowed::for_locale(&locale),
                     metadata: {

diff --git a/components/segmenter/tests/css_line_break.rs b/components/segmenter/tests/css_line_break.rs
@@ -2,7 +2,7 @@
 // called LICENSE at the top level of the ICU4X source tree
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
-use icu::locale::locale;
+use icu_locale_core::{langid, LanguageIdentifier};
 use icu_segmenter::LineBreakOptions;
 use icu_segmenter::LineBreakStrictness;
 use icu_segmenter::LineBreakWordOption;
@@ -28,51 +28,37 @@ fn check_with_options(
     assert_eq!(expect_utf16, result, "{s}");
 }
 
+static JA: LanguageIdentifier = langid!("ja");
+
 fn strict(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
     let mut options = LineBreakOptions::default();
     options.strictness = LineBreakStrictness::Strict;
     options.word_option = LineBreakWordOption::Normal;
-    options.content_locale = if ja_zh {
-        Some(locale!("ja").into())
-    } else {
-        None
-    };
+    options.content_locale = ja_zh.then_some(&JA);
     check_with_options(s, expect_utf8, expect_utf16, options);
 }
 
 fn normal(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
     let mut options = LineBreakOptions::default();
     options.strictness = LineBreakStrictness::Normal;
     options.word_option = LineBreakWordOption::Normal;
-    options.content_locale = if ja_zh {
-        Some(locale!("ja").into())
-    } else {
-        None
-    };
+    options.content_locale = ja_zh.then_some(&JA);
     check_with_options(s, expect_utf8, expect_utf16, options);
 }
 
 fn loose(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
     let mut options = LineBreakOptions::default();
     options.strictness = LineBreakStrictness::Loose;
     options.word_option = LineBreakWordOption::Normal;
-    options.content_locale = if ja_zh {
-        Some(locale!("ja").into())
-    } else {
-        None
-    };
+    options.content_locale = ja_zh.then_some(&JA);
     check_with_options(s, expect_utf8, expect_utf16, options);
 }
 
 fn anywhere(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
     let mut options = LineBreakOptions::default();
     options.strictness = LineBreakStrictness::Anywhere;
     options.word_option = LineBreakWordOption::Normal;
-    options.content_locale = if ja_zh {
-        Some(locale!("ja").into())
-    } else {
-        None
-    };
+    options.content_locale = ja_zh.then_some(&JA);
     check_with_options(s, expect_utf8, expect_utf16, options);
 }
 

diff --git a/components/segmenter/tests/locale.rs b/components/segmenter/tests/locale.rs
@@ -2,7 +2,7 @@
 // called LICENSE at the top level of the ICU4X source tree
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
-use icu_locale_core::locale;
+use icu_locale_core::langid;
 use icu_segmenter::{SentenceBreakOptions, SentenceSegmenter, WordBreakOptions, WordSegmenter};
 
 // Additional segmenter tests with locale.
@@ -12,7 +12,8 @@ fn word_break_with_locale() {
     // MidLetter is different because U+0x3A isn't MidLetter on Swedish.
     let s = "hello:world";
     let mut options_sv = WordBreakOptions::default();
-    options_sv.content_locale = Some(locale!("sv").into());
+    let langid = langid!("sv");
+    options_sv.content_locale = Some(&langid);
     let segmenter =
         WordSegmenter::try_new_auto_with_options(options_sv).expect("Loading should succeed!");
     let iter = segmenter.segment_str(s);
@@ -23,7 +24,8 @@ fn word_break_with_locale() {
     );
 
     let mut options_en = WordBreakOptions::default();
-    options_en.content_locale = Some(locale!("en").into());
+    let langid = langid!("en");
+    options_en.content_locale = Some(&langid);
     let segmenter =
         WordSegmenter::try_new_auto_with_options(options_en).expect("Loading should succeed!");
     let iter = segmenter.segment_str(s);
@@ -39,7 +41,8 @@ fn sentence_break_with_locale() {
     // SB11 is different because U+0x3B is STerm on Greek.
     let s = "hello; world";
     let mut options_el = SentenceBreakOptions::default();
-    options_el.content_locale = Some(locale!("el").into());
+    let langid = langid!("el");
+    options_el.content_locale = Some(&langid);
     let segmenter =
         SentenceSegmenter::try_new_with_options(options_el).expect("Loading should succeed!");
     let iter = segmenter.segment_str(s);
@@ -50,7 +53,8 @@ fn sentence_break_with_locale() {
     );
 
     let mut options_en = SentenceBreakOptions::default();
-    options_en.content_locale = Some(locale!("en").into());
+    let langid = langid!("en");
+    options_en.content_locale = Some(&langid);
     let segmenter =
         SentenceSegmenter::try_new_with_options(options_en).expect("Loading should succeed!");
     let iter = segmenter.segment_str(s);