Skip to content

Commit

Permalink
Change content_locale to be &LanguageIdentifier (#5566)
Browse files Browse the repository at this point in the history
  • Loading branch information
sffc authored Oct 8, 2024
1 parent d70e5cd commit 0bf22da
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 76 deletions.
67 changes: 35 additions & 32 deletions components/segmenter/src/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use alloc::vec::Vec;
use core::char;
use core::str::CharIndices;
use icu_locale_core::subtags::language;
use icu_locale_core::LanguageIdentifier;
use icu_provider::prelude::*;
use utf8_iter::Utf8CharIndices;

Expand Down Expand Up @@ -184,8 +185,8 @@ pub enum LineBreakWordOption {

/// Options to tailor line-breaking behavior.
#[non_exhaustive]
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct LineBreakOptions {
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
pub struct LineBreakOptions<'a> {
/// Strictness of line-breaking rules. See [`LineBreakStrictness`].
pub strictness: LineBreakStrictness,

Expand All @@ -198,10 +199,10 @@ pub struct LineBreakOptions {
/// `Normal` or `Loose`. See
/// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
/// This option has no effect in Latin-1 mode.
pub content_locale: Option<DataLocale>,
pub content_locale: Option<&'a LanguageIdentifier>,
}

impl Default for LineBreakOptions {
impl Default for LineBreakOptions<'_> {
fn default() -> Self {
Self {
strictness: LineBreakStrictness::Strict,
Expand All @@ -211,6 +212,28 @@ impl Default for LineBreakOptions {
}
}

#[derive(Debug)]
struct ResolvedLineBreakOptions {
strictness: LineBreakStrictness,
word_option: LineBreakWordOption,
ja_zh: bool,
}

impl From<LineBreakOptions<'_>> for ResolvedLineBreakOptions {
fn from(options: LineBreakOptions<'_>) -> Self {
let ja_zh = if let Some(content_locale) = options.content_locale.as_ref() {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
Self {
strictness: options.strictness,
word_option: options.word_option,
ja_zh,
}
}
}

/// Line break iterator for an `str` (a UTF-8 string).
///
/// For examples of use, see [`LineSegmenter`].
Expand Down Expand Up @@ -353,7 +376,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp
/// ```
#[derive(Debug)]
pub struct LineSegmenter {
options: LineBreakOptions,
options: ResolvedLineBreakOptions,
payload: DataPayload<LineBreakDataV2Marker>,
complex: ComplexPayloads,
}
Expand Down Expand Up @@ -536,7 +559,7 @@ impl LineSegmenter {
#[cfg(feature = "compiled_data")]
pub fn new_lstm_with_options(options: LineBreakOptions) -> Self {
Self {
options,
options: options.into(),
payload: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_LINE_BREAK_DATA_V2_MARKER,
),
Expand Down Expand Up @@ -569,7 +592,7 @@ impl LineSegmenter {
+ ?Sized,
{
Ok(Self {
options,
options: options.into(),
payload: provider.load(Default::default())?.payload,
complex: ComplexPayloads::try_new_lstm(provider)?,
})
Expand All @@ -589,7 +612,7 @@ impl LineSegmenter {
#[cfg(feature = "compiled_data")]
pub fn new_dictionary_with_options(options: LineBreakOptions) -> Self {
Self {
options,
options: options.into(),
payload: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_LINE_BREAK_DATA_V2_MARKER,
),
Expand Down Expand Up @@ -626,7 +649,7 @@ impl LineSegmenter {
+ ?Sized,
{
Ok(Self {
options,
options: options.into(),
payload: provider.load(Default::default())?.payload,
// Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
// characters [1]. Southeast Asian languages however require complex context analysis
Expand All @@ -642,11 +665,6 @@ impl LineSegmenter {
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
LineBreakIterator {
iter: input.char_indices(),
len: input.len(),
Expand All @@ -655,7 +673,6 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh,
}
}
/// Creates a line break iterator for a potentially ill-formed UTF8 string
Expand All @@ -667,11 +684,6 @@ impl LineSegmenter {
&'l self,
input: &'s [u8],
) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
LineBreakIterator {
iter: Utf8CharIndices::new(input),
len: input.len(),
Expand All @@ -680,7 +692,6 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh,
}
}
/// Creates a line break iterator for a Latin-1 (8-bit) string.
Expand All @@ -695,19 +706,13 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh: false,
}
}

/// Creates a line break iterator for a UTF-16 string.
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
LineBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
Expand All @@ -716,7 +721,6 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh,
}
}
}
Expand Down Expand Up @@ -871,9 +875,8 @@ pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
current_pos_data: Option<(usize, Y::CharType)>,
result_cache: Vec<usize>,
data: &'l RuleBreakDataV2<'l>,
options: &'l LineBreakOptions,
options: &'l ResolvedLineBreakOptions,
complex: &'l ComplexPayloads,
ja_zh: bool,
}

impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
Expand Down Expand Up @@ -969,7 +972,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
right_codepoint.into(),
left_prop,
right_prop,
self.ja_zh,
self.options.ja_zh,
) {
if breakable && !after_zwj {
return self.get_current_position();
Expand Down Expand Up @@ -1172,7 +1175,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {

fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
match codepoint.into() {
0x301C | 0x30A0 => self.ja_zh,
0x301C | 0x30A0 => self.options.ja_zh,
_ => false,
}
}
Expand Down
8 changes: 5 additions & 3 deletions components/segmenter/src/sentence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use alloc::vec::Vec;
use icu_locale_core::LanguageIdentifier;
use icu_provider::prelude::*;

use crate::indices::{Latin1Indices, Utf16Indices};
Expand All @@ -13,10 +14,10 @@ use utf8_iter::Utf8CharIndices;

/// Options to tailor sentence breaking behavior.
#[non_exhaustive]
#[derive(Clone, PartialEq, Eq, Debug, Default)]
pub struct SentenceBreakOptions {
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub struct SentenceBreakOptions<'a> {
/// Content locale for sentence segmenter.
pub content_locale: Option<DataLocale>,
pub content_locale: Option<&'a LanguageIdentifier>,
}

/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
Expand Down Expand Up @@ -184,6 +185,7 @@ impl SentenceSegmenter {
{
let payload = provider.load(Default::default())?.payload;
let payload_locale_override = if let Some(locale) = options.content_locale {
let locale = DataLocale::from(locale);
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
metadata: {
Expand Down
10 changes: 7 additions & 3 deletions components/segmenter/src/word.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,16 @@ use alloc::string::String;
use alloc::vec;
use alloc::vec::Vec;
use core::str::CharIndices;
use icu_locale_core::LanguageIdentifier;
use icu_provider::prelude::*;
use utf8_iter::Utf8CharIndices;

/// Options to tailor word breaking behavior.
#[non_exhaustive]
#[derive(Clone, PartialEq, Eq, Debug, Default)]
pub struct WordBreakOptions {
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub struct WordBreakOptions<'a> {
/// Content locale for word segmenter
pub content_locale: Option<DataLocale>,
pub content_locale: Option<&'a LanguageIdentifier>,
}

/// Implements the [`Iterator`] trait over the word boundaries of the given string.
Expand Down Expand Up @@ -280,6 +281,7 @@ impl WordSegmenter {
payload: provider.load(Default::default())?.payload,
complex: ComplexPayloads::try_new_auto(provider)?,
payload_locale_override: if let Some(locale) = options.content_locale {
let locale = DataLocale::from(locale);
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
metadata: {
Expand Down Expand Up @@ -405,6 +407,7 @@ impl WordSegmenter {
payload: provider.load(Default::default())?.payload,
complex: ComplexPayloads::try_new_lstm(provider)?,
payload_locale_override: if let Some(locale) = options.content_locale {
let locale = DataLocale::from(locale);
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
metadata: {
Expand Down Expand Up @@ -522,6 +525,7 @@ impl WordSegmenter {
payload: provider.load(Default::default())?.payload,
complex: ComplexPayloads::try_new_dict(provider)?,
payload_locale_override: if let Some(locale) = options.content_locale {
let locale = DataLocale::from(locale);
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
metadata: {
Expand Down
28 changes: 7 additions & 21 deletions components/segmenter/tests/css_line_break.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use icu::locale::locale;
use icu_locale_core::{langid, LanguageIdentifier};
use icu_segmenter::LineBreakOptions;
use icu_segmenter::LineBreakStrictness;
use icu_segmenter::LineBreakWordOption;
Expand All @@ -28,51 +28,37 @@ fn check_with_options(
assert_eq!(expect_utf16, result, "{s}");
}

static JA: LanguageIdentifier = langid!("ja");

fn strict(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::Normal;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
options.content_locale = ja_zh.then_some(&JA);
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn normal(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Normal;
options.word_option = LineBreakWordOption::Normal;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
options.content_locale = ja_zh.then_some(&JA);
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn loose(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Loose;
options.word_option = LineBreakWordOption::Normal;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
options.content_locale = ja_zh.then_some(&JA);
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn anywhere(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Anywhere;
options.word_option = LineBreakWordOption::Normal;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
options.content_locale = ja_zh.then_some(&JA);
check_with_options(s, expect_utf8, expect_utf16, options);
}

Expand Down
14 changes: 9 additions & 5 deletions components/segmenter/tests/locale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use icu_locale_core::locale;
use icu_locale_core::langid;
use icu_segmenter::{SentenceBreakOptions, SentenceSegmenter, WordBreakOptions, WordSegmenter};

// Additional segmenter tests with locale.
Expand All @@ -12,7 +12,8 @@ fn word_break_with_locale() {
// MidLetter is different because U+0x3A isn't MidLetter on Swedish.
let s = "hello:world";
let mut options_sv = WordBreakOptions::default();
options_sv.content_locale = Some(locale!("sv").into());
let langid = langid!("sv");
options_sv.content_locale = Some(&langid);
let segmenter =
WordSegmenter::try_new_auto_with_options(options_sv).expect("Loading should succeed!");
let iter = segmenter.segment_str(s);
Expand All @@ -23,7 +24,8 @@ fn word_break_with_locale() {
);

let mut options_en = WordBreakOptions::default();
options_en.content_locale = Some(locale!("en").into());
let langid = langid!("en");
options_en.content_locale = Some(&langid);
let segmenter =
WordSegmenter::try_new_auto_with_options(options_en).expect("Loading should succeed!");
let iter = segmenter.segment_str(s);
Expand All @@ -39,7 +41,8 @@ fn sentence_break_with_locale() {
// SB11 is different because U+0x3B is STerm on Greek.
let s = "hello; world";
let mut options_el = SentenceBreakOptions::default();
options_el.content_locale = Some(locale!("el").into());
let langid = langid!("el");
options_el.content_locale = Some(&langid);
let segmenter =
SentenceSegmenter::try_new_with_options(options_el).expect("Loading should succeed!");
let iter = segmenter.segment_str(s);
Expand All @@ -50,7 +53,8 @@ fn sentence_break_with_locale() {
);

let mut options_en = SentenceBreakOptions::default();
options_en.content_locale = Some(locale!("en").into());
let langid = langid!("en");
options_en.content_locale = Some(&langid);
let segmenter =
SentenceSegmenter::try_new_with_options(options_en).expect("Loading should succeed!");
let iter = segmenter.segment_str(s);
Expand Down
Loading

0 comments on commit 0bf22da

Please sign in to comment.