Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian committed Oct 1, 2024
1 parent 0339a3d commit a33a90a
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 168 deletions.
1 change: 1 addition & 0 deletions components/collator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ all-features = true
displaydoc = { workspace = true }
icu_collections = { workspace = true }
icu_normalizer = { workspace = true }
icu_locale_core = { workspace = true }
icu_properties = { workspace = true }
icu_provider = { workspace = true, features = ["macros"] }
utf8_iter = { workspace = true }
Expand Down
24 changes: 11 additions & 13 deletions components/collator/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@ use criterion::{black_box, criterion_group, criterion_main, BatchSize, Benchmark

use icu::collator::*;
use icu::locale::Locale;
use icu_provider::DataLocale;

fn to_data_locale(locale_str: &str) -> DataLocale {
fn to_locale(locale_str: &str) -> Locale {
locale_str
.parse::<Locale>()
.expect("Failed to parse locale")
.into()
}

pub fn collator_with_locale(criterion: &mut Criterion) {
Expand Down Expand Up @@ -99,36 +97,36 @@ pub fn collator_with_locale(criterion: &mut Criterion) {
Strength::Identical,
];
let performance_parameters = [
(to_data_locale("en_US"), vec![&content_latin], &all_strength),
(to_data_locale("da_DK"), vec![&content_latin], &all_strength),
(to_data_locale("fr_CA"), vec![&content_latin], &all_strength),
(to_locale("en_US"), vec![&content_latin], &all_strength),
(to_locale("da_DK"), vec![&content_latin], &all_strength),
(to_locale("fr_CA"), vec![&content_latin], &all_strength),
(
to_data_locale("ja_JP"),
to_locale("ja_JP"),
vec![&content_latin, &content_jp_h, &content_jp_k, &content_asian],
&all_strength,
),
(
to_data_locale("zh-u-co-pinyin"),
to_locale("zh-u-co-pinyin"),
vec![&content_latin, &content_chinese],
&all_strength,
), // zh_CN
(
to_data_locale("zh-u-co-stroke"),
to_locale("zh-u-co-stroke"),
vec![&content_latin, &content_chinese],
&all_strength,
), // zh_TW
(
to_data_locale("ru_RU"),
to_locale("ru_RU"),
vec![&content_latin, &content_russian],
&all_strength,
),
(
to_data_locale("th"),
to_locale("th"),
vec![&content_latin, &content_thai],
&all_strength,
),
(
to_data_locale("ko_KR"),
to_locale("ko_KR"),
vec![&content_latin, &content_korean],
&all_strength,
),
Expand Down Expand Up @@ -156,7 +154,7 @@ pub fn collator_with_locale(criterion: &mut Criterion) {
for (index, strength) in benched_strength.iter().enumerate() {
let mut options = CollatorOptions::new();
options.strength = Some(*strength);
let collator = Collator::try_new(&locale_under_bench, options).unwrap();
let collator = Collator::try_new(CollatorPreferences::from(locale_under_bench.clone()), options).unwrap();
// ICU4X collator performance, sort is locale-aware
group.bench_function(
BenchmarkId::new(
Expand Down
44 changes: 26 additions & 18 deletions components/collator/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use crate::provider::CollationRootV1Marker;
use crate::provider::CollationSpecialPrimariesV1;
use crate::provider::CollationSpecialPrimariesV1Marker;
use crate::provider::CollationTailoringV1Marker;
use crate::CollatorPreferences;
use crate::{AlternateHandling, CollatorOptions, MaxVariable, ResolvedCollatorOptions, Strength};
use core::cmp::Ordering;
use core::convert::TryFrom;
Expand All @@ -34,6 +35,7 @@ use icu_normalizer::provider::CanonicalDecompositionTablesV1Marker;
use icu_normalizer::provider::DecompositionDataV1;
use icu_normalizer::provider::DecompositionTablesV1;
use icu_normalizer::Decomposition;
use icu_preferences::extensions::unicode::keywords::CollationType;
use icu_provider::prelude::*;
use smallvec::SmallVec;
use utf16_iter::Utf16CharsEx;
Expand Down Expand Up @@ -74,7 +76,7 @@ impl LocaleSpecificDataHolder {
/// The constructor code reused between owned and borrowed cases.
fn try_new_unstable_internal<D>(
provider: &D,
locale: &DataLocale,
preferences: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError>
where
Expand All @@ -84,24 +86,26 @@ impl LocaleSpecificDataHolder {
+ DataProvider<CollationReorderingV1Marker>
+ ?Sized,
{
let id = DataIdentifierBorrowed::for_marker_attributes_and_locale(
DataMarkerAttributes::from_str_or_panic(
locale.get_single_unicode_ext("co").unwrap_or_default(),
),
locale,
);
let preferences = preferences.resolve();

let id = DataIdentifierCow::from(&preferences);

let req = DataRequest {
id,
id: id.as_borrowed(),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
metadata
},
};

let mut fallback_preferences = preferences.clone();
fallback_preferences.collation_type = CollationType::Standard;

let fallback_id = DataIdentifierCow::from(&fallback_preferences);

let fallback_req = DataRequest {
id: DataIdentifierBorrowed::for_locale(locale),
id: fallback_id.as_borrowed(),
..Default::default()
};

Expand Down Expand Up @@ -228,14 +232,14 @@ impl Collator {
/// Creates `CollatorBorrowed` for the given locale and options from compiled data.
#[cfg(feature = "compiled_data")]
pub fn try_new(
locale: &DataLocale,
preferences: CollatorPreferences,
options: CollatorOptions,
) -> Result<CollatorBorrowed<'static>, DataError> {
CollatorBorrowed::try_new(locale, options)
CollatorBorrowed::try_new(preferences, options)
}

icu_provider::gen_any_buffer_data_constructors!(
(locale, options: CollatorOptions) -> error: DataError,
(preferences: CollatorPreferences, options: CollatorOptions) -> error: DataError,
functions: [
try_new: skip,
try_new_with_any_provider,
Expand All @@ -248,7 +252,7 @@ impl Collator {
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
pub fn try_new_unstable<D>(
provider: &D,
locale: &DataLocale,
preferences: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError>
where
Expand All @@ -270,7 +274,7 @@ impl Collator {
provider.load(Default::default())?.payload,
provider.load(Default::default())?.payload,
|| provider.load(Default::default()).map(|r| r.payload),
locale,
preferences,
options,
)
}
Expand All @@ -286,7 +290,7 @@ impl Collator {
DataPayload<CollationSpecialPrimariesV1Marker>,
DataError,
>,
locale: &DataLocale,
preferences: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError>
where
Expand All @@ -298,7 +302,7 @@ impl Collator {
+ ?Sized,
{
let locale_dependent =
LocaleSpecificDataHolder::try_new_unstable_internal(provider, locale, options)?;
LocaleSpecificDataHolder::try_new_unstable_internal(provider, preferences, options)?;

// TODO: redesign Korean search collation handling
if jamo.get().ce32s.len() != JAMO_COUNT {
Expand Down Expand Up @@ -355,9 +359,13 @@ pub struct CollatorBorrowed<'a> {
impl CollatorBorrowed<'static> {
/// Creates a collator for the given locale and options from compiled data.
#[cfg(feature = "compiled_data")]
pub fn try_new(locale: &DataLocale, options: CollatorOptions) -> Result<Self, DataError> {
pub fn try_new(
preferences: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError> {
// These are assigned to locals in order to keep the code after these assignments
// copypaste-compatible with `Collator::try_new_unstable_internal`.

let provider = &crate::provider::Baked;
let decompositions =
icu_normalizer::provider::Baked::SINGLETON_CANONICAL_DECOMPOSITION_DATA_V1_MARKER;
Expand All @@ -367,7 +375,7 @@ impl CollatorBorrowed<'static> {
let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1_MARKER;

let locale_dependent =
LocaleSpecificDataHolder::try_new_unstable_internal(provider, locale, options)?;
LocaleSpecificDataHolder::try_new_unstable_internal(provider, preferences, options)?;

// TODO: redesign Korean search collation handling
if jamo.ce32s.len() != JAMO_COUNT {
Expand Down
61 changes: 61 additions & 0 deletions components/collator/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,64 @@ pub use options::MaxVariable;
pub use options::Numeric;
pub use options::ResolvedCollatorOptions;
pub use options::Strength;

icu_preferences::preferences!(
CollatorPreferences,
ResolvedCollatorPreferences,
{
collation_type => icu_preferences::extensions::unicode::keywords::CollationType
}
);

impl From<&icu_provider::prelude::icu_locale_core::LanguageIdentifier> for CollatorPreferences {
fn from(_value: &icu_provider::prelude::icu_locale_core::LanguageIdentifier) -> Self {
Self {
lid: _value.clone(),
collation_type: None,
}
}
}

impl CollatorPreferences {
/// TODO
pub fn resolve(self) -> ResolvedCollatorPreferences {
ResolvedCollatorPreferences {
lid: self.lid,
collation_type: self.collation_type.unwrap_or(icu_preferences::extensions::unicode::keywords::CollationType::Standard),
}
}
}

use icu_preferences::extensions::unicode::keywords::CollationType;
use icu_provider::DataIdentifierCow;
use icu_provider::DataMarkerAttributes;

impl<'a> From<&'a ResolvedCollatorPreferences> for DataIdentifierCow<'a> {
fn from(value: &'a ResolvedCollatorPreferences) -> Self {
Self::from_borrowed_and_owned(
match value.collation_type {
CollationType::Dict => DataMarkerAttributes::from_str_or_panic("dict"),
CollationType::Big5han => DataMarkerAttributes::from_str_or_panic("big5han"),
CollationType::Compat => DataMarkerAttributes::from_str_or_panic("compat"),
CollationType::Direct => DataMarkerAttributes::from_str_or_panic("direct"),
CollationType::Ducet => DataMarkerAttributes::from_str_or_panic("ducet"),
CollationType::Emoji => DataMarkerAttributes::from_str_or_panic("emoji"),
CollationType::Eor => DataMarkerAttributes::from_str_or_panic("eor"),
CollationType::Gb2312 => DataMarkerAttributes::from_str_or_panic("gb2312"),
CollationType::Phonebk => DataMarkerAttributes::from_str_or_panic("phonebk"),
CollationType::Phonetic => DataMarkerAttributes::from_str_or_panic("phonetic"),
CollationType::Pinyin => DataMarkerAttributes::from_str_or_panic("pinyin"),
CollationType::Reformed => DataMarkerAttributes::from_str_or_panic("reformed"),
CollationType::Search => DataMarkerAttributes::from_str_or_panic("search"),
CollationType::Searchjl => DataMarkerAttributes::from_str_or_panic("searchjl"),
CollationType::Standard => DataMarkerAttributes::from_str_or_panic("standard"),
CollationType::Stroke => DataMarkerAttributes::from_str_or_panic("stroke"),
CollationType::Trad => DataMarkerAttributes::from_str_or_panic("trad"),
CollationType::Unihan => DataMarkerAttributes::from_str_or_panic("unihan"),
CollationType::Zhuyin => DataMarkerAttributes::from_str_or_panic("zhuyin"),
_ => DataMarkerAttributes::empty(),
},
(&value.lid).into(),
)
}
}
Loading

0 comments on commit a33a90a

Please sign in to comment.