Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve locale ordering examples and rename writeable::cmp_bytes to writeable::cmp_utf8 #5828

Merged
merged 2 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions components/locale/src/canonicalizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ where
let mut source_variants = source.variants.iter();
'outer: for raw_variant in raw_variants {
for source_variant in source_variants.by_ref() {
match source_variant.strict_cmp(raw_variant.as_bytes()) {
match source_variant.as_str().cmp(raw_variant) {
Ordering::Equal => {
// The source_variant is equal, move to next raw_variant
continue 'outer;
Expand Down Expand Up @@ -122,13 +122,11 @@ fn uts35_replacement<'a, I>(
loop {
match (sources.peek(), skips.peek(), replacements.peek()) {
(Some(&source), Some(skip), _)
if source.strict_cmp(skip.as_bytes()) == Ordering::Greater =>
if source.as_str().cmp(skip) == Ordering::Greater =>
{
skips.next();
}
(Some(&source), Some(skip), _)
if source.strict_cmp(skip.as_bytes()) == Ordering::Equal =>
{
(Some(&source), Some(skip), _) if source.as_str().cmp(skip) == Ordering::Equal => {
skips.next();
sources.next();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ impl Keywords {
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_bytes(self, other)
writeable::cmp_utf8(self, other)
}

pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
Expand Down
2 changes: 1 addition & 1 deletion components/locale_core/src/extensions/unicode/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ impl FromStr for Value {

impl PartialEq<&str> for Value {
fn eq(&self, other: &&str) -> bool {
writeable::cmp_bytes(self, other.as_bytes()).is_eq()
writeable::cmp_utf8(self, other.as_bytes()).is_eq()
}
}

Expand Down
101 changes: 73 additions & 28 deletions components/locale_core/src/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@ use alloc::borrow::Cow;

/// A core struct representing a [`Unicode BCP47 Language Identifier`].
///
/// # Ordering
///
/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
/// multiple possible orderings. Depending on your use case, two orderings are available:
///
/// 1. A string ordering, suitable for stable serialization: [`LanguageIdentifier::strict_cmp`]
/// 2. A struct ordering, suitable for use with a BTreeSet: [`LanguageIdentifier::total_cmp`]
///
/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
///
/// # Parsing
///
/// Unicode recognizes three levels of standard conformance for any language identifier:
Expand All @@ -30,18 +40,6 @@ use alloc::borrow::Cow;
/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
/// For validation and canonicalization, see `LocaleCanonicalizer`.
///
/// # Ordering
///
/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
/// multiple possible orderings, and the team did not want to favor one over any other.
///
/// Instead, there are functions available that return these different orderings:
///
/// - [`LanguageIdentifier::strict_cmp`]
/// - [`LanguageIdentifier::total_cmp`]
///
/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
///
/// # Examples
///
/// Simple example:
Expand Down Expand Up @@ -214,31 +212,44 @@ impl LanguageIdentifier {
///
/// # Examples
///
/// Sorting a list of langids with this method requires converting one of them to a string:
///
/// ```
/// use icu::locale::LanguageIdentifier;
/// use std::cmp::Ordering;
/// use writeable::Writeable;
///
/// // Random input order:
/// let bcp47_strings: &[&str] = &[
/// "pl-Latn-PL",
/// "und",
/// "und-Adlm",
/// "und-GB",
/// "und-ZA",
/// "ar-Latn",
/// "zh-Hant-TW",
/// "zh-TW",
/// "und-fonipa",
/// "zh",
/// "zh-Hant",
/// "ar-SA",
/// ];
///
/// for ab in bcp47_strings.windows(2) {
/// let a = ab[0];
/// let b = ab[1];
/// assert!(a.cmp(b) == Ordering::Less);
/// let a_langid = a.parse::<LanguageIdentifier>().unwrap();
/// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal);
/// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less);
/// }
/// let mut langids = bcp47_strings.iter().map(|s| s.parse().unwrap()).collect::<Vec<LanguageIdentifier>>();
/// langids.sort_by(|a, b| {
/// let b = b.write_to_string();
/// a.strict_cmp(b.as_bytes())
/// });
/// let strict_cmp_strings = langids.iter().map(|l| l.to_string()).collect::<Vec<String>>();
///
/// // Output ordering, sorted alphabetically
/// let expected_ordering: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
///
/// assert_eq!(expected_ordering, strict_cmp_strings);
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_bytes(self, other)
writeable::cmp_utf8(self, other)
}

pub(crate) fn as_tuple(
Expand All @@ -260,7 +271,41 @@ impl LanguageIdentifier {
///
/// # Examples
///
/// Using a wrapper to add one of these to a [`BTreeSet`]:
/// This method returns a nonsensical ordering derived from the fields of the struct:
///
/// ```
/// use icu::locale::LanguageIdentifier;
/// use std::cmp::Ordering;
///
/// // Input strings, sorted alphabetically
/// let bcp47_strings: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
/// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
///
/// let mut langids = bcp47_strings.iter().map(|s| s.parse().unwrap()).collect::<Vec<LanguageIdentifier>>();
/// langids.sort_by(LanguageIdentifier::total_cmp);
/// let total_cmp_strings = langids.iter().map(|l| l.to_string()).collect::<Vec<String>>();
///
/// // Output ordering, sorted arbitrarily
/// let expected_ordering: &[&str] = &[
/// "ar-SA",
/// "ar-Latn",
/// "und-fonipa",
/// "zh-TW",
/// "zh-Hant",
/// "zh-Hant-TW",
/// ];
///
/// assert_eq!(expected_ordering, total_cmp_strings);
/// ```
///
/// Use a wrapper to add a [`LanguageIdentifier`] to a [`BTreeSet`]:
///
/// ```no_run
/// use icu::locale::LanguageIdentifier;
Expand Down
103 changes: 76 additions & 27 deletions components/locale_core/src/locale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,16 @@ use core::str::FromStr;
/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
/// on top of that is able to parse, manipulate and serialize unicode extension fields.
///
/// # Ordering
///
/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
/// multiple possible orderings. Depending on your use case, two orderings are available:
///
/// 1. A string ordering, suitable for stable serialization: [`Locale::strict_cmp`]
/// 2. A struct ordering, suitable for use with a BTreeSet: [`Locale::total_cmp`]
///
/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
///
/// # Parsing
///
/// Unicode recognizes three levels of standard conformance for a locale:
Expand All @@ -37,18 +47,6 @@ use core::str::FromStr;
/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
/// For validation and canonicalization, see `LocaleCanonicalizer`.
///
/// # Ordering
///
/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
/// multiple possible orderings, and the team did not want to favor one over any other.
///
/// Instead, there are functions available that return these different orderings:
///
/// - [`Locale::strict_cmp`]
/// - [`Locale::total_cmp`]
///
/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
///
/// # Examples
///
/// Simple example:
Expand Down Expand Up @@ -198,31 +196,46 @@ impl Locale {
///
/// # Examples
///
/// Sorting a list of locales with this method requires converting one of them to a string:
///
/// ```
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
/// use writeable::Writeable;
///
/// // Random input order:
/// let bcp47_strings: &[&str] = &[
/// "pl-Latn-PL",
/// "und",
/// "und-u-ca-hebrew",
/// "ar-Latn",
/// "zh-Hant-TW",
/// "zh-TW",
/// "und-fonipa",
/// "zh-Hant",
/// "ar-SA",
/// ];
///
/// let mut locales = bcp47_strings.iter().map(|s| s.parse().unwrap()).collect::<Vec<Locale>>();
/// locales.sort_by(|a, b| {
/// let b = b.write_to_string();
/// a.strict_cmp(b.as_bytes())
/// });
/// let strict_cmp_strings = locales.iter().map(|l| l.to_string()).collect::<Vec<String>>();
///
/// // Output ordering, sorted alphabetically
/// let expected_ordering: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "und-t-m0-true",
/// "und-u-ca-hebrew",
/// "und-u-ca-japanese",
/// "zh",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
///
/// for ab in bcp47_strings.windows(2) {
/// let a = ab[0];
/// let b = ab[1];
/// assert!(a.cmp(b) == Ordering::Less);
/// let a_loc = a.parse::<Locale>().unwrap();
/// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal);
/// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less);
/// }
/// assert_eq!(expected_ordering, strict_cmp_strings);
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_bytes(self, other)
writeable::cmp_utf8(self, other)
}

#[allow(clippy::type_complexity)]
Expand Down Expand Up @@ -263,7 +276,43 @@ impl Locale {
///
/// # Examples
///
/// Using a wrapper to add one of these to a [`BTreeSet`]:
/// This method returns a nonsensical ordering derived from the fields of the struct:
///
/// ```
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
///
/// // Input strings, sorted alphabetically
/// let bcp47_strings: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "und-u-ca-hebrew",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
/// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
///
/// let mut locales = bcp47_strings.iter().map(|s| s.parse().unwrap()).collect::<Vec<Locale>>();
/// locales.sort_by(Locale::total_cmp);
/// let total_cmp_strings = locales.iter().map(|l| l.to_string()).collect::<Vec<String>>();
///
/// // Output ordering, sorted arbitrarily
/// let expected_ordering: &[&str] = &[
/// "ar-SA",
/// "ar-Latn",
/// "und-u-ca-hebrew",
/// "und-fonipa",
/// "zh-TW",
/// "zh-Hant",
/// "zh-Hant-TW",
/// ];
///
/// assert_eq!(expected_ordering, total_cmp_strings);
/// ```
///
/// Use a wrapper to add a [`Locale`] to a [`BTreeSet`]:
///
/// ```no_run
/// use icu::locale::Locale;
Expand Down
2 changes: 1 addition & 1 deletion provider/core/src/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ impl DataLocale {
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_bytes(self, other)
writeable::cmp_utf8(self, other)
}

/// Returns whether this [`DataLocale`] is `und` in the locale and extensions portion.
Expand Down
6 changes: 3 additions & 3 deletions provider/source/src/time_zones/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ impl DataProvider<MetazoneGenericNamesLongV1Marker> for SourceDataProvider {
let Some(location) = locations.get(tz) else {
return true;
};
writeable::cmp_bytes(
writeable::cmp_utf8(
&time_zone_names_resource
.region_format
.interpolate([location]),
Expand Down Expand Up @@ -627,12 +627,12 @@ impl DataProvider<MetazoneSpecificNamesLongV1Marker> for SourceDataProvider {
return true;
};
if zv == ZoneVariant::Daylight {
writeable::cmp_bytes(
writeable::cmp_utf8(
&locations.get().pattern_daylight.interpolate([location]),
v.as_bytes(),
) != Ordering::Equal
} else if zv == ZoneVariant::Standard {
writeable::cmp_bytes(
writeable::cmp_utf8(
&locations.get().pattern_standard.interpolate([location]),
v.as_bytes(),
) != Ordering::Equal
Expand Down
8 changes: 4 additions & 4 deletions utils/writeable/benches/writeable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ fn writeable_benches(c: &mut Criterion) {
.into_owned()
});
});
c.bench_function("writeable/cmp_bytes", |b| {
c.bench_function("writeable/cmp_str", |b| {
b.iter(|| {
let short = black_box(SHORT_STR);
let medium = black_box(MEDIUM_STR);
Expand All @@ -143,7 +143,7 @@ fn writeable_benches(c: &mut Criterion) {
[short, medium, long, long_overlap].map(|s1| {
[short, medium, long, long_overlap].map(|s2| {
let message = WriteableMessage { message: s1 };
writeable::cmp_bytes(&message, s2.as_bytes())
writeable::cmp_str(&message, s2)
})
})
});
Expand Down Expand Up @@ -238,10 +238,10 @@ fn complex_benches(c: &mut Criterion) {
MEDIUM_STR,
LONG_STR,
];
c.bench_function("complex/cmp_bytes", |b| {
c.bench_function("complex/cmp_str", |b| {
b.iter(|| {
black_box(REFERENCE_STRS)
.map(|s| writeable::cmp_bytes(black_box(&COMPLEX_WRITEABLE_MEDIUM), s.as_bytes()))
.map(|s| writeable::cmp_str(black_box(&COMPLEX_WRITEABLE_MEDIUM), s))
});
});
}
Expand Down
Loading