unicode.rs - mozsearch

mozilla-central/third_party/rust/regex-syntax/src/unicode.rs

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

use alloc::{

    string::{String, ToString},

    vec::Vec,

};

use crate::hir;

/// An inclusive range of codepoints from a generated file (hence the static

/// lifetime).

type Range = &'static [(char, char)];

/// An error that occurs when dealing with Unicode.

///

/// We don't impl the Error trait here because these always get converted

/// into other public errors. (This error type isn't exported.)

#[derive(Debug)]

pub enum Error {

    PropertyNotFound,

    PropertyValueNotFound,

    // Not used when unicode-perl is enabled.

    #[allow(dead_code)]

    PerlClassNotFound,

/// An error that occurs when Unicode-aware simple case folding fails.

///

/// This error can occur when the case mapping tables necessary for Unicode

/// aware case folding are unavailable. This only occurs when the

/// `unicode-case` feature is disabled. (The feature is enabled by default.)

#[derive(Debug)]

pub struct CaseFoldError(());

#[cfg(feature = "std")]

impl std::error::Error for CaseFoldError {}

impl core::fmt::Display for CaseFoldError {

    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {

        write!(

f,

            "Unicode-aware case folding is not available \

             (probably because the unicode-case feature is not enabled)"

/// An error that occurs when the Unicode-aware `\w` class is unavailable.

///

/// This error can occur when the data tables necessary for the Unicode aware

/// Perl character class `\w` are unavailable. This only occurs when the

/// `unicode-perl` feature is disabled. (The feature is enabled by default.)

#[derive(Debug)]

pub struct UnicodeWordError(());

#[cfg(feature = "std")]

impl std::error::Error for UnicodeWordError {}

impl core::fmt::Display for UnicodeWordError {

    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {

        write!(

f,

            "Unicode-aware \\w class is not available \

             (probably because the unicode-perl feature is not enabled)"

/// A state oriented traverser of the simple case folding table.

///

/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will

/// return an error if the underlying case folding table is unavailable.

///

/// After construction, it is expected that callers will use

/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly

/// increasing order. For example, calling it on `b` and then on `a` is illegal

/// and will result in a panic.

///

/// The main idea of this type is that it tries hard to make mapping lookups

/// fast by exploiting the structure of the underlying table, and the ordering

/// assumption enables this.

#[derive(Debug)]

pub struct SimpleCaseFolder {

    /// The simple case fold table. It's a sorted association list, where the

    /// keys are Unicode scalar values and the values are the corresponding

    /// equivalence class (not including the key) of the "simple" case folded

    /// Unicode scalar values.

    table: &'static [(char, &'static [char])],

    /// The last codepoint that was used for a lookup.

    last: Option<char>,

    /// The index to the entry in `table` corresponding to the smallest key `k`

    /// such that `k > k0`, where `k0` is the most recent key lookup. Note that

    /// in particular, `k0` may not be in the table!

    next: usize,

impl SimpleCaseFolder {

    /// Create a new simple case folder, returning an error if the underlying

    /// case folding table is unavailable.

    pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> {

        #[cfg(not(feature = "unicode-case"))]

            Err(CaseFoldError(()))

        #[cfg(feature = "unicode-case")]

            Ok(SimpleCaseFolder {

                table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE,

                last: None,

                next: 0,

})

    /// Return the equivalence class of case folded codepoints for the given

    /// codepoint. The equivalence class returned never includes the codepoint

    /// given. If the given codepoint has no case folded codepoints (i.e.,

    /// no entry in the underlying case folding table), then this returns an

    /// empty slice.

///

    /// # Panics

///

    /// This panics when called with a `c` that is less than or equal to the

    /// previous call. In other words, callers need to use this method with

    /// strictly increasing values of `c`.

    pub fn mapping(&mut self, c: char) -> &'static [char] {

        if let Some(last) = self.last {

            assert!(

                last < c,

                "got codepoint U+{:X} which occurs before \

                 last codepoint U+{:X}",

                u32::from(c),

                u32::from(last),

);

        self.last = Some(c);

        if self.next >= self.table.len() {

            return &[];

        let (k, v) = self.table[self.next];

        if k == c {

            self.next += 1;

            return v;

        match self.get(c) {

            Err(i) => {

                self.next = i;

&[]

            Ok(i) => {

                // Since we require lookups to proceed

                // in order, anything we find should be

                // after whatever we thought might be

                // next. Otherwise, the caller is either

                // going out of order or we would have

                // found our next key at 'self.next'.

                assert!(i > self.next);

                self.next = i + 1;

                self.table[i].1

    /// Returns true if and only if the given range overlaps with any region

    /// of the underlying case folding table. That is, when true, there exists

    /// at least one codepoint in the inclusive range `[start, end]` that has

    /// a non-trivial equivalence class of case folded codepoints. Conversely,

    /// when this returns false, all codepoints in the range `[start, end]`

    /// correspond to the trivial equivalence class of case folded codepoints,

    /// i.e., itself.

///

    /// This is useful to call before iterating over the codepoints in the

    /// range and looking up the mapping for each. If you know none of the

    /// mappings will return anything, then you might be able to skip doing it

    /// altogether.

///

    /// # Panics

///

    /// This panics when `end < start`.

    pub fn overlaps(&self, start: char, end: char) -> bool {

        use core::cmp::Ordering;

        assert!(start <= end);

        self.table

            .binary_search_by(|&(c, _)| {

                if start <= c && c <= end {

                    Ordering::Equal

                } else if c > end {

                    Ordering::Greater

                } else {

                    Ordering::Less

})

            .is_ok()

    /// Returns the index at which `c` occurs in the simple case fold table. If

    /// `c` does not occur, then this returns an `i` such that `table[i-1].0 <

    /// c` and `table[i].0 > c`.

    fn get(&self, c: char) -> Result<usize, usize> {

        self.table.binary_search_by_key(&c, |&(c1, _)| c1)

/// A query for finding a character class defined by Unicode. This supports

/// either use of a property name directly, or lookup by property value. The

/// former generally refers to Binary properties (see UTS#44, Table 8), but

/// as a special exception (see UTS#18, Section 1.2) both general categories

/// (an enumeration) and scripts (a catalog) are supported as if each of their

/// possible values were a binary property.

///

/// In all circumstances, property names and values are normalized and

/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.

///

/// The lifetime `'a` refers to the shorter of the lifetimes of property name

/// and property value.

#[derive(Debug)]

pub enum ClassQuery<'a> {

    /// Return a class corresponding to a Unicode binary property, named by

    /// a single letter.

    OneLetter(char),

    /// Return a class corresponding to a Unicode binary property.

///

    /// Note that, by special exception (see UTS#18, Section 1.2), both

    /// general category values and script values are permitted here as if

    /// they were a binary property.

    Binary(&'a str),

    /// Return a class corresponding to all codepoints whose property

    /// (identified by `property_name`) corresponds to the given value

    /// (identified by `property_value`).

    ByValue {

        /// A property name.

        property_name: &'a str,

        /// A property value.

        property_value: &'a str,

},

impl<'a> ClassQuery<'a> {

    fn canonicalize(&self) -> Result<CanonicalClassQuery, Error> {

        match *self {

            ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),

            ClassQuery::Binary(name) => self.canonical_binary(name),

            ClassQuery::ByValue { property_name, property_value } => {

                let property_name = symbolic_name_normalize(property_name);

                let property_value = symbolic_name_normalize(property_value);

                let canon_name = match canonical_prop(&property_name)? {

                    None => return Err(Error::PropertyNotFound),

                    Some(canon_name) => canon_name,

};

                Ok(match canon_name {

                    "General_Category" => {

                        let canon = match canonical_gencat(&property_value)? {

                            None => return Err(Error::PropertyValueNotFound),

                            Some(canon) => canon,

};

                        CanonicalClassQuery::GeneralCategory(canon)

                    "Script" => {

                        let canon = match canonical_script(&property_value)? {

                            None => return Err(Error::PropertyValueNotFound),

                            Some(canon) => canon,

};

                        CanonicalClassQuery::Script(canon)

                    _ => {

                        let vals = match property_values(canon_name)? {

                            None => return Err(Error::PropertyValueNotFound),

                            Some(vals) => vals,

};

                        let canon_val =

                            match canonical_value(vals, &property_value) {

                                None => {

                                    return Err(Error::PropertyValueNotFound)

                                Some(canon_val) => canon_val,

};

                        CanonicalClassQuery::ByValue {

                            property_name: canon_name,

                            property_value: canon_val,

})

    fn canonical_binary(

        &self,

        name: &str,

    ) -> Result<CanonicalClassQuery, Error> {

        let norm = symbolic_name_normalize(name);

        // This is a special case where 'cf' refers to the 'Format' general

        // category, but where the 'cf' abbreviation is also an abbreviation

        // for the 'Case_Folding' property. But we want to treat it as

        // a general category. (Currently, we don't even support the

        // 'Case_Folding' property. But if we do in the future, users will be

        // required to spell it out.)

//

        // Also 'sc' refers to the 'Currency_Symbol' general category, but is

        // also the abbreviation for the 'Script' property. So we avoid calling

        // 'canonical_prop' for it too, which would erroneously normalize it

        // to 'Script'.

//

        // Another case: 'lc' is an abbreviation for the 'Cased_Letter'

        // general category, but is also an abbreviation for the 'Lowercase_Mapping'

        // property. We don't currently support the latter, so as with 'cf'

        // above, we treat 'lc' as 'Cased_Letter'.

        if norm != "cf" && norm != "sc" && norm != "lc" {

            if let Some(canon) = canonical_prop(&norm)? {

                return Ok(CanonicalClassQuery::Binary(canon));

        if let Some(canon) = canonical_gencat(&norm)? {

            return Ok(CanonicalClassQuery::GeneralCategory(canon));

        if let Some(canon) = canonical_script(&norm)? {

            return Ok(CanonicalClassQuery::Script(canon));

        Err(Error::PropertyNotFound)

/// Like ClassQuery, but its parameters have been canonicalized. This also

/// differentiates binary properties from flattened general categories and

/// scripts.

#[derive(Debug, Eq, PartialEq)]

enum CanonicalClassQuery {

    /// The canonical binary property name.

    Binary(&'static str),

    /// The canonical general category name.

    GeneralCategory(&'static str),

    /// The canonical script name.

    Script(&'static str),

    /// An arbitrary association between property and value, both of which

    /// have been canonicalized.

///

    /// Note that by construction, the property name of ByValue will never

    /// be General_Category or Script. Those two cases are subsumed by the

    /// eponymous variants.

    ByValue {

        /// The canonical property name.

        property_name: &'static str,

        /// The canonical property value.

        property_value: &'static str,

},

/// Looks up a Unicode class given a query. If one doesn't exist, then

/// `None` is returned.

pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode, Error> {

    use self::CanonicalClassQuery::*;

    match query.canonicalize()? {

        Binary(name) => bool_property(name),

        GeneralCategory(name) => gencat(name),

        Script(name) => script(name),

        ByValue { property_name: "Age", property_value } => {

            let mut class = hir::ClassUnicode::empty();

            for set in ages(property_value)? {

                class.union(&hir_class(set));

            Ok(class)

        ByValue { property_name: "Script_Extensions", property_value } => {

            script_extension(property_value)

        ByValue {

            property_name: "Grapheme_Cluster_Break",

            property_value,

        } => gcb(property_value),

        ByValue { property_name: "Sentence_Break", property_value } => {

            sb(property_value)

        ByValue { property_name: "Word_Break", property_value } => {

            wb(property_value)

        _ => {

            // What else should we support?

            Err(Error::PropertyNotFound)

/// Returns a Unicode aware class for \w.

///

/// This returns an error if the data is not available for \w.

pub fn perl_word() -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(feature = "unicode-perl"))]

    fn imp() -> Result<hir::ClassUnicode, Error> {

        Err(Error::PerlClassNotFound)

    #[cfg(feature = "unicode-perl")]

    fn imp() -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::perl_word::PERL_WORD;

        Ok(hir_class(PERL_WORD))

    imp()

/// Returns a Unicode aware class for \s.

///

/// This returns an error if the data is not available for \s.

pub fn perl_space() -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]

    fn imp() -> Result<hir::ClassUnicode, Error> {

        Err(Error::PerlClassNotFound)

    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]

    fn imp() -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::perl_space::WHITE_SPACE;

        Ok(hir_class(WHITE_SPACE))

    #[cfg(feature = "unicode-bool")]

    fn imp() -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::property_bool::WHITE_SPACE;

        Ok(hir_class(WHITE_SPACE))

    imp()

/// Returns a Unicode aware class for \d.

///

/// This returns an error if the data is not available for \d.

pub fn perl_digit() -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]

    fn imp() -> Result<hir::ClassUnicode, Error> {

        Err(Error::PerlClassNotFound)

    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]

    fn imp() -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;

        Ok(hir_class(DECIMAL_NUMBER))

    #[cfg(feature = "unicode-gencat")]

    fn imp() -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::general_category::DECIMAL_NUMBER;

        Ok(hir_class(DECIMAL_NUMBER))

    imp()

/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.

pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {

    let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges

        .iter()

        .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))

        .collect();

    hir::ClassUnicode::new(hir_ranges)

/// Returns true only if the given codepoint is in the `\w` character class.

///

/// If the `unicode-perl` feature is not enabled, then this returns an error.

pub fn is_word_character(c: char) -> Result<bool, UnicodeWordError> {

    #[cfg(not(feature = "unicode-perl"))]

    fn imp(_: char) -> Result<bool, UnicodeWordError> {

        Err(UnicodeWordError(()))

    #[cfg(feature = "unicode-perl")]

    fn imp(c: char) -> Result<bool, UnicodeWordError> {

        use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD};

        if u8::try_from(c).map_or(false, is_word_byte) {

            return Ok(true);

        Ok(PERL_WORD

            .binary_search_by(|&(start, end)| {

                use core::cmp::Ordering;

                if start <= c && c <= end {

                    Ordering::Equal

                } else if start > c {

                    Ordering::Greater

                } else {

                    Ordering::Less

})

            .is_ok())

    imp(c)

/// A mapping of property values for a specific property.

///

/// The first element of each tuple is a normalized property value while the

/// second element of each tuple is the corresponding canonical property

/// value.

type PropertyValues = &'static [(&'static str, &'static str)];

fn canonical_gencat(

    normalized_value: &str,

) -> Result<Option<&'static str>, Error> {

    Ok(match normalized_value {

        "any" => Some("Any"),

        "assigned" => Some("Assigned"),

        "ascii" => Some("ASCII"),

        _ => {

            let gencats = property_values("General_Category")?.unwrap();

            canonical_value(gencats, normalized_value)

})

fn canonical_script(

    normalized_value: &str,

) -> Result<Option<&'static str>, Error> {

    let scripts = property_values("Script")?.unwrap();

    Ok(canonical_value(scripts, normalized_value))

/// Find the canonical property name for the given normalized property name.

///

/// If no such property exists, then `None` is returned.

///

/// The normalized property name must have been normalized according to

/// UAX44 LM3, which can be done using `symbolic_name_normalize`.

///

/// If the property names data is not available, then an error is returned.

fn canonical_prop(

    normalized_name: &str,

) -> Result<Option<&'static str>, Error> {

    #[cfg(not(any(

        feature = "unicode-age",

        feature = "unicode-bool",

        feature = "unicode-gencat",

        feature = "unicode-perl",

        feature = "unicode-script",

        feature = "unicode-segment",

    )))]

    fn imp(_: &str) -> Result<Option<&'static str>, Error> {

        Err(Error::PropertyNotFound)

    #[cfg(any(

        feature = "unicode-age",

        feature = "unicode-bool",

        feature = "unicode-gencat",

        feature = "unicode-perl",

        feature = "unicode-script",

        feature = "unicode-segment",

))]

    fn imp(name: &str) -> Result<Option<&'static str>, Error> {

        use crate::unicode_tables::property_names::PROPERTY_NAMES;

        Ok(PROPERTY_NAMES

            .binary_search_by_key(&name, |&(n, _)| n)

            .ok()

            .map(|i| PROPERTY_NAMES[i].1))

    imp(normalized_name)

/// Find the canonical property value for the given normalized property

/// value.

///

/// The given property values should correspond to the values for the property

/// under question, which can be found using `property_values`.

///

/// If no such property value exists, then `None` is returned.

///

/// The normalized property value must have been normalized according to

/// UAX44 LM3, which can be done using `symbolic_name_normalize`.

fn canonical_value(

    vals: PropertyValues,

    normalized_value: &str,

) -> Option<&'static str> {

    vals.binary_search_by_key(&normalized_value, |&(n, _)| n)

        .ok()

        .map(|i| vals[i].1)

/// Return the table of property values for the given property name.

///

/// If the property values data is not available, then an error is returned.

fn property_values(

    canonical_property_name: &'static str,

) -> Result<Option<PropertyValues>, Error> {

    #[cfg(not(any(

        feature = "unicode-age",

        feature = "unicode-bool",

        feature = "unicode-gencat",

        feature = "unicode-perl",

        feature = "unicode-script",

        feature = "unicode-segment",

    )))]

    fn imp(_: &'static str) -> Result<Option<PropertyValues>, Error> {

        Err(Error::PropertyValueNotFound)

    #[cfg(any(

        feature = "unicode-age",

        feature = "unicode-bool",

        feature = "unicode-gencat",

        feature = "unicode-perl",

        feature = "unicode-script",

        feature = "unicode-segment",

))]

    fn imp(name: &'static str) -> Result<Option<PropertyValues>, Error> {

        use crate::unicode_tables::property_values::PROPERTY_VALUES;

        Ok(PROPERTY_VALUES

            .binary_search_by_key(&name, |&(n, _)| n)

            .ok()

            .map(|i| PROPERTY_VALUES[i].1))

    imp(canonical_property_name)

// This is only used in some cases, but small enough to just let it be dead

// instead of figuring out (and maintaining) the right set of features.

#[allow(dead_code)]

fn property_set(

    name_map: &'static [(&'static str, Range)],

    canonical: &'static str,

) -> Option<Range> {

    name_map

        .binary_search_by_key(&canonical, |x| x.0)

        .ok()

        .map(|i| name_map[i].1)

/// Returns an iterator over Unicode Age sets. Each item corresponds to a set

/// of codepoints that were added in a particular revision of Unicode. The

/// iterator yields items in chronological order.

///

/// If the given age value isn't valid or if the data isn't available, then an

/// error is returned instead.

fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {

    #[cfg(not(feature = "unicode-age"))]

    fn imp(_: &str) -> Result<impl Iterator<Item = Range>, Error> {

        use core::option::IntoIter;

        Err::<IntoIter<Range>, _>(Error::PropertyNotFound)

    #[cfg(feature = "unicode-age")]

    fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {

        use crate::unicode_tables::age;

        const AGES: &[(&str, Range)] = &[

            ("V1_1", age::V1_1),

            ("V2_0", age::V2_0),

            ("V2_1", age::V2_1),

            ("V3_0", age::V3_0),

            ("V3_1", age::V3_1),

            ("V3_2", age::V3_2),

            ("V4_0", age::V4_0),

            ("V4_1", age::V4_1),

            ("V5_0", age::V5_0),

            ("V5_1", age::V5_1),

            ("V5_2", age::V5_2),

            ("V6_0", age::V6_0),

            ("V6_1", age::V6_1),

            ("V6_2", age::V6_2),

            ("V6_3", age::V6_3),

            ("V7_0", age::V7_0),

            ("V8_0", age::V8_0),

            ("V9_0", age::V9_0),

            ("V10_0", age::V10_0),

            ("V11_0", age::V11_0),

            ("V12_0", age::V12_0),

            ("V12_1", age::V12_1),

            ("V13_0", age::V13_0),

            ("V14_0", age::V14_0),

            ("V15_0", age::V15_0),

];

        assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");

        let pos = AGES.iter().position(|&(age, _)| canonical_age == age);

        match pos {

            None => Err(Error::PropertyValueNotFound),

            Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)),

    imp(canonical_age)

/// Returns the Unicode HIR class corresponding to the given general category.

///

/// Name canonicalization is assumed to be performed by the caller.

///

/// If the given general category could not be found, or if the general

/// category data is not available, then an error is returned.

fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(feature = "unicode-gencat"))]

    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {

        Err(Error::PropertyNotFound)

    #[cfg(feature = "unicode-gencat")]

    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::general_category::BY_NAME;

        match name {

            "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),

            "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),

            "Assigned" => {

                let mut cls = gencat("Unassigned")?;

                cls.negate();

                Ok(cls)

            name => property_set(BY_NAME, name)

                .map(hir_class)

                .ok_or(Error::PropertyValueNotFound),

    match canonical_name {

        "Decimal_Number" => perl_digit(),

        name => imp(name),

/// Returns the Unicode HIR class corresponding to the given script.

///

/// Name canonicalization is assumed to be performed by the caller.

///

/// If the given script could not be found, or if the script data is not

/// available, then an error is returned.

fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(feature = "unicode-script"))]

    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {

        Err(Error::PropertyNotFound)

    #[cfg(feature = "unicode-script")]

    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::script::BY_NAME;

        property_set(BY_NAME, name)

            .map(hir_class)

            .ok_or(Error::PropertyValueNotFound)

    imp(canonical_name)

/// Returns the Unicode HIR class corresponding to the given script extension.

///

/// Name canonicalization is assumed to be performed by the caller.

///

/// If the given script extension could not be found, or if the script data is

/// not available, then an error is returned.

fn script_extension(

    canonical_name: &'static str,

) -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(feature = "unicode-script"))]

    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {

        Err(Error::PropertyNotFound)

    #[cfg(feature = "unicode-script")]

    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::script_extension::BY_NAME;

        property_set(BY_NAME, name)

            .map(hir_class)

            .ok_or(Error::PropertyValueNotFound)

    imp(canonical_name)

/// Returns the Unicode HIR class corresponding to the given Unicode boolean

/// property.

///

/// Name canonicalization is assumed to be performed by the caller.

///

/// If the given boolean property could not be found, or if the boolean

/// property data is not available, then an error is returned.

fn bool_property(

    canonical_name: &'static str,

) -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(feature = "unicode-bool"))]

    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {

        Err(Error::PropertyNotFound)

    #[cfg(feature = "unicode-bool")]

    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::property_bool::BY_NAME;

        property_set(BY_NAME, name)

            .map(hir_class)

            .ok_or(Error::PropertyNotFound)

    match canonical_name {

        "Decimal_Number" => perl_digit(),

        "White_Space" => perl_space(),

        name => imp(name),

/// Returns the Unicode HIR class corresponding to the given grapheme cluster

/// break property.

///

/// Name canonicalization is assumed to be performed by the caller.

///

/// If the given property could not be found, or if the corresponding data is

/// not available, then an error is returned.

fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(feature = "unicode-segment"))]

    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {

        Err(Error::PropertyNotFound)

    #[cfg(feature = "unicode-segment")]

    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::grapheme_cluster_break::BY_NAME;

        property_set(BY_NAME, name)

            .map(hir_class)

            .ok_or(Error::PropertyValueNotFound)

    imp(canonical_name)

/// Returns the Unicode HIR class corresponding to the given word break

/// property.

///

/// Name canonicalization is assumed to be performed by the caller.

///

/// If the given property could not be found, or if the corresponding data is

/// not available, then an error is returned.

fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(feature = "unicode-segment"))]

    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {

        Err(Error::PropertyNotFound)

    #[cfg(feature = "unicode-segment")]

    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::word_break::BY_NAME;

        property_set(BY_NAME, name)

            .map(hir_class)

            .ok_or(Error::PropertyValueNotFound)

    imp(canonical_name)

/// Returns the Unicode HIR class corresponding to the given sentence

/// break property.

///

/// Name canonicalization is assumed to be performed by the caller.

///

/// If the given property could not be found, or if the corresponding data is

/// not available, then an error is returned.

fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {

    #[cfg(not(feature = "unicode-segment"))]

    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {

        Err(Error::PropertyNotFound)

    #[cfg(feature = "unicode-segment")]

    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {

        use crate::unicode_tables::sentence_break::BY_NAME;

        property_set(BY_NAME, name)

            .map(hir_class)

            .ok_or(Error::PropertyValueNotFound)

    imp(canonical_name)

/// Like symbolic_name_normalize_bytes, but operates on a string.

fn symbolic_name_normalize(x: &str) -> String {

    let mut tmp = x.as_bytes().to_vec();

    let len = symbolic_name_normalize_bytes(&mut tmp).len();

    tmp.truncate(len);

    // This should always succeed because `symbolic_name_normalize_bytes`

    // guarantees that `&tmp[..len]` is always valid UTF-8.

//

    // N.B. We could avoid the additional UTF-8 check here, but it's unlikely

    // to be worth skipping the additional safety check. A benchmark must

    // justify it first.

    String::from_utf8(tmp).unwrap()

/// Normalize the given symbolic name in place according to UAX44-LM3.

///

/// A "symbolic name" typically corresponds to property names and property

/// value aliases. Note, though, that it should not be applied to property

/// string values.

///

/// The slice returned is guaranteed to be valid UTF-8 for all possible values

/// of `slice`.

///

/// See: https://unicode.org/reports/tr44/#UAX44-LM3

fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {

    // I couldn't find a place in the standard that specified that property

    // names/aliases had a particular structure (unlike character names), but

    // we assume that it's ASCII only and drop anything that isn't ASCII.

    let mut start = 0;

    let mut starts_with_is = false;

    if slice.len() >= 2 {

        // Ignore any "is" prefix.

        starts_with_is = slice[0..2] == b"is"[..]

            || slice[0..2] == b"IS"[..]

            || slice[0..2] == b"iS"[..]

            || slice[0..2] == b"Is"[..];

        if starts_with_is {

            start = 2;

    let mut next_write = 0;

    for i in start..slice.len() {

        // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid

        // UTF-8, we ensure that the slice contains only ASCII bytes. In

        // particular, we drop every non-ASCII byte from the normalized string.

        let b = slice[i];

        if b == b' ' || b == b'_' || b == b'-' {

            continue;

        } else if b'A' <= b && b <= b'Z' {

            slice[next_write] = b + (b'a' - b'A');

            next_write += 1;

        } else if b <= 0x7F {

            slice[next_write] = b;

            next_write += 1;

    // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally

    // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross

    // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it

    // is actually an alias for the 'Other' general category.

    if starts_with_is && next_write == 1 && slice[0] == b'c' {

        slice[0] = b'i';

        slice[1] = b's';

        slice[2] = b'c';

        next_write = 3;

    &mut slice[..next_write]

#[cfg(test)]

mod tests {

    use super::*;

    #[cfg(feature = "unicode-case")]

    fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {

        SimpleCaseFolder::new().unwrap().mapping(c).iter().copied()

    #[cfg(feature = "unicode-case")]

    fn contains_case_map(start: char, end: char) -> bool {

        SimpleCaseFolder::new().unwrap().overlaps(start, end)

    #[test]

    #[cfg(feature = "unicode-case")]

    fn simple_fold_k() {

        let xs: Vec<char> = simple_fold_ok('k').collect();

        assert_eq!(xs, alloc::vec!['K', 'K']);

        let xs: Vec<char> = simple_fold_ok('K').collect();

        assert_eq!(xs, alloc::vec!['k', 'K']);

        let xs: Vec<char> = simple_fold_ok('K').collect();

        assert_eq!(xs, alloc::vec!['K', 'k']);

    #[test]

    #[cfg(feature = "unicode-case")]

    fn simple_fold_a() {

        let xs: Vec<char> = simple_fold_ok('a').collect();

        assert_eq!(xs, alloc::vec!['A']);

        let xs: Vec<char> = simple_fold_ok('A').collect();

        assert_eq!(xs, alloc::vec!['a']);

    #[test]

    #[cfg(not(feature = "unicode-case"))]

    fn simple_fold_disabled() {

        assert!(SimpleCaseFolder::new().is_err());

    #[test]

    #[cfg(feature = "unicode-case")]

    fn range_contains() {

        assert!(contains_case_map('A', 'A'));

        assert!(contains_case_map('Z', 'Z'));

        assert!(contains_case_map('A', 'Z'));

        assert!(contains_case_map('@', 'A'));

        assert!(contains_case_map('Z', '['));

        assert!(contains_case_map('☃', 'Ⰰ'));

        assert!(!contains_case_map('[', '['));

        assert!(!contains_case_map('[', '`'));

        assert!(!contains_case_map('☃', '☃'));

    #[test]

    #[cfg(feature = "unicode-gencat")]

    fn regression_466() {

        use super::{CanonicalClassQuery, ClassQuery};

        let q = ClassQuery::OneLetter('C');

        assert_eq!(

            q.canonicalize().unwrap(),

            CanonicalClassQuery::GeneralCategory("Other")

);

    #[test]

    fn sym_normalize() {

        let sym_norm = symbolic_name_normalize;

        assert_eq!(sym_norm("Line_Break"), "linebreak");

        assert_eq!(sym_norm("Line-break"), "linebreak");

        assert_eq!(sym_norm("linebreak"), "linebreak");

        assert_eq!(sym_norm("BA"), "ba");

        assert_eq!(sym_norm("ba"), "ba");

        assert_eq!(sym_norm("Greek"), "greek");

        assert_eq!(sym_norm("isGreek"), "greek");

        assert_eq!(sym_norm("IS_Greek"), "greek");

        assert_eq!(sym_norm("isc"), "isc");

        assert_eq!(sym_norm("is c"), "isc");

        assert_eq!(sym_norm("is_c"), "isc");

    #[test]

    fn valid_utf8_symbolic() {

        let mut x = b"abc\xFFxyz".to_vec();

        let y = symbolic_name_normalize_bytes(&mut x);

        assert_eq!(y, b"abcxyz");