Source code

Revision control

Copy as Markdown

Other Tools

// Copyright 2013-2014 The rust-url developers.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! [*Unicode IDNA Compatibility Processing*
//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
use self::Mapping::*;
use crate::punycode;
use alloc::string::String;
use core::fmt;
use unicode_bidi::{bidi_class, BidiClass};
use unicode_normalization::char::is_combining_mark;
use unicode_normalization::{is_nfc, UnicodeNormalization};
include!("uts46_mapping_table.rs");
const PUNYCODE_PREFIX: &str = "xn--";
#[derive(Debug)]
struct StringTableSlice {
// Store these as separate fields so the structure will have an
// alignment of 1 and thus pack better into the Mapping enum, below.
byte_start_lo: u8,
byte_start_hi: u8,
byte_len: u8,
}
fn decode_slice(slice: &StringTableSlice) -> &'static str {
let lo = slice.byte_start_lo as usize;
let hi = slice.byte_start_hi as usize;
let start = (hi << 8) | lo;
let len = slice.byte_len as usize;
&STRING_TABLE[start..(start + len)]
}
#[repr(u8)]
#[derive(Debug)]
enum Mapping {
Valid,
Ignored,
Mapped(StringTableSlice),
Deviation(StringTableSlice),
Disallowed,
DisallowedStd3Valid,
DisallowedStd3Mapped(StringTableSlice),
DisallowedIdna2008,
}
fn find_char(codepoint: char) -> &'static Mapping {
let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
Ok(idx) => idx,
Err(idx) => idx - 1,
};
const SINGLE_MARKER: u16 = 1 << 15;
let (base, x) = TABLE[idx];
let single = (x & SINGLE_MARKER) != 0;
let offset = !SINGLE_MARKER & x;
if single {
&MAPPING_TABLE[offset as usize]
} else {
&MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
}
}
struct Mapper<'a> {
chars: core::str::Chars<'a>,
config: Config,
errors: &'a mut Errors,
slice: Option<core::str::Chars<'static>>,
}
impl<'a> Iterator for Mapper<'a> {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(s) = &mut self.slice {
match s.next() {
Some(c) => return Some(c),
None => {
self.slice = None;
}
}
}
let codepoint = self.chars.next()?;
if let '.' | '-' | 'a'..='z' | '0'..='9' = codepoint {
return Some(codepoint);
}
return Some(match *find_char(codepoint) {
Mapping::Valid => codepoint,
Mapping::Ignored => continue,
Mapping::Mapped(ref slice) => {
self.slice = Some(decode_slice(slice).chars());
continue;
}
Mapping::Deviation(ref slice) => {
if self.config.transitional_processing {
self.slice = Some(decode_slice(slice).chars());
continue;
} else {
codepoint
}
}
Mapping::Disallowed => {
self.errors.disallowed_character = true;
codepoint
}
Mapping::DisallowedStd3Valid => {
if self.config.use_std3_ascii_rules {
self.errors.disallowed_by_std3_ascii_rules = true;
};
codepoint
}
Mapping::DisallowedStd3Mapped(ref slice) => {
if self.config.use_std3_ascii_rules {
self.errors.disallowed_mapped_in_std3 = true;
};
self.slice = Some(decode_slice(slice).chars());
continue;
}
Mapping::DisallowedIdna2008 => {
if self.config.use_idna_2008_rules {
self.errors.disallowed_in_idna_2008 = true;
}
codepoint
}
});
}
}
}
fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
// Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
// is RTL if it contains at least one character of bidi class R, AL or AN.
if !is_bidi_domain {
return true;
}
let mut chars = label.chars();
let first_char_class = match chars.next() {
Some(c) => bidi_class(c),
None => return true, // empty string
};
match first_char_class {
// LTR label
BidiClass::L => {
// Rule 5
for c in chars.by_ref() {
if !matches!(
bidi_class(c),
BidiClass::L
| BidiClass::EN
| BidiClass::ES
| BidiClass::CS
| BidiClass::ET
| BidiClass::ON
| BidiClass::BN
| BidiClass::NSM
) {
return false;
}
}
// Rule 6
// must end in L or EN followed by 0 or more NSM
let mut rev_chars = label.chars().rev();
let mut last_non_nsm = rev_chars.next();
loop {
match last_non_nsm {
Some(c) if bidi_class(c) == BidiClass::NSM => {
last_non_nsm = rev_chars.next();
continue;
}
_ => {
break;
}
}
}
match last_non_nsm {
Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
Some(_) => {
return false;
}
_ => {}
}
}
// RTL label
BidiClass::R | BidiClass::AL => {
let mut found_en = false;
let mut found_an = false;
// Rule 2
for c in chars {
let char_class = bidi_class(c);
if char_class == BidiClass::EN {
found_en = true;
} else if char_class == BidiClass::AN {
found_an = true;
}
if !matches!(
char_class,
BidiClass::R
| BidiClass::AL
| BidiClass::AN
| BidiClass::EN
| BidiClass::ES
| BidiClass::CS
| BidiClass::ET
| BidiClass::ON
| BidiClass::BN
| BidiClass::NSM
) {
return false;
}
}
// Rule 3
let mut rev_chars = label.chars().rev();
let mut last = rev_chars.next();
loop {
// must end in L or EN followed by 0 or more NSM
match last {
Some(c) if bidi_class(c) == BidiClass::NSM => {
last = rev_chars.next();
continue;
}
_ => {
break;
}
}
}
match last {
Some(c)
if matches!(
bidi_class(c),
BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
) => {}
_ => {
return false;
}
}
// Rule 4
if found_an && found_en {
return false;
}
}
// Rule 1: Should start with L or R/AL
_ => {
return false;
}
}
true
}
/// Check the validity criteria for the given label
///
/// V1 (NFC) and V8 (Bidi) are checked inside `processing()` to prevent doing duplicate work.
///
fn check_validity(label: &str, config: Config, errors: &mut Errors) {
let first_char = label.chars().next();
if first_char.is_none() {
// Empty string, pass
return;
}
// V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
//
// NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
// third and fourth positions. But nobody follows this criteria. See the spec issue below:
// V3: neither begin nor end with a U+002D HYPHEN-MINUS
if config.check_hyphens && (label.starts_with('-') || label.ends_with('-')) {
errors.check_hyphens = true;
return;
}
// V4: not contain a U+002E FULL STOP
//
// Here, label can't contain '.' since the input is from .split('.')
// V5: not begin with a GC=Mark
if is_combining_mark(first_char.unwrap()) {
errors.start_combining_mark = true;
return;
}
// V6: Check against Mapping Table
if label.chars().any(|c| match *find_char(c) {
Mapping::Valid | Mapping::DisallowedIdna2008 => false,
Mapping::Deviation(_) => config.transitional_processing,
Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
_ => true,
}) {
errors.invalid_mapping = true;
}
// V7: ContextJ rules
//
// TODO: Implement rules and add *CheckJoiners* flag.
// V8: Bidi rules are checked inside `processing()`
}
// Detect simple cases: all lowercase ASCII characters and digits where none
// of the labels start with PUNYCODE_PREFIX and labels don't start or end with hyphen.
fn is_simple(domain: &str) -> bool {
if domain.is_empty() {
return false;
}
let (mut prev, mut puny_prefix) = ('?', 0);
for c in domain.chars() {
if c == '.' {
if prev == '-' {
return false;
}
puny_prefix = 0;
continue;
} else if puny_prefix == 0 && c == '-' {
return false;
} else if puny_prefix < 5 {
if c == ['x', 'n', '-', '-'][puny_prefix] {
puny_prefix += 1;
if puny_prefix == 4 {
return false;
}
} else {
puny_prefix = 5;
}
}
if !c.is_ascii_lowercase() && !c.is_ascii_digit() {
return false;
}
prev = c;
}
true
}
fn processing(
domain: &str,
config: Config,
normalized: &mut String,
output: &mut String,
) -> Errors {
normalized.clear();
let mut errors = Errors::default();
let offset = output.len();
let iter = Mapper {
chars: domain.chars(),
config,
errors: &mut errors,
slice: None,
};
normalized.extend(iter.nfc());
let mut decoder = punycode::Decoder::default();
let non_transitional = config.transitional_processing(false);
let (mut first, mut has_bidi_labels) = (true, false);
for label in normalized.split('.') {
if !first {
output.push('.');
}
first = false;
if let Some(remainder) = label.strip_prefix(PUNYCODE_PREFIX) {
match decoder.decode(remainder) {
Ok(decode) => {
let start = output.len();
output.extend(decode);
let decoded_label = &output[start..];
if !has_bidi_labels {
has_bidi_labels |= is_bidi_domain(decoded_label);
}
if !errors.is_err() {
if !is_nfc(decoded_label) {
errors.nfc = true;
} else {
check_validity(decoded_label, non_transitional, &mut errors);
}
}
}
Err(()) => {
has_bidi_labels = true;
errors.punycode = true;
}
}
} else {
if !has_bidi_labels {
has_bidi_labels |= is_bidi_domain(label);
}
// `normalized` is already `NFC` so we can skip that check
check_validity(label, config, &mut errors);
output.push_str(label)
}
}
for label in output[offset..].split('.') {
// V8: Bidi rules
//
// TODO: Add *CheckBidi* flag
if !passes_bidi(label, has_bidi_labels) {
errors.check_bidi = true;
break;
}
}
errors
}
#[derive(Default)]
pub struct Idna {
config: Config,
normalized: String,
output: String,
}
impl Idna {
pub fn new(config: Config) -> Self {
Self {
config,
normalized: String::new(),
output: String::new(),
}
}
pub fn to_ascii_inner(&mut self, domain: &str, out: &mut String) -> Errors {
if is_simple(domain) {
out.push_str(domain);
return Errors::default();
}
let mut errors = processing(domain, self.config, &mut self.normalized, out);
self.output = core::mem::replace(out, String::with_capacity(out.len()));
let mut first = true;
for label in self.output.split('.') {
if !first {
out.push('.');
}
first = false;
if label.is_ascii() {
out.push_str(label);
} else {
let offset = out.len();
out.push_str(PUNYCODE_PREFIX);
if let Err(()) = punycode::encode_into(label.chars(), out) {
errors.punycode = true;
out.truncate(offset);
}
}
}
errors
}
#[allow(clippy::wrong_self_convention)]
pub fn to_ascii(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
let mut errors = self.to_ascii_inner(domain, out);
if self.config.verify_dns_length {
let domain = if out.ends_with('.') {
&out[..out.len() - 1]
} else {
&*out
};
if domain.is_empty() || domain.split('.').any(|label| label.is_empty()) {
errors.too_short_for_dns = true;
}
if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
errors.too_long_for_dns = true;
}
}
errors.into()
}
#[allow(clippy::wrong_self_convention)]
pub fn to_unicode(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
if is_simple(domain) {
out.push_str(domain);
return Errors::default().into();
}
processing(domain, self.config, &mut self.normalized, out).into()
}
}
#[derive(Clone, Copy)]
#[must_use]
pub struct Config {
use_std3_ascii_rules: bool,
transitional_processing: bool,
verify_dns_length: bool,
check_hyphens: bool,
use_idna_2008_rules: bool,
}
/// The defaults are that of https://url.spec.whatwg.org/#idna
impl Default for Config {
fn default() -> Self {
Config {
use_std3_ascii_rules: false,
transitional_processing: false,
check_hyphens: false,
// check_bidi: true,
// check_joiners: true,
// Only use for to_ascii, not to_unicode
verify_dns_length: false,
use_idna_2008_rules: false,
}
}
}
impl Config {
#[inline]
pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
self.use_std3_ascii_rules = value;
self
}
#[inline]
pub fn transitional_processing(mut self, value: bool) -> Self {
self.transitional_processing = value;
self
}
#[inline]
pub fn verify_dns_length(mut self, value: bool) -> Self {
self.verify_dns_length = value;
self
}
#[inline]
pub fn check_hyphens(mut self, value: bool) -> Self {
self.check_hyphens = value;
self
}
#[inline]
pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
self.use_idna_2008_rules = value;
self
}
pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
let mut result = String::with_capacity(domain.len());
let mut codec = Idna::new(self);
codec.to_ascii(domain, &mut result).map(|()| result)
}
pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
let mut codec = Idna::new(self);
let mut out = String::with_capacity(domain.len());
let result = codec.to_unicode(domain, &mut out);
(out, result)
}
}
fn is_bidi_domain(s: &str) -> bool {
for c in s.chars() {
if c.is_ascii_graphic() {
continue;
}
match bidi_class(c) {
BidiClass::R | BidiClass::AL | BidiClass::AN => return true,
_ => {}
}
}
false
}
/// Errors recorded during UTS #46 processing.
///
/// This is opaque for now, indicating what types of errors have been encountered at least once.
/// More details may be exposed in the future.
#[derive(Default)]
pub struct Errors {
punycode: bool,
check_hyphens: bool,
check_bidi: bool,
start_combining_mark: bool,
invalid_mapping: bool,
nfc: bool,
disallowed_by_std3_ascii_rules: bool,
disallowed_mapped_in_std3: bool,
disallowed_character: bool,
too_long_for_dns: bool,
too_short_for_dns: bool,
disallowed_in_idna_2008: bool,
}
impl Errors {
fn is_err(&self) -> bool {
let Errors {
punycode,
check_hyphens,
check_bidi,
start_combining_mark,
invalid_mapping,
nfc,
disallowed_by_std3_ascii_rules,
disallowed_mapped_in_std3,
disallowed_character,
too_long_for_dns,
too_short_for_dns,
disallowed_in_idna_2008,
} = *self;
punycode
|| check_hyphens
|| check_bidi
|| start_combining_mark
|| invalid_mapping
|| nfc
|| disallowed_by_std3_ascii_rules
|| disallowed_mapped_in_std3
|| disallowed_character
|| too_long_for_dns
|| too_short_for_dns
|| disallowed_in_idna_2008
}
}
impl fmt::Debug for Errors {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Errors {
punycode,
check_hyphens,
check_bidi,
start_combining_mark,
invalid_mapping,
nfc,
disallowed_by_std3_ascii_rules,
disallowed_mapped_in_std3,
disallowed_character,
too_long_for_dns,
too_short_for_dns,
disallowed_in_idna_2008,
} = *self;
let fields = [
("punycode", punycode),
("check_hyphens", check_hyphens),
("check_bidi", check_bidi),
("start_combining_mark", start_combining_mark),
("invalid_mapping", invalid_mapping),
("nfc", nfc),
(
"disallowed_by_std3_ascii_rules",
disallowed_by_std3_ascii_rules,
),
("disallowed_mapped_in_std3", disallowed_mapped_in_std3),
("disallowed_character", disallowed_character),
("too_long_for_dns", too_long_for_dns),
("too_short_for_dns", too_short_for_dns),
("disallowed_in_idna_2008", disallowed_in_idna_2008),
];
let mut empty = true;
f.write_str("Errors { ")?;
for (name, val) in &fields {
if *val {
if !empty {
f.write_str(", ")?;
}
f.write_str(name)?;
empty = false;
}
}
if !empty {
f.write_str(" }")
} else {
f.write_str("}")
}
}
}
impl From<Errors> for Result<(), Errors> {
fn from(e: Errors) -> Result<(), Errors> {
if !e.is_err() {
Ok(())
} else {
Err(e)
}
}
}
#[cfg(feature = "std")]
impl std::error::Error for Errors {}
impl fmt::Display for Errors {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(self, f)
}
}
#[cfg(test)]
mod tests {
use super::{find_char, Mapping};
#[test]
fn mapping_fast_path() {
assert_matches!(find_char('-'), &Mapping::Valid);
assert_matches!(find_char('.'), &Mapping::Valid);
for c in &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] {
assert_matches!(find_char(*c), &Mapping::Valid);
}
for c in &[
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
] {
assert_matches!(find_char(*c), &Mapping::Valid);
}
}
}