tokenizer.rs - mozsearch

mozilla-central/third_party/rust/cssparser/src/tokenizer.rs

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

// https://drafts.csswg.org/css-syntax/#tokenization

use self::Token::*;

use crate::cow_rc_str::CowRcStr;

use crate::parser::ParserState;

use std::char;

use std::ops::Range;

#[cfg(not(feature = "dummy_match_byte"))]

use cssparser_macros::match_byte;

#[cfg(feature = "dummy_match_byte")]

macro_rules! match_byte {

    ($value:expr, $($rest:tt)* ) => {

        match $value {

$(

                $rest

)+

};

/// One of the pieces the CSS input is broken into.

///

/// Some components use `Cow` in order to borrow from the original input string

/// and avoid allocating/copying when possible.

#[derive(PartialEq, Debug, Clone)]

pub enum Token<'a> {

    /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram)

    Ident(CowRcStr<'a>),

    /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram)

///

    /// The value does not include the `@` marker.

    AtKeyword(CowRcStr<'a>),

    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"

///

    /// The value does not include the `#` marker.

    Hash(CowRcStr<'a>),

    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"

///

    /// The value does not include the `#` marker.

    IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.

    /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram)

///

    /// The value does not include the quotes.

    QuotedString(CowRcStr<'a>),

    /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram)

///

    /// The value does not include the `url(` `)` markers.  Note that `url( <string-token> )` is represented by a

    /// `Function` token.

    UnquotedUrl(CowRcStr<'a>),

    /// A `<delim-token>`

    Delim(char),

    /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram)

    Number {

        /// Whether the number had a `+` or `-` sign.

///

        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)

        has_sign: bool,

        /// The value as a float

        value: f32,

        /// If the origin source did not include a fractional part, the value as an integer.

        int_value: Option<i32>,

},

    /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram)

    Percentage {

        /// Whether the number had a `+` or `-` sign.

        has_sign: bool,

        /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.

        unit_value: f32,

        /// If the origin source did not include a fractional part, the value as an integer.

        /// It is **not** divided by 100.

        int_value: Option<i32>,

},

    /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram)

    Dimension {

        /// Whether the number had a `+` or `-` sign.

///

        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)

        has_sign: bool,

        /// The value as a float

        value: f32,

        /// If the origin source did not include a fractional part, the value as an integer.

        int_value: Option<i32>,

        /// The unit, e.g. "px" in `12px`

        unit: CowRcStr<'a>,

},

    /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram)

    WhiteSpace(&'a str),

    /// A comment.

///

    /// The CSS Syntax spec does not generate tokens for comments,

    /// But we do, because we can (borrowed &str makes it cheap).

///

    /// The value does not include the `/*` `*/` markers.

    Comment(&'a str),

    /// A `:` `<colon-token>`

    Colon, // :

    /// A `;` `<semicolon-token>`

    Semicolon, // ;

    /// A `,` `<comma-token>`

    Comma, // ,

    /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram)

    IncludeMatch,

    /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram)

    DashMatch,

    /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram)

    PrefixMatch,

    /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram)

    SuffixMatch,

    /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram)

    SubstringMatch,

    /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram)

    CDO,

    /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram)

    CDC,

    /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram)

///

    /// The value (name) does not include the `(` marker.

    Function(CowRcStr<'a>),

    /// A `<(-token>`

    ParenthesisBlock,

    /// A `<[-token>`

    SquareBracketBlock,

    /// A `<{-token>`

    CurlyBracketBlock,

    /// A `<bad-url-token>`

///

    /// This token always indicates a parse error.

    BadUrl(CowRcStr<'a>),

    /// A `<bad-string-token>`

///

    /// This token always indicates a parse error.

    BadString(CowRcStr<'a>),

    /// A `<)-token>`

///

    /// When obtained from one of the `Parser::next*` methods,

    /// this token is always unmatched and indicates a parse error.

    CloseParenthesis,

    /// A `<]-token>`

///

    /// When obtained from one of the `Parser::next*` methods,

    /// this token is always unmatched and indicates a parse error.

    CloseSquareBracket,

    /// A `<}-token>`

///

    /// When obtained from one of the `Parser::next*` methods,

    /// this token is always unmatched and indicates a parse error.

    CloseCurlyBracket,

impl<'a> Token<'a> {

    /// Return whether this token represents a parse error.

///

    /// `BadUrl` and `BadString` are tokenizer-level parse errors.

///

    /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*

    /// and therefore parse errors when returned by one of the `Parser::next*` methods.

    pub fn is_parse_error(&self) -> bool {

        matches!(

            *self,

            BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket

#[derive(Clone)]

pub struct Tokenizer<'a> {

    input: &'a str,

    /// Counted in bytes, not code points. From 0.

    position: usize,

    /// The position at the start of the current line; but adjusted to

    /// ensure that computing the column will give the result in units

    /// of UTF-16 characters.

    current_line_start_position: usize,

    current_line_number: u32,

    var_or_env_functions: SeenStatus,

    source_map_url: Option<&'a str>,

    source_url: Option<&'a str>,

#[derive(Copy, Clone, PartialEq, Eq)]

enum SeenStatus {

    DontCare,

    LookingForThem,

    SeenAtLeastOne,

impl<'a> Tokenizer<'a> {

    #[inline]

    pub fn new(input: &str) -> Tokenizer {

        Tokenizer {

            input,

            position: 0,

            current_line_start_position: 0,

            current_line_number: 0,

            var_or_env_functions: SeenStatus::DontCare,

            source_map_url: None,

            source_url: None,

    #[inline]

    pub fn look_for_var_or_env_functions(&mut self) {

        self.var_or_env_functions = SeenStatus::LookingForThem;

    #[inline]

    pub fn seen_var_or_env_functions(&mut self) -> bool {

        let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne;

        self.var_or_env_functions = SeenStatus::DontCare;

        seen

    #[inline]

    pub fn see_function(&mut self, name: &str) {

        if self.var_or_env_functions == SeenStatus::LookingForThem

            && (name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env"))

            self.var_or_env_functions = SeenStatus::SeenAtLeastOne;

    #[inline]

    pub fn next(&mut self) -> Result<Token<'a>, ()> {

        next_token(self)

    #[inline]

    pub fn position(&self) -> SourcePosition {

        debug_assert!(self.input.is_char_boundary(self.position));

        SourcePosition(self.position)

    #[inline]

    pub fn current_source_location(&self) -> SourceLocation {

        SourceLocation {

            line: self.current_line_number,

            column: (self.position - self.current_line_start_position + 1) as u32,

    #[inline]

    pub fn current_source_map_url(&self) -> Option<&'a str> {

        self.source_map_url

    #[inline]

    pub fn current_source_url(&self) -> Option<&'a str> {

        self.source_url

    #[inline]

    pub fn state(&self) -> ParserState {

        ParserState {

            position: self.position,

            current_line_start_position: self.current_line_start_position,

            current_line_number: self.current_line_number,

            at_start_of: None,

    #[inline]

    pub fn reset(&mut self, state: &ParserState) {

        self.position = state.position;

        self.current_line_start_position = state.current_line_start_position;

        self.current_line_number = state.current_line_number;

    #[inline]

    pub(crate) fn slice_from(&self, start_pos: SourcePosition) -> &'a str {

        self.slice(start_pos..self.position())

    #[inline]

    pub(crate) fn slice(&self, range: Range<SourcePosition>) -> &'a str {

        debug_assert!(self.input.is_char_boundary(range.start.0));

        debug_assert!(self.input.is_char_boundary(range.end.0));

        unsafe { self.input.get_unchecked(range.start.0..range.end.0) }

    pub fn current_source_line(&self) -> &'a str {

        let current = self.position();

        let start = self

            .slice(SourcePosition(0)..current)

            .rfind(|c| matches!(c, '\r' | '\n' | '\x0C'))

            .map_or(0, |start| start + 1);

        let end = self

            .slice(current..SourcePosition(self.input.len()))

            .find(|c| matches!(c, '\r' | '\n' | '\x0C'))

            .map_or(self.input.len(), |end| current.0 + end);

        self.slice(SourcePosition(start)..SourcePosition(end))

    #[inline]

    pub fn next_byte(&self) -> Option<u8> {

        if self.is_eof() {

            None

        } else {

            Some(self.input.as_bytes()[self.position])

    // If false, `tokenizer.next_char()` will not panic.

    #[inline]

    fn is_eof(&self) -> bool {

        !self.has_at_least(0)

    // If true, the input has at least `n` bytes left *after* the current one.

    // That is, `tokenizer.char_at(n)` will not panic.

    #[inline]

    fn has_at_least(&self, n: usize) -> bool {

        self.position + n < self.input.len()

    // Advance over N bytes in the input.  This function can advance

    // over ASCII bytes (excluding newlines), or UTF-8 sequence

    // leaders (excluding leaders for 4-byte sequences).

    #[inline]

    pub fn advance(&mut self, n: usize) {

        if cfg!(debug_assertions) {

            // Each byte must either be an ASCII byte or a sequence

            // leader, but not a 4-byte leader; also newlines are

            // rejected.

            for i in 0..n {

                let b = self.byte_at(i);

                debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));

                debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');

        self.position += n

    // Assumes non-EOF

    #[inline]

    fn next_byte_unchecked(&self) -> u8 {

        self.byte_at(0)

    #[inline]

    fn byte_at(&self, offset: usize) -> u8 {

        self.input.as_bytes()[self.position + offset]

    // Advance over a single byte; the byte must be a UTF-8 sequence

    // leader for a 4-byte sequence.

    #[inline]

    fn consume_4byte_intro(&mut self) {

        debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);

        // This takes two UTF-16 characters to represent, so we

        // actually have an undercount.

        self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);

        self.position += 1;

    // Advance over a single byte; the byte must be a UTF-8

    // continuation byte.

    #[inline]

    fn consume_continuation_byte(&mut self) {

        debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);

        // Continuation bytes contribute to column overcount.  Note

        // that due to the special case for the 4-byte sequence intro,

        // we must use wrapping add here.

        self.current_line_start_position = self.current_line_start_position.wrapping_add(1);

        self.position += 1;

    // Advance over any kind of byte, excluding newlines.

    #[inline(never)]

    fn consume_known_byte(&mut self, byte: u8) {

        debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');

        self.position += 1;

        // Continuation bytes contribute to column overcount.

        if byte & 0xF0 == 0xF0 {

            // This takes two UTF-16 characters to represent, so we

            // actually have an undercount.

            self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);

        } else if byte & 0xC0 == 0x80 {

            // Note that due to the special case for the 4-byte

            // sequence intro, we must use wrapping add here.

            self.current_line_start_position = self.current_line_start_position.wrapping_add(1);

    #[inline]

    fn next_char(&self) -> char {

        unsafe { self.input.get_unchecked(self.position().0..) }

            .chars()

            .next()

            .unwrap()

    // Given that a newline has been seen, advance over the newline

    // and update the state.

    #[inline]

    fn consume_newline(&mut self) {

        let byte = self.next_byte_unchecked();

        debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');

        self.position += 1;

        if byte == b'\r' && self.next_byte() == Some(b'\n') {

            self.position += 1;

        self.current_line_start_position = self.position;

        self.current_line_number += 1;

    #[inline]

    fn has_newline_at(&self, offset: usize) -> bool {

        self.position + offset < self.input.len()

            && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')

    #[inline]

    fn consume_char(&mut self) -> char {

        let c = self.next_char();

        let len_utf8 = c.len_utf8();

        self.position += len_utf8;

        // Note that due to the special case for the 4-byte sequence

        // intro, we must use wrapping add here.

        self.current_line_start_position = self

            .current_line_start_position

            .wrapping_add(len_utf8 - c.len_utf16());

    #[inline]

    fn starts_with(&self, needle: &[u8]) -> bool {

        self.input.as_bytes()[self.position..].starts_with(needle)

    pub fn skip_whitespace(&mut self) {

        while !self.is_eof() {

            match_byte! { self.next_byte_unchecked(),

                b' ' | b'\t' => {

                    self.advance(1)

},

                b'\n' | b'\x0C' | b'\r' => {

                    self.consume_newline();

},

                b'/' => {

                    if self.starts_with(b"/*") {

                        consume_comment(self);

                    } else {

                        return

                _ => return,

    pub fn skip_cdc_and_cdo(&mut self) {

        while !self.is_eof() {

            match_byte! { self.next_byte_unchecked(),

                b' ' | b'\t' => {

                    self.advance(1)

},

                b'\n' | b'\x0C' | b'\r' => {

                    self.consume_newline();

},

                b'/' => {

                    if self.starts_with(b"/*") {

                        consume_comment(self);

                    } else {

                        return

                b'<' => {

                    if self.starts_with(b"<!--") {

                        self.advance(4)

                    } else {

                        return

                b'-' => {

                    if self.starts_with(b"-->") {

                        self.advance(3)

                    } else {

                        return

                _ => {

                    return

/// A position from the start of the input, counted in UTF-8 bytes.

#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]

pub struct SourcePosition(pub(crate) usize);

impl SourcePosition {

    /// Returns the current byte index in the original input.

    #[inline]

    pub fn byte_index(&self) -> usize {

        self.0

/// The line and column number for a given position within the input.

#[derive(PartialEq, Eq, Debug, Clone, Copy)]

pub struct SourceLocation {

    /// The line number, starting at 0 for the first line.

    pub line: u32,

    /// The column number within a line, starting at 1 for first the character of the line.

    /// Column numbers are counted in UTF-16 code units.

    pub column: u32,

fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {

    if tokenizer.is_eof() {

        return Err(());

    let b = tokenizer.next_byte_unchecked();

    let token = match_byte! { b,

        b' ' | b'\t' => {

            consume_whitespace(tokenizer, false)

},

        b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true),

        b'"' => consume_string(tokenizer, false),

        b'#' => {

            tokenizer.advance(1);

            if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }

            else if !tokenizer.is_eof() &&

                matches!(tokenizer.next_byte_unchecked(), b'0'..=b'9' | b'-') {

                // Any other valid case here already resulted in IDHash.

                Hash(consume_name(tokenizer))

            else { Delim('#') }

},

        b'$' => {

            if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }

            else { tokenizer.advance(1); Delim('$') }

},

        b'\'' => consume_string(tokenizer, true),

        b'(' => { tokenizer.advance(1); ParenthesisBlock },

        b')' => { tokenizer.advance(1); CloseParenthesis },

        b'*' => {

            if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }

            else { tokenizer.advance(1); Delim('*') }

},

        b'+' => {

            if (

                tokenizer.has_at_least(1)

                && tokenizer.byte_at(1).is_ascii_digit()

            ) || (

                tokenizer.has_at_least(2)

                && tokenizer.byte_at(1) == b'.'

                && tokenizer.byte_at(2).is_ascii_digit()

) {

                consume_numeric(tokenizer)

            } else {

                tokenizer.advance(1);

                Delim('+')

},

        b',' => { tokenizer.advance(1); Comma },

        b'-' => {

            if (

                tokenizer.has_at_least(1)

                && tokenizer.byte_at(1).is_ascii_digit()

            ) || (

                tokenizer.has_at_least(2)

                && tokenizer.byte_at(1) == b'.'

                && tokenizer.byte_at(2).is_ascii_digit()

) {

                consume_numeric(tokenizer)

            } else if tokenizer.starts_with(b"-->") {

                tokenizer.advance(3);

CDC

            } else if is_ident_start(tokenizer) {

                consume_ident_like(tokenizer)

            } else {

                tokenizer.advance(1);

                Delim('-')

},

        b'.' => {

            if tokenizer.has_at_least(1)

                && tokenizer.byte_at(1).is_ascii_digit() {

                consume_numeric(tokenizer)

            } else {

                tokenizer.advance(1);

                Delim('.')

        b'/' => {

            if tokenizer.starts_with(b"/*") {

                Comment(consume_comment(tokenizer))

            } else {

                tokenizer.advance(1);

                Delim('/')

        b'0'..=b'9' => consume_numeric(tokenizer),

        b':' => { tokenizer.advance(1); Colon },

        b';' => { tokenizer.advance(1); Semicolon },

        b'<' => {

            if tokenizer.starts_with(b"<!--") {

                tokenizer.advance(4);

CDO

            } else {

                tokenizer.advance(1);

                Delim('<')

},

        b'@' => {

            tokenizer.advance(1);

            if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }

            else { Delim('@') }

},

        b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => consume_ident_like(tokenizer),

        b'[' => { tokenizer.advance(1); SquareBracketBlock },

        b'\\' => {

            if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }

            else { tokenizer.advance(1); Delim('\\') }

},

        b']' => { tokenizer.advance(1); CloseSquareBracket },

        b'^' => {

            if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }

            else { tokenizer.advance(1); Delim('^') }

},

        b'{' => { tokenizer.advance(1); CurlyBracketBlock },

        b'|' => {

            if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }

            else { tokenizer.advance(1); Delim('|') }

},

        b'}' => { tokenizer.advance(1); CloseCurlyBracket },

        b'~' => {

            if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }

            else { tokenizer.advance(1); Delim('~') }

},

        _ => {

            if !b.is_ascii() {

                consume_ident_like(tokenizer)

            } else {

                tokenizer.advance(1);

                Delim(b as char)

},

};

    Ok(token)

fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {

    let start_position = tokenizer.position();

    if newline {

        tokenizer.consume_newline();

    } else {

        tokenizer.advance(1);

    while !tokenizer.is_eof() {

        let b = tokenizer.next_byte_unchecked();

        match_byte! { b,

            b' ' | b'\t' => {

                tokenizer.advance(1);

            b'\n' | b'\x0C' | b'\r' => {

                tokenizer.consume_newline();

            _ => {

                break

    WhiteSpace(tokenizer.slice_from(start_position))

// Check for sourceMappingURL or sourceURL comments and update the

// tokenizer appropriately.

fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {

    let directive = "# sourceMappingURL=";

    let directive_old = "@ sourceMappingURL=";

    // If there is a source map directive, extract the URL.

    if contents.starts_with(directive) || contents.starts_with(directive_old) {

        let contents = &contents[directive.len()..];

        tokenizer.source_map_url = contents

            .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')

            .next()

    let directive = "# sourceURL=";

    let directive_old = "@ sourceURL=";

    // If there is a source map directive, extract the URL.

    if contents.starts_with(directive) || contents.starts_with(directive_old) {

        let contents = &contents[directive.len()..];

        tokenizer.source_url = contents

            .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')

            .next()

fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {

    tokenizer.advance(2); // consume "/*"

    let start_position = tokenizer.position();

    while !tokenizer.is_eof() {

        match_byte! { tokenizer.next_byte_unchecked(),

            b'*' => {

                let end_position = tokenizer.position();

                tokenizer.advance(1);

                if tokenizer.next_byte() == Some(b'/') {

                    tokenizer.advance(1);

                    let contents = tokenizer.slice(start_position..end_position);

                    check_for_source_map(tokenizer, contents);

                    return contents

            b'\n' | b'\x0C' | b'\r' => {

                tokenizer.consume_newline();

            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }

            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }

            _ => {

                // ASCII or other leading byte.

                tokenizer.advance(1);

    let contents = tokenizer.slice_from(start_position);

    check_for_source_map(tokenizer, contents);

    contents

fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {

    match consume_quoted_string(tokenizer, single_quote) {

        Ok(value) => QuotedString(value),

        Err(value) => BadString(value),

/// Return `Err(())` on syntax error (ie. unescaped newline)

fn consume_quoted_string<'a>(

    tokenizer: &mut Tokenizer<'a>,

    single_quote: bool,

) -> Result<CowRcStr<'a>, CowRcStr<'a>> {

    tokenizer.advance(1); // Skip the initial quote

                          // start_pos is at code point boundary, after " or '

    let start_pos = tokenizer.position();

    let mut string_bytes;

    loop {

        if tokenizer.is_eof() {

            return Ok(tokenizer.slice_from(start_pos).into());

        match_byte! { tokenizer.next_byte_unchecked(),

            b'"' => {

                if !single_quote {

                    let value = tokenizer.slice_from(start_pos);

                    tokenizer.advance(1);

                    return Ok(value.into())

                tokenizer.advance(1);

            b'\'' => {

                if single_quote {

                    let value = tokenizer.slice_from(start_pos);

                    tokenizer.advance(1);

                    return Ok(value.into())

                tokenizer.advance(1);

            b'\\' | b'\0' => {

                // * The tokenizer’s input is UTF-8 since it’s `&str`.

                // * start_pos is at a code point boundary

                // * so is the current position (which is before '\\' or '\0'

//

                // So `string_bytes` is well-formed UTF-8.

                string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();

                break

            b'\n' | b'\r' | b'\x0C' => {

                return Err(tokenizer.slice_from(start_pos).into())

},

            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }

            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }

            _ => {

                // ASCII or other leading byte.

                tokenizer.advance(1);

    while !tokenizer.is_eof() {

        let b = tokenizer.next_byte_unchecked();

        match_byte! { b,

            b'\n' | b'\r' | b'\x0C' => {

                return Err(

                    // string_bytes is well-formed UTF-8, see other comments.

                    unsafe {

                        from_utf8_release_unchecked(string_bytes)

                    }.into()

);

            b'"' => {

                tokenizer.advance(1);

                if !single_quote {

                    break;

            b'\'' => {

                tokenizer.advance(1);

                if single_quote {

                    break;

            b'\\' => {

                tokenizer.advance(1);

                if !tokenizer.is_eof() {

                    match tokenizer.next_byte_unchecked() {

                        // Escaped newline

                        b'\n' | b'\x0C' | b'\r' => {

                            tokenizer.consume_newline();

                        // This pushes one well-formed code point

                        _ => consume_escape_and_write(tokenizer, &mut string_bytes)

                // else: escaped EOF, do nothing.

                continue;

            b'\0' => {

                tokenizer.advance(1);

                string_bytes.extend("\u{FFFD}".as_bytes());

                continue;

            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }

            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }

            _ => {

                // ASCII or other leading byte.

                tokenizer.advance(1);

},

        // If this byte is part of a multi-byte code point,

        // we’ll end up copying the whole code point before this loop does something else.

        string_bytes.push(b);

Ok(

        // string_bytes is well-formed UTF-8, see other comments.

        unsafe { from_utf8_release_unchecked(string_bytes) }.into(),

#[inline]

fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {

    !tokenizer.is_eof()

        && match_byte! { tokenizer.next_byte_unchecked(),

            b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => true,

            b'-' => {

                tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),

                    b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {

                        true

                    b'\\' => !tokenizer.has_newline_at(1),

                    b => !b.is_ascii(),

},

            b'\\' => !tokenizer.has_newline_at(1),

            b => !b.is_ascii(),

fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {

    let value = consume_name(tokenizer);

    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {

        tokenizer.advance(1);

        if value.eq_ignore_ascii_case("url") {

            consume_unquoted_url(tokenizer).unwrap_or(Function(value))

        } else {

            tokenizer.see_function(&value);

            Function(value)

    } else {

        Ident(value)

fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {

    // start_pos is the end of the previous token, therefore at a code point boundary

    let start_pos = tokenizer.position();

    let mut value_bytes;

    loop {

        if tokenizer.is_eof() {

            return tokenizer.slice_from(start_pos).into();

        match_byte! { tokenizer.next_byte_unchecked(),

            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => tokenizer.advance(1),

            b'\\' | b'\0' => {

                // * The tokenizer’s input is UTF-8 since it’s `&str`.

                // * start_pos is at a code point boundary

                // * so is the current position (which is before '\\' or '\0'

//

                // So `value_bytes` is well-formed UTF-8.

                value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();

                break

            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }

            b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }

            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }

            _b => {

                return tokenizer.slice_from(start_pos).into();

    while !tokenizer.is_eof() {

        let b = tokenizer.next_byte_unchecked();

        match_byte! { b,

            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-'  => {

                tokenizer.advance(1);

                value_bytes.push(b)  // ASCII

            b'\\' => {

                if tokenizer.has_newline_at(1) { break }

                tokenizer.advance(1);

                // This pushes one well-formed code point

                consume_escape_and_write(tokenizer, &mut value_bytes)

            b'\0' => {

                tokenizer.advance(1);

                value_bytes.extend("\u{FFFD}".as_bytes());

},

            b'\x80'..=b'\xBF' => {

                // This byte *is* part of a multi-byte code point,

                // we’ll end up copying the whole code point before this loop does something else.

                tokenizer.consume_continuation_byte();

                value_bytes.push(b)

            b'\xC0'..=b'\xEF' => {

                // This byte *is* part of a multi-byte code point,

                // we’ll end up copying the whole code point before this loop does something else.

                tokenizer.advance(1);

                value_bytes.push(b)

            b'\xF0'..=b'\xFF' => {

                tokenizer.consume_4byte_intro();

                value_bytes.push(b)

            _ => {

                // ASCII

                break;

    // string_bytes is well-formed UTF-8, see other comments.

    unsafe { from_utf8_release_unchecked(value_bytes) }.into()

fn byte_to_hex_digit(b: u8) -> Option<u32> {

    Some(match_byte! { b,

        b'0' ..= b'9' => b - b'0',

        b'a' ..= b'f' => b - b'a' + 10,

        b'A' ..= b'F' => b - b'A' + 10,

        _ => {

            return None

    } as u32)

fn byte_to_decimal_digit(b: u8) -> Option<u32> {

    if b.is_ascii_digit() {

        Some((b - b'0') as u32)

    } else {

        None

fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {

    // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?

    // But this is always called so that there is at least one digit in \d*(\.\d+)?

    // Do all the math in f64 so that large numbers overflow to +/-inf

    // and i32::{MIN, MAX} are within range.

    let (has_sign, sign) = match tokenizer.next_byte_unchecked() {

        b'-' => (true, -1.),

        b'+' => (true, 1.),

        _ => (false, 1.),

};

    if has_sign {

        tokenizer.advance(1);

    let mut integral_part: f64 = 0.;

    while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {

        integral_part = integral_part * 10. + digit as f64;

        tokenizer.advance(1);

        if tokenizer.is_eof() {

            break;

    let mut is_integer = true;

    let mut fractional_part: f64 = 0.;

    if tokenizer.has_at_least(1)

        && tokenizer.next_byte_unchecked() == b'.'

        && tokenizer.byte_at(1).is_ascii_digit()

        is_integer = false;

        tokenizer.advance(1); // Consume '.'

        let mut factor = 0.1;

        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {

            fractional_part += digit as f64 * factor;

            factor *= 0.1;

            tokenizer.advance(1);

            if tokenizer.is_eof() {

                break;

    let mut value = sign * (integral_part + fractional_part);

    if tokenizer.has_at_least(1)

        && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E')

        && (tokenizer.byte_at(1).is_ascii_digit()

            || (tokenizer.has_at_least(2)

                && matches!(tokenizer.byte_at(1), b'+' | b'-')

                && tokenizer.byte_at(2).is_ascii_digit()))

        is_integer = false;

        tokenizer.advance(1);

        let (has_sign, sign) = match tokenizer.next_byte_unchecked() {

            b'-' => (true, -1.),

            b'+' => (true, 1.),

            _ => (false, 1.),

};

        if has_sign {

            tokenizer.advance(1);

        let mut exponent: f64 = 0.;

        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {

            exponent = exponent * 10. + digit as f64;

            tokenizer.advance(1);

            if tokenizer.is_eof() {

                break;

        value *= f64::powf(10., sign * exponent);

    let int_value = if is_integer {

        Some(if value >= i32::MAX as f64 {

            i32::MAX

        } else if value <= i32::MIN as f64 {

            i32::MIN

        } else {

            value as i32

})

    } else {

        None

};

    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {

        tokenizer.advance(1);

        return Percentage {

            unit_value: (value / 100.) as f32,

            int_value,

            has_sign,

};

    let value = value as f32;

    if is_ident_start(tokenizer) {

        let unit = consume_name(tokenizer);

        Dimension {

            value,

            int_value,

            has_sign,

            unit,

    } else {

        Number {

            value,

            int_value,

            has_sign,

#[inline]

unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {

    if cfg!(debug_assertions) {

        String::from_utf8(string_bytes).unwrap()

    } else {

        String::from_utf8_unchecked(string_bytes)

fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {

    // This is only called after "url(", so the current position is a code point boundary.

    let start_position = tokenizer.position;

    let from_start = &tokenizer.input[tokenizer.position..];

    let mut newlines = 0;

    let mut last_newline = 0;

    let mut found_printable_char = false;

    let mut iter = from_start.bytes().enumerate();

    loop {

        let (offset, b) = match iter.next() {

            Some(item) => item,

            None => {

                tokenizer.position = tokenizer.input.len();

                break;

};

        match_byte! { b,

            b' ' | b'\t' => {},

            b'\n' | b'\x0C' => {

                newlines += 1;

                last_newline = offset;

            b'\r' => {

                if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {

                    newlines += 1;

                    last_newline = offset;

            b'"' | b'\'' => return Err(()),  // Do not advance

            b')' => {

                // Don't use advance, because we may be skipping

                // newlines here, and we want to avoid the assert.

                tokenizer.position += offset + 1;

                break

            _ => {

                // Don't use advance, because we may be skipping

                // newlines here, and we want to avoid the assert.

                tokenizer.position += offset;

                found_printable_char = true;

                break

    if newlines > 0 {

        tokenizer.current_line_number += newlines;

        // No need for wrapping_add here, because there's no possible

        // way to wrap.

        tokenizer.current_line_start_position = start_position + last_newline + 1;

    if found_printable_char {

        // This function only consumed ASCII (whitespace) bytes,

        // so the current position is a code point boundary.

        return Ok(consume_unquoted_url_internal(tokenizer));

    } else {

        return Ok(UnquotedUrl("".into()));

    fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {

        // This function is only called with start_pos at a code point boundary.

        let start_pos = tokenizer.position();

        let mut string_bytes: Vec<u8>;

        loop {

            if tokenizer.is_eof() {

                return UnquotedUrl(tokenizer.slice_from(start_pos).into());

            match_byte! { tokenizer.next_byte_unchecked(),

                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {

                    let value = tokenizer.slice_from(start_pos);

                    return consume_url_end(tokenizer, start_pos, value.into())

                b')' => {

                    let value = tokenizer.slice_from(start_pos);

                    tokenizer.advance(1);

                    return UnquotedUrl(value.into())

                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable

                    | b'"' | b'\'' | b'(' => {

                    tokenizer.advance(1);

                    return consume_bad_url(tokenizer, start_pos)

},

                b'\\' | b'\0' => {

                    // * The tokenizer’s input is UTF-8 since it’s `&str`.

                    // * start_pos is at a code point boundary

                    // * so is the current position (which is before '\\' or '\0'

//

                    // So `string_bytes` is well-formed UTF-8.

                    string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();

                    break

                b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }

                b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }

                _ => {

                    // ASCII or other leading byte.

                    tokenizer.advance(1);

        while !tokenizer.is_eof() {

            let b = tokenizer.next_byte_unchecked();

            match_byte! { b,

                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {

                    // string_bytes is well-formed UTF-8, see other comments.

                    let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();

                    return consume_url_end(tokenizer, start_pos, string)

                b')' => {

                    tokenizer.advance(1);

                    break;

                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable

                    | b'"' | b'\'' | b'(' => {

                    tokenizer.advance(1);

                    return consume_bad_url(tokenizer, start_pos);

                b'\\' => {

                    tokenizer.advance(1);

                    if tokenizer.has_newline_at(0) {

                        return consume_bad_url(tokenizer, start_pos)

                    // This pushes one well-formed code point to string_bytes

                    consume_escape_and_write(tokenizer, &mut string_bytes)

},

                b'\0' => {

                    tokenizer.advance(1);

                    string_bytes.extend("\u{FFFD}".as_bytes());

                b'\x80'..=b'\xBF' => {

                    // We’ll end up copying the whole code point

                    // before this loop does something else.

                    tokenizer.consume_continuation_byte();

                    string_bytes.push(b);

                b'\xF0'..=b'\xFF' => {

                    // We’ll end up copying the whole code point

                    // before this loop does something else.

                    tokenizer.consume_4byte_intro();

                    string_bytes.push(b);

                // If this byte is part of a multi-byte code point,

                // we’ll end up copying the whole code point before this loop does something else.

                b => {

                    // ASCII or other leading byte.

                    tokenizer.advance(1);

                    string_bytes.push(b)

        UnquotedUrl(

            // string_bytes is well-formed UTF-8, see other comments.

            unsafe { from_utf8_release_unchecked(string_bytes) }.into(),

    fn consume_url_end<'a>(

        tokenizer: &mut Tokenizer<'a>,

        start_pos: SourcePosition,

        string: CowRcStr<'a>,

    ) -> Token<'a> {

        while !tokenizer.is_eof() {

            match_byte! { tokenizer.next_byte_unchecked(),

                b')' => {

                    tokenizer.advance(1);

                    break

                b' ' | b'\t' => { tokenizer.advance(1); }

                b'\n' | b'\x0C' | b'\r' => {

                    tokenizer.consume_newline();

                b => {

                    tokenizer.consume_known_byte(b);

                    return consume_bad_url(tokenizer, start_pos);

        UnquotedUrl(string)

    fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {

        // Consume up to the closing )

        while !tokenizer.is_eof() {

            match_byte! { tokenizer.next_byte_unchecked(),

                b')' => {

                    let contents = tokenizer.slice_from(start_pos).into();

                    tokenizer.advance(1);

                    return BadUrl(contents)

                b'\\' => {

                    tokenizer.advance(1);

                    if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {

                        tokenizer.advance(1); // Skip an escaped ')' or '\'

                b'\n' | b'\x0C' | b'\r' => {

                    tokenizer.consume_newline();

                b => {

                    tokenizer.consume_known_byte(b);

        BadUrl(tokenizer.slice_from(start_pos).into())

// (value, number of digits up to 6)

fn consume_hex_digits(tokenizer: &mut Tokenizer<'_>) -> (u32, u32) {

    let mut value = 0;

    let mut digits = 0;

    while digits < 6 && !tokenizer.is_eof() {

        match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {

            Some(digit) => {

                value = value * 16 + digit;

                digits += 1;

                tokenizer.advance(1);

            None => break,

    (value, digits)

// Same constraints as consume_escape except it writes into `bytes` the result

// instead of returning it.

fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {

    bytes.extend(

        consume_escape(tokenizer)

            .encode_utf8(&mut [0; 4])

            .as_bytes(),

// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed

// and that the next input character has already been verified

// to not be a newline.

fn consume_escape(tokenizer: &mut Tokenizer) -> char {

    if tokenizer.is_eof() {

        return '\u{FFFD}';

    } // Escaped EOF

    match_byte! { tokenizer.next_byte_unchecked(),

        b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {

            let (c, _) = consume_hex_digits(tokenizer);

            if !tokenizer.is_eof() {

                match_byte! { tokenizer.next_byte_unchecked(),

                    b' ' | b'\t' => {

                        tokenizer.advance(1)

                    b'\n' | b'\x0C' | b'\r' => {

                        tokenizer.consume_newline();

                    _ => {}

            static REPLACEMENT_CHAR: char = '\u{FFFD}';

            if c != 0 {

                let c = char::from_u32(c);

                c.unwrap_or(REPLACEMENT_CHAR)

            } else {

                REPLACEMENT_CHAR

},

        b'\0' => {

            tokenizer.advance(1);

            '\u{FFFD}'

        _ => tokenizer.consume_char(),