| /* This Source Code Form is subject to the terms of the Mozilla Public |
| * License, v. 2.0. If a copy of the MPL was not distributed with this |
| * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
| |
| // https://drafts.csswg.org/css-syntax/#tokenization |
| |
| use self::Token::*; |
| use crate::cow_rc_str::CowRcStr; |
| use crate::parser::ParserState; |
| use std::char; |
| use std::ops::Range; |
| |
| #[cfg(not(feature = "dummy_match_byte"))] |
| use cssparser_macros::match_byte; |
| |
| #[cfg(feature = "dummy_match_byte")] |
| macro_rules! match_byte { |
| ($value:expr, $($rest:tt)* ) => { |
| match $value { |
| $( |
| $rest |
| )+ |
| } |
| }; |
| } |
| |
| /// One of the pieces the CSS input is broken into. |
| /// |
| /// Some components use `Cow` in order to borrow from the original input string |
| /// and avoid allocating/copying when possible. |
| #[derive(PartialEq, Debug, Clone)] |
| pub enum Token<'a> { |
| /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram) |
| Ident(CowRcStr<'a>), |
| |
| /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram) |
| /// |
| /// The value does not include the `@` marker. |
| AtKeyword(CowRcStr<'a>), |
| |
| /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted" |
| /// |
| /// The value does not include the `#` marker. |
| Hash(CowRcStr<'a>), |
| |
| /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id" |
| /// |
| /// The value does not include the `#` marker. |
| IDHash(CowRcStr<'a>), // Hash that is a valid ID selector. |
| |
| /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram) |
| /// |
| /// The value does not include the quotes. |
| QuotedString(CowRcStr<'a>), |
| |
| /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram) |
| /// |
| /// The value does not include the `url(` `)` markers. Note that `url( <string-token> )` is represented by a |
| /// `Function` token. |
| UnquotedUrl(CowRcStr<'a>), |
| |
| /// A `<delim-token>` |
| Delim(char), |
| |
| /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram) |
| Number { |
| /// Whether the number had a `+` or `-` sign. |
| /// |
| /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.) |
| has_sign: bool, |
| |
| /// The value as a float |
| value: f32, |
| |
| /// If the origin source did not include a fractional part, the value as an integer. |
| int_value: Option<i32>, |
| }, |
| |
| /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram) |
| Percentage { |
| /// Whether the number had a `+` or `-` sign. |
| has_sign: bool, |
| |
| /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0. |
| unit_value: f32, |
| |
| /// If the origin source did not include a fractional part, the value as an integer. |
| /// It is **not** divided by 100. |
| int_value: Option<i32>, |
| }, |
| |
| /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram) |
| Dimension { |
| /// Whether the number had a `+` or `-` sign. |
| /// |
| /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.) |
| has_sign: bool, |
| |
| /// The value as a float |
| value: f32, |
| |
| /// If the origin source did not include a fractional part, the value as an integer. |
| int_value: Option<i32>, |
| |
| /// The unit, e.g. "px" in `12px` |
| unit: CowRcStr<'a>, |
| }, |
| |
| /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram) |
| WhiteSpace(&'a str), |
| |
| /// A comment. |
| /// |
| /// The CSS Syntax spec does not generate tokens for comments, |
| /// But we do, because we can (borrowed &str makes it cheap). |
| /// |
| /// The value does not include the `/*` `*/` markers. |
| Comment(&'a str), |
| |
| /// A `:` `<colon-token>` |
| Colon, // : |
| |
| /// A `;` `<semicolon-token>` |
| Semicolon, // ; |
| |
| /// A `,` `<comma-token>` |
| Comma, // , |
| |
| /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram) |
| IncludeMatch, |
| |
| /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram) |
| DashMatch, |
| |
| /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram) |
| PrefixMatch, |
| |
| /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram) |
| SuffixMatch, |
| |
| /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram) |
| SubstringMatch, |
| |
| /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram) |
| CDO, |
| |
| /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram) |
| CDC, |
| |
| /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram) |
| /// |
| /// The value (name) does not include the `(` marker. |
| Function(CowRcStr<'a>), |
| |
| /// A `<(-token>` |
| ParenthesisBlock, |
| |
| /// A `<[-token>` |
| SquareBracketBlock, |
| |
| /// A `<{-token>` |
| CurlyBracketBlock, |
| |
| /// A `<bad-url-token>` |
| /// |
| /// This token always indicates a parse error. |
| BadUrl(CowRcStr<'a>), |
| |
| /// A `<bad-string-token>` |
| /// |
| /// This token always indicates a parse error. |
| BadString(CowRcStr<'a>), |
| |
| /// A `<)-token>` |
| /// |
| /// When obtained from one of the `Parser::next*` methods, |
| /// this token is always unmatched and indicates a parse error. |
| CloseParenthesis, |
| |
| /// A `<]-token>` |
| /// |
| /// When obtained from one of the `Parser::next*` methods, |
| /// this token is always unmatched and indicates a parse error. |
| CloseSquareBracket, |
| |
| /// A `<}-token>` |
| /// |
| /// When obtained from one of the `Parser::next*` methods, |
| /// this token is always unmatched and indicates a parse error. |
| CloseCurlyBracket, |
| } |
| |
| impl Token<'_> { |
| /// Return whether this token represents a parse error. |
| /// |
| /// `BadUrl` and `BadString` are tokenizer-level parse errors. |
| /// |
| /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched* |
| /// and therefore parse errors when returned by one of the `Parser::next*` methods. |
| pub fn is_parse_error(&self) -> bool { |
| matches!( |
| *self, |
| BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket |
| ) |
| } |
| } |
| |
| #[derive(Clone)] |
| pub struct Tokenizer<'a> { |
| input: &'a str, |
| /// Counted in bytes, not code points. From 0. |
| position: usize, |
| /// The position at the start of the current line; but adjusted to |
| /// ensure that computing the column will give the result in units |
| /// of UTF-16 characters. |
| current_line_start_position: usize, |
| current_line_number: u32, |
| var_or_env_functions: SeenStatus, |
| source_map_url: Option<&'a str>, |
| source_url: Option<&'a str>, |
| } |
| |
| #[derive(Copy, Clone, PartialEq, Eq)] |
| enum SeenStatus { |
| DontCare, |
| LookingForThem, |
| SeenAtLeastOne, |
| } |
| |
| impl<'a> Tokenizer<'a> { |
| #[inline] |
| pub fn new(input: &str) -> Tokenizer { |
| Tokenizer { |
| input, |
| position: 0, |
| current_line_start_position: 0, |
| current_line_number: 0, |
| var_or_env_functions: SeenStatus::DontCare, |
| source_map_url: None, |
| source_url: None, |
| } |
| } |
| |
| #[inline] |
| pub fn look_for_var_or_env_functions(&mut self) { |
| self.var_or_env_functions = SeenStatus::LookingForThem; |
| } |
| |
| #[inline] |
| pub fn seen_var_or_env_functions(&mut self) -> bool { |
| let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne; |
| self.var_or_env_functions = SeenStatus::DontCare; |
| seen |
| } |
| |
| #[inline] |
| pub fn see_function(&mut self, name: &str) { |
| if self.var_or_env_functions == SeenStatus::LookingForThem |
| && (name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env")) |
| { |
| self.var_or_env_functions = SeenStatus::SeenAtLeastOne; |
| } |
| } |
| |
| #[inline] |
| pub fn next(&mut self) -> Result<Token<'a>, ()> { |
| next_token(self) |
| } |
| |
| #[inline] |
| pub fn position(&self) -> SourcePosition { |
| debug_assert!(self.input.is_char_boundary(self.position)); |
| SourcePosition(self.position) |
| } |
| |
| #[inline] |
| pub fn current_source_location(&self) -> SourceLocation { |
| SourceLocation { |
| line: self.current_line_number, |
| column: (self.position - self.current_line_start_position + 1) as u32, |
| } |
| } |
| |
| #[inline] |
| pub fn current_source_map_url(&self) -> Option<&'a str> { |
| self.source_map_url |
| } |
| |
| #[inline] |
| pub fn current_source_url(&self) -> Option<&'a str> { |
| self.source_url |
| } |
| |
| #[inline] |
| pub fn state(&self) -> ParserState { |
| ParserState { |
| position: self.position, |
| current_line_start_position: self.current_line_start_position, |
| current_line_number: self.current_line_number, |
| at_start_of: None, |
| } |
| } |
| |
| #[inline] |
| pub fn reset(&mut self, state: &ParserState) { |
| self.position = state.position; |
| self.current_line_start_position = state.current_line_start_position; |
| self.current_line_number = state.current_line_number; |
| } |
| |
| #[inline] |
| pub(crate) fn slice_from(&self, start_pos: SourcePosition) -> &'a str { |
| self.slice(start_pos..self.position()) |
| } |
| |
| #[inline] |
| pub(crate) fn slice(&self, range: Range<SourcePosition>) -> &'a str { |
| debug_assert!(self.input.is_char_boundary(range.start.0)); |
| debug_assert!(self.input.is_char_boundary(range.end.0)); |
| unsafe { self.input.get_unchecked(range.start.0..range.end.0) } |
| } |
| |
| pub fn current_source_line(&self) -> &'a str { |
| let current = self.position(); |
| let start = self |
| .slice(SourcePosition(0)..current) |
| .rfind(['\r', '\n', '\x0C']) |
| .map_or(0, |start| start + 1); |
| let end = self |
| .slice(current..SourcePosition(self.input.len())) |
| .find(['\r', '\n', '\x0C']) |
| .map_or(self.input.len(), |end| current.0 + end); |
| self.slice(SourcePosition(start)..SourcePosition(end)) |
| } |
| |
| #[inline] |
| pub fn next_byte(&self) -> Option<u8> { |
| if self.is_eof() { |
| None |
| } else { |
| Some(self.input.as_bytes()[self.position]) |
| } |
| } |
| |
| // If false, `tokenizer.next_char()` will not panic. |
| #[inline] |
| fn is_eof(&self) -> bool { |
| !self.has_at_least(0) |
| } |
| |
| // If true, the input has at least `n` bytes left *after* the current one. |
| // That is, `tokenizer.char_at(n)` will not panic. |
| #[inline] |
| fn has_at_least(&self, n: usize) -> bool { |
| self.position + n < self.input.len() |
| } |
| |
| // Advance over N bytes in the input. This function can advance |
| // over ASCII bytes (excluding newlines), or UTF-8 sequence |
| // leaders (excluding leaders for 4-byte sequences). |
| #[inline] |
| pub fn advance(&mut self, n: usize) { |
| if cfg!(debug_assertions) { |
| // Each byte must either be an ASCII byte or a sequence |
| // leader, but not a 4-byte leader; also newlines are |
| // rejected. |
| for i in 0..n { |
| let b = self.byte_at(i); |
| debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80)); |
| debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C'); |
| } |
| } |
| self.position += n |
| } |
| |
| // Assumes non-EOF |
| #[inline] |
| fn next_byte_unchecked(&self) -> u8 { |
| self.byte_at(0) |
| } |
| |
| #[inline] |
| fn byte_at(&self, offset: usize) -> u8 { |
| self.input.as_bytes()[self.position + offset] |
| } |
| |
| // Advance over a single byte; the byte must be a UTF-8 sequence |
| // leader for a 4-byte sequence. |
| #[inline] |
| fn consume_4byte_intro(&mut self) { |
| debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0); |
| // This takes two UTF-16 characters to represent, so we |
| // actually have an undercount. |
| self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); |
| self.position += 1; |
| } |
| |
| // Advance over a single byte; the byte must be a UTF-8 |
| // continuation byte. |
| #[inline] |
| fn consume_continuation_byte(&mut self) { |
| debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80); |
| // Continuation bytes contribute to column overcount. Note |
| // that due to the special case for the 4-byte sequence intro, |
| // we must use wrapping add here. |
| self.current_line_start_position = self.current_line_start_position.wrapping_add(1); |
| self.position += 1; |
| } |
| |
| // Advance over any kind of byte, excluding newlines. |
| #[inline(never)] |
| fn consume_known_byte(&mut self, byte: u8) { |
| debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C'); |
| self.position += 1; |
| // Continuation bytes contribute to column overcount. |
| if byte & 0xF0 == 0xF0 { |
| // This takes two UTF-16 characters to represent, so we |
| // actually have an undercount. |
| self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); |
| } else if byte & 0xC0 == 0x80 { |
| // Note that due to the special case for the 4-byte |
| // sequence intro, we must use wrapping add here. |
| self.current_line_start_position = self.current_line_start_position.wrapping_add(1); |
| } |
| } |
| |
| #[inline] |
| fn next_char(&self) -> char { |
| unsafe { self.input.get_unchecked(self.position().0..) } |
| .chars() |
| .next() |
| .unwrap() |
| } |
| |
| // Given that a newline has been seen, advance over the newline |
| // and update the state. |
| #[inline] |
| fn consume_newline(&mut self) { |
| let byte = self.next_byte_unchecked(); |
| debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C'); |
| self.position += 1; |
| if byte == b'\r' && self.next_byte() == Some(b'\n') { |
| self.position += 1; |
| } |
| self.current_line_start_position = self.position; |
| self.current_line_number += 1; |
| } |
| |
| #[inline] |
| fn has_newline_at(&self, offset: usize) -> bool { |
| self.position + offset < self.input.len() |
| && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C') |
| } |
| |
| #[inline] |
| fn consume_char(&mut self) -> char { |
| let c = self.next_char(); |
| let len_utf8 = c.len_utf8(); |
| self.position += len_utf8; |
| // Note that due to the special case for the 4-byte sequence |
| // intro, we must use wrapping add here. |
| self.current_line_start_position = self |
| .current_line_start_position |
| .wrapping_add(len_utf8 - c.len_utf16()); |
| c |
| } |
| |
| #[inline] |
| fn starts_with(&self, needle: &[u8]) -> bool { |
| self.input.as_bytes()[self.position..].starts_with(needle) |
| } |
| |
| pub fn skip_whitespace(&mut self) { |
| while !self.is_eof() { |
| match_byte! { self.next_byte_unchecked(), |
| b' ' | b'\t' => { |
| self.advance(1) |
| }, |
| b'\n' | b'\x0C' | b'\r' => { |
| self.consume_newline(); |
| }, |
| b'/' => { |
| if self.starts_with(b"/*") { |
| consume_comment(self); |
| } else { |
| return |
| } |
| } |
| _ => return, |
| } |
| } |
| } |
| |
| pub fn skip_cdc_and_cdo(&mut self) { |
| while !self.is_eof() { |
| match_byte! { self.next_byte_unchecked(), |
| b' ' | b'\t' => { |
| self.advance(1) |
| }, |
| b'\n' | b'\x0C' | b'\r' => { |
| self.consume_newline(); |
| }, |
| b'/' => { |
| if self.starts_with(b"/*") { |
| consume_comment(self); |
| } else { |
| return |
| } |
| } |
| b'<' => { |
| if self.starts_with(b"<!--") { |
| self.advance(4) |
| } else { |
| return |
| } |
| } |
| b'-' => { |
| if self.starts_with(b"-->") { |
| self.advance(3) |
| } else { |
| return |
| } |
| } |
| _ => { |
| return |
| } |
| } |
| } |
| } |
| } |
| |
| /// A position from the start of the input, counted in UTF-8 bytes. |
| #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] |
| pub struct SourcePosition(pub(crate) usize); |
| |
| #[cfg(feature = "malloc_size_of")] |
| malloc_size_of::malloc_size_of_is_0!(SourcePosition); |
| |
| impl SourcePosition { |
| /// Returns the current byte index in the original input. |
| #[inline] |
| pub fn byte_index(&self) -> usize { |
| self.0 |
| } |
| } |
| |
| /// The line and column number for a given position within the input. |
| #[derive(PartialEq, Eq, Debug, Clone, Copy, Default)] |
| pub struct SourceLocation { |
| /// The line number, starting at 0 for the first line. |
| pub line: u32, |
| |
| /// The column number within a line, starting at 1 for first the character of the line. |
| /// Column numbers are counted in UTF-16 code units. |
| pub column: u32, |
| } |
| |
| #[cfg(feature = "malloc_size_of")] |
| malloc_size_of::malloc_size_of_is_0!(SourceLocation); |
| |
| fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> { |
| if tokenizer.is_eof() { |
| return Err(()); |
| } |
| let b = tokenizer.next_byte_unchecked(); |
| let token = match_byte! { b, |
| b' ' | b'\t' => { |
| consume_whitespace(tokenizer, false) |
| }, |
| b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true), |
| b'"' => consume_string(tokenizer, false), |
| b'#' => { |
| tokenizer.advance(1); |
| if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) } |
| else if !tokenizer.is_eof() && |
| matches!(tokenizer.next_byte_unchecked(), b'0'..=b'9' | b'-') { |
| // Any other valid case here already resulted in IDHash. |
| Hash(consume_name(tokenizer)) |
| } |
| else { Delim('#') } |
| }, |
| b'$' => { |
| if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch } |
| else { tokenizer.advance(1); Delim('$') } |
| }, |
| b'\'' => consume_string(tokenizer, true), |
| b'(' => { tokenizer.advance(1); ParenthesisBlock }, |
| b')' => { tokenizer.advance(1); CloseParenthesis }, |
| b'*' => { |
| if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch } |
| else { tokenizer.advance(1); Delim('*') } |
| }, |
| b'+' => { |
| if ( |
| tokenizer.has_at_least(1) |
| && tokenizer.byte_at(1).is_ascii_digit() |
| ) || ( |
| tokenizer.has_at_least(2) |
| && tokenizer.byte_at(1) == b'.' |
| && tokenizer.byte_at(2).is_ascii_digit() |
| ) { |
| consume_numeric(tokenizer) |
| } else { |
| tokenizer.advance(1); |
| Delim('+') |
| } |
| }, |
| b',' => { tokenizer.advance(1); Comma }, |
| b'-' => { |
| if ( |
| tokenizer.has_at_least(1) |
| && tokenizer.byte_at(1).is_ascii_digit() |
| ) || ( |
| tokenizer.has_at_least(2) |
| && tokenizer.byte_at(1) == b'.' |
| && tokenizer.byte_at(2).is_ascii_digit() |
| ) { |
| consume_numeric(tokenizer) |
| } else if tokenizer.starts_with(b"-->") { |
| tokenizer.advance(3); |
| CDC |
| } else if is_ident_start(tokenizer) { |
| consume_ident_like(tokenizer) |
| } else { |
| tokenizer.advance(1); |
| Delim('-') |
| } |
| }, |
| b'.' => { |
| if tokenizer.has_at_least(1) |
| && tokenizer.byte_at(1).is_ascii_digit() { |
| consume_numeric(tokenizer) |
| } else { |
| tokenizer.advance(1); |
| Delim('.') |
| } |
| } |
| b'/' => { |
| if tokenizer.starts_with(b"/*") { |
| Comment(consume_comment(tokenizer)) |
| } else { |
| tokenizer.advance(1); |
| Delim('/') |
| } |
| } |
| b'0'..=b'9' => consume_numeric(tokenizer), |
| b':' => { tokenizer.advance(1); Colon }, |
| b';' => { tokenizer.advance(1); Semicolon }, |
| b'<' => { |
| if tokenizer.starts_with(b"<!--") { |
| tokenizer.advance(4); |
| CDO |
| } else { |
| tokenizer.advance(1); |
| Delim('<') |
| } |
| }, |
| b'@' => { |
| tokenizer.advance(1); |
| if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) } |
| else { Delim('@') } |
| }, |
| b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => consume_ident_like(tokenizer), |
| b'[' => { tokenizer.advance(1); SquareBracketBlock }, |
| b'\\' => { |
| if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) } |
| else { tokenizer.advance(1); Delim('\\') } |
| }, |
| b']' => { tokenizer.advance(1); CloseSquareBracket }, |
| b'^' => { |
| if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch } |
| else { tokenizer.advance(1); Delim('^') } |
| }, |
| b'{' => { tokenizer.advance(1); CurlyBracketBlock }, |
| b'|' => { |
| if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch } |
| else { tokenizer.advance(1); Delim('|') } |
| }, |
| b'}' => { tokenizer.advance(1); CloseCurlyBracket }, |
| b'~' => { |
| if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch } |
| else { tokenizer.advance(1); Delim('~') } |
| }, |
| _ => { |
| if !b.is_ascii() { |
| consume_ident_like(tokenizer) |
| } else { |
| tokenizer.advance(1); |
| Delim(b as char) |
| } |
| }, |
| }; |
| Ok(token) |
| } |
| |
| fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> { |
| let start_position = tokenizer.position(); |
| if newline { |
| tokenizer.consume_newline(); |
| } else { |
| tokenizer.advance(1); |
| } |
| while !tokenizer.is_eof() { |
| let b = tokenizer.next_byte_unchecked(); |
| match_byte! { b, |
| b' ' | b'\t' => { |
| tokenizer.advance(1); |
| } |
| b'\n' | b'\x0C' | b'\r' => { |
| tokenizer.consume_newline(); |
| } |
| _ => { |
| break |
| } |
| } |
| } |
| WhiteSpace(tokenizer.slice_from(start_position)) |
| } |
| |
| // Check for sourceMappingURL or sourceURL comments and update the |
| // tokenizer appropriately. |
| fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) { |
| let directive = "# sourceMappingURL="; |
| let directive_old = "@ sourceMappingURL="; |
| |
| // If there is a source map directive, extract the URL. |
| if contents.starts_with(directive) || contents.starts_with(directive_old) { |
| let contents = &contents[directive.len()..]; |
| tokenizer.source_map_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next(); |
| } |
| |
| let directive = "# sourceURL="; |
| let directive_old = "@ sourceURL="; |
| |
| // If there is a source map directive, extract the URL. |
| if contents.starts_with(directive) || contents.starts_with(directive_old) { |
| let contents = &contents[directive.len()..]; |
| tokenizer.source_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next() |
| } |
| } |
| |
| fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str { |
| tokenizer.advance(2); // consume "/*" |
| let start_position = tokenizer.position(); |
| while !tokenizer.is_eof() { |
| match_byte! { tokenizer.next_byte_unchecked(), |
| b'*' => { |
| let end_position = tokenizer.position(); |
| tokenizer.advance(1); |
| if tokenizer.next_byte() == Some(b'/') { |
| tokenizer.advance(1); |
| let contents = tokenizer.slice(start_position..end_position); |
| check_for_source_map(tokenizer, contents); |
| return contents |
| } |
| } |
| b'\n' | b'\x0C' | b'\r' => { |
| tokenizer.consume_newline(); |
| } |
| b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } |
| b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } |
| _ => { |
| // ASCII or other leading byte. |
| tokenizer.advance(1); |
| } |
| } |
| } |
| let contents = tokenizer.slice_from(start_position); |
| check_for_source_map(tokenizer, contents); |
| contents |
| } |
| |
| fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> { |
| match consume_quoted_string(tokenizer, single_quote) { |
| Ok(value) => QuotedString(value), |
| Err(value) => BadString(value), |
| } |
| } |
| |
| /// Return `Err(())` on syntax error (ie. unescaped newline) |
| fn consume_quoted_string<'a>( |
| tokenizer: &mut Tokenizer<'a>, |
| single_quote: bool, |
| ) -> Result<CowRcStr<'a>, CowRcStr<'a>> { |
| tokenizer.advance(1); // Skip the initial quote |
| // start_pos is at code point boundary, after " or ' |
| let start_pos = tokenizer.position(); |
| let mut string_bytes; |
| loop { |
| if tokenizer.is_eof() { |
| return Ok(tokenizer.slice_from(start_pos).into()); |
| } |
| match_byte! { tokenizer.next_byte_unchecked(), |
| b'"' => { |
| if !single_quote { |
| let value = tokenizer.slice_from(start_pos); |
| tokenizer.advance(1); |
| return Ok(value.into()) |
| } |
| tokenizer.advance(1); |
| } |
| b'\'' => { |
| if single_quote { |
| let value = tokenizer.slice_from(start_pos); |
| tokenizer.advance(1); |
| return Ok(value.into()) |
| } |
| tokenizer.advance(1); |
| } |
| b'\\' | b'\0' => { |
| // * The tokenizer’s input is UTF-8 since it’s `&str`. |
| // * start_pos is at a code point boundary |
| // * so is the current position (which is before '\\' or '\0' |
| // |
| // So `string_bytes` is well-formed UTF-8. |
| string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned(); |
| break |
| } |
| b'\n' | b'\r' | b'\x0C' => { |
| return Err(tokenizer.slice_from(start_pos).into()) |
| }, |
| b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } |
| b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } |
| _ => { |
| // ASCII or other leading byte. |
| tokenizer.advance(1); |
| } |
| } |
| } |
| |
| while !tokenizer.is_eof() { |
| let b = tokenizer.next_byte_unchecked(); |
| match_byte! { b, |
| b'\n' | b'\r' | b'\x0C' => { |
| return Err( |
| // string_bytes is well-formed UTF-8, see other comments. |
| unsafe { |
| from_utf8_release_unchecked(string_bytes) |
| }.into() |
| ); |
| } |
| b'"' => { |
| tokenizer.advance(1); |
| if !single_quote { |
| break; |
| } |
| } |
| b'\'' => { |
| tokenizer.advance(1); |
| if single_quote { |
| break; |
| } |
| } |
| b'\\' => { |
| tokenizer.advance(1); |
| if !tokenizer.is_eof() { |
| match tokenizer.next_byte_unchecked() { |
| // Escaped newline |
| b'\n' | b'\x0C' | b'\r' => { |
| tokenizer.consume_newline(); |
| } |
| // This pushes one well-formed code point |
| _ => consume_escape_and_write(tokenizer, &mut string_bytes) |
| } |
| } |
| // else: escaped EOF, do nothing. |
| continue; |
| } |
| b'\0' => { |
| tokenizer.advance(1); |
| string_bytes.extend("\u{FFFD}".as_bytes()); |
| continue; |
| } |
| b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } |
| b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } |
| _ => { |
| // ASCII or other leading byte. |
| tokenizer.advance(1); |
| }, |
| } |
| |
| // If this byte is part of a multi-byte code point, |
| // we’ll end up copying the whole code point before this loop does something else. |
| string_bytes.push(b); |
| } |
| |
| Ok( |
| // string_bytes is well-formed UTF-8, see other comments. |
| unsafe { from_utf8_release_unchecked(string_bytes) }.into(), |
| ) |
| } |
| |
| #[inline] |
| fn is_ident_start(tokenizer: &mut Tokenizer) -> bool { |
| !tokenizer.is_eof() |
| && match_byte! { tokenizer.next_byte_unchecked(), |
| b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => true, |
| b'-' => { |
| tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1), |
| b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => { |
| true |
| } |
| b'\\' => !tokenizer.has_newline_at(1), |
| b => !b.is_ascii(), |
| } |
| }, |
| b'\\' => !tokenizer.has_newline_at(1), |
| b => !b.is_ascii(), |
| } |
| } |
| |
| fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> { |
| let value = consume_name(tokenizer); |
| if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' { |
| tokenizer.advance(1); |
| if value.eq_ignore_ascii_case("url") { |
| consume_unquoted_url(tokenizer).unwrap_or(Function(value)) |
| } else { |
| tokenizer.see_function(&value); |
| Function(value) |
| } |
| } else { |
| Ident(value) |
| } |
| } |
| |
| fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> { |
| // start_pos is the end of the previous token, therefore at a code point boundary |
| let start_pos = tokenizer.position(); |
| let mut value_bytes; |
| loop { |
| if tokenizer.is_eof() { |
| return tokenizer.slice_from(start_pos).into(); |
| } |
| match_byte! { tokenizer.next_byte_unchecked(), |
| b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => tokenizer.advance(1), |
| b'\\' | b'\0' => { |
| // * The tokenizer’s input is UTF-8 since it’s `&str`. |
| // * start_pos is at a code point boundary |
| // * so is the current position (which is before '\\' or '\0' |
| // |
| // So `value_bytes` is well-formed UTF-8. |
| value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned(); |
| break |
| } |
| b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } |
| b'\xC0'..=b'\xEF' => { tokenizer.advance(1); } |
| b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } |
| _b => { |
| return tokenizer.slice_from(start_pos).into(); |
| } |
| } |
| } |
| |
| while !tokenizer.is_eof() { |
| let b = tokenizer.next_byte_unchecked(); |
| match_byte! { b, |
| b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => { |
| tokenizer.advance(1); |
| value_bytes.push(b) // ASCII |
| } |
| b'\\' => { |
| if tokenizer.has_newline_at(1) { break } |
| tokenizer.advance(1); |
| // This pushes one well-formed code point |
| consume_escape_and_write(tokenizer, &mut value_bytes) |
| } |
| b'\0' => { |
| tokenizer.advance(1); |
| value_bytes.extend("\u{FFFD}".as_bytes()); |
| }, |
| b'\x80'..=b'\xBF' => { |
| // This byte *is* part of a multi-byte code point, |
| // we’ll end up copying the whole code point before this loop does something else. |
| tokenizer.consume_continuation_byte(); |
| value_bytes.push(b) |
| } |
| b'\xC0'..=b'\xEF' => { |
| // This byte *is* part of a multi-byte code point, |
| // we’ll end up copying the whole code point before this loop does something else. |
| tokenizer.advance(1); |
| value_bytes.push(b) |
| } |
| b'\xF0'..=b'\xFF' => { |
| tokenizer.consume_4byte_intro(); |
| value_bytes.push(b) |
| } |
| _ => { |
| // ASCII |
| break; |
| } |
| } |
| } |
| // string_bytes is well-formed UTF-8, see other comments. |
| unsafe { from_utf8_release_unchecked(value_bytes) }.into() |
| } |
| |
| fn byte_to_hex_digit(b: u8) -> Option<u32> { |
| Some(match_byte! { b, |
| b'0' ..= b'9' => b - b'0', |
| b'a' ..= b'f' => b - b'a' + 10, |
| b'A' ..= b'F' => b - b'A' + 10, |
| _ => { |
| return None |
| } |
| } as u32) |
| } |
| |
| fn byte_to_decimal_digit(b: u8) -> Option<u32> { |
| if b.is_ascii_digit() { |
| Some((b - b'0') as u32) |
| } else { |
| None |
| } |
| } |
| |
| fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> { |
| // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)? |
| // But this is always called so that there is at least one digit in \d*(\.\d+)? |
| |
| // Do all the math in f64 so that large numbers overflow to +/-inf |
| // and i32::{MIN, MAX} are within range. |
| |
| let (has_sign, sign) = match tokenizer.next_byte_unchecked() { |
| b'-' => (true, -1.), |
| b'+' => (true, 1.), |
| _ => (false, 1.), |
| }; |
| if has_sign { |
| tokenizer.advance(1); |
| } |
| |
| let mut integral_part: f64 = 0.; |
| while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) { |
| integral_part = integral_part * 10. + digit as f64; |
| tokenizer.advance(1); |
| if tokenizer.is_eof() { |
| break; |
| } |
| } |
| |
| let mut is_integer = true; |
| |
| let mut fractional_part: f64 = 0.; |
| if tokenizer.has_at_least(1) |
| && tokenizer.next_byte_unchecked() == b'.' |
| && tokenizer.byte_at(1).is_ascii_digit() |
| { |
| is_integer = false; |
| tokenizer.advance(1); // Consume '.' |
| let mut factor = 0.1; |
| while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) { |
| fractional_part += digit as f64 * factor; |
| factor *= 0.1; |
| tokenizer.advance(1); |
| if tokenizer.is_eof() { |
| break; |
| } |
| } |
| } |
| |
| let mut value = sign * (integral_part + fractional_part); |
| |
| if tokenizer.has_at_least(1) |
| && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E') |
| && (tokenizer.byte_at(1).is_ascii_digit() |
| || (tokenizer.has_at_least(2) |
| && matches!(tokenizer.byte_at(1), b'+' | b'-') |
| && tokenizer.byte_at(2).is_ascii_digit())) |
| { |
| is_integer = false; |
| tokenizer.advance(1); |
| let (has_sign, sign) = match tokenizer.next_byte_unchecked() { |
| b'-' => (true, -1.), |
| b'+' => (true, 1.), |
| _ => (false, 1.), |
| }; |
| if has_sign { |
| tokenizer.advance(1); |
| } |
| let mut exponent: f64 = 0.; |
| while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) { |
| exponent = exponent * 10. + digit as f64; |
| tokenizer.advance(1); |
| if tokenizer.is_eof() { |
| break; |
| } |
| } |
| value *= f64::powf(10., sign * exponent); |
| } |
| |
| let int_value = if is_integer { |
| Some(if value >= i32::MAX as f64 { |
| i32::MAX |
| } else if value <= i32::MIN as f64 { |
| i32::MIN |
| } else { |
| value as i32 |
| }) |
| } else { |
| None |
| }; |
| |
| if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' { |
| tokenizer.advance(1); |
| return Percentage { |
| unit_value: (value / 100.) as f32, |
| int_value, |
| has_sign, |
| }; |
| } |
| let value = value as f32; |
| if is_ident_start(tokenizer) { |
| let unit = consume_name(tokenizer); |
| Dimension { |
| value, |
| int_value, |
| has_sign, |
| unit, |
| } |
| } else { |
| Number { |
| value, |
| int_value, |
| has_sign, |
| } |
| } |
| } |
| |
| #[inline] |
| unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String { |
| if cfg!(debug_assertions) { |
| String::from_utf8(string_bytes).unwrap() |
| } else { |
| String::from_utf8_unchecked(string_bytes) |
| } |
| } |
| |
| fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> { |
| // This is only called after "url(", so the current position is a code point boundary. |
| let start_position = tokenizer.position; |
| let from_start = &tokenizer.input[tokenizer.position..]; |
| let mut newlines = 0; |
| let mut last_newline = 0; |
| let mut found_printable_char = false; |
| let mut iter = from_start.bytes().enumerate(); |
| loop { |
| let (offset, b) = match iter.next() { |
| Some(item) => item, |
| None => { |
| tokenizer.position = tokenizer.input.len(); |
| break; |
| } |
| }; |
| match_byte! { b, |
| b' ' | b'\t' => {}, |
| b'\n' | b'\x0C' => { |
| newlines += 1; |
| last_newline = offset; |
| } |
| b'\r' => { |
| if from_start.as_bytes().get(offset + 1) != Some(&b'\n') { |
| newlines += 1; |
| last_newline = offset; |
| } |
| } |
| b'"' | b'\'' => return Err(()), // Do not advance |
| b')' => { |
| // Don't use advance, because we may be skipping |
| // newlines here, and we want to avoid the assert. |
| tokenizer.position += offset + 1; |
| break |
| } |
| _ => { |
| // Don't use advance, because we may be skipping |
| // newlines here, and we want to avoid the assert. |
| tokenizer.position += offset; |
| found_printable_char = true; |
| break |
| } |
| } |
| } |
| |
| if newlines > 0 { |
| tokenizer.current_line_number += newlines; |
| // No need for wrapping_add here, because there's no possible |
| // way to wrap. |
| tokenizer.current_line_start_position = start_position + last_newline + 1; |
| } |
| |
| if found_printable_char { |
| // This function only consumed ASCII (whitespace) bytes, |
| // so the current position is a code point boundary. |
| return Ok(consume_unquoted_url_internal(tokenizer)); |
| } else { |
| return Ok(UnquotedUrl("".into())); |
| } |
| |
| fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> { |
| // This function is only called with start_pos at a code point boundary. |
| let start_pos = tokenizer.position(); |
| let mut string_bytes: Vec<u8>; |
| loop { |
| if tokenizer.is_eof() { |
| return UnquotedUrl(tokenizer.slice_from(start_pos).into()); |
| } |
| match_byte! { tokenizer.next_byte_unchecked(), |
| b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { |
| let value = tokenizer.slice_from(start_pos); |
| return consume_url_end(tokenizer, start_pos, value.into()) |
| } |
| b')' => { |
| let value = tokenizer.slice_from(start_pos); |
| tokenizer.advance(1); |
| return UnquotedUrl(value.into()) |
| } |
| b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable |
| | b'"' | b'\'' | b'(' => { |
| tokenizer.advance(1); |
| return consume_bad_url(tokenizer, start_pos) |
| }, |
| b'\\' | b'\0' => { |
| // * The tokenizer’s input is UTF-8 since it’s `&str`. |
| // * start_pos is at a code point boundary |
| // * so is the current position (which is before '\\' or '\0' |
| // |
| // So `string_bytes` is well-formed UTF-8. |
| string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned(); |
| break |
| } |
| b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } |
| b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } |
| _ => { |
| // ASCII or other leading byte. |
| tokenizer.advance(1); |
| } |
| } |
| } |
| while !tokenizer.is_eof() { |
| let b = tokenizer.next_byte_unchecked(); |
| match_byte! { b, |
| b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { |
| // string_bytes is well-formed UTF-8, see other comments. |
| let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into(); |
| return consume_url_end(tokenizer, start_pos, string) |
| } |
| b')' => { |
| tokenizer.advance(1); |
| break; |
| } |
| b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable |
| | b'"' | b'\'' | b'(' => { |
| tokenizer.advance(1); |
| return consume_bad_url(tokenizer, start_pos); |
| } |
| b'\\' => { |
| tokenizer.advance(1); |
| if tokenizer.has_newline_at(0) { |
| return consume_bad_url(tokenizer, start_pos) |
| } |
| |
| // This pushes one well-formed code point to string_bytes |
| consume_escape_and_write(tokenizer, &mut string_bytes) |
| }, |
| b'\0' => { |
| tokenizer.advance(1); |
| string_bytes.extend("\u{FFFD}".as_bytes()); |
| } |
| b'\x80'..=b'\xBF' => { |
| // We’ll end up copying the whole code point |
| // before this loop does something else. |
| tokenizer.consume_continuation_byte(); |
| string_bytes.push(b); |
| } |
| b'\xF0'..=b'\xFF' => { |
| // We’ll end up copying the whole code point |
| // before this loop does something else. |
| tokenizer.consume_4byte_intro(); |
| string_bytes.push(b); |
| } |
| // If this byte is part of a multi-byte code point, |
| // we’ll end up copying the whole code point before this loop does something else. |
| b => { |
| // ASCII or other leading byte. |
| tokenizer.advance(1); |
| string_bytes.push(b) |
| } |
| } |
| } |
| UnquotedUrl( |
| // string_bytes is well-formed UTF-8, see other comments. |
| unsafe { from_utf8_release_unchecked(string_bytes) }.into(), |
| ) |
| } |
| |
| fn consume_url_end<'a>( |
| tokenizer: &mut Tokenizer<'a>, |
| start_pos: SourcePosition, |
| string: CowRcStr<'a>, |
| ) -> Token<'a> { |
| while !tokenizer.is_eof() { |
| match_byte! { tokenizer.next_byte_unchecked(), |
| b')' => { |
| tokenizer.advance(1); |
| break |
| } |
| b' ' | b'\t' => { tokenizer.advance(1); } |
| b'\n' | b'\x0C' | b'\r' => { |
| tokenizer.consume_newline(); |
| } |
| b => { |
| tokenizer.consume_known_byte(b); |
| return consume_bad_url(tokenizer, start_pos); |
| } |
| } |
| } |
| UnquotedUrl(string) |
| } |
| |
| fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> { |
| // Consume up to the closing ) |
| while !tokenizer.is_eof() { |
| match_byte! { tokenizer.next_byte_unchecked(), |
| b')' => { |
| let contents = tokenizer.slice_from(start_pos).into(); |
| tokenizer.advance(1); |
| return BadUrl(contents) |
| } |
| b'\\' => { |
| tokenizer.advance(1); |
| if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) { |
| tokenizer.advance(1); // Skip an escaped ')' or '\' |
| } |
| } |
| b'\n' | b'\x0C' | b'\r' => { |
| tokenizer.consume_newline(); |
| } |
| b => { |
| tokenizer.consume_known_byte(b); |
| } |
| } |
| } |
| BadUrl(tokenizer.slice_from(start_pos).into()) |
| } |
| } |
| |
| // (value, number of digits up to 6) |
| fn consume_hex_digits(tokenizer: &mut Tokenizer<'_>) -> (u32, u32) { |
| let mut value = 0; |
| let mut digits = 0; |
| while digits < 6 && !tokenizer.is_eof() { |
| match byte_to_hex_digit(tokenizer.next_byte_unchecked()) { |
| Some(digit) => { |
| value = value * 16 + digit; |
| digits += 1; |
| tokenizer.advance(1); |
| } |
| None => break, |
| } |
| } |
| (value, digits) |
| } |
| |
| // Same constraints as consume_escape except it writes into `bytes` the result |
| // instead of returning it. |
| fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) { |
| bytes.extend( |
| consume_escape(tokenizer) |
| .encode_utf8(&mut [0; 4]) |
| .as_bytes(), |
| ) |
| } |
| |
| // Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed |
| // and that the next input character has already been verified |
| // to not be a newline. |
| fn consume_escape(tokenizer: &mut Tokenizer) -> char { |
| if tokenizer.is_eof() { |
| return '\u{FFFD}'; |
| } // Escaped EOF |
| match_byte! { tokenizer.next_byte_unchecked(), |
| b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => { |
| let (c, _) = consume_hex_digits(tokenizer); |
| if !tokenizer.is_eof() { |
| match_byte! { tokenizer.next_byte_unchecked(), |
| b' ' | b'\t' => { |
| tokenizer.advance(1) |
| } |
| b'\n' | b'\x0C' | b'\r' => { |
| tokenizer.consume_newline(); |
| } |
| _ => {} |
| } |
| } |
| static REPLACEMENT_CHAR: char = '\u{FFFD}'; |
| if c != 0 { |
| let c = char::from_u32(c); |
| c.unwrap_or(REPLACEMENT_CHAR) |
| } else { |
| REPLACEMENT_CHAR |
| } |
| }, |
| b'\0' => { |
| tokenizer.advance(1); |
| '\u{FFFD}' |
| } |
| _ => tokenizer.consume_char(), |
| } |
| } |