Introduce rustc_lexer

The idea here is to make a reusable library out of the existing rust-lexer, by separating out pure lexing and rustc-specific concerns, like spans, error reporting an interning. So, rustc_lexer operates directly on `&str`, produces simple tokens which are a pair of type-tag and a bit of original text, and does not report errors, instead storing them as flags on the token.
2024-10-14 04:23:37 +00:00 · 2019-05-06 11:53:40 +03:00 · 2019-05-06 11:53:40 +03:00 · 395ee0b79f
parent 95b1fe560d
commit 395ee0b79f
15 changed files with 1337 additions and 1261 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2972,6 +2972,13 @@ dependencies = [
 "tempfile 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "rustc_lexer"
+version = "0.1.0"
+dependencies = [
+ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "rustc_lint"
 version = "0.0.0"
@ -3622,6 +3629,7 @@ dependencies = [
 "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
 "rustc_data_structures 0.0.0",
 "rustc_errors 0.0.0",
+ "rustc_lexer 0.1.0",
 "rustc_macros 0.1.0",
 "rustc_target 0.0.0",
 "scoped-tls 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
--- a/src/librustc_lexer/Cargo.toml
+++ b/src/librustc_lexer/Cargo.toml
@ -0,0 +1,9 @@
+[package]
+authors = ["The Rust Project Developers"]
+name = "rustc_lexer"
+version = "0.1.0"
+edition = "2018"
+
+# Note that this crate purposefully does not depend on other rustc crates
+[dependencies]
+unicode-xid = { version = "0.1.0", optional = true }
--- a/src/librustc_lexer/src/cursor.rs
+++ b/src/librustc_lexer/src/cursor.rs
@ -0,0 +1,57 @@
+use std::str::Chars;
+
+pub(crate) struct Cursor<'a> {
+    initial_len: usize,
+    chars: Chars<'a>,
+    #[cfg(debug_assertions)]
+    prev: char,
+}
+
+pub(crate) const EOF_CHAR: char = '\0';
+
+impl<'a> Cursor<'a> {
+    pub(crate) fn new(input: &'a str) -> Cursor<'a> {
+        Cursor {
+            initial_len: input.len(),
+            chars: input.chars(),
+            #[cfg(debug_assertions)]
+            prev: EOF_CHAR,
+        }
+    }
+    /// For debug assertions only
+    pub(crate) fn prev(&self) -> char {
+        #[cfg(debug_assertions)]
+        {
+            self.prev
+        }
+
+        #[cfg(not(debug_assertions))]
+        {
+            '\0'
+        }
+    }
+    pub(crate) fn nth_char(&self, n: usize) -> char {
+        self.chars().nth(n).unwrap_or(EOF_CHAR)
+    }
+    pub(crate) fn is_eof(&self) -> bool {
+        self.chars.as_str().is_empty()
+    }
+    pub(crate) fn len_consumed(&self) -> usize {
+        self.initial_len - self.chars.as_str().len()
+    }
+    /// Returns an iterator over the remaining characters.
+    fn chars(&self) -> Chars<'a> {
+        self.chars.clone()
+    }
+    /// Moves to the next character.
+    pub(crate) fn bump(&mut self) -> Option<char> {
+        let c = self.chars.next()?;
+
+        #[cfg(debug_assertions)]
+        {
+            self.prev = c;
+        }
+
+        Some(c)
+    }
+}
--- a/src/librustc_lexer/src/lib.rs
+++ b/src/librustc_lexer/src/lib.rs
@ -0,0 +1,710 @@
+// We want to be able to build this crate with a stable compiler, so feature
+// flags should optional.
+#![cfg_attr(not(feature = "unicode-xid"), feature(rustc_private))]
+#![cfg_attr(not(feature = "unicode-xid"), feature(unicode_internals))]
+
+mod cursor;
+
+use crate::cursor::{Cursor, EOF_CHAR};
+
+pub struct Token {
+    pub kind: TokenKind,
+    pub len: usize,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum TokenKind {
+    LineComment,
+    BlockComment { terminated: bool },
+    Whitespace,
+    Ident,
+    RawIdent,
+    Literal { kind: LiteralKind, suffix_start: usize },
+    Lifetime { starts_with_number: bool },
+    Semi,
+    Comma,
+    DotDotDot,
+    DotDotEq,
+    DotDot,
+    Dot,
+    OpenParen,
+    CloseParen,
+    OpenBrace,
+    CloseBrace,
+    OpenBracket,
+    CloseBracket,
+    At,
+    Pound,
+    Tilde,
+    Question,
+    ColonColon,
+    Colon,
+    Dollar,
+    EqEq,
+    Eq,
+    FatArrow,
+    Ne,
+    Not,
+    Le,
+    LArrow,
+    Lt,
+    ShlEq,
+    Shl,
+    Ge,
+    Gt,
+    ShrEq,
+    Shr,
+    RArrow,
+    Minus,
+    MinusEq,
+    And,
+    AndAnd,
+    AndEq,
+    Or,
+    OrOr,
+    OrEq,
+    PlusEq,
+    Plus,
+    StarEq,
+    Star,
+    SlashEq,
+    Slash,
+    CaretEq,
+    Caret,
+    PercentEq,
+    Percent,
+    Unknown,
+}
+use self::TokenKind::*;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum LiteralKind {
+    Int { base: Base, empty_int: bool },
+    Float { base: Base, empty_exponent: bool },
+    Char { terminated: bool },
+    Byte { terminated: bool },
+    Str { terminated: bool },
+    ByteStr { terminated: bool },
+    RawStr { n_hashes: usize, started: bool, terminated: bool },
+    RawByteStr { n_hashes: usize, started: bool, terminated: bool },
+}
+use self::LiteralKind::*;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum Base {
+    Binary,
+    Octal,
+    Hexadecimal,
+    Decimal,
+}
+
+impl Token {
+    fn new(kind: TokenKind, len: usize) -> Token {
+        Token { kind, len }
+    }
+}
+
+pub fn strip_shebang(input: &str) -> Option<usize> {
+    debug_assert!(!input.is_empty());
+    if !input.starts_with("#!") || input.starts_with("#![") {
+        return None;
+    }
+    Some(input.find('\n').unwrap_or(input.len()))
+}
+
+pub fn first_token(input: &str) -> Token {
+    debug_assert!(!input.is_empty());
+    Cursor::new(input).advance_token()
+}
+
+pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
+    std::iter::from_fn(move || {
+        if input.is_empty() {
+            return None;
+        }
+        let token = first_token(input);
+        input = &input[token.len..];
+        Some(token)
+    })
+}
+
+impl Cursor<'_> {
+    fn advance_token(&mut self) -> Token {
+        let first_char = self.bump().unwrap();
+        let token_kind = match first_char {
+            '/' => match self.nth_char(0) {
+                '/' => self.line_comment(),
+                '*' => self.block_comment(),
+                _ => {
+                    if self.eat_assign() {
+                        SlashEq
+                    } else {
+                        Slash
+                    }
+                }
+            },
+            c if character_properties::is_whitespace(c) => self.whitespace(),
+            'r' => match (self.nth_char(0), self.nth_char(1)) {
+                ('#', c1) if character_properties::is_id_start(c1) => self.raw_ident(),
+                ('#', _) | ('"', _) => {
+                    let (n_hashes, started, terminated) = self.raw_double_quoted_string();
+                    let suffix_start = self.len_consumed();
+                    if terminated {
+                        self.eat_literal_suffix();
+                    }
+                    let kind = RawStr { n_hashes, started, terminated };
+                    Literal { kind, suffix_start }
+                }
+                _ => self.ident(),
+            },
+            'b' => match (self.nth_char(0), self.nth_char(1)) {
+                ('\'', _) => {
+                    self.bump();
+                    let terminated = self.single_quoted_string();
+                    let suffix_start = self.len_consumed();
+                    if terminated {
+                        self.eat_literal_suffix();
+                    }
+                    let kind = Byte { terminated };
+                    Literal { kind, suffix_start }
+                }
+                ('"', _) => {
+                    self.bump();
+                    let terminated = self.double_quoted_string();
+                    let suffix_start = self.len_consumed();
+                    if terminated {
+                        self.eat_literal_suffix();
+                    }
+                    let kind = ByteStr { terminated };
+                    Literal { kind, suffix_start }
+                }
+                ('r', '"') | ('r', '#') => {
+                    self.bump();
+                    let (n_hashes, started, terminated) = self.raw_double_quoted_string();
+                    let suffix_start = self.len_consumed();
+                    if terminated {
+                        self.eat_literal_suffix();
+                    }
+                    let kind = RawByteStr { n_hashes, started, terminated };
+                    Literal { kind, suffix_start }
+                }
+                _ => self.ident(),
+            },
+            c if character_properties::is_id_start(c) => self.ident(),
+            c @ '0'..='9' => {
+                let literal_kind = self.number(c);
+                let suffix_start = self.len_consumed();
+                self.eat_literal_suffix();
+                TokenKind::Literal { kind: literal_kind, suffix_start }
+            }
+            ';' => Semi,
+            ',' => Comma,
+            '.' => {
+                if self.nth_char(0) == '.' {
+                    self.bump();
+                    if self.nth_char(0) == '.' {
+                        self.bump();
+                        DotDotDot
+                    } else if self.nth_char(0) == '=' {
+                        self.bump();
+                        DotDotEq
+                    } else {
+                        DotDot
+                    }
+                } else {
+                    Dot
+                }
+            }
+            '(' => OpenParen,
+            ')' => CloseParen,
+            '{' => OpenBrace,
+            '}' => CloseBrace,
+            '[' => OpenBracket,
+            ']' => CloseBracket,
+            '@' => At,
+            '#' => Pound,
+            '~' => Tilde,
+            '?' => Question,
+            ':' => {
+                if self.nth_char(0) == ':' {
+                    self.bump();
+                    ColonColon
+                } else {
+                    Colon
+                }
+            }
+            '$' => Dollar,
+            '=' => {
+                if self.nth_char(0) == '=' {
+                    self.bump();
+                    EqEq
+                } else if self.nth_char(0) == '>' {
+                    self.bump();
+                    FatArrow
+                } else {
+                    Eq
+                }
+            }
+            '!' => {
+                if self.nth_char(0) == '=' {
+                    self.bump();
+                    Ne
+                } else {
+                    Not
+                }
+            }
+            '<' => match self.nth_char(0) {
+                '=' => {
+                    self.bump();
+                    Le
+                }
+                '<' => {
+                    self.bump();
+                    if self.eat_assign() { ShlEq } else { Shl }
+                }
+                '-' => {
+                    self.bump();
+                    LArrow
+                }
+                _ => Lt,
+            },
+            '>' => match self.nth_char(0) {
+                '=' => {
+                    self.bump();
+                    Ge
+                }
+                '>' => {
+                    self.bump();
+                    if self.eat_assign() { ShrEq } else { Shr }
+                }
+                _ => Gt,
+            },
+            '-' => {
+                if self.nth_char(0) == '>' {
+                    self.bump();
+                    RArrow
+                } else {
+                    if self.eat_assign() { MinusEq } else { Minus }
+                }
+            }
+            '&' => {
+                if self.nth_char(0) == '&' {
+                    self.bump();
+                    AndAnd
+                } else {
+                    if self.eat_assign() { AndEq } else { And }
+                }
+            }
+            '|' => {
+                if self.nth_char(0) == '|' {
+                    self.bump();
+                    OrOr
+                } else {
+                    if self.eat_assign() { OrEq } else { Or }
+                }
+            }
+            '+' => {
+                if self.eat_assign() {
+                    PlusEq
+                } else {
+                    Plus
+                }
+            }
+            '*' => {
+                if self.eat_assign() {
+                    StarEq
+                } else {
+                    Star
+                }
+            }
+            '^' => {
+                if self.eat_assign() {
+                    CaretEq
+                } else {
+                    Caret
+                }
+            }
+            '%' => {
+                if self.eat_assign() {
+                    PercentEq
+                } else {
+                    Percent
+                }
+            }
+            '\'' => self.lifetime_or_char(),
+            '"' => {
+                let terminated = self.double_quoted_string();
+                let suffix_start = self.len_consumed();
+                if terminated {
+                    self.eat_literal_suffix();
+                }
+                let kind = Str { terminated };
+                Literal { kind, suffix_start }
+            }
+            _ => Unknown,
+        };
+        Token::new(token_kind, self.len_consumed())
+    }
+
+    fn line_comment(&mut self) -> TokenKind {
+        debug_assert!(self.prev() == '/' && self.nth_char(0) == '/');
+        self.bump();
+        loop {
+            match self.nth_char(0) {
+                '\n' => break,
+                '\r' if self.nth_char(1) == '\n' => break,
+                EOF_CHAR if self.is_eof() => break,
+                _ => {
+                    self.bump();
+                }
+            }
+        }
+        LineComment
+    }
+
+    fn block_comment(&mut self) -> TokenKind {
+        debug_assert!(self.prev() == '/' && self.nth_char(0) == '*');
+        self.bump();
+        let mut depth = 1usize;
+        while let Some(c) = self.bump() {
+            match c {
+                '/' if self.nth_char(0) == '*' => {
+                    self.bump();
+                    depth += 1;
+                }
+                '*' if self.nth_char(0) == '/' => {
+                    self.bump();
+                    depth -= 1;
+                    if depth == 0 {
+                        break;
+                    }
+                }
+                _ => (),
+            }
+        }
+
+        BlockComment { terminated: depth == 0 }
+    }
+
+    fn whitespace(&mut self) -> TokenKind {
+        debug_assert!(character_properties::is_whitespace(self.prev()));
+        while character_properties::is_whitespace(self.nth_char(0)) {
+            self.bump();
+        }
+        Whitespace
+    }
+
+    fn raw_ident(&mut self) -> TokenKind {
+        debug_assert!(
+            self.prev() == 'r'
+                && self.nth_char(0) == '#'
+                && character_properties::is_id_start(self.nth_char(1))
+        );
+        self.bump();
+        self.bump();
+        while character_properties::is_id_continue(self.nth_char(0)) {
+            self.bump();
+        }
+        RawIdent
+    }
+
+    fn ident(&mut self) -> TokenKind {
+        debug_assert!(character_properties::is_id_start(self.prev()));
+        while character_properties::is_id_continue(self.nth_char(0)) {
+            self.bump();
+        }
+        Ident
+    }
+
+    fn number(&mut self, first_digit: char) -> LiteralKind {
+        debug_assert!('0' <= self.prev() && self.prev() <= '9');
+        let mut base = Base::Decimal;
+        if first_digit == '0' {
+            let has_digits = match self.nth_char(0) {
+                'b' => {
+                    base = Base::Binary;
+                    self.bump();
+                    self.eat_decimal_digits()
+                }
+                'o' => {
+                    base = Base::Octal;
+                    self.bump();
+                    self.eat_decimal_digits()
+                }
+                'x' => {
+                    base = Base::Hexadecimal;
+                    self.bump();
+                    self.eat_hexadecimal_digits()
+                }
+                '0'..='9' | '_' | '.' | 'e' | 'E' => {
+                    self.eat_decimal_digits();
+                    true
+                }
+                // just a 0
+                _ => return Int { base, empty_int: false },
+            };
+            if !has_digits {
+                return Int { base, empty_int: true };
+            }
+        } else {
+            self.eat_decimal_digits();
+        };
+
+        match self.nth_char(0) {
+            // Don't be greedy if this is actually an
+            // integer literal followed by field/method access or a range pattern
+            // (`0..2` and `12.foo()`)
+            '.' if self.nth_char(1) != '.'
+                && !character_properties::is_id_start(self.nth_char(1)) =>
+            {
+                // might have stuff after the ., and if it does, it needs to start
+                // with a number
+                self.bump();
+                let mut empty_exponent = false;
+                if self.nth_char(0).is_digit(10) {
+                    self.eat_decimal_digits();
+                    match self.nth_char(0) {
+                        'e' | 'E' => {
+                            self.bump();
+                            empty_exponent = self.float_exponent().is_err()
+                        }
+                        _ => (),
+                    }
+                }
+                Float { base, empty_exponent }
+            }
+            'e' | 'E' => {
+                self.bump();
+                let empty_exponent = self.float_exponent().is_err();
+                Float { base, empty_exponent }
+            }
+            _ => Int { base, empty_int: false },
+        }
+    }
+
+    fn lifetime_or_char(&mut self) -> TokenKind {
+        debug_assert!(self.prev() == '\'');
+        let mut starts_with_number = false;
+        if (character_properties::is_id_start(self.nth_char(0))
+            || self.nth_char(0).is_digit(10) && {
+                starts_with_number = true;
+                true
+            })
+            && self.nth_char(1) != '\''
+        {
+            self.bump();
+            while character_properties::is_id_continue(self.nth_char(0)) {
+                self.bump();
+            }
+
+            return if self.nth_char(0) == '\'' {
+                self.bump();
+                let kind = Char { terminated: true };
+                Literal { kind, suffix_start: self.len_consumed() }
+            } else {
+                Lifetime { starts_with_number }
+            };
+        }
+        let terminated = self.single_quoted_string();
+        let suffix_start = self.len_consumed();
+        if terminated {
+            self.eat_literal_suffix();
+        }
+        let kind = Char { terminated };
+        return Literal { kind, suffix_start };
+    }
+
+    fn single_quoted_string(&mut self) -> bool {
+        debug_assert!(self.prev() == '\'');
+        // parse `'''` as a single char literal
+        if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' {
+            self.bump();
+        }
+        let mut first = true;
+        loop {
+            match self.nth_char(0) {
+                '/' if !first => break,
+                '\n' if self.nth_char(1) != '\'' => break,
+                '\r' if self.nth_char(1) == '\n' => break,
+                EOF_CHAR if self.is_eof() => break,
+                '\'' => {
+                    self.bump();
+                    return true;
+                }
+                '\\' => {
+                    self.bump();
+                    self.bump();
+                }
+                _ => {
+                    self.bump();
+                }
+            }
+            first = false;
+        }
+        false
+    }
+
+    fn double_quoted_string(&mut self) -> bool {
+        debug_assert!(self.prev() == '"');
+        loop {
+            match self.nth_char(0) {
+                '"' => {
+                    self.bump();
+                    return true;
+                }
+                EOF_CHAR if self.is_eof() => return false,
+                '\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => {
+                    self.bump();
+                }
+                _ => (),
+            }
+            self.bump();
+        }
+    }
+
+    fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
+        debug_assert!(self.prev() == 'r');
+        let n_hashes = {
+            let mut acc: usize = 0;
+            loop {
+                match self.bump() {
+                    Some('#') => acc += 1,
+                    Some('"') => break acc,
+                    None | Some(_) => return (acc, false, false),
+                }
+            }
+        };
+
+        loop {
+            match self.bump() {
+                Some('"') => {
+                    let mut acc = n_hashes;
+                    while self.nth_char(0) == '#' && acc > 0 {
+                        self.bump();
+                        acc -= 1;
+                    }
+                    if acc == 0 {
+                        return (n_hashes, true, true);
+                    }
+                }
+                Some(_) => (),
+                None => return (n_hashes, true, false),
+            }
+        }
+    }
+
+    fn eat_decimal_digits(&mut self) -> bool {
+        let mut has_digits = false;
+        loop {
+            match self.nth_char(0) {
+                '_' => {
+                    self.bump();
+                }
+                '0'..='9' => {
+                    has_digits = true;
+                    self.bump();
+                }
+                _ => break,
+            }
+        }
+        has_digits
+    }
+
+    fn eat_hexadecimal_digits(&mut self) -> bool {
+        let mut has_digits = false;
+        loop {
+            match self.nth_char(0) {
+                '_' => {
+                    self.bump();
+                }
+                '0'..='9' | 'a'..='f' | 'A'..='F' => {
+                    has_digits = true;
+                    self.bump();
+                }
+                _ => break,
+            }
+        }
+        has_digits
+    }
+
+    fn float_exponent(&mut self) -> Result<(), ()> {
+        debug_assert!(self.prev() == 'e' || self.prev() == 'E');
+        if self.nth_char(0) == '-' || self.nth_char(0) == '+' {
+            self.bump();
+        }
+        if self.eat_decimal_digits() { Ok(()) } else { Err(()) }
+    }
+
+    fn eat_literal_suffix(&mut self) {
+        if !character_properties::is_id_start(self.nth_char(0)) {
+            return;
+        }
+        self.bump();
+
+        while character_properties::is_id_continue(self.nth_char(0)) {
+            self.bump();
+        }
+    }
+
+    fn eat_assign(&mut self) -> bool {
+        if self.nth_char(0) == '=' {
+            self.bump();
+            true
+        } else {
+            false
+        }
+    }
+}
+
+pub mod character_properties {
+    // this is Pattern_White_Space
+    #[cfg(feature = "unicode-xid")]
+    pub fn is_whitespace(c: char) -> bool {
+        match c {
+            '\u{0009}' | '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{0020}'
+            | '\u{0085}' | '\u{200E}' | '\u{200F}' | '\u{2028}' | '\u{2029}' => true,
+            _ => false,
+        }
+    }
+
+    #[cfg(not(feature = "unicode-xid"))]
+    pub fn is_whitespace(c: char) -> bool {
+        core::unicode::property::Pattern_White_Space(c)
+    }
+
+    // this is XID_Start OR '_' (which formally is not a XID_Start)
+    #[cfg(feature = "unicode-xid")]
+    pub fn is_id_start(c: char) -> bool {
+        ('a' <= c && c <= 'z')
+            || ('A' <= c && c <= 'Z')
+            || c == '_'
+            || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
+    }
+
+    #[cfg(not(feature = "unicode-xid"))]
+    pub fn is_id_start(c: char) -> bool {
+        ('a' <= c && c <= 'z')
+            || ('A' <= c && c <= 'Z')
+            || c == '_'
+            || (c > '\x7f' && c.is_xid_start())
+    }
+
+    // this is XID_Continue
+    #[cfg(feature = "unicode-xid")]
+    pub fn is_id_continue(c: char) -> bool {
+        ('a' <= c && c <= 'z')
+            || ('A' <= c && c <= 'Z')
+            || ('0' <= c && c <= '9')
+            || c == '_'
+            || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
+    }
+
+    #[cfg(not(feature = "unicode-xid"))]
+    pub fn is_id_continue(c: char) -> bool {
+        ('a' <= c && c <= 'z')
+            || ('A' <= c && c <= 'Z')
+            || ('0' <= c && c <= '9')
+            || c == '_'
+            || (c > '\x7f' && c.is_xid_continue())
+    }
+}
--- a/src/libsyntax/Cargo.toml
+++ b/src/libsyntax/Cargo.toml
@ -18,6 +18,7 @@ lazy_static = "1.0.0"
 syntax_pos = { path = "../libsyntax_pos" }
 errors = { path = "../librustc_errors", package = "rustc_errors" }
 rustc_data_structures = { path = "../librustc_data_structures" }
+rustc_lexer = { path = "../librustc_lexer" }
 rustc_macros = { path = "../librustc_macros" }
 rustc_target = { path = "../librustc_target" }
 smallvec = { version = "0.6.7", features = ["union", "may_dangle"] }
--- a/src/libsyntax/parse/lexer/comments.rs
+++ b/src/libsyntax/parse/lexer/comments.rs
@ -2,11 +2,10 @@

 use crate::ast;
 use crate::source_map::SourceMap;
-use crate::parse::lexer::{is_block_doc_comment, is_pattern_whitespace};
-use crate::parse::lexer::{self, ParseSess, StringReader};
+use crate::parse::lexer::is_block_doc_comment;
+use crate::parse::lexer::ParseSess;

 use syntax_pos::{BytePos, CharPos, Pos, FileName};
-use log::debug;

 use std::usize;

@ -135,66 +134,6 @@ fn horizontal_trim(lines: Vec<String>) -> Vec<String> {
    panic!("not a doc-comment: {}", comment);
 }

-fn push_blank_line_comment(rdr: &StringReader<'_>, comments: &mut Vec<Comment>) {
-    debug!(">>> blank-line comment");
-    comments.push(Comment {
-        style: BlankLine,
-        lines: Vec::new(),
-        pos: rdr.pos,
-    });
-}
-
-fn consume_whitespace_counting_blank_lines(
-    rdr: &mut StringReader<'_>,
-    comments: &mut Vec<Comment>
-) {
-    while is_pattern_whitespace(rdr.ch) && !rdr.is_eof() {
-        if rdr.ch_is('\n') {
-            push_blank_line_comment(rdr, &mut *comments);
-        }
-        rdr.bump();
-    }
-}
-
-fn read_shebang_comment(rdr: &mut StringReader<'_>,
-                        code_to_the_left: bool,
-                        comments: &mut Vec<Comment>) {
-    debug!(">>> shebang comment");
-    let p = rdr.pos;
-    debug!("<<< shebang comment");
-    comments.push(Comment {
-        style: if code_to_the_left { Trailing } else { Isolated },
-        lines: vec![rdr.read_one_line_comment()],
-        pos: p,
-    });
-}
-
-fn read_line_comments(rdr: &mut StringReader<'_>,
-                      code_to_the_left: bool,
-                      comments: &mut Vec<Comment>) {
-    debug!(">>> line comments");
-    let p = rdr.pos;
-    let mut lines: Vec<String> = Vec::new();
-    while rdr.ch_is('/') && rdr.nextch_is('/') {
-        let line = rdr.read_one_line_comment();
-        debug!("{}", line);
-        // Doc comments are not put in comments.
-        if is_doc_comment(&line[..]) {
-            break;
-        }
-        lines.push(line);
-        rdr.consume_non_eol_whitespace();
-    }
-    debug!("<<< line comments");
-    if !lines.is_empty() {
-        comments.push(Comment {
-            style: if code_to_the_left { Trailing } else { Isolated },
-            lines,
-            pos: p,
-        });
-    }
-}
-
 /// Returns `None` if the first `col` chars of `s` contain a non-whitespace char.
 /// Otherwise returns `Some(k)` where `k` is first char offset after that leading
 /// whitespace. Note that `k` may be outside bounds of `s`.
@ -209,170 +148,103 @@ fn all_whitespace(s: &str, col: CharPos) -> Option<usize> {
    Some(idx)
 }

-fn trim_whitespace_prefix_and_push_line(lines: &mut Vec<String>, s: String, col: CharPos) {
+fn trim_whitespace_prefix(s: &str, col: CharPos) -> &str {
    let len = s.len();
-    let s1 = match all_whitespace(&s[..], col) {
-        Some(col) => {
-            if col < len {
-                s[col..len].to_string()
-            } else {
-                String::new()
-            }
-        }
+    match all_whitespace(&s, col) {
+        Some(col) => if col < len { &s[col..] } else { "" },
        None => s,
-    };
-    debug!("pushing line: {}", s1);
-    lines.push(s1);
+    }
 }

-fn read_block_comment(rdr: &mut StringReader<'_>,
-                      code_to_the_left: bool,
-                      comments: &mut Vec<Comment>) {
-    debug!(">>> block comment");
-    let p = rdr.pos;
-    let mut lines: Vec<String> = Vec::new();
-
-    // Count the number of chars since the start of the line by rescanning.
-    let src_index = rdr.src_index(rdr.source_file.line_begin_pos(rdr.pos));
-    let end_src_index = rdr.src_index(rdr.pos);
-    assert!(src_index <= end_src_index,
-        "src_index={}, end_src_index={}, line_begin_pos={}",
-        src_index, end_src_index, rdr.source_file.line_begin_pos(rdr.pos).to_u32());
-
-    let col = CharPos(rdr.src[src_index..end_src_index].chars().count());
-
-    rdr.bump();
-    rdr.bump();
-
-    let mut curr_line = String::from("/*");
-
-    // doc-comments are not really comments, they are attributes
-    if (rdr.ch_is('*') && !rdr.nextch_is('*')) || rdr.ch_is('!') {
-        while !(rdr.ch_is('*') && rdr.nextch_is('/')) && !rdr.is_eof() {
-            curr_line.push(rdr.ch.unwrap());
-            rdr.bump();
-        }
-        if !rdr.is_eof() {
-            curr_line.push_str("*/");
-            rdr.bump();
-            rdr.bump();
-        }
-        if is_block_doc_comment(&curr_line[..]) {
-            return;
-        }
-        assert!(!curr_line.contains('\n'));
-        lines.push(curr_line);
-    } else {
-        let mut level: isize = 1;
-        while level > 0 {
-            debug!("=== block comment level {}", level);
-            if rdr.is_eof() {
-                rdr.fatal_span_(rdr.pos, rdr.pos, "unterminated block comment").raise();
-            }
-            if rdr.ch_is('\n') {
-                trim_whitespace_prefix_and_push_line(&mut lines, curr_line, col);
-                curr_line = String::new();
-                rdr.bump();
-            } else {
-                curr_line.push(rdr.ch.unwrap());
-                if rdr.ch_is('/') && rdr.nextch_is('*') {
-                    rdr.bump();
-                    rdr.bump();
-                    curr_line.push('*');
-                    level += 1;
-                } else {
-                    if rdr.ch_is('*') && rdr.nextch_is('/') {
-                        rdr.bump();
-                        rdr.bump();
-                        curr_line.push('/');
-                        level -= 1;
-                    } else {
-                        rdr.bump();
-                    }
-                }
-            }
-        }
-        if !curr_line.is_empty() {
-            trim_whitespace_prefix_and_push_line(&mut lines, curr_line, col);
-        }
+fn split_block_comment_into_lines(
+    text: &str,
+    col: CharPos,
+) -> Vec<String> {
+    let mut res: Vec<String> = vec![];
+    let mut lines = text.lines();
+    // just push the first line
+    res.extend(lines.next().map(|it| it.to_string()));
+    // for other lines, strip common whitespace prefix
+    for line in lines {
+        res.push(trim_whitespace_prefix(line, col).to_string())
    }
-
-    let mut style = if code_to_the_left {
-        Trailing
-    } else {
-        Isolated
-    };
-    rdr.consume_non_eol_whitespace();
-    if !rdr.is_eof() && !rdr.ch_is('\n') && lines.len() == 1 {
-        style = Mixed;
-    }
-    debug!("<<< block comment");
-    comments.push(Comment {
-        style,
-        lines,
-        pos: p,
-    });
-}
-
-
-fn consume_comment(rdr: &mut StringReader<'_>,
-                   comments: &mut Vec<Comment>,
-                   code_to_the_left: &mut bool,
-                   anything_to_the_left: &mut bool) {
-    debug!(">>> consume comment");
-    if rdr.ch_is('/') && rdr.nextch_is('/') {
-        read_line_comments(rdr, *code_to_the_left, comments);
-        *code_to_the_left = false;
-        *anything_to_the_left = false;
-    } else if rdr.ch_is('/') && rdr.nextch_is('*') {
-        read_block_comment(rdr, *code_to_the_left, comments);
-        *anything_to_the_left = true;
-    } else if rdr.ch_is('#') && rdr.nextch_is('!') {
-        read_shebang_comment(rdr, *code_to_the_left, comments);
-        *code_to_the_left = false;
-        *anything_to_the_left = false;
-    } else {
-        panic!();
-    }
-    debug!("<<< consume comment");
+    res
 }

 // it appears this function is called only from pprust... that's
 // probably not a good thing.
-pub fn gather_comments(sess: &ParseSess, path: FileName, src: String) -> Vec<Comment>
-{
+pub fn gather_comments(sess: &ParseSess, path: FileName, src: String) -> Vec<Comment> {
    let cm = SourceMap::new(sess.source_map().path_mapping().clone());
    let source_file = cm.new_source_file(path, src);
-    let mut rdr = lexer::StringReader::new(sess, source_file, None);
+    let text = (*source_file.src.as_ref().unwrap()).clone();

+    let text: &str = text.as_str();
+    let start_bpos = source_file.start_pos;
+    let mut pos = 0;
    let mut comments: Vec<Comment> = Vec::new();
-    let mut code_to_the_left = false; // Only code
-    let mut anything_to_the_left = false; // Code or comments
+    let mut code_to_the_left = false;

-    while !rdr.is_eof() {
-        loop {
-            // Eat all the whitespace and count blank lines.
-            rdr.consume_non_eol_whitespace();
-            if rdr.ch_is('\n') {
-                if anything_to_the_left {
-                    rdr.bump(); // The line is not blank, do not count.
+    if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
+        comments.push(Comment {
+            style: Isolated,
+            lines: vec![text[..shebang_len].to_string()],
+            pos: start_bpos,
+        });
+        pos += shebang_len;
+    }
+
+    for token in rustc_lexer::tokenize(&text[pos..]) {
+        let token_text = &text[pos..pos + token.len];
+        match token.kind {
+            rustc_lexer::TokenKind::Whitespace => {
+                if let Some(mut idx) = token_text.find('\n') {
+                    code_to_the_left = false;
+                    while let Some(next_newline) = &token_text[idx + 1..].find('\n') {
+                        idx = idx + 1 + next_newline;
+                        comments.push(Comment {
+                            style: BlankLine,
+                            lines: vec![],
+                            pos: start_bpos + BytePos((pos + idx) as u32),
+                        });
+                    }
                }
-                consume_whitespace_counting_blank_lines(&mut rdr, &mut comments);
-                code_to_the_left = false;
-                anything_to_the_left = false;
            }
-            // Eat one comment group
-            if rdr.peeking_at_comment() {
-                consume_comment(&mut rdr, &mut comments,
-                                &mut code_to_the_left, &mut anything_to_the_left);
-            } else {
-                break
+            rustc_lexer::TokenKind::BlockComment { terminated: _ } => {
+                if !is_block_doc_comment(token_text) {
+                    let code_to_the_right = match text[pos + token.len..].chars().next() {
+                        Some('\r') | Some('\n') => false,
+                        _ => true,
+                    };
+                    let style = match (code_to_the_left, code_to_the_right) {
+                        (true, true) | (false, true) => Mixed,
+                        (false, false) => Isolated,
+                        (true, false) => Trailing,
+                    };
+
+                    // Count the number of chars since the start of the line by rescanning.
+                    let pos_in_file = start_bpos + BytePos(pos as u32);
+                    let line_begin_in_file = source_file.line_begin_pos(pos_in_file);
+                    let line_begin_pos = (line_begin_in_file - start_bpos).to_usize();
+                    let col = CharPos(text[line_begin_pos..pos].chars().count());
+
+                    let lines = split_block_comment_into_lines(token_text, col);
+                    comments.push(Comment { style, lines, pos: pos_in_file })
+                }
+            }
+            rustc_lexer::TokenKind::LineComment => {
+                if !is_doc_comment(token_text) {
+                    comments.push(Comment {
+                        style: if code_to_the_left { Trailing } else { Isolated },
+                        lines: vec![token_text.to_string()],
+                        pos: start_bpos + BytePos(pos as u32),
+                    })
+                }
+            }
+            _ => {
+                code_to_the_left = true;
            }
        }
-
-        rdr.next_token();
-        code_to_the_left = true;
-        anything_to_the_left = true;
+        pos += token.len;
    }

    comments
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
--- a/src/libsyntax/parse/lexer/unicode_chars.rs
+++ b/src/libsyntax/parse/lexer/unicode_chars.rs
@ -3,7 +3,7 @@

 use super::StringReader;
 use errors::{Applicability, DiagnosticBuilder};
-use syntax_pos::{Pos, Span, NO_EXPANSION};
+use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};

 #[rustfmt::skip] // for line breaks
 const UNICODE_ARRAY: &[(char, &str, char)] = &[
@ -327,6 +327,7 @@

 crate fn check_for_substitution<'a>(
    reader: &StringReader<'a>,
+    pos: BytePos,
    ch: char,
    err: &mut DiagnosticBuilder<'a>,
 ) -> bool {
@ -335,19 +336,19 @@
        None => return false,
    };

-    let span = Span::new(reader.pos, reader.next_pos, NO_EXPANSION);
+    let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);

    let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
        Some((_ascii_char, ascii_name)) => ascii_name,
        None => {
            let msg = format!("substitution character not found for '{}'", ch);
            reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
-            return false
-        },
+            return false;
+        }
    };

    // special help suggestion for "directed" double quotes
-    if let Some(s) = reader.peek_delimited('“', '”') {
+    if let Some(s) = peek_delimited(&reader.src[reader.src_index(pos)..], '“', '”') {
        let msg = format!(
            "Unicode characters '“' (Left Double Quotation Mark) and \
             '”' (Right Double Quotation Mark) look like '{}' ({}), but are not",
@ -355,8 +356,8 @@
        );
        err.span_suggestion(
            Span::new(
-                reader.pos,
-                reader.next_pos + Pos::from_usize(s.len()) + Pos::from_usize('”'.len_utf8()),
+                pos,
+                pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()),
                NO_EXPANSION,
            ),
            &msg,
@ -368,26 +369,18 @@
            "Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
            ch, u_name, ascii_char, ascii_name
        );
-        err.span_suggestion(
-            span,
-            &msg,
-            ascii_char.to_string(),
-            Applicability::MaybeIncorrect,
-        );
+        err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
    }
    true
 }

-impl StringReader<'_> {
-    /// Immutably extract string if found at current position with given delimiters
-    fn peek_delimited(&self, from_ch: char, to_ch: char) -> Option<&str> {
-        let tail = &self.src[self.src_index(self.pos)..];
-        let mut chars = tail.chars();
-        let first_char = chars.next()?;
-        if first_char != from_ch {
-            return None;
-        }
-        let last_char_idx = chars.as_str().find(to_ch)?;
-        Some(&chars.as_str()[..last_char_idx])
+/// Extract string if found at current position with given delimiters
+fn peek_delimited(text: &str, from_ch: char, to_ch: char) -> Option<&str> {
+    let mut chars = text.chars();
+    let first_char = chars.next()?;
+    if first_char != from_ch {
+        return None;
    }
+    let last_char_idx = chars.as_str().find(to_ch)?;
+    Some(&chars.as_str()[..last_char_idx])
 }
--- a/src/libsyntax/util/parser_testing.rs
+++ b/src/libsyntax/util/parser_testing.rs
@ -1,7 +1,7 @@
 use crate::ast::{self, Ident};
 use crate::source_map::FilePathMapping;
 use crate::parse::{ParseSess, PResult, source_file_to_stream};
-use crate::parse::{lexer, new_parser_from_source_str};
+use crate::parse::new_parser_from_source_str;
 use crate::parse::parser::Parser;
 use crate::ptr::P;
 use crate::tokenstream::TokenStream;
@ -113,14 +113,14 @@ pub fn matches_codepattern(a : &str, b : &str) -> bool {
 }

 /// Advances the given peekable `Iterator` until it reaches a non-whitespace character
-fn scan_for_non_ws_or_end<I: Iterator<Item= char>>(iter: &mut Peekable<I>) {
-    while lexer::is_pattern_whitespace(iter.peek().cloned()) {
+fn scan_for_non_ws_or_end<I: Iterator<Item = char>>(iter: &mut Peekable<I>) {
+    while iter.peek().copied().map(|c| is_pattern_whitespace(c)) == Some(true) {
        iter.next();
    }
 }

 pub fn is_pattern_whitespace(c: char) -> bool {
-    lexer::is_pattern_whitespace(Some(c))
+    rustc_lexer::character_properties::is_whitespace(c)
 }

 #[cfg(test)]
--- a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs
+++ b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs
@ -1,4 +1,5 @@
 const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
 //~^ ERROR expected at least one digit in exponent
+//~| ERROR unknown start of token: \u{2212}

 fn main() {}
--- a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr
+++ b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr
@ -1,4 +1,10 @@
 error: expected at least one digit in exponent
+  --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:47
+   |
+LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
+   |                                               ^^^^^^
+
+error: unknown start of token: \u{2212}
  --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
   |
 LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
@ -8,5 +14,5 @@ help: Unicode character '−' (Minus Sign) looks like '-' (Minus/Hyphen), but it
 LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
   |                                                     ^

-error: aborting due to previous error
+error: aborting due to 2 previous errors

--- a/src/test/ui/parser/lex-bad-numeric-literals.stderr
+++ b/src/test/ui/parser/lex-bad-numeric-literals.stderr
@ -53,10 +53,10 @@ LL |     0o;
   |     ^^

 error: expected at least one digit in exponent
-  --> $DIR/lex-bad-numeric-literals.rs:12:8
+  --> $DIR/lex-bad-numeric-literals.rs:12:5
   |
 LL |     1e+;
-   |        ^
+   |     ^^^

 error: hexadecimal float literal is not supported
  --> $DIR/lex-bad-numeric-literals.rs:13:5
--- a/src/test/ui/parser/raw-byte-string-eof.stderr
+++ b/src/test/ui/parser/raw-byte-string-eof.stderr
@ -1,8 +1,8 @@
 error: unterminated raw string
-  --> $DIR/raw-byte-string-eof.rs:2:6
+  --> $DIR/raw-byte-string-eof.rs:2:5
   |
 LL |     br##"a"#;
-   |      ^ unterminated raw string
+   |     ^ unterminated raw string
   |
   = note: this raw string should be terminated with `"##`

--- a/src/test/ui/parser/raw-byte-string-literals.stderr
+++ b/src/test/ui/parser/raw-byte-string-literals.stderr
@ -11,10 +11,10 @@ LL |     br"é";
   |        ^

 error: found invalid character; only `#` is allowed in raw string delimitation: ~
-  --> $DIR/raw-byte-string-literals.rs:6:6
+  --> $DIR/raw-byte-string-literals.rs:6:5
   |
 LL |     br##~"a"~##;
-   |      ^^^
+   |     ^^^^^

 error: aborting due to 3 previous errors

--- a/src/test/ui/parser/raw-str-delim.stderr
+++ b/src/test/ui/parser/raw-str-delim.stderr
@ -2,7 +2,7 @@ error: found invalid character; only `#` is allowed in raw string delimitation:
  --> $DIR/raw-str-delim.rs:2:5
   |
 LL |     r#~"#"~#
-   |     ^^
+   |     ^^^

 error: aborting due to previous error