1 //! Lexing `&str` into a sequence of Rust tokens.
3 //! Note that strictly speaking the parser in this crate is not required to work
4 //! on tokens which originated from text. Macros, eg, can synthesize tokens out
5 //! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
6 //! convenient to include a text-based lexer here!
8 //! Note that these tokens, unlike the tokens we feed into the parser, do
9 //! include info about comments and whitespace.
14 SyntaxKind::{self, *},
18 pub struct LexedStr<'a> {
20 kind: Vec<SyntaxKind>,
30 impl<'a> LexedStr<'a> {
31 pub fn new(text: &'a str) -> LexedStr<'a> {
32 let mut res = LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() };
35 if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
36 res.push(SHEBANG, offset);
39 for token in rustc_lexer::tokenize(&text[offset..]) {
40 let token_text = &text[offset..][..token.len];
42 let (kind, err) = from_rustc(&token.kind, token_text);
43 res.push(kind, offset);
46 if let Some(err) = err {
47 let token = res.len() as u32;
48 let msg = err.to_string();
49 res.error.push(LexError { msg, token });
52 res.push(EOF, offset);
57 pub fn single_token(text: &'a str) -> Option<(SyntaxKind, Option<String>)> {
62 let token = rustc_lexer::first_token(text);
63 if token.len != text.len() {
67 let (kind, err) = from_rustc(&token.kind, text);
68 Some((kind, err.map(|it| it.to_owned())))
71 pub fn as_str(&self) -> &str {
75 pub fn len(&self) -> usize {
79 pub fn is_empty(&self) -> bool {
83 pub fn kind(&self, i: usize) -> SyntaxKind {
84 assert!(i < self.len());
88 pub fn text(&self, i: usize) -> &str {
89 self.range_text(i..i + 1)
91 pub fn range_text(&self, r: ops::Range<usize>) -> &str {
92 assert!(r.start < r.end && r.end <= self.len());
93 let lo = self.start[r.start] as usize;
94 let hi = self.start[r.end] as usize;
99 pub fn text_range(&self, i: usize) -> ops::Range<usize> {
100 assert!(i < self.len());
101 let lo = self.start[i] as usize;
102 let hi = self.start[i + 1] as usize;
105 pub fn text_start(&self, i: usize) -> usize {
106 assert!(i <= self.len());
107 self.start[i] as usize
109 pub fn text_len(&self, i: usize) -> usize {
110 assert!(i < self.len());
111 let r = self.text_range(i);
115 pub fn error(&self, i: usize) -> Option<&str> {
116 assert!(i < self.len());
117 let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
118 Some(self.error[err].msg.as_str())
121 pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
122 self.error.iter().map(|it| (it.token as usize, it.msg.as_str()))
125 fn push(&mut self, kind: SyntaxKind, offset: usize) {
126 self.kind.push(kind);
127 self.start.push(offset as u32);
131 /// Returns `SyntaxKind` and an optional tokenize error message.
133 kind: &rustc_lexer::TokenKind,
135 ) -> (SyntaxKind, Option<&'static str>) {
136 // A note on an intended tradeoff:
137 // We drop some useful information here (see patterns with double dots `..`)
138 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
139 // being `u16` that come from `rowan::SyntaxKind`.
144 rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
145 rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
147 err = "Missing trailing `*/` symbols to terminate the block comment";
152 rustc_lexer::TokenKind::Whitespace => WHITESPACE,
154 rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE,
155 rustc_lexer::TokenKind::Ident => SyntaxKind::from_keyword(token_text).unwrap_or(IDENT),
157 rustc_lexer::TokenKind::RawIdent => IDENT,
158 rustc_lexer::TokenKind::Literal { kind, .. } => return from_rustc_literal(kind),
160 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
161 if *starts_with_number {
162 err = "Lifetime name cannot start with a number";
167 rustc_lexer::TokenKind::Semi => T![;],
168 rustc_lexer::TokenKind::Comma => T![,],
169 rustc_lexer::TokenKind::Dot => T![.],
170 rustc_lexer::TokenKind::OpenParen => T!['('],
171 rustc_lexer::TokenKind::CloseParen => T![')'],
172 rustc_lexer::TokenKind::OpenBrace => T!['{'],
173 rustc_lexer::TokenKind::CloseBrace => T!['}'],
174 rustc_lexer::TokenKind::OpenBracket => T!['['],
175 rustc_lexer::TokenKind::CloseBracket => T![']'],
176 rustc_lexer::TokenKind::At => T![@],
177 rustc_lexer::TokenKind::Pound => T![#],
178 rustc_lexer::TokenKind::Tilde => T![~],
179 rustc_lexer::TokenKind::Question => T![?],
180 rustc_lexer::TokenKind::Colon => T![:],
181 rustc_lexer::TokenKind::Dollar => T![$],
182 rustc_lexer::TokenKind::Eq => T![=],
183 rustc_lexer::TokenKind::Bang => T![!],
184 rustc_lexer::TokenKind::Lt => T![<],
185 rustc_lexer::TokenKind::Gt => T![>],
186 rustc_lexer::TokenKind::Minus => T![-],
187 rustc_lexer::TokenKind::And => T![&],
188 rustc_lexer::TokenKind::Or => T![|],
189 rustc_lexer::TokenKind::Plus => T![+],
190 rustc_lexer::TokenKind::Star => T![*],
191 rustc_lexer::TokenKind::Slash => T![/],
192 rustc_lexer::TokenKind::Caret => T![^],
193 rustc_lexer::TokenKind::Percent => T![%],
194 rustc_lexer::TokenKind::Unknown => ERROR,
198 let err = if err.is_empty() { None } else { Some(err) };
202 fn from_rustc_literal(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) {
205 let syntax_kind = match *kind {
206 rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
208 err = "Missing digits after the integer base prefix";
212 rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
214 err = "Missing digits after the exponent symbol";
218 rustc_lexer::LiteralKind::Char { terminated } => {
220 err = "Missing trailing `'` symbol to terminate the character literal";
224 rustc_lexer::LiteralKind::Byte { terminated } => {
226 err = "Missing trailing `'` symbol to terminate the byte literal";
230 rustc_lexer::LiteralKind::Str { terminated } => {
232 err = "Missing trailing `\"` symbol to terminate the string literal";
236 rustc_lexer::LiteralKind::ByteStr { terminated } => {
238 err = "Missing trailing `\"` symbol to terminate the byte string literal";
242 rustc_lexer::LiteralKind::RawStr { err: raw_str_err, .. } => {
243 if let Some(raw_str_err) = raw_str_err {
244 err = match raw_str_err {
245 rustc_lexer::RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw string literal",
246 rustc_lexer::RawStrError::NoTerminator { expected, found, .. } => if expected == found {
247 "Missing trailing `\"` to terminate the raw string literal"
249 "Missing trailing `\"` with `#` symbols to terminate the raw string literal"
251 rustc_lexer::RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
256 rustc_lexer::LiteralKind::RawByteStr { err: raw_str_err, .. } => {
257 if let Some(raw_str_err) = raw_str_err {
258 err = match raw_str_err {
259 rustc_lexer::RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw byte string literal",
260 rustc_lexer::RawStrError::NoTerminator { expected, found, .. } => if expected == found {
261 "Missing trailing `\"` to terminate the raw byte string literal"
263 "Missing trailing `\"` with `#` symbols to terminate the raw byte string literal"
265 rustc_lexer::RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols",
273 let err = if err.is_empty() { None } else { Some(err) };