1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 #![feature(plugin, rustc_private, str_char, collections)]
19 use std::collections::HashMap;
22 use std::io::{BufRead, Read};
26 use syntax::parse::lexer;
27 use rustc::session::{self, config};
28 use rustc::middle::cstore::DummyCrateStore;
32 use syntax::ast::Name;
34 use syntax::codemap::Pos;
35 use syntax::parse::token;
36 use syntax::parse::lexer::TokenAndSpan;
38 fn parse_token_list(file: &str) -> HashMap<String, token::Token> {
39 fn id() -> token::Token {
40 token::Ident(ast::Ident::with_empty_ctxt(Name(0)), token::Plain)
43 let mut res = HashMap::new();
45 res.insert("-1".to_string(), token::Eof);
47 for line in file.split('\n') {
48 let eq = match line.trim().rfind('=') {
53 let val = &line[..eq];
54 let num = &line[eq + 1..];
57 "SHR" => token::BinOp(token::Shr),
58 "DOLLAR" => token::Dollar,
60 "STAR" => token::BinOp(token::Star),
61 "FLOAT_SUFFIX" => id(),
63 "SHL" => token::BinOp(token::Shl),
64 "LBRACE" => token::OpenDelim(token::Brace),
65 "RARROW" => token::RArrow,
66 "LIT_STR" => token::Literal(token::Str_(Name(0)), None),
67 "DOTDOT" => token::DotDot,
68 "MOD_SEP" => token::ModSep,
69 "DOTDOTDOT" => token::DotDotDot,
71 "AND" => token::BinOp(token::And),
72 "LPAREN" => token::OpenDelim(token::Paren),
73 "ANDAND" => token::AndAnd,
75 "LBRACKET" => token::OpenDelim(token::Bracket),
76 "LIT_STR_RAW" => token::Literal(token::StrRaw(Name(0), 0), None),
77 "RPAREN" => token::CloseDelim(token::Paren),
78 "SLASH" => token::BinOp(token::Slash),
79 "COMMA" => token::Comma,
80 "LIFETIME" => token::Lifetime(ast::Ident::with_empty_ctxt(Name(0))),
81 "CARET" => token::BinOp(token::Caret),
82 "TILDE" => token::Tilde,
84 "PLUS" => token::BinOp(token::Plus),
85 "LIT_CHAR" => token::Literal(token::Char(Name(0)), None),
86 "LIT_BYTE" => token::Literal(token::Byte(Name(0)), None),
88 "RBRACKET" => token::CloseDelim(token::Bracket),
89 "COMMENT" => token::Comment,
90 "DOC_COMMENT" => token::DocComment(Name(0)),
92 "EQEQ" => token::EqEq,
95 "PERCENT" => token::BinOp(token::Percent),
96 "RBRACE" => token::CloseDelim(token::Brace),
97 "BINOP" => token::BinOp(token::Plus),
98 "POUND" => token::Pound,
99 "OROR" => token::OrOr,
100 "LIT_INTEGER" => token::Literal(token::Integer(Name(0)), None),
101 "BINOPEQ" => token::BinOpEq(token::Plus),
102 "LIT_FLOAT" => token::Literal(token::Float(Name(0)), None),
103 "WHITESPACE" => token::Whitespace,
104 "UNDERSCORE" => token::Underscore,
105 "MINUS" => token::BinOp(token::Minus),
106 "SEMI" => token::Semi,
107 "COLON" => token::Colon,
108 "FAT_ARROW" => token::FatArrow,
109 "OR" => token::BinOp(token::Or),
112 "LIT_BYTE_STR" => token::Literal(token::ByteStr(Name(0)), None),
113 "LIT_BYTE_STR_RAW" => token::Literal(token::ByteStrRaw(Name(0), 0), None),
114 "QUESTION" => token::Question,
115 "SHEBANG" => token::Shebang(Name(0)),
119 res.insert(num.to_string(), tok);
122 debug!("Token map: {:?}", res);
126 fn str_to_binop(s: &str) -> token::BinOpToken {
132 "%" => token::Percent,
138 _ => panic!("Bad binop str `{}`", s),
142 /// Assuming a string/byte string literal, strip out the leading/trailing
143 /// hashes and surrounding quotes/raw/byte prefix.
144 fn fix(mut lit: &str) -> ast::Name {
145 if lit.char_at(0) == 'r' {
146 if lit.char_at(1) == 'b' {
151 } else if lit.char_at(0) == 'b' {
155 let leading_hashes = count(lit);
157 // +1/-1 to adjust for single quotes
158 parse::token::intern(&lit[leading_hashes + 1..lit.len() - leading_hashes - 1])
161 /// Assuming a char/byte literal, strip the 'b' prefix and the single quotes.
162 fn fixchar(mut lit: &str) -> ast::Name {
163 if lit.char_at(0) == 'b' {
167 parse::token::intern(&lit[1..lit.len() - 1])
170 fn count(lit: &str) -> usize {
171 lit.chars().take_while(|c| *c == '#').count()
174 fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize],
178 // \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
179 let start = s.find("[@").unwrap();
180 let comma = start + s[start..].find(",").unwrap();
181 let colon = comma + s[comma..].find(":").unwrap();
182 let content_start = colon + s[colon..].find("='").unwrap();
183 // Use rfind instead of find, because we don't want to stop at the content
184 let content_end = content_start + s[content_start..].rfind("',<").unwrap();
185 let toknum_end = content_end + s[content_end..].find(">,").unwrap();
187 let start = &s[comma + 1 .. colon];
188 let end = &s[colon + 1 .. content_start];
189 let content = &s[content_start + 2 .. content_end];
190 let toknum = &s[content_end + 3 .. toknum_end];
192 let not_found = format!("didn't find token {:?} in the map", toknum);
193 let proto_tok = tokens.get(toknum).expect(¬_found[..]);
195 let nm = parse::token::intern(content);
197 debug!("What we got: content (`{}`), proto: {:?}", content, proto_tok);
199 let real_tok = match *proto_tok {
200 token::BinOp(..) => token::BinOp(str_to_binop(content)),
201 token::BinOpEq(..) => token::BinOpEq(str_to_binop(&content[..content.len() - 1])),
202 token::Literal(token::Str_(..), n) => token::Literal(token::Str_(fix(content)), n),
203 token::Literal(token::StrRaw(..), n) => token::Literal(token::StrRaw(fix(content),
205 token::Literal(token::Char(..), n) => token::Literal(token::Char(fixchar(content)), n),
206 token::Literal(token::Byte(..), n) => token::Literal(token::Byte(fixchar(content)), n),
207 token::DocComment(..) => token::DocComment(nm),
208 token::Literal(token::Integer(..), n) => token::Literal(token::Integer(nm), n),
209 token::Literal(token::Float(..), n) => token::Literal(token::Float(nm), n),
210 token::Literal(token::ByteStr(..), n) => token::Literal(token::ByteStr(nm), n),
211 token::Literal(token::ByteStrRaw(..), n) => token::Literal(token::ByteStrRaw(fix(content),
213 token::Ident(..) => token::Ident(ast::Ident::with_empty_ctxt(nm),
215 token::Lifetime(..) => token::Lifetime(ast::Ident::with_empty_ctxt(nm)),
219 let start_offset = if real_tok == token::Eof {
225 let offset = if has_bom { 1 } else { 0 };
227 let mut lo = start.parse::<u32>().unwrap() - start_offset - offset;
228 let mut hi = end.parse::<u32>().unwrap() + 1 - offset;
230 // Adjust the span: For each surrogate pair already encountered, subtract one position.
231 lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32;
232 hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32;
234 let sp = codemap::Span {
235 lo: codemap::BytePos(lo),
236 hi: codemap::BytePos(hi),
237 expn_id: codemap::NO_EXPANSION
246 fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
248 &token::Ident(id, _) => match b {
249 &token::Ident(id2, _) => id == id2,
256 fn span_cmp(antlr_sp: codemap::Span, rust_sp: codemap::Span, cm: &codemap::CodeMap) -> bool {
257 antlr_sp.expn_id == rust_sp.expn_id &&
258 antlr_sp.lo.to_usize() == cm.bytepos_to_file_charpos(rust_sp.lo).to_usize() &&
259 antlr_sp.hi.to_usize() == cm.bytepos_to_file_charpos(rust_sp.hi).to_usize()
263 fn next(r: &mut lexer::StringReader) -> TokenAndSpan {
264 use syntax::parse::lexer::Reader;
268 let mut args = env::args().skip(1);
269 let filename = args.next().unwrap();
270 if filename.find("parse-fail").is_some() {
275 let mut code = String::new();
276 File::open(&Path::new(&filename)).unwrap().read_to_string(&mut code).unwrap();
278 let surrogate_pairs_pos: Vec<usize> = code.chars().enumerate()
279 .filter(|&(_, c)| c as usize > 0xFFFF)
285 let has_bom = code.starts_with("\u{feff}");
287 debug!("Pairs: {:?}", surrogate_pairs_pos);
289 let options = config::basic_options();
290 let session = session::build_session(options, None,
291 syntax::diagnostics::registry::Registry::new(&[]),
292 Rc::new(DummyCrateStore));
293 let filemap = session.parse_sess.codemap().new_filemap(String::from("<n/a>"), code);
294 let mut lexer = lexer::StringReader::new(session.diagnostic(), filemap);
295 let cm = session.codemap();
298 let mut token_file = File::open(&Path::new(&args.next().unwrap())).unwrap();
299 let mut token_list = String::new();
300 token_file.read_to_string(&mut token_list).unwrap();
301 let token_map = parse_token_list(&token_list[..]);
303 let stdin = std::io::stdin();
304 let lock = stdin.lock();
305 let lines = lock.lines();
306 let antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
308 &surrogate_pairs_pos[..],
311 for antlr_tok in antlr_tokens {
312 let rustc_tok = next(&mut lexer);
313 if rustc_tok.tok == token::Eof && antlr_tok.tok == token::Eof {
317 assert!(span_cmp(antlr_tok.sp, rustc_tok.sp, cm), "{:?} and {:?} have different spans",
321 macro_rules! matches {
323 match rustc_tok.tok {
324 $($x => match antlr_tok.tok {
326 if !tok_cmp(&rustc_tok.tok, &antlr_tok.tok) {
327 // FIXME #15677: needs more robust escaping in
329 warn!("Different names for {:?} and {:?}", rustc_tok, antlr_tok);
332 _ => panic!("{:?} is not {:?}", antlr_tok, rustc_tok)
334 ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", antlr_tok, rustc_tok)
340 token::Literal(token::Byte(..), _),
341 token::Literal(token::Char(..), _),
342 token::Literal(token::Integer(..), _),
343 token::Literal(token::Float(..), _),
344 token::Literal(token::Str_(..), _),
345 token::Literal(token::StrRaw(..), _),
346 token::Literal(token::ByteStr(..), _),
347 token::Literal(token::ByteStrRaw(..), _),
350 token::Interpolated(..),
351 token::DocComment(..),