src/grammar/verify.rs

   1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 #![feature(plugin, rustc_private, str_char, collections)]
  12
  13 extern crate syntax;
  14 extern crate rustc;
  15
  16 #[macro_use]
  17 extern crate log;
  18
  19 use std::collections::HashMap;
  20 use std::env;
  21 use std::fs::File;
  22 use std::io::{BufRead, Read};
  23 use std::path::Path;
  24
  25 use syntax::parse;
  26 use syntax::parse::lexer;
  27 use rustc::session::{self, config};
  28 use rustc::middle::cstore::DummyCrateStore;
  29
  30 use std::rc::Rc;
  31 use syntax::ast;
  32 use syntax::ast::Name;
  33 use syntax::codemap;
  34 use syntax::codemap::Pos;
  35 use syntax::parse::token;
  36 use syntax::parse::lexer::TokenAndSpan;
  37
  38 fn parse_token_list(file: &str) -> HashMap<String, token::Token> {
  39     fn id() -> token::Token {
  40         token::Ident(ast::Ident::with_empty_ctxt(Name(0)), token::Plain)
  41     }
  42
  43     let mut res = HashMap::new();
  44
  45     res.insert("-1".to_string(), token::Eof);
  46
  47     for line in file.split('\n') {
  48         let eq = match line.trim().rfind('=') {
  49             Some(val) => val,
  50             None => continue
  51         };
  52
  53         let val = &line[..eq];
  54         let num = &line[eq + 1..];
  55
  56         let tok = match val {
  57             "SHR"               => token::BinOp(token::Shr),
  58             "DOLLAR"            => token::Dollar,
  59             "LT"                => token::Lt,
  60             "STAR"              => token::BinOp(token::Star),
  61             "FLOAT_SUFFIX"      => id(),
  62             "INT_SUFFIX"        => id(),
  63             "SHL"               => token::BinOp(token::Shl),
  64             "LBRACE"            => token::OpenDelim(token::Brace),
  65             "RARROW"            => token::RArrow,
  66             "LIT_STR"           => token::Literal(token::Str_(Name(0)), None),
  67             "DOTDOT"            => token::DotDot,
  68             "MOD_SEP"           => token::ModSep,
  69             "DOTDOTDOT"         => token::DotDotDot,
  70             "NOT"               => token::Not,
  71             "AND"               => token::BinOp(token::And),
  72             "LPAREN"            => token::OpenDelim(token::Paren),
  73             "ANDAND"            => token::AndAnd,
  74             "AT"                => token::At,
  75             "LBRACKET"          => token::OpenDelim(token::Bracket),
  76             "LIT_STR_RAW"       => token::Literal(token::StrRaw(Name(0), 0), None),
  77             "RPAREN"            => token::CloseDelim(token::Paren),
  78             "SLASH"             => token::BinOp(token::Slash),
  79             "COMMA"             => token::Comma,
  80             "LIFETIME"          => token::Lifetime(ast::Ident::with_empty_ctxt(Name(0))),
  81             "CARET"             => token::BinOp(token::Caret),
  82             "TILDE"             => token::Tilde,
  83             "IDENT"             => id(),
  84             "PLUS"              => token::BinOp(token::Plus),
  85             "LIT_CHAR"          => token::Literal(token::Char(Name(0)), None),
  86             "LIT_BYTE"          => token::Literal(token::Byte(Name(0)), None),
  87             "EQ"                => token::Eq,
  88             "RBRACKET"          => token::CloseDelim(token::Bracket),
  89             "COMMENT"           => token::Comment,
  90             "DOC_COMMENT"       => token::DocComment(Name(0)),
  91             "DOT"               => token::Dot,
  92             "EQEQ"              => token::EqEq,
  93             "NE"                => token::Ne,
  94             "GE"                => token::Ge,
  95             "PERCENT"           => token::BinOp(token::Percent),
  96             "RBRACE"            => token::CloseDelim(token::Brace),
  97             "BINOP"             => token::BinOp(token::Plus),
  98             "POUND"             => token::Pound,
  99             "OROR"              => token::OrOr,
 100             "LIT_INTEGER"       => token::Literal(token::Integer(Name(0)), None),
 101             "BINOPEQ"           => token::BinOpEq(token::Plus),
 102             "LIT_FLOAT"         => token::Literal(token::Float(Name(0)), None),
 103             "WHITESPACE"        => token::Whitespace,
 104             "UNDERSCORE"        => token::Underscore,
 105             "MINUS"             => token::BinOp(token::Minus),
 106             "SEMI"              => token::Semi,
 107             "COLON"             => token::Colon,
 108             "FAT_ARROW"         => token::FatArrow,
 109             "OR"                => token::BinOp(token::Or),
 110             "GT"                => token::Gt,
 111             "LE"                => token::Le,
 112             "LIT_BYTE_STR"      => token::Literal(token::ByteStr(Name(0)), None),
 113             "LIT_BYTE_STR_RAW"  => token::Literal(token::ByteStrRaw(Name(0), 0), None),
 114             "QUESTION"          => token::Question,
 115             "SHEBANG"           => token::Shebang(Name(0)),
 116             _                   => continue,
 117         };
 118
 119         res.insert(num.to_string(), tok);
 120     }
 121
 122     debug!("Token map: {:?}", res);
 123     res
 124 }
 125
 126 fn str_to_binop(s: &str) -> token::BinOpToken {
 127     match s {
 128         "+"     => token::Plus,
 129         "/"     => token::Slash,
 130         "-"     => token::Minus,
 131         "*"     => token::Star,
 132         "%"     => token::Percent,
 133         "^"     => token::Caret,
 134         "&"     => token::And,
 135         "|"     => token::Or,
 136         "<<"    => token::Shl,
 137         ">>"    => token::Shr,
 138         _       => panic!("Bad binop str `{}`", s),
 139     }
 140 }
 141
 142 /// Assuming a string/byte string literal, strip out the leading/trailing
 143 /// hashes and surrounding quotes/raw/byte prefix.
 144 fn fix(mut lit: &str) -> ast::Name {
 145     if lit.char_at(0) == 'r' {
 146         if lit.char_at(1) == 'b' {
 147             lit = &lit[2..]
 148         } else {
 149             lit = &lit[1..];
 150         }
 151     } else if lit.char_at(0) == 'b' {
 152         lit = &lit[1..];
 153     }
 154
 155     let leading_hashes = count(lit);
 156
 157     // +1/-1 to adjust for single quotes
 158     parse::token::intern(&lit[leading_hashes + 1..lit.len() - leading_hashes - 1])
 159 }
 160
 161 /// Assuming a char/byte literal, strip the 'b' prefix and the single quotes.
 162 fn fixchar(mut lit: &str) -> ast::Name {
 163     if lit.char_at(0) == 'b' {
 164         lit = &lit[1..];
 165     }
 166
 167     parse::token::intern(&lit[1..lit.len() - 1])
 168 }
 169
 170 fn count(lit: &str) -> usize {
 171     lit.chars().take_while(|c| *c == '#').count()
 172 }
 173
 174 fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize],
 175                      has_bom: bool)
 176                      -> TokenAndSpan {
 177     // old regex:
 178     // \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
 179     let start = s.find("[@").unwrap();
 180     let comma = start + s[start..].find(",").unwrap();
 181     let colon = comma + s[comma..].find(":").unwrap();
 182     let content_start = colon + s[colon..].find("='").unwrap();
 183     // Use rfind instead of find, because we don't want to stop at the content
 184     let content_end = content_start + s[content_start..].rfind("',<").unwrap();
 185     let toknum_end = content_end + s[content_end..].find(">,").unwrap();
 186
 187     let start = &s[comma + 1 .. colon];
 188     let end = &s[colon + 1 .. content_start];
 189     let content = &s[content_start + 2 .. content_end];
 190     let toknum = &s[content_end + 3 .. toknum_end];
 191
 192     let not_found = format!("didn't find token {:?} in the map", toknum);
 193     let proto_tok = tokens.get(toknum).expect(&not_found[..]);
 194
 195     let nm = parse::token::intern(content);
 196
 197     debug!("What we got: content (`{}`), proto: {:?}", content, proto_tok);
 198
 199     let real_tok = match *proto_tok {
 200         token::BinOp(..)           => token::BinOp(str_to_binop(content)),
 201         token::BinOpEq(..)         => token::BinOpEq(str_to_binop(&content[..content.len() - 1])),
 202         token::Literal(token::Str_(..), n)      => token::Literal(token::Str_(fix(content)), n),
 203         token::Literal(token::StrRaw(..), n)    => token::Literal(token::StrRaw(fix(content),
 204                                                                              count(content)), n),
 205         token::Literal(token::Char(..), n)      => token::Literal(token::Char(fixchar(content)), n),
 206         token::Literal(token::Byte(..), n)      => token::Literal(token::Byte(fixchar(content)), n),
 207         token::DocComment(..)      => token::DocComment(nm),
 208         token::Literal(token::Integer(..), n)   => token::Literal(token::Integer(nm), n),
 209         token::Literal(token::Float(..), n)     => token::Literal(token::Float(nm), n),
 210         token::Literal(token::ByteStr(..), n)    => token::Literal(token::ByteStr(nm), n),
 211         token::Literal(token::ByteStrRaw(..), n) => token::Literal(token::ByteStrRaw(fix(content),
 212                                                                                 count(content)), n),
 213         token::Ident(..)           => token::Ident(ast::Ident::with_empty_ctxt(nm),
 214                                                    token::ModName),
 215         token::Lifetime(..)        => token::Lifetime(ast::Ident::with_empty_ctxt(nm)),
 216         ref t => t.clone()
 217     };
 218
 219     let start_offset = if real_tok == token::Eof {
 220         1
 221     } else {
 222         0
 223     };
 224
 225     let offset = if has_bom { 1 } else { 0 };
 226
 227     let mut lo = start.parse::<u32>().unwrap() - start_offset - offset;
 228     let mut hi = end.parse::<u32>().unwrap() + 1 - offset;
 229
 230     // Adjust the span: For each surrogate pair already encountered, subtract one position.
 231     lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32;
 232     hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32;
 233
 234     let sp = codemap::Span {
 235         lo: codemap::BytePos(lo),
 236         hi: codemap::BytePos(hi),
 237         expn_id: codemap::NO_EXPANSION
 238     };
 239
 240     TokenAndSpan {
 241         tok: real_tok,
 242         sp: sp
 243     }
 244 }
 245
 246 fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
 247     match a {
 248         &token::Ident(id, _) => match b {
 249                 &token::Ident(id2, _) => id == id2,
 250                 _ => false
 251         },
 252         _ => a == b
 253     }
 254 }
 255
 256 fn span_cmp(antlr_sp: codemap::Span, rust_sp: codemap::Span, cm: &codemap::CodeMap) -> bool {
 257     antlr_sp.expn_id == rust_sp.expn_id &&
 258         antlr_sp.lo.to_usize() == cm.bytepos_to_file_charpos(rust_sp.lo).to_usize() &&
 259         antlr_sp.hi.to_usize() == cm.bytepos_to_file_charpos(rust_sp.hi).to_usize()
 260 }
 261
 262 fn main() {
 263     fn next(r: &mut lexer::StringReader) -> TokenAndSpan {
 264         use syntax::parse::lexer::Reader;
 265         r.next_token()
 266     }
 267
 268     let mut args = env::args().skip(1);
 269     let filename = args.next().unwrap();
 270     if filename.find("parse-fail").is_some() {
 271         return;
 272     }
 273
 274     // Rust's lexer
 275     let mut code = String::new();
 276     File::open(&Path::new(&filename)).unwrap().read_to_string(&mut code).unwrap();
 277
 278     let surrogate_pairs_pos: Vec<usize> = code.chars().enumerate()
 279                                                      .filter(|&(_, c)| c as usize > 0xFFFF)
 280                                                      .map(|(n, _)| n)
 281                                                      .enumerate()
 282                                                      .map(|(x, n)| x + n)
 283                                                      .collect();
 284
 285     let has_bom = code.starts_with("\u{feff}");
 286
 287     debug!("Pairs: {:?}", surrogate_pairs_pos);
 288
 289     let options = config::basic_options();
 290     let session = session::build_session(options, None,
 291                                          syntax::diagnostics::registry::Registry::new(&[]),
 292                                          Rc::new(DummyCrateStore));
 293     let filemap = session.parse_sess.codemap().new_filemap(String::from("<n/a>"), code);
 294     let mut lexer = lexer::StringReader::new(session.diagnostic(), filemap);
 295     let cm = session.codemap();
 296
 297     // ANTLR
 298     let mut token_file = File::open(&Path::new(&args.next().unwrap())).unwrap();
 299     let mut token_list = String::new();
 300     token_file.read_to_string(&mut token_list).unwrap();
 301     let token_map = parse_token_list(&token_list[..]);
 302
 303     let stdin = std::io::stdin();
 304     let lock = stdin.lock();
 305     let lines = lock.lines();
 306     let antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
 307                                                        &token_map,
 308                                                        &surrogate_pairs_pos[..],
 309                                                        has_bom));
 310
 311     for antlr_tok in antlr_tokens {
 312         let rustc_tok = next(&mut lexer);
 313         if rustc_tok.tok == token::Eof && antlr_tok.tok == token::Eof {
 314             continue
 315         }
 316
 317         assert!(span_cmp(antlr_tok.sp, rustc_tok.sp, cm), "{:?} and {:?} have different spans",
 318                 rustc_tok,
 319                 antlr_tok);
 320
 321         macro_rules! matches {
 322             ( $($x:pat),+ ) => (
 323                 match rustc_tok.tok {
 324                     $($x => match antlr_tok.tok {
 325                         $x => {
 326                             if !tok_cmp(&rustc_tok.tok, &antlr_tok.tok) {
 327                                 // FIXME #15677: needs more robust escaping in
 328                                 // antlr
 329                                 warn!("Different names for {:?} and {:?}", rustc_tok, antlr_tok);
 330                             }
 331                         }
 332                         _ => panic!("{:?} is not {:?}", antlr_tok, rustc_tok)
 333                     },)*
 334                     ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", antlr_tok, rustc_tok)
 335                 }
 336             )
 337         }
 338
 339         matches!(
 340             token::Literal(token::Byte(..), _),
 341             token::Literal(token::Char(..), _),
 342             token::Literal(token::Integer(..), _),
 343             token::Literal(token::Float(..), _),
 344             token::Literal(token::Str_(..), _),
 345             token::Literal(token::StrRaw(..), _),
 346             token::Literal(token::ByteStr(..), _),
 347             token::Literal(token::ByteStrRaw(..), _),
 348             token::Ident(..),
 349             token::Lifetime(..),
 350             token::Interpolated(..),
 351             token::DocComment(..),
 352             token::Shebang(..)
 353         );
 354     }
 355 }