src/grammar/verify.rs

   1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 #![feature(plugin)]
  12
  13 extern crate syntax;
  14 extern crate rustc;
  15
  16 extern crate regex;
  17
  18 #[macro_use]
  19 extern crate log;
  20
  21 use std::collections::HashMap;
  22 use std::io::File;
  23 use regex::Regex;
  24
  25 use syntax::parse;
  26 use syntax::parse::lexer;
  27 use rustc::session::{self, config};
  28
  29 use syntax::ast;
  30 use syntax::ast::Name;
  31 use syntax::parse::token;
  32 use syntax::parse::lexer::TokenAndSpan;
  33
  34 fn parse_token_list(file: &str) -> HashMap<String, token::Token> {
  35     fn id() -> token::Token {
  36         token::Ident(ast::Ident { name: Name(0), ctxt: 0, }, token::Plain)
  37     }
  38
  39     let mut res = HashMap::new();
  40
  41     res.insert("-1".to_string(), token::Eof);
  42
  43     for line in file.split('\n') {
  44         let eq = match line.trim().rfind('=') {
  45             Some(val) => val,
  46             None => continue
  47         };
  48
  49         let val = line.slice_to(eq);
  50         let num = line.slice_from(eq + 1);
  51
  52         let tok = match val {
  53             "SHR"               => token::BinOp(token::Shr),
  54             "DOLLAR"            => token::Dollar,
  55             "LT"                => token::Lt,
  56             "STAR"              => token::BinOp(token::Star),
  57             "FLOAT_SUFFIX"      => id(),
  58             "INT_SUFFIX"        => id(),
  59             "SHL"               => token::BinOp(token::Shl),
  60             "LBRACE"            => token::OpenDelim(token::Brace),
  61             "RARROW"            => token::RArrow,
  62             "LIT_STR"           => token::Literal(token::Str_(Name(0)), None),
  63             "DOTDOT"            => token::DotDot,
  64             "MOD_SEP"           => token::ModSep,
  65             "DOTDOTDOT"         => token::DotDotDot,
  66             "NOT"               => token::Not,
  67             "AND"               => token::BinOp(token::And),
  68             "LPAREN"            => token::OpenDelim(token::Paren),
  69             "ANDAND"            => token::AndAnd,
  70             "AT"                => token::At,
  71             "LBRACKET"          => token::OpenDelim(token::Bracket),
  72             "LIT_STR_RAW"       => token::Literal(token::StrRaw(Name(0), 0), None),
  73             "RPAREN"            => token::CloseDelim(token::Paren),
  74             "SLASH"             => token::BinOp(token::Slash),
  75             "COMMA"             => token::Comma,
  76             "LIFETIME"          => token::Lifetime(ast::Ident { name: Name(0), ctxt: 0 }),
  77             "CARET"             => token::BinOp(token::Caret),
  78             "TILDE"             => token::Tilde,
  79             "IDENT"             => id(),
  80             "PLUS"              => token::BinOp(token::Plus),
  81             "LIT_CHAR"          => token::Literal(token::Char(Name(0)), None),
  82             "LIT_BYTE"          => token::Literal(token::Byte(Name(0)), None),
  83             "EQ"                => token::Eq,
  84             "RBRACKET"          => token::CloseDelim(token::Bracket),
  85             "COMMENT"           => token::Comment,
  86             "DOC_COMMENT"       => token::DocComment(Name(0)),
  87             "DOT"               => token::Dot,
  88             "EQEQ"              => token::EqEq,
  89             "NE"                => token::Ne,
  90             "GE"                => token::Ge,
  91             "PERCENT"           => token::BinOp(token::Percent),
  92             "RBRACE"            => token::CloseDelim(token::Brace),
  93             "BINOP"             => token::BinOp(token::Plus),
  94             "POUND"             => token::Pound,
  95             "OROR"              => token::OrOr,
  96             "LIT_INTEGER"       => token::Literal(token::Integer(Name(0)), None),
  97             "BINOPEQ"           => token::BinOpEq(token::Plus),
  98             "LIT_FLOAT"         => token::Literal(token::Float(Name(0)), None),
  99             "WHITESPACE"        => token::Whitespace,
 100             "UNDERSCORE"        => token::Underscore,
 101             "MINUS"             => token::BinOp(token::Minus),
 102             "SEMI"              => token::Semi,
 103             "COLON"             => token::Colon,
 104             "FAT_ARROW"         => token::FatArrow,
 105             "OR"                => token::BinOp(token::Or),
 106             "GT"                => token::Gt,
 107             "LE"                => token::Le,
 108             "LIT_BINARY"        => token::Literal(token::Binary(Name(0)), None),
 109             "LIT_BINARY_RAW"    => token::Literal(token::BinaryRaw(Name(0), 0), None),
 110             "QUESTION"          => token::Question,
 111             _                   => continue,
 112         };
 113
 114         res.insert(num.to_string(), tok);
 115     }
 116
 117     debug!("Token map: {:?}", res);
 118     res
 119 }
 120
 121 fn str_to_binop(s: &str) -> token::BinOpToken {
 122     match s {
 123         "+"     => token::Plus,
 124         "/"     => token::Slash,
 125         "-"     => token::Minus,
 126         "*"     => token::Star,
 127         "%"     => token::Percent,
 128         "^"     => token::Caret,
 129         "&"     => token::And,
 130         "|"     => token::Or,
 131         "<<"    => token::Shl,
 132         ">>"    => token::Shr,
 133         _       => panic!("Bad binop str `{}`", s),
 134     }
 135 }
 136
 137 /// Assuming a string/binary literal, strip out the leading/trailing
 138 /// hashes and surrounding quotes/raw/binary prefix.
 139 fn fix(mut lit: &str) -> ast::Name {
 140     if lit.char_at(0) == 'r' {
 141         if lit.char_at(1) == 'b' {
 142             lit = lit.slice_from(2)
 143         } else {
 144             lit = lit.slice_from(1);
 145         }
 146     } else if lit.char_at(0) == 'b' {
 147         lit = lit.slice_from(1);
 148     }
 149
 150     let leading_hashes = count(lit);
 151
 152     // +1/-1 to adjust for single quotes
 153     parse::token::intern(lit.slice(leading_hashes + 1, lit.len() - leading_hashes - 1))
 154 }
 155
 156 /// Assuming a char/byte literal, strip the 'b' prefix and the single quotes.
 157 fn fixchar(mut lit: &str) -> ast::Name {
 158     if lit.char_at(0) == 'b' {
 159         lit = lit.slice_from(1);
 160     }
 161
 162     parse::token::intern(lit.slice(1, lit.len() - 1))
 163 }
 164
 165 fn count(lit: &str) -> usize {
 166     lit.chars().take_while(|c| *c == '#').count()
 167 }
 168
 169 fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>) -> TokenAndSpan {
 170     let re = Regex::new(
 171       r"\[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]"
 172     ).unwrap();
 173
 174     let m = re.captures(s).expect(format!("The regex didn't match {}", s).as_slice());
 175     let start = m.name("start").unwrap_or("");
 176     let end = m.name("end").unwrap_or("");
 177     let toknum = m.name("toknum").unwrap_or("");
 178     let content = m.name("content").unwrap_or("");
 179
 180     let proto_tok = tokens.get(toknum).expect(format!("didn't find token {:?} in the map",
 181                                                               toknum).as_slice());
 182
 183     let nm = parse::token::intern(content);
 184
 185     debug!("What we got: content (`{}`), proto: {:?}", content, proto_tok);
 186
 187     let real_tok = match *proto_tok {
 188         token::BinOp(..)           => token::BinOp(str_to_binop(content)),
 189         token::BinOpEq(..)         => token::BinOpEq(str_to_binop(content.slice_to(
 190                                                                     content.len() - 1))),
 191         token::Literal(token::Str_(..), n)      => token::Literal(token::Str_(fix(content)), n),
 192         token::Literal(token::StrRaw(..), n)    => token::Literal(token::StrRaw(fix(content),
 193                                                                              count(content)), n),
 194         token::Literal(token::Char(..), n)      => token::Literal(token::Char(fixchar(content)), n),
 195         token::Literal(token::Byte(..), n)      => token::Literal(token::Byte(fixchar(content)), n),
 196         token::DocComment(..)      => token::DocComment(nm),
 197         token::Literal(token::Integer(..), n)   => token::Literal(token::Integer(nm), n),
 198         token::Literal(token::Float(..), n)     => token::Literal(token::Float(nm), n),
 199         token::Literal(token::Binary(..), n)    => token::Literal(token::Binary(nm), n),
 200         token::Literal(token::BinaryRaw(..), n) => token::Literal(token::BinaryRaw(fix(content),
 201                                                                                 count(content)), n),
 202         token::Ident(..)           => token::Ident(ast::Ident { name: nm, ctxt: 0 },
 203                                                    token::ModName),
 204         token::Lifetime(..)        => token::Lifetime(ast::Ident { name: nm, ctxt: 0 }),
 205         ref t => t.clone()
 206     };
 207
 208     let offset = if real_tok == token::Eof
 209  {
 210         1
 211     } else {
 212         0
 213     };
 214
 215     let sp = syntax::codemap::Span {
 216         lo: syntax::codemap::BytePos(start.parse::<u32>().unwrap() - offset),
 217         hi: syntax::codemap::BytePos(end.parse::<u32>().unwrap() + 1),
 218         expn_id: syntax::codemap::NO_EXPANSION
 219     };
 220
 221     TokenAndSpan {
 222         tok: real_tok,
 223         sp: sp
 224     }
 225 }
 226
 227 fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
 228     match a {
 229         &token::Ident(id, _) => match b {
 230                 &token::Ident(id2, _) => id == id2,
 231                 _ => false
 232         },
 233         _ => a == b
 234     }
 235 }
 236
 237 fn main() {
 238     fn next(r: &mut lexer::StringReader) -> TokenAndSpan {
 239         use syntax::parse::lexer::Reader;
 240         r.next_token()
 241     }
 242
 243     let args = std::os::args();
 244
 245     let mut token_file = File::open(&Path::new(args[2].as_slice()));
 246     let token_map = parse_token_list(token_file.read_to_string().unwrap().as_slice());
 247
 248     let mut stdin = std::io::stdin();
 249     let mut lock = stdin.lock();
 250     let lines = lock.lines();
 251     let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().as_slice().trim(),
 252                                                                    &token_map));
 253
 254     let code = File::open(&Path::new(args[1].as_slice())).unwrap().read_to_string().unwrap();
 255     let options = config::basic_options();
 256     let session = session::build_session(options, None,
 257                                          syntax::diagnostics::registry::Registry::new(&[]));
 258     let filemap = parse::string_to_filemap(&session.parse_sess,
 259                                            code,
 260                                            String::from_str("<n/a>"));
 261     let mut lexer = lexer::StringReader::new(session.diagnostic(), filemap);
 262
 263     for antlr_tok in antlr_tokens {
 264         let rustc_tok = next(&mut lexer);
 265         if rustc_tok.tok == token::Eof && antlr_tok.tok == token::Eof {
 266             continue
 267         }
 268
 269         assert!(rustc_tok.sp == antlr_tok.sp, "{:?} and {:?} have different spans", rustc_tok,
 270                 antlr_tok);
 271
 272         macro_rules! matches {
 273             ( $($x:pat),+ ) => (
 274                 match rustc_tok.tok {
 275                     $($x => match antlr_tok.tok {
 276                         $x => {
 277                             if !tok_cmp(&rustc_tok.tok, &antlr_tok.tok) {
 278                                 // FIXME #15677: needs more robust escaping in
 279                                 // antlr
 280                                 warn!("Different names for {:?} and {:?}", rustc_tok, antlr_tok);
 281                             }
 282                         }
 283                         _ => panic!("{:?} is not {:?}", antlr_tok, rustc_tok)
 284                     },)*
 285                     ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", rustc_tok, antlr_tok)
 286                 }
 287             )
 288         }
 289
 290         matches!(
 291             token::Literal(token::Byte(..), _),
 292             token::Literal(token::Char(..), _),
 293             token::Literal(token::Integer(..), _),
 294             token::Literal(token::Float(..), _),
 295             token::Literal(token::Str_(..), _),
 296             token::Literal(token::StrRaw(..), _),
 297             token::Literal(token::Binary(..), _),
 298             token::Literal(token::BinaryRaw(..), _),
 299             token::Ident(..),
 300             token::Lifetime(..),
 301             token::Interpolated(..),
 302             token::DocComment(..),
 303             token::Shebang(..)
 304         );
 305     }
 306 }