1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
12 pub use self::Repeater::*;
13 pub use self::Greed::*;
14 use self::BuildAst::*;
23 /// Static data containing Unicode ranges for general categories and scripts.
24 use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
26 /// The maximum number of repetitions allowed with the `{n,m}` syntax.
27 static MAX_REPEAT: uint = 1000;
29 /// Error corresponds to something that can go wrong while parsing
30 /// a regular expression.
32 /// (Once an expression is compiled, it is not possible to produce an error
33 /// via searching, splitting or replacing.)
35 /// The *approximate* character index of where the error occurred.
37 /// A message describing the error.
41 impl fmt::Show for Error {
42 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
43 write!(f, "Regex syntax error near position {}: {}",
48 /// Represents the abstract syntax of a regular expression.
49 /// It is showable so that error messages resulting from a bug can provide
50 /// useful information.
51 /// It is cloneable so that expressions can be repeated for the counted
52 /// repetition feature. (No other copying is done.)
54 /// Note that this representation prevents one from reproducing the regex as
55 /// it was typed. (But it could be used to reproduce an equivalent regex.)
56 #[derive(Show, Clone)]
61 AstClass(Vec<(char, char)>, Flags),
65 Capture(uint, Option<String>, Box<Ast>),
66 // Represent concatenation as a flat vector to avoid blowing the
67 // stack in the compiler.
69 Alt(Box<Ast>, Box<Ast>),
70 Rep(Box<Ast>, Repeater, Greed),
73 #[derive(Show, PartialEq, Clone)]
80 #[derive(Copy, Show, Clone)]
87 pub fn is_greedy(&self) -> bool {
94 fn swap(self, swapped: bool) -> Greed {
95 if !swapped { return self }
103 /// BuildAst is a regrettable type that represents intermediate state for
104 /// constructing an abstract syntax tree. Its central purpose is to facilitate
105 /// parsing groups and alternations while also maintaining a stack of flag
110 Paren(Flags, uint, String), // '('
115 fn paren(&self) -> bool {
117 Paren(_, _, _) => true,
122 fn flags(&self) -> Flags {
124 Paren(flags, _, _) => flags,
125 _ => panic!("Cannot get flags from {}", self),
129 fn capture(&self) -> Option<uint> {
131 Paren(_, 0, _) => None,
132 Paren(_, c, _) => Some(c),
133 _ => panic!("Cannot get capture group from {}", self),
137 fn capture_name(&self) -> Option<String> {
139 Paren(_, 0, _) => None,
140 Paren(_, _, ref name) => {
147 _ => panic!("Cannot get capture name from {}", self),
151 fn bar(&self) -> bool {
158 fn unwrap(self) -> Result<Ast, Error> {
161 _ => panic!("Tried to unwrap non-AST item: {}", self),
166 /// Flags represents all options that can be twiddled by a user in an
170 pub const FLAG_EMPTY: u8 = 0;
171 pub const FLAG_NOCASE: u8 = 1 << 0; // i
172 pub const FLAG_MULTI: u8 = 1 << 1; // m
173 pub const FLAG_DOTNL: u8 = 1 << 2; // s
174 pub const FLAG_SWAP_GREED: u8 = 1 << 3; // U
175 pub const FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary
178 // The input, parsed only as a sequence of UTF8 code points.
180 // The index of the current character in the input.
182 // The intermediate state representing the AST.
183 stack: Vec<BuildAst>,
184 // The current set of flags.
186 // The total number of capture groups.
187 // Incremented each time an opening left paren is seen (assuming it is
188 // opening a capture group).
190 // A set of all capture group names used only to detect duplicates.
194 pub fn parse(s: &str) -> Result<Ast, Error> {
196 chars: s.chars().collect(),
205 impl<'a> Parser<'a> {
206 fn parse(&mut self) -> Result<Ast, Error> {
207 if self.chars.len() == 0 {
213 '?' | '*' | '+' => try!(self.push_repeater(c)),
215 let ast = try!(self.parse_escape());
218 '{' => try!(self.parse_counted()),
219 '[' => match self.try_parse_ascii() {
220 None => try!(self.parse_class()),
221 Some(class) => self.push(class),
224 if self.peek_is(1, '?') {
225 try!(self.expect('?'));
226 try!(self.parse_group_opts())
229 self.stack.push(Paren(self.flags,
236 self.pos_last(false, |x| x.paren() || x.bar()));
237 try!(self.concat(catfrom));
239 let altfrom = try!(self.pos_last(false, |x| x.paren()));
240 // Before we smush the alternates together and pop off the
241 // left paren, let's grab the old flags and see if we
243 let (cap, cap_name, oldflags) = {
244 let paren = &self.stack[altfrom-1];
245 (paren.capture(), paren.capture_name(), paren.flags())
247 try!(self.alternate(altfrom));
248 self.flags = oldflags;
250 // If this was a capture, pop what we just pushed in
251 // alternate and make it a capture.
253 let ast = try!(self.pop_ast());
254 self.push(Capture(cap.unwrap(), cap_name, box ast));
259 self.pos_last(true, |x| x.paren() || x.bar()));
260 try!(self.concat(catfrom));
262 self.stack.push(Bar);
264 _ => try!(self.push_literal(c)),
266 if !self.next_char() {
271 // Try to improve error handling. At this point, there should be
272 // no remaining open parens.
273 if self.stack.iter().any(|x| x.paren()) {
274 return self.err("Unclosed parenthesis.")
276 let catfrom = try!(self.pos_last(true, |x| x.bar()));
277 try!(self.concat(catfrom));
278 try!(self.alternate(0));
280 assert!(self.stack.len() == 1);
284 fn noteof(&mut self, expected: &str) -> Result<(), Error> {
285 match self.next_char() {
288 self.err(format!("Expected {} but got EOF.",
289 expected).index(&FullRange))
294 fn expect(&mut self, expected: char) -> Result<(), Error> {
295 match self.next_char() {
296 true if self.cur() == expected => Ok(()),
297 true => self.err(format!("Expected '{}' but got '{}'.",
298 expected, self.cur()).index(&FullRange)),
300 self.err(format!("Expected '{}' but got EOF.",
301 expected).index(&FullRange))
306 fn next_char(&mut self) -> bool {
308 self.chari < self.chars.len()
311 fn pop_ast(&mut self) -> Result<Ast, Error> {
312 match self.stack.pop().unwrap().unwrap() {
318 fn push(&mut self, ast: Ast) {
319 self.stack.push(Expr(ast))
322 fn push_repeater(&mut self, c: char) -> Result<(), Error> {
323 match self.stack.last() {
324 Some(&Expr(..)) => (),
325 // self.stack is empty, or the top item is not an Expr
326 _ => return self.err("A repeat operator must be preceded by a valid expression."),
328 let rep: Repeater = match c {
329 '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore,
330 _ => panic!("Not a valid repeater operator."),
334 Some('*') | Some('+') =>
336 "Double repeat operators are not supported."),
339 let ast = try!(self.pop_ast());
341 Begin(_) | End(_) | WordBoundary(_) =>
343 "Repeat arguments cannot be empty width assertions."),
346 let greed = try!(self.get_next_greedy());
347 self.push(Rep(box ast, rep, greed));
351 fn push_literal(&mut self, c: char) -> Result<(), Error> {
352 let flags = self.flags;
355 self.push(Dot(flags))
358 self.push(Begin(flags))
361 self.push(End(flags))
364 self.push(Literal(c, flags))
370 // Parses all forms of character classes.
371 // Assumes that '[' is the current character.
372 fn parse_class(&mut self) -> Result<(), Error> {
374 if self.peek_is(1, '^') {
375 try!(self.expect('^'));
380 let mut ranges: Vec<(char, char)> = vec!();
381 let mut alts: Vec<Ast> = vec!();
383 while self.peek_is(1, '-') {
384 try!(self.expect('-'));
385 ranges.push(('-', '-'))
388 try!(self.noteof("a closing ']' or a non-empty character class)"));
389 let mut c = self.cur();
392 match self.try_parse_ascii() {
393 Some(AstClass(asciis, flags)) => {
394 alts.push(AstClass(asciis, flags ^ negated));
398 panic!("Expected Class AST but got '{}'", ast),
399 // Just drop down and try to add as a regular character.
403 match try!(self.parse_escape()) {
404 AstClass(asciis, flags) => {
405 alts.push(AstClass(asciis, flags ^ negated));
408 Literal(c2, _) => c = c2, // process below
409 Begin(_) | End(_) | WordBoundary(_) =>
411 "\\A, \\z, \\b and \\B are not valid escape \
412 sequences inside a character class."),
413 ast => panic!("Unexpected AST item '{}'", ast),
416 ']' if ranges.len() > 0 || alts.len() > 0 => {
417 if ranges.len() > 0 {
418 let flags = negated | (self.flags & FLAG_NOCASE);
419 let mut ast = AstClass(combine_ranges(ranges), flags);
420 for alt in alts.into_iter() {
421 ast = Alt(box alt, box ast)
424 } else if alts.len() > 0 {
425 let mut ast = alts.pop().unwrap();
426 for alt in alts.into_iter() {
427 ast = Alt(box alt, box ast)
436 if self.peek_is(1, '-') && !self.peek_is(2, ']') {
437 try!(self.expect('-'));
438 // The regex can't end here.
439 try!(self.noteof("not a ']'"));
440 // End the range with a single character or character escape.
441 let mut c2 = self.cur();
443 match try!(self.parse_escape()) {
444 Literal(c3, _) => c2 = c3, // allow literal escapes below
446 return self.err(format!("Expected a literal, but got {}.",
447 ast).index(&FullRange)),
451 return self.err(format!("Invalid character class \
454 c2).index(&FullRange))
456 ranges.push((c, self.cur()))
463 // Tries to parse an ASCII character class of the form [:name:].
464 // If successful, returns an AST character class corresponding to name
465 // and moves the parser to the final ']' character.
466 // If unsuccessful, no state is changed and None is returned.
467 // Assumes that '[' is the current character.
468 fn try_parse_ascii(&mut self) -> Option<Ast> {
469 if !self.peek_is(1, ':') {
473 match self.pos(']') {
477 if self.chars[closer-1] != ':' {
480 if closer - self.chari <= 3 {
483 let mut name_start = self.chari + 2;
485 if self.peek_is(2, '^') {
491 let name = self.slice(name_start, closer - 1);
492 match find_class(ASCII_CLASSES, name.index(&FullRange)) {
496 let flags = negated | (self.flags & FLAG_NOCASE);
497 Some(AstClass(combine_ranges(ranges), flags))
502 // Parses counted repetition. Supports:
503 // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}?
504 // Assumes that '{' is the current character.
505 // Returns either an error or moves the parser to the final '}' character.
506 // (Or the '?' character if not greedy.)
507 fn parse_counted(&mut self) -> Result<(), Error> {
508 // Scan until the closing '}' and grab the stuff in {}.
509 let start = self.chari;
511 match self.pos('}') {
514 return self.err(format!("No closing brace for counted \
515 repetition starting at position \
517 start).index(&FullRange))
521 let greed = try!(self.get_next_greedy());
522 let inner = self.chars.index(&((start+1)..closer)).iter().cloned()
523 .collect::<String>();
525 // Parse the min and max values from the regex.
526 let (mut min, mut max): (uint, Option<uint>);
527 if !inner.contains(",") {
528 min = try!(self.parse_uint(inner.index(&FullRange)));
531 let pieces: Vec<&str> = inner.splitn(1, ',').collect();
532 let (smin, smax) = (pieces[0], pieces[1]);
534 return self.err("Max repetitions cannot be specified \
535 without min repetitions.")
537 min = try!(self.parse_uint(smin));
542 Some(try!(self.parse_uint(smax)))
546 // Do some bounds checking and make sure max >= min.
547 if min > MAX_REPEAT {
548 return self.err(format!(
549 "{} exceeds maximum allowed repetitions ({})",
550 min, MAX_REPEAT).index(&FullRange));
553 let m = max.unwrap();
555 return self.err(format!(
556 "{} exceeds maximum allowed repetitions ({})",
557 m, MAX_REPEAT).index(&FullRange));
560 return self.err(format!(
561 "Max repetitions ({}) cannot be smaller than min \
562 repetitions ({}).", m, min).index(&FullRange));
566 // Now manipulate the AST be repeating elements.
568 // Require N copies of what's on the stack and then repeat it.
569 let ast = try!(self.pop_ast());
570 for _ in iter::range(0, min) {
571 self.push(ast.clone())
573 self.push(Rep(box ast, ZeroMore, greed));
575 // Require N copies of what's on the stack and then repeat it
576 // up to M times optionally.
577 let ast = try!(self.pop_ast());
578 for _ in iter::range(0, min) {
579 self.push(ast.clone())
582 for _ in iter::range(min, max.unwrap()) {
583 self.push(Rep(box ast.clone(), ZeroOne, greed))
586 // It's possible that we popped something off the stack but
587 // never put anything back on it. To keep things simple, add
588 // a no-op expression.
589 if min == 0 && (max.is_none() || max == Some(0)) {
596 // Parses all escape sequences.
597 // Assumes that '\' is the current character.
598 fn parse_escape(&mut self) -> Result<Ast, Error> {
599 try!(self.noteof("an escape sequence following a '\\'"));
603 return Ok(Literal(c, FLAG_EMPTY))
606 'a' => Ok(Literal('\x07', FLAG_EMPTY)),
607 'f' => Ok(Literal('\x0C', FLAG_EMPTY)),
608 't' => Ok(Literal('\t', FLAG_EMPTY)),
609 'n' => Ok(Literal('\n', FLAG_EMPTY)),
610 'r' => Ok(Literal('\r', FLAG_EMPTY)),
611 'v' => Ok(Literal('\x0B', FLAG_EMPTY)),
612 'A' => Ok(Begin(FLAG_EMPTY)),
613 'z' => Ok(End(FLAG_EMPTY)),
614 'b' => Ok(WordBoundary(FLAG_EMPTY)),
615 'B' => Ok(WordBoundary(FLAG_NEGATED)),
616 '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())),
617 'x' => Ok(try!(self.parse_hex())),
618 'p' | 'P' => Ok(try!(self.parse_unicode_name())),
619 'd' | 'D' | 's' | 'S' | 'w' | 'W' => {
620 let ranges = perl_unicode_class(c);
621 let mut flags = self.flags & FLAG_NOCASE;
622 if c.is_uppercase() { flags |= FLAG_NEGATED }
623 Ok(AstClass(ranges, flags))
626 self.err(format!("Invalid escape sequence '\\\\{}'", c).index(&FullRange))
631 // Parses a Unicode character class name, either of the form \pF where
632 // F is a one letter Unicode class name or of the form \p{name} where
633 // name is the Unicode class name.
634 // Assumes that \p or \P has been read (and 'p' or 'P' is the current
636 fn parse_unicode_name(&mut self) -> Result<Ast, Error> {
637 let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY };
638 let mut name: String;
639 if self.peek_is(1, '{') {
640 try!(self.expect('{'));
642 match self.pos('}') {
644 None => return self.err(format!(
645 "Missing '}}' for unclosed '{{' at position {}",
646 self.chari).index(&FullRange)),
648 if closer - self.chari + 1 == 0 {
649 return self.err("No Unicode class name found.")
651 name = self.slice(self.chari + 1, closer);
654 if self.chari + 1 >= self.chars.len() {
655 return self.err("No single letter Unicode class name found.")
657 name = self.slice(self.chari + 1, self.chari + 2);
660 match find_class(UNICODE_CLASSES, name.index(&FullRange)) {
662 return self.err(format!("Could not find Unicode class '{}'",
663 name).index(&FullRange))
666 Ok(AstClass(ranges, negated | (self.flags & FLAG_NOCASE)))
671 // Parses an octal number, up to 3 digits.
672 // Assumes that \n has been read, where n is the first digit.
673 fn parse_octal(&mut self) -> Result<Ast, Error> {
674 let start = self.chari;
675 let mut end = start + 1;
676 let (d2, d3) = (self.peek(1), self.peek(2));
677 if d2 >= Some('0') && d2 <= Some('7') {
678 try!(self.noteof("expected octal character in [0-7]"));
680 if d3 >= Some('0') && d3 <= Some('7') {
681 try!(self.noteof("expected octal character in [0-7]"));
685 let s = self.slice(start, end);
686 match num::from_str_radix::<u32>(s.index(&FullRange), 8) {
687 Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
689 self.err(format!("Could not parse '{}' as octal number.",
690 s).index(&FullRange))
695 // Parse a hex number. Either exactly two digits or anything in {}.
696 // Assumes that \x has been read.
697 fn parse_hex(&mut self) -> Result<Ast, Error> {
698 if !self.peek_is(1, '{') {
699 try!(self.expect('{'));
700 return self.parse_hex_two()
702 let start = self.chari + 2;
704 match self.pos('}') {
706 return self.err(format!("Missing '}}' for unclosed \
707 '{{' at position {}",
708 start).index(&FullRange))
713 self.parse_hex_digits(self.slice(start, closer).index(&FullRange))
716 // Parses a two-digit hex number.
717 // Assumes that \xn has been read, where n is the first digit and is the
718 // current character.
719 // After return, parser will point at the second digit.
720 fn parse_hex_two(&mut self) -> Result<Ast, Error> {
721 let (start, end) = (self.chari, self.chari + 2);
722 let bad = self.slice(start - 2, self.chars.len());
723 try!(self.noteof(format!("Invalid hex escape sequence '{}'",
725 self.parse_hex_digits(self.slice(start, end).as_slice())
728 // Parses `s` as a hexadecimal number.
729 fn parse_hex_digits(&self, s: &str) -> Result<Ast, Error> {
730 match num::from_str_radix::<u32>(s, 16) {
731 Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
733 self.err(format!("Could not parse '{}' as hex number.", s).index(&FullRange))
738 // Parses a named capture.
739 // Assumes that '(?P<' has been consumed and that the current character
741 // When done, parser will be at the closing '>' character.
742 fn parse_named_capture(&mut self) -> Result<(), Error> {
743 try!(self.noteof("a capture name"));
745 match self.pos('>') {
747 None => return self.err("Capture name must end with '>'."),
749 if closer - self.chari == 0 {
750 return self.err("Capture names must have at least 1 character.")
752 let name = self.slice(self.chari, closer);
753 if !name.chars().all(is_valid_cap) {
755 "Capture names can only have underscores, letters and digits.")
757 if self.names.contains(&name) {
758 return self.err(format!("Duplicate capture group name '{}'.",
759 name).index(&FullRange))
761 self.names.push(name.clone());
764 self.stack.push(Paren(self.flags, self.caps, name));
768 // Parses non-capture groups and options.
769 // Assumes that '(?' has already been consumed and '?' is the current
771 fn parse_group_opts(&mut self) -> Result<(), Error> {
772 if self.peek_is(1, 'P') && self.peek_is(2, '<') {
773 try!(self.expect('P'));
774 try!(self.expect('<'));
775 return self.parse_named_capture()
777 let start = self.chari;
778 let mut flags = self.flags;
780 let mut saw_flag = false;
783 "expected non-empty set of flags or closing ')'"));
785 'i' => { flags = flags | FLAG_NOCASE; saw_flag = true},
786 'm' => { flags = flags | FLAG_MULTI; saw_flag = true},
787 's' => { flags = flags | FLAG_DOTNL; saw_flag = true},
788 'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true},
791 return self.err(format!(
792 "Cannot negate flags twice in '{}'.",
793 self.slice(start, self.chari + 1)).index(&FullRange))
797 flags = flags ^ flags;
802 return self.err(format!(
803 "A valid flag does not follow negation in '{}'",
804 self.slice(start, self.chari + 1)).index(&FullRange))
806 flags = flags ^ flags;
808 if self.cur() == ':' {
809 // Save the old flags with the opening paren.
810 self.stack.push(Paren(self.flags, 0, "".to_string()));
815 _ => return self.err(format!(
816 "Unrecognized flag '{}'.", self.cur()).index(&FullRange)),
821 // Peeks at the next character and returns whether it's ungreedy or not.
822 // If it is, then the next character is consumed.
823 fn get_next_greedy(&mut self) -> Result<Greed, Error> {
824 Ok(if self.peek_is(1, '?') {
825 try!(self.expect('?'));
829 }.swap(self.flags & FLAG_SWAP_GREED > 0))
832 // Searches the stack (starting at the top) until it finds an expression
833 // for which `pred` returns true. The index of that expression in the
834 // stack is returned.
835 // If there's no match, then one of two things happens depending on the
836 // values of `allow_start`. When it's true, then `0` will be returned.
837 // Otherwise, an error will be returned.
838 // Generally, `allow_start` is only true when you're *not* expecting an
839 // opening parenthesis.
840 fn pos_last<P>(&self, allow_start: bool, pred: P) -> Result<uint, Error> where
841 P: FnMut(&BuildAst) -> bool,
843 let from = match self.stack.iter().rev().position(pred) {
849 return self.err("No matching opening parenthesis.")
853 // Adjust index since 'from' is for the reversed stack.
854 // Also, don't include the '(' or '|'.
855 Ok(self.stack.len() - from)
858 // concat starts at `from` in the parser's stack and concatenates all
859 // expressions up to the top of the stack. The resulting concatenation is
860 // then pushed on to the stack.
861 // Usually `from` corresponds to the position of an opening parenthesis,
862 // a '|' (alternation) or the start of the entire expression.
863 fn concat(&mut self, from: uint) -> Result<(), Error> {
864 let ast = try!(self.build_from(from, concat_flatten));
869 // concat starts at `from` in the parser's stack and alternates all
870 // expressions up to the top of the stack. The resulting alternation is
871 // then pushed on to the stack.
872 // Usually `from` corresponds to the position of an opening parenthesis
873 // or the start of the entire expression.
874 // This will also drop any opening parens or alternation bars found in
875 // the intermediate AST.
876 fn alternate(&mut self, mut from: uint) -> Result<(), Error> {
877 // Unlike in the concatenation case, we want 'build_from' to continue
878 // all the way to the opening left paren (so it will be popped off and
879 // thrown away). But be careful with overflow---we can't count on the
880 // open paren to be there.
881 if from > 0 { from = from - 1}
882 let ast = try!(self.build_from(from, |l,r| Alt(box l, box r)));
887 // build_from combines all AST elements starting at 'from' in the
888 // parser's stack using 'mk' to combine them. If any such element is not an
889 // AST then it is popped off the stack and ignored.
890 fn build_from<F>(&mut self, from: uint, mut mk: F) -> Result<Ast, Error> where
891 F: FnMut(Ast, Ast) -> Ast,
893 if from >= self.stack.len() {
894 return self.err("Empty group or alternate not allowed.")
897 let mut combined = try!(self.pop_ast());
898 let mut i = self.stack.len();
901 match self.stack.pop().unwrap() {
902 Expr(x) => combined = mk(x, combined),
909 fn parse_uint(&self, s: &str) -> Result<uint, Error> {
910 match s.parse::<uint>() {
913 self.err(format!("Expected an unsigned integer but got '{}'.",
914 s).index(&FullRange))
919 fn char_from_u32(&self, n: u32) -> Result<char, Error> {
920 match char::from_u32(n) {
923 self.err(format!("Could not decode '{}' to unicode \
924 character.", n).index(&FullRange))
929 fn pos(&self, c: char) -> Option<uint> {
931 .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i)
934 fn err<T>(&self, msg: &str) -> Result<T, Error> {
937 msg: msg.to_string(),
941 fn peek(&self, offset: uint) -> Option<char> {
942 if self.chari + offset >= self.chars.len() {
945 Some(self.chars[self.chari + offset])
948 fn peek_is(&self, offset: uint, is: char) -> bool {
949 self.peek(offset) == Some(is)
952 fn cur(&self) -> char {
953 self.chars[self.chari]
956 fn slice(&self, start: uint, end: uint) -> String {
957 self.chars.index(&(start..end)).iter().cloned().collect()
961 // Given an unordered collection of character ranges, combine_ranges returns
962 // an ordered sequence of character ranges where no two ranges overlap. They
963 // are ordered from least to greatest (using start position).
964 fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> {
965 // Returns true iff the two character classes overlap or share a boundary.
966 // e.g., ('a', 'g') and ('h', 'm') would return true.
967 fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool {
968 cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1
971 // This is currently O(n^2), but I think with sufficient cleverness,
972 // it can be reduced to O(n) **if necessary**.
973 let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len());
974 for (us, ue) in unordered.into_iter() {
975 let (mut us, mut ue) = (us, ue);
977 let mut which: Option<uint> = None;
978 for (i, &(os, oe)) in ordered.iter().enumerate() {
979 if should_merge((us, ue), (os, oe)) {
980 us = cmp::min(us, os);
981 ue = cmp::max(ue, oe);
987 None => ordered.push((us, ue)),
988 Some(i) => ordered[i] = (us, ue),
995 // Constructs a Unicode friendly Perl character class from \d, \s or \w
996 // (or any of their negated forms). Note that this does not handle negation.
997 fn perl_unicode_class(which: char) -> Vec<(char, char)> {
998 match which.to_lowercase() {
999 'd' => PERLD.to_vec(),
1000 's' => PERLS.to_vec(),
1001 'w' => PERLW.to_vec(),
1002 _ => unreachable!(),
1006 // Returns a concatenation of two expressions. This also guarantees that a
1007 // `Cat` expression will never be a direct child of another `Cat` expression.
1008 fn concat_flatten(x: Ast, y: Ast) -> Ast {
1010 (Cat(mut xs), Cat(ys)) => { xs.extend(ys.into_iter()); Cat(xs) }
1011 (Cat(mut xs), ast) => { xs.push(ast); Cat(xs) }
1012 (ast, Cat(mut xs)) => { xs.insert(0, ast); Cat(xs) }
1013 (ast1, ast2) => Cat(vec!(ast1, ast2)),
1017 pub fn is_punct(c: char) -> bool {
1019 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
1020 '[' | ']' | '{' | '}' | '^' | '$' => true,
1025 fn is_valid_cap(c: char) -> bool {
1026 c == '_' || (c >= '0' && c <= '9')
1027 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
1030 fn find_class(classes: NamedClasses, name: &str) -> Option<Vec<(char, char)>> {
1031 match classes.binary_search_by(|&(s, _)| s.cmp(name)) {
1032 Ok(i) => Some(classes[i].1.to_vec()),
1037 type Class = &'static [(char, char)];
1038 type NamedClasses = &'static [(&'static str, &'static Class)];
1040 static ASCII_CLASSES: NamedClasses = &[
1041 // Classes must be in alphabetical order so that bsearch works.
1042 // [:alnum:] alphanumeric (== [0-9A-Za-z])
1043 // [:alpha:] alphabetic (== [A-Za-z])
1044 // [:ascii:] ASCII (== [\x00-\x7F])
1045 // [:blank:] blank (== [\t ])
1046 // [:cntrl:] control (== [\x00-\x1F\x7F])
1047 // [:digit:] digits (== [0-9])
1048 // [:graph:] graphical (== [!-~])
1049 // [:lower:] lower case (== [a-z])
1050 // [:print:] printable (== [ -~] == [ [:graph:]])
1051 // [:punct:] punctuation (== [!-/:-@[-`{-~])
1052 // [:space:] whitespace (== [\t\n\v\f\r ])
1053 // [:upper:] upper case (== [A-Z])
1054 // [:word:] word characters (== [0-9A-Za-z_])
1055 // [:xdigit:] hex digit (== [0-9A-Fa-f])
1056 // Taken from: http://golang.org/pkg/regex/syntax/
1070 ("xdigit", &XDIGIT),
1073 static ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')];
1074 static ALPHA: Class = &[('A', 'Z'), ('a', 'z')];
1075 static ASCII: Class = &[('\x00', '\x7F')];
1076 static BLANK: Class = &[(' ', ' '), ('\t', '\t')];
1077 static CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')];
1078 static DIGIT: Class = &[('0', '9')];
1079 static GRAPH: Class = &[('!', '~')];
1080 static LOWER: Class = &[('a', 'z')];
1081 static PRINT: Class = &[(' ', '~')];
1082 static PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')];
1083 static SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'),
1084 ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')];
1085 static UPPER: Class = &[('A', 'Z')];
1086 static WORD: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')];
1087 static XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')];