1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
12 pub use self::Repeater::*;
13 pub use self::Greed::*;
14 use self::BuildAst::*;
22 /// Static data containing Unicode ranges for general categories and scripts.
23 use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
25 /// The maximum number of repetitions allowed with the `{n,m}` syntax.
26 static MAX_REPEAT: uint = 1000;
28 /// Error corresponds to something that can go wrong while parsing
29 /// a regular expression.
31 /// (Once an expression is compiled, it is not possible to produce an error
32 /// via searching, splitting or replacing.)
34 /// The *approximate* character index of where the error occurred.
36 /// A message describing the error.
40 impl fmt::Show for Error {
41 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
42 write!(f, "Regex syntax error near position {}: {}",
47 /// Represents the abstract syntax of a regular expression.
48 /// It is showable so that error messages resulting from a bug can provide
49 /// useful information.
50 /// It is cloneable so that expressions can be repeated for the counted
51 /// repetition feature. (No other copying is done.)
53 /// Note that this representation prevents one from reproducing the regex as
54 /// it was typed. (But it could be used to reproduce an equivalent regex.)
55 #[derive(Show, Clone)]
60 AstClass(Vec<(char, char)>, Flags),
64 Capture(uint, Option<String>, Box<Ast>),
65 // Represent concatenation as a flat vector to avoid blowing the
66 // stack in the compiler.
68 Alt(Box<Ast>, Box<Ast>),
69 Rep(Box<Ast>, Repeater, Greed),
72 #[derive(Show, PartialEq, Clone)]
79 #[derive(Copy, Show, Clone)]
86 pub fn is_greedy(&self) -> bool {
93 fn swap(self, swapped: bool) -> Greed {
94 if !swapped { return self }
102 /// BuildAst is a regrettable type that represents intermediate state for
103 /// constructing an abstract syntax tree. Its central purpose is to facilitate
104 /// parsing groups and alternations while also maintaining a stack of flag
109 Paren(Flags, uint, String), // '('
114 fn paren(&self) -> bool {
116 Paren(_, _, _) => true,
121 fn flags(&self) -> Flags {
123 Paren(flags, _, _) => flags,
124 _ => panic!("Cannot get flags from {}", self),
128 fn capture(&self) -> Option<uint> {
130 Paren(_, 0, _) => None,
131 Paren(_, c, _) => Some(c),
132 _ => panic!("Cannot get capture group from {}", self),
136 fn capture_name(&self) -> Option<String> {
138 Paren(_, 0, _) => None,
139 Paren(_, _, ref name) => {
146 _ => panic!("Cannot get capture name from {}", self),
150 fn bar(&self) -> bool {
157 fn unwrap(self) -> Result<Ast, Error> {
160 _ => panic!("Tried to unwrap non-AST item: {}", self),
165 /// Flags represents all options that can be twiddled by a user in an
169 pub const FLAG_EMPTY: u8 = 0;
170 pub const FLAG_NOCASE: u8 = 1 << 0; // i
171 pub const FLAG_MULTI: u8 = 1 << 1; // m
172 pub const FLAG_DOTNL: u8 = 1 << 2; // s
173 pub const FLAG_SWAP_GREED: u8 = 1 << 3; // U
174 pub const FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary
177 // The input, parsed only as a sequence of UTF8 code points.
179 // The index of the current character in the input.
181 // The intermediate state representing the AST.
182 stack: Vec<BuildAst>,
183 // The current set of flags.
185 // The total number of capture groups.
186 // Incremented each time an opening left paren is seen (assuming it is
187 // opening a capture group).
189 // A set of all capture group names used only to detect duplicates.
193 pub fn parse(s: &str) -> Result<Ast, Error> {
195 chars: s.chars().collect(),
204 impl<'a> Parser<'a> {
205 fn parse(&mut self) -> Result<Ast, Error> {
206 if self.chars.len() == 0 {
212 '?' | '*' | '+' => try!(self.push_repeater(c)),
214 let ast = try!(self.parse_escape());
217 '{' => try!(self.parse_counted()),
218 '[' => match self.try_parse_ascii() {
219 None => try!(self.parse_class()),
220 Some(class) => self.push(class),
223 if self.peek_is(1, '?') {
224 try!(self.expect('?'));
225 try!(self.parse_group_opts())
228 self.stack.push(Paren(self.flags,
235 self.pos_last(false, |x| x.paren() || x.bar()));
236 try!(self.concat(catfrom));
238 let altfrom = try!(self.pos_last(false, |x| x.paren()));
239 // Before we smush the alternates together and pop off the
240 // left paren, let's grab the old flags and see if we
242 let (cap, cap_name, oldflags) = {
243 let paren = &self.stack[altfrom-1];
244 (paren.capture(), paren.capture_name(), paren.flags())
246 try!(self.alternate(altfrom));
247 self.flags = oldflags;
249 // If this was a capture, pop what we just pushed in
250 // alternate and make it a capture.
252 let ast = try!(self.pop_ast());
253 self.push(Capture(cap.unwrap(), cap_name, box ast));
258 self.pos_last(true, |x| x.paren() || x.bar()));
259 try!(self.concat(catfrom));
261 self.stack.push(Bar);
263 _ => try!(self.push_literal(c)),
265 if !self.next_char() {
270 // Try to improve error handling. At this point, there should be
271 // no remaining open parens.
272 if self.stack.iter().any(|x| x.paren()) {
273 return self.err("Unclosed parenthesis.")
275 let catfrom = try!(self.pos_last(true, |x| x.bar()));
276 try!(self.concat(catfrom));
277 try!(self.alternate(0));
279 assert!(self.stack.len() == 1);
283 fn noteof(&mut self, expected: &str) -> Result<(), Error> {
284 match self.next_char() {
287 self.err(format!("Expected {} but got EOF.",
293 fn expect(&mut self, expected: char) -> Result<(), Error> {
294 match self.next_char() {
295 true if self.cur() == expected => Ok(()),
296 true => self.err(format!("Expected '{}' but got '{}'.",
297 expected, self.cur())[]),
299 self.err(format!("Expected '{}' but got EOF.",
305 fn next_char(&mut self) -> bool {
307 self.chari < self.chars.len()
310 fn pop_ast(&mut self) -> Result<Ast, Error> {
311 match self.stack.pop().unwrap().unwrap() {
317 fn push(&mut self, ast: Ast) {
318 self.stack.push(Expr(ast))
321 fn push_repeater(&mut self, c: char) -> Result<(), Error> {
322 match self.stack.last() {
323 Some(&Expr(..)) => (),
324 // self.stack is empty, or the top item is not an Expr
325 _ => return self.err("A repeat operator must be preceded by a valid expression."),
327 let rep: Repeater = match c {
328 '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore,
329 _ => panic!("Not a valid repeater operator."),
333 Some('*') | Some('+') =>
335 "Double repeat operators are not supported."),
338 let ast = try!(self.pop_ast());
340 Begin(_) | End(_) | WordBoundary(_) =>
342 "Repeat arguments cannot be empty width assertions."),
345 let greed = try!(self.get_next_greedy());
346 self.push(Rep(box ast, rep, greed));
350 fn push_literal(&mut self, c: char) -> Result<(), Error> {
351 let flags = self.flags;
354 self.push(Dot(flags))
357 self.push(Begin(flags))
360 self.push(End(flags))
363 self.push(Literal(c, flags))
369 // Parses all forms of character classes.
370 // Assumes that '[' is the current character.
371 fn parse_class(&mut self) -> Result<(), Error> {
373 if self.peek_is(1, '^') {
374 try!(self.expect('^'));
379 let mut ranges: Vec<(char, char)> = vec!();
380 let mut alts: Vec<Ast> = vec!();
382 while self.peek_is(1, '-') {
383 try!(self.expect('-'));
384 ranges.push(('-', '-'))
387 try!(self.noteof("a closing ']' or a non-empty character class)"));
388 let mut c = self.cur();
391 match self.try_parse_ascii() {
392 Some(AstClass(asciis, flags)) => {
393 alts.push(AstClass(asciis, flags ^ negated));
397 panic!("Expected Class AST but got '{}'", ast),
398 // Just drop down and try to add as a regular character.
402 match try!(self.parse_escape()) {
403 AstClass(asciis, flags) => {
404 alts.push(AstClass(asciis, flags ^ negated));
407 Literal(c2, _) => c = c2, // process below
408 Begin(_) | End(_) | WordBoundary(_) =>
410 "\\A, \\z, \\b and \\B are not valid escape \
411 sequences inside a character class."),
412 ast => panic!("Unexpected AST item '{}'", ast),
415 ']' if ranges.len() > 0 || alts.len() > 0 => {
416 if ranges.len() > 0 {
417 let flags = negated | (self.flags & FLAG_NOCASE);
418 let mut ast = AstClass(combine_ranges(ranges), flags);
419 for alt in alts.into_iter() {
420 ast = Alt(box alt, box ast)
423 } else if alts.len() > 0 {
424 let mut ast = alts.pop().unwrap();
425 for alt in alts.into_iter() {
426 ast = Alt(box alt, box ast)
435 if self.peek_is(1, '-') && !self.peek_is(2, ']') {
436 try!(self.expect('-'));
437 // The regex can't end here.
438 try!(self.noteof("not a ']'"));
439 // End the range with a single character or character escape.
440 let mut c2 = self.cur();
442 match try!(self.parse_escape()) {
443 Literal(c3, _) => c2 = c3, // allow literal escapes below
445 return self.err(format!("Expected a literal, but got {}.",
450 return self.err(format!("Invalid character class \
455 ranges.push((c, self.cur()))
462 // Tries to parse an ASCII character class of the form [:name:].
463 // If successful, returns an AST character class corresponding to name
464 // and moves the parser to the final ']' character.
465 // If unsuccessful, no state is changed and None is returned.
466 // Assumes that '[' is the current character.
467 fn try_parse_ascii(&mut self) -> Option<Ast> {
468 if !self.peek_is(1, ':') {
472 match self.pos(']') {
476 if self.chars[closer-1] != ':' {
479 if closer - self.chari <= 3 {
482 let mut name_start = self.chari + 2;
484 if self.peek_is(2, '^') {
490 let name = self.slice(name_start, closer - 1);
491 match find_class(ASCII_CLASSES, name[]) {
495 let flags = negated | (self.flags & FLAG_NOCASE);
496 Some(AstClass(combine_ranges(ranges), flags))
501 // Parses counted repetition. Supports:
502 // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}?
503 // Assumes that '{' is the current character.
504 // Returns either an error or moves the parser to the final '}' character.
505 // (Or the '?' character if not greedy.)
506 fn parse_counted(&mut self) -> Result<(), Error> {
507 // Scan until the closing '}' and grab the stuff in {}.
508 let start = self.chari;
510 match self.pos('}') {
513 return self.err(format!("No closing brace for counted \
514 repetition starting at position \
520 let greed = try!(self.get_next_greedy());
521 let inner = self.chars[start+1..closer].iter().cloned()
522 .collect::<String>();
524 // Parse the min and max values from the regex.
525 let (mut min, mut max): (uint, Option<uint>);
526 if !inner.contains(",") {
527 min = try!(self.parse_uint(inner[]));
530 let pieces: Vec<&str> = inner.splitn(1, ',').collect();
531 let (smin, smax) = (pieces[0], pieces[1]);
533 return self.err("Max repetitions cannot be specified \
534 without min repetitions.")
536 min = try!(self.parse_uint(smin));
541 Some(try!(self.parse_uint(smax)))
545 // Do some bounds checking and make sure max >= min.
546 if min > MAX_REPEAT {
547 return self.err(format!(
548 "{} exceeds maximum allowed repetitions ({})",
552 let m = max.unwrap();
554 return self.err(format!(
555 "{} exceeds maximum allowed repetitions ({})",
559 return self.err(format!(
560 "Max repetitions ({}) cannot be smaller than min \
561 repetitions ({}).", m, min)[]);
565 // Now manipulate the AST be repeating elements.
567 // Require N copies of what's on the stack and then repeat it.
568 let ast = try!(self.pop_ast());
569 for _ in iter::range(0, min) {
570 self.push(ast.clone())
572 self.push(Rep(box ast, ZeroMore, greed));
574 // Require N copies of what's on the stack and then repeat it
575 // up to M times optionally.
576 let ast = try!(self.pop_ast());
577 for _ in iter::range(0, min) {
578 self.push(ast.clone())
581 for _ in iter::range(min, max.unwrap()) {
582 self.push(Rep(box ast.clone(), ZeroOne, greed))
585 // It's possible that we popped something off the stack but
586 // never put anything back on it. To keep things simple, add
587 // a no-op expression.
588 if min == 0 && (max.is_none() || max == Some(0)) {
595 // Parses all escape sequences.
596 // Assumes that '\' is the current character.
597 fn parse_escape(&mut self) -> Result<Ast, Error> {
598 try!(self.noteof("an escape sequence following a '\\'"));
602 return Ok(Literal(c, FLAG_EMPTY))
605 'a' => Ok(Literal('\x07', FLAG_EMPTY)),
606 'f' => Ok(Literal('\x0C', FLAG_EMPTY)),
607 't' => Ok(Literal('\t', FLAG_EMPTY)),
608 'n' => Ok(Literal('\n', FLAG_EMPTY)),
609 'r' => Ok(Literal('\r', FLAG_EMPTY)),
610 'v' => Ok(Literal('\x0B', FLAG_EMPTY)),
611 'A' => Ok(Begin(FLAG_EMPTY)),
612 'z' => Ok(End(FLAG_EMPTY)),
613 'b' => Ok(WordBoundary(FLAG_EMPTY)),
614 'B' => Ok(WordBoundary(FLAG_NEGATED)),
615 '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())),
616 'x' => Ok(try!(self.parse_hex())),
617 'p' | 'P' => Ok(try!(self.parse_unicode_name())),
618 'd' | 'D' | 's' | 'S' | 'w' | 'W' => {
619 let ranges = perl_unicode_class(c);
620 let mut flags = self.flags & FLAG_NOCASE;
621 if c.is_uppercase() { flags |= FLAG_NEGATED }
622 Ok(AstClass(ranges, flags))
625 self.err(format!("Invalid escape sequence '\\\\{}'", c)[])
630 // Parses a Unicode character class name, either of the form \pF where
631 // F is a one letter Unicode class name or of the form \p{name} where
632 // name is the Unicode class name.
633 // Assumes that \p or \P has been read (and 'p' or 'P' is the current
635 fn parse_unicode_name(&mut self) -> Result<Ast, Error> {
636 let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY };
637 let mut name: String;
638 if self.peek_is(1, '{') {
639 try!(self.expect('{'));
641 match self.pos('}') {
643 None => return self.err(format!(
644 "Missing '}}' for unclosed '{{' at position {}",
647 if closer - self.chari + 1 == 0 {
648 return self.err("No Unicode class name found.")
650 name = self.slice(self.chari + 1, closer);
653 if self.chari + 1 >= self.chars.len() {
654 return self.err("No single letter Unicode class name found.")
656 name = self.slice(self.chari + 1, self.chari + 2);
659 match find_class(UNICODE_CLASSES, name[]) {
661 return self.err(format!("Could not find Unicode class '{}'",
665 Ok(AstClass(ranges, negated | (self.flags & FLAG_NOCASE)))
670 // Parses an octal number, up to 3 digits.
671 // Assumes that \n has been read, where n is the first digit.
672 fn parse_octal(&mut self) -> Result<Ast, Error> {
673 let start = self.chari;
674 let mut end = start + 1;
675 let (d2, d3) = (self.peek(1), self.peek(2));
676 if d2 >= Some('0') && d2 <= Some('7') {
677 try!(self.noteof("expected octal character in [0-7]"));
679 if d3 >= Some('0') && d3 <= Some('7') {
680 try!(self.noteof("expected octal character in [0-7]"));
684 let s = self.slice(start, end);
685 match num::from_str_radix::<u32>(s[], 8) {
686 Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
688 self.err(format!("Could not parse '{}' as octal number.",
694 // Parse a hex number. Either exactly two digits or anything in {}.
695 // Assumes that \x has been read.
696 fn parse_hex(&mut self) -> Result<Ast, Error> {
697 if !self.peek_is(1, '{') {
698 try!(self.expect('{'));
699 return self.parse_hex_two()
701 let start = self.chari + 2;
703 match self.pos('}') {
705 return self.err(format!("Missing '}}' for unclosed \
706 '{{' at position {}",
712 self.parse_hex_digits(self.slice(start, closer)[])
715 // Parses a two-digit hex number.
716 // Assumes that \xn has been read, where n is the first digit and is the
717 // current character.
718 // After return, parser will point at the second digit.
719 fn parse_hex_two(&mut self) -> Result<Ast, Error> {
720 let (start, end) = (self.chari, self.chari + 2);
721 let bad = self.slice(start - 2, self.chars.len());
722 try!(self.noteof(format!("Invalid hex escape sequence '{}'",
724 self.parse_hex_digits(self.slice(start, end).as_slice())
727 // Parses `s` as a hexadecimal number.
728 fn parse_hex_digits(&self, s: &str) -> Result<Ast, Error> {
729 match num::from_str_radix::<u32>(s, 16) {
730 Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
732 self.err(format!("Could not parse '{}' as hex number.", s)[])
737 // Parses a named capture.
738 // Assumes that '(?P<' has been consumed and that the current character
740 // When done, parser will be at the closing '>' character.
741 fn parse_named_capture(&mut self) -> Result<(), Error> {
742 try!(self.noteof("a capture name"));
744 match self.pos('>') {
746 None => return self.err("Capture name must end with '>'."),
748 if closer - self.chari == 0 {
749 return self.err("Capture names must have at least 1 character.")
751 let name = self.slice(self.chari, closer);
752 if !name.chars().all(is_valid_cap) {
754 "Capture names can only have underscores, letters and digits.")
756 if self.names.contains(&name) {
757 return self.err(format!("Duplicate capture group name '{}'.",
760 self.names.push(name.clone());
763 self.stack.push(Paren(self.flags, self.caps, name));
767 // Parses non-capture groups and options.
768 // Assumes that '(?' has already been consumed and '?' is the current
770 fn parse_group_opts(&mut self) -> Result<(), Error> {
771 if self.peek_is(1, 'P') && self.peek_is(2, '<') {
772 try!(self.expect('P'));
773 try!(self.expect('<'));
774 return self.parse_named_capture()
776 let start = self.chari;
777 let mut flags = self.flags;
779 let mut saw_flag = false;
782 "expected non-empty set of flags or closing ')'"));
784 'i' => { flags = flags | FLAG_NOCASE; saw_flag = true},
785 'm' => { flags = flags | FLAG_MULTI; saw_flag = true},
786 's' => { flags = flags | FLAG_DOTNL; saw_flag = true},
787 'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true},
790 return self.err(format!(
791 "Cannot negate flags twice in '{}'.",
792 self.slice(start, self.chari + 1))[])
796 flags = flags ^ flags;
801 return self.err(format!(
802 "A valid flag does not follow negation in '{}'",
803 self.slice(start, self.chari + 1))[])
805 flags = flags ^ flags;
807 if self.cur() == ':' {
808 // Save the old flags with the opening paren.
809 self.stack.push(Paren(self.flags, 0, "".to_string()));
814 _ => return self.err(format!(
815 "Unrecognized flag '{}'.", self.cur())[]),
820 // Peeks at the next character and returns whether it's ungreedy or not.
821 // If it is, then the next character is consumed.
822 fn get_next_greedy(&mut self) -> Result<Greed, Error> {
823 Ok(if self.peek_is(1, '?') {
824 try!(self.expect('?'));
828 }.swap(self.flags & FLAG_SWAP_GREED > 0))
831 // Searches the stack (starting at the top) until it finds an expression
832 // for which `pred` returns true. The index of that expression in the
833 // stack is returned.
834 // If there's no match, then one of two things happens depending on the
835 // values of `allow_start`. When it's true, then `0` will be returned.
836 // Otherwise, an error will be returned.
837 // Generally, `allow_start` is only true when you're *not* expecting an
838 // opening parenthesis.
839 fn pos_last<P>(&self, allow_start: bool, pred: P) -> Result<uint, Error> where
840 P: FnMut(&BuildAst) -> bool,
842 let from = match self.stack.iter().rev().position(pred) {
848 return self.err("No matching opening parenthesis.")
852 // Adjust index since 'from' is for the reversed stack.
853 // Also, don't include the '(' or '|'.
854 Ok(self.stack.len() - from)
857 // concat starts at `from` in the parser's stack and concatenates all
858 // expressions up to the top of the stack. The resulting concatenation is
859 // then pushed on to the stack.
860 // Usually `from` corresponds to the position of an opening parenthesis,
861 // a '|' (alternation) or the start of the entire expression.
862 fn concat(&mut self, from: uint) -> Result<(), Error> {
863 let ast = try!(self.build_from(from, concat_flatten));
868 // concat starts at `from` in the parser's stack and alternates all
869 // expressions up to the top of the stack. The resulting alternation is
870 // then pushed on to the stack.
871 // Usually `from` corresponds to the position of an opening parenthesis
872 // or the start of the entire expression.
873 // This will also drop any opening parens or alternation bars found in
874 // the intermediate AST.
875 fn alternate(&mut self, mut from: uint) -> Result<(), Error> {
876 // Unlike in the concatenation case, we want 'build_from' to continue
877 // all the way to the opening left paren (so it will be popped off and
878 // thrown away). But be careful with overflow---we can't count on the
879 // open paren to be there.
880 if from > 0 { from = from - 1}
881 let ast = try!(self.build_from(from, |l,r| Alt(box l, box r)));
886 // build_from combines all AST elements starting at 'from' in the
887 // parser's stack using 'mk' to combine them. If any such element is not an
888 // AST then it is popped off the stack and ignored.
889 fn build_from<F>(&mut self, from: uint, mut mk: F) -> Result<Ast, Error> where
890 F: FnMut(Ast, Ast) -> Ast,
892 if from >= self.stack.len() {
893 return self.err("Empty group or alternate not allowed.")
896 let mut combined = try!(self.pop_ast());
897 let mut i = self.stack.len();
900 match self.stack.pop().unwrap() {
901 Expr(x) => combined = mk(x, combined),
908 fn parse_uint(&self, s: &str) -> Result<uint, Error> {
909 match s.parse::<uint>() {
912 self.err(format!("Expected an unsigned integer but got '{}'.",
918 fn char_from_u32(&self, n: u32) -> Result<char, Error> {
919 match char::from_u32(n) {
922 self.err(format!("Could not decode '{}' to unicode \
928 fn pos(&self, c: char) -> Option<uint> {
930 .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i)
933 fn err<T>(&self, msg: &str) -> Result<T, Error> {
936 msg: msg.to_string(),
940 fn peek(&self, offset: uint) -> Option<char> {
941 if self.chari + offset >= self.chars.len() {
944 Some(self.chars[self.chari + offset])
947 fn peek_is(&self, offset: uint, is: char) -> bool {
948 self.peek(offset) == Some(is)
951 fn cur(&self) -> char {
952 self.chars[self.chari]
955 fn slice(&self, start: uint, end: uint) -> String {
956 self.chars[start..end].iter().cloned().collect()
960 // Given an unordered collection of character ranges, combine_ranges returns
961 // an ordered sequence of character ranges where no two ranges overlap. They
962 // are ordered from least to greatest (using start position).
963 fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> {
964 // Returns true iff the two character classes overlap or share a boundary.
965 // e.g., ('a', 'g') and ('h', 'm') would return true.
966 fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool {
967 cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1
970 // This is currently O(n^2), but I think with sufficient cleverness,
971 // it can be reduced to O(n) **if necessary**.
972 let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len());
973 for (us, ue) in unordered.into_iter() {
974 let (mut us, mut ue) = (us, ue);
976 let mut which: Option<uint> = None;
977 for (i, &(os, oe)) in ordered.iter().enumerate() {
978 if should_merge((us, ue), (os, oe)) {
979 us = cmp::min(us, os);
980 ue = cmp::max(ue, oe);
986 None => ordered.push((us, ue)),
987 Some(i) => ordered[i] = (us, ue),
994 // Constructs a Unicode friendly Perl character class from \d, \s or \w
995 // (or any of their negated forms). Note that this does not handle negation.
996 fn perl_unicode_class(which: char) -> Vec<(char, char)> {
997 match which.to_lowercase() {
998 'd' => PERLD.to_vec(),
999 's' => PERLS.to_vec(),
1000 'w' => PERLW.to_vec(),
1001 _ => unreachable!(),
1005 // Returns a concatenation of two expressions. This also guarantees that a
1006 // `Cat` expression will never be a direct child of another `Cat` expression.
1007 fn concat_flatten(x: Ast, y: Ast) -> Ast {
1009 (Cat(mut xs), Cat(ys)) => { xs.extend(ys.into_iter()); Cat(xs) }
1010 (Cat(mut xs), ast) => { xs.push(ast); Cat(xs) }
1011 (ast, Cat(mut xs)) => { xs.insert(0, ast); Cat(xs) }
1012 (ast1, ast2) => Cat(vec!(ast1, ast2)),
1016 pub fn is_punct(c: char) -> bool {
1018 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
1019 '[' | ']' | '{' | '}' | '^' | '$' => true,
1024 fn is_valid_cap(c: char) -> bool {
1025 c == '_' || (c >= '0' && c <= '9')
1026 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
1029 fn find_class(classes: NamedClasses, name: &str) -> Option<Vec<(char, char)>> {
1030 match classes.binary_search_by(|&(s, _)| s.cmp(name)) {
1031 Ok(i) => Some(classes[i].1.to_vec()),
1036 type Class = &'static [(char, char)];
1037 type NamedClasses = &'static [(&'static str, &'static Class)];
1039 static ASCII_CLASSES: NamedClasses = &[
1040 // Classes must be in alphabetical order so that bsearch works.
1041 // [:alnum:] alphanumeric (== [0-9A-Za-z])
1042 // [:alpha:] alphabetic (== [A-Za-z])
1043 // [:ascii:] ASCII (== [\x00-\x7F])
1044 // [:blank:] blank (== [\t ])
1045 // [:cntrl:] control (== [\x00-\x1F\x7F])
1046 // [:digit:] digits (== [0-9])
1047 // [:graph:] graphical (== [!-~])
1048 // [:lower:] lower case (== [a-z])
1049 // [:print:] printable (== [ -~] == [ [:graph:]])
1050 // [:punct:] punctuation (== [!-/:-@[-`{-~])
1051 // [:space:] whitespace (== [\t\n\v\f\r ])
1052 // [:upper:] upper case (== [A-Z])
1053 // [:word:] word characters (== [0-9A-Za-z_])
1054 // [:xdigit:] hex digit (== [0-9A-Fa-f])
1055 // Taken from: http://golang.org/pkg/regex/syntax/
1069 ("xdigit", &XDIGIT),
1072 static ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')];
1073 static ALPHA: Class = &[('A', 'Z'), ('a', 'z')];
1074 static ASCII: Class = &[('\x00', '\x7F')];
1075 static BLANK: Class = &[(' ', ' '), ('\t', '\t')];
1076 static CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')];
1077 static DIGIT: Class = &[('0', '9')];
1078 static GRAPH: Class = &[('!', '~')];
1079 static LOWER: Class = &[('a', 'z')];
1080 static PRINT: Class = &[(' ', '~')];
1081 static PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')];
1082 static SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'),
1083 ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')];
1084 static UPPER: Class = &[('A', 'Z')];
1085 static WORD: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')];
1086 static XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')];