1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
18 /// Static data containing Unicode ranges for general categories and scripts.
19 use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
20 #[allow(visible_private_types)]
23 /// The maximum number of repetitions allowed with the `{n,m}` syntax.
24 static MAX_REPEAT: uint = 1000;
26 /// Error corresponds to something that can go wrong while parsing
27 /// a regular expression.
29 /// (Once an expression is compiled, it is not possible to produce an error
30 /// via searching, splitting or replacing.)
32 /// The *approximate* character index of where the error occurred.
34 /// A message describing the error.
38 impl fmt::Show for Error {
39 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
40 write!(f, "Regex syntax error near position {}: {}",
45 /// Represents the abstract syntax of a regular expression.
46 /// It is showable so that error messages resulting from a bug can provide
47 /// useful information.
48 /// It is cloneable so that expressions can be repeated for the counted
49 /// repetition feature. (No other copying is done.)
51 /// Note that this representation prevents one from reproducing the regex as
52 /// it was typed. (But it could be used to reproduce an equivalent regex.)
53 #[deriving(Show, Clone)]
58 Class(Vec<(char, char)>, Flags),
62 Capture(uint, Option<String>, Box<Ast>),
63 // Represent concatenation as a flat vector to avoid blowing the
64 // stack in the compiler.
66 Alt(Box<Ast>, Box<Ast>),
67 Rep(Box<Ast>, Repeater, Greed),
70 #[deriving(Show, PartialEq, Clone)]
77 #[deriving(Show, Clone)]
84 pub fn is_greedy(&self) -> bool {
91 fn swap(self, swapped: bool) -> Greed {
92 if !swapped { return self }
100 /// BuildAst is a regrettable type that represents intermediate state for
101 /// constructing an abstract syntax tree. Its central purpose is to facilitate
102 /// parsing groups and alternations while also maintaining a stack of flag
107 Paren(Flags, uint, String), // '('
112 fn paren(&self) -> bool {
114 Paren(_, _, _) => true,
119 fn flags(&self) -> Flags {
121 Paren(flags, _, _) => flags,
122 _ => fail!("Cannot get flags from {}", self),
126 fn capture(&self) -> Option<uint> {
128 Paren(_, 0, _) => None,
129 Paren(_, c, _) => Some(c),
130 _ => fail!("Cannot get capture group from {}", self),
134 fn capture_name(&self) -> Option<String> {
136 Paren(_, 0, _) => None,
137 Paren(_, _, ref name) => {
144 _ => fail!("Cannot get capture name from {}", self),
148 fn bar(&self) -> bool {
155 fn unwrap(self) -> Result<Ast, Error> {
158 _ => fail!("Tried to unwrap non-AST item: {}", self),
163 /// Flags represents all options that can be twiddled by a user in an
167 pub static FLAG_EMPTY: u8 = 0;
168 pub static FLAG_NOCASE: u8 = 1 << 0; // i
169 pub static FLAG_MULTI: u8 = 1 << 1; // m
170 pub static FLAG_DOTNL: u8 = 1 << 2; // s
171 pub static FLAG_SWAP_GREED: u8 = 1 << 3; // U
172 pub static FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary
175 // The input, parsed only as a sequence of UTF8 code points.
177 // The index of the current character in the input.
179 // The intermediate state representing the AST.
180 stack: Vec<BuildAst>,
181 // The current set of flags.
183 // The total number of capture groups.
184 // Incremented each time an opening left paren is seen (assuming it is
185 // opening a capture group).
187 // A set of all capture group names used only to detect duplicates.
191 pub fn parse(s: &str) -> Result<Ast, Error> {
193 chars: s.chars().collect(),
202 impl<'a> Parser<'a> {
203 fn parse(&mut self) -> Result<Ast, Error> {
204 if self.chars.len() == 0 {
210 '?' | '*' | '+' => try!(self.push_repeater(c)),
212 let ast = try!(self.parse_escape());
215 '{' => try!(self.parse_counted()),
216 '[' => match self.try_parse_ascii() {
217 None => try!(self.parse_class()),
218 Some(class) => self.push(class),
221 if self.peek_is(1, '?') {
222 try!(self.expect('?'))
223 try!(self.parse_group_opts())
226 self.stack.push(Paren(self.flags,
233 self.pos_last(false, |x| x.paren() || x.bar()));
234 try!(self.concat(catfrom));
236 let altfrom = try!(self.pos_last(false, |x| x.paren()));
237 // Before we smush the alternates together and pop off the
238 // left paren, let's grab the old flags and see if we
240 let (cap, cap_name, oldflags) = {
241 let paren = self.stack.get(altfrom-1);
242 (paren.capture(), paren.capture_name(), paren.flags())
244 try!(self.alternate(altfrom));
245 self.flags = oldflags;
247 // If this was a capture, pop what we just pushed in
248 // alternate and make it a capture.
250 let ast = try!(self.pop_ast());
251 self.push(Capture(cap.unwrap(), cap_name, box ast));
256 self.pos_last(true, |x| x.paren() || x.bar()));
257 try!(self.concat(catfrom));
259 self.stack.push(Bar);
261 _ => try!(self.push_literal(c)),
263 if !self.next_char() {
268 // Try to improve error handling. At this point, there should be
269 // no remaining open parens.
270 if self.stack.iter().any(|x| x.paren()) {
271 return self.err("Unclosed parenthesis.")
273 let catfrom = try!(self.pos_last(true, |x| x.bar()));
274 try!(self.concat(catfrom));
275 try!(self.alternate(0));
277 assert!(self.stack.len() == 1);
281 fn noteof(&mut self, expected: &str) -> Result<(), Error> {
282 match self.next_char() {
285 self.err(format!("Expected {} but got EOF.",
286 expected).as_slice())
291 fn expect(&mut self, expected: char) -> Result<(), Error> {
292 match self.next_char() {
293 true if self.cur() == expected => Ok(()),
294 true => self.err(format!("Expected '{}' but got '{}'.",
295 expected, self.cur()).as_slice()),
297 self.err(format!("Expected '{}' but got EOF.",
298 expected).as_slice())
303 fn next_char(&mut self) -> bool {
305 self.chari < self.chars.len()
308 fn pop_ast(&mut self) -> Result<Ast, Error> {
309 match self.stack.pop().unwrap().unwrap() {
315 fn push(&mut self, ast: Ast) {
316 self.stack.push(Ast(ast))
319 fn push_repeater(&mut self, c: char) -> Result<(), Error> {
320 if self.stack.len() == 0 {
322 "A repeat operator must be preceded by a valid expression.")
324 let rep: Repeater = match c {
325 '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore,
326 _ => fail!("Not a valid repeater operator."),
330 Some('*') | Some('+') =>
332 "Double repeat operators are not supported."),
335 let ast = try!(self.pop_ast());
337 Begin(_) | End(_) | WordBoundary(_) =>
339 "Repeat arguments cannot be empty width assertions."),
342 let greed = try!(self.get_next_greedy());
343 self.push(Rep(box ast, rep, greed));
347 fn push_literal(&mut self, c: char) -> Result<(), Error> {
348 let flags = self.flags;
351 self.push(Dot(flags))
354 self.push(Begin(flags))
357 self.push(End(flags))
360 self.push(Literal(c, flags))
366 // Parses all forms of character classes.
367 // Assumes that '[' is the current character.
368 fn parse_class(&mut self) -> Result<(), Error> {
370 if self.peek_is(1, '^') {
371 try!(self.expect('^'))
376 let mut ranges: Vec<(char, char)> = vec!();
377 let mut alts: Vec<Ast> = vec!();
379 if self.peek_is(1, ']') {
380 try!(self.expect(']'))
381 ranges.push((']', ']'))
383 while self.peek_is(1, '-') {
384 try!(self.expect('-'))
385 ranges.push(('-', '-'))
388 try!(self.noteof("a closing ']' or a non-empty character class)"))
389 let mut c = self.cur();
392 match self.try_parse_ascii() {
393 Some(Class(asciis, flags)) => {
394 alts.push(Class(asciis, flags ^ negated));
398 fail!("Expected Class AST but got '{}'", ast),
399 // Just drop down and try to add as a regular character.
403 match try!(self.parse_escape()) {
404 Class(asciis, flags) => {
405 alts.push(Class(asciis, flags ^ negated));
408 Literal(c2, _) => c = c2, // process below
409 Begin(_) | End(_) | WordBoundary(_) =>
411 "\\A, \\z, \\b and \\B are not valid escape \
412 sequences inside a character class."),
413 ast => fail!("Unexpected AST item '{}'", ast),
420 if ranges.len() > 0 {
421 let flags = negated | (self.flags & FLAG_NOCASE);
422 let mut ast = Class(combine_ranges(ranges), flags);
423 for alt in alts.move_iter() {
424 ast = Alt(box alt, box ast)
427 } else if alts.len() > 0 {
428 let mut ast = alts.pop().unwrap();
429 for alt in alts.move_iter() {
430 ast = Alt(box alt, box ast)
437 if self.peek_is(1, '-') && !self.peek_is(2, ']') {
438 try!(self.expect('-'))
439 try!(self.noteof("not a ']'"))
442 return self.err(format!("Invalid character class \
447 ranges.push((c, self.cur()))
456 // Tries to parse an ASCII character class of the form [:name:].
457 // If successful, returns an AST character class corresponding to name
458 // and moves the parser to the final ']' character.
459 // If unsuccessful, no state is changed and None is returned.
460 // Assumes that '[' is the current character.
461 fn try_parse_ascii(&mut self) -> Option<Ast> {
462 if !self.peek_is(1, ':') {
466 match self.pos(']') {
470 if *self.chars.get(closer-1) != ':' {
473 if closer - self.chari <= 3 {
476 let mut name_start = self.chari + 2;
478 if self.peek_is(2, '^') {
484 let name = self.slice(name_start, closer - 1);
485 match find_class(ASCII_CLASSES, name.as_slice()) {
489 let flags = negated | (self.flags & FLAG_NOCASE);
490 Some(Class(combine_ranges(ranges), flags))
495 // Parses counted repetition. Supports:
496 // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}?
497 // Assumes that '{' is the current character.
498 // Returns either an error or moves the parser to the final '}' character.
499 // (Or the '?' character if not greedy.)
500 fn parse_counted(&mut self) -> Result<(), Error> {
501 // Scan until the closing '}' and grab the stuff in {}.
502 let start = self.chari;
504 match self.pos('}') {
507 return self.err(format!("No closing brace for counted \
508 repetition starting at position \
514 let greed = try!(self.get_next_greedy());
515 let inner = str::from_chars(
516 self.chars.as_slice().slice(start + 1, closer));
518 // Parse the min and max values from the regex.
519 let (mut min, mut max): (uint, Option<uint>);
520 if !inner.as_slice().contains(",") {
521 min = try!(self.parse_uint(inner.as_slice()));
524 let pieces: Vec<&str> = inner.as_slice().splitn(',', 1).collect();
525 let (smin, smax) = (*pieces.get(0), *pieces.get(1));
527 return self.err("Max repetitions cannot be specified \
528 without min repetitions.")
530 min = try!(self.parse_uint(smin));
535 Some(try!(self.parse_uint(smax)))
539 // Do some bounds checking and make sure max >= min.
540 if min > MAX_REPEAT {
541 return self.err(format!(
542 "{} exceeds maximum allowed repetitions ({})",
543 min, MAX_REPEAT).as_slice());
546 let m = max.unwrap();
548 return self.err(format!(
549 "{} exceeds maximum allowed repetitions ({})",
550 m, MAX_REPEAT).as_slice());
553 return self.err(format!(
554 "Max repetitions ({}) cannot be smaller than min \
555 repetitions ({}).", m, min).as_slice());
559 // Now manipulate the AST be repeating elements.
561 // Require N copies of what's on the stack and then repeat it.
562 let ast = try!(self.pop_ast());
563 for _ in iter::range(0, min) {
564 self.push(ast.clone())
566 self.push(Rep(box ast, ZeroMore, greed));
568 // Require N copies of what's on the stack and then repeat it
569 // up to M times optionally.
570 let ast = try!(self.pop_ast());
571 for _ in iter::range(0, min) {
572 self.push(ast.clone())
575 for _ in iter::range(min, max.unwrap()) {
576 self.push(Rep(box ast.clone(), ZeroOne, greed))
579 // It's possible that we popped something off the stack but
580 // never put anything back on it. To keep things simple, add
581 // a no-op expression.
582 if min == 0 && (max.is_none() || max == Some(0)) {
589 // Parses all escape sequences.
590 // Assumes that '\' is the current character.
591 fn parse_escape(&mut self) -> Result<Ast, Error> {
592 try!(self.noteof("an escape sequence following a '\\'"))
596 return Ok(Literal(c, FLAG_EMPTY))
599 'a' => Ok(Literal('\x07', FLAG_EMPTY)),
600 'f' => Ok(Literal('\x0C', FLAG_EMPTY)),
601 't' => Ok(Literal('\t', FLAG_EMPTY)),
602 'n' => Ok(Literal('\n', FLAG_EMPTY)),
603 'r' => Ok(Literal('\r', FLAG_EMPTY)),
604 'v' => Ok(Literal('\x0B', FLAG_EMPTY)),
605 'A' => Ok(Begin(FLAG_EMPTY)),
606 'z' => Ok(End(FLAG_EMPTY)),
607 'b' => Ok(WordBoundary(FLAG_EMPTY)),
608 'B' => Ok(WordBoundary(FLAG_NEGATED)),
609 '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())),
610 'x' => Ok(try!(self.parse_hex())),
611 'p' | 'P' => Ok(try!(self.parse_unicode_name())),
612 'd' | 'D' | 's' | 'S' | 'w' | 'W' => {
613 let ranges = perl_unicode_class(c);
614 let mut flags = self.flags & FLAG_NOCASE;
615 if c.is_uppercase() { flags |= FLAG_NEGATED }
616 Ok(Class(ranges, flags))
619 self.err(format!("Invalid escape sequence '\\\\{}'",
625 // Parses a unicode character class name, either of the form \pF where
626 // F is a one letter unicode class name or of the form \p{name} where
627 // name is the unicode class name.
628 // Assumes that \p or \P has been read (and 'p' or 'P' is the current
630 fn parse_unicode_name(&mut self) -> Result<Ast, Error> {
631 let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY };
632 let mut name: String;
633 if self.peek_is(1, '{') {
634 try!(self.expect('{'))
636 match self.pos('}') {
639 None => return self.err(format!(
640 "Missing '\\}' for unclosed '\\{' at position {}",
641 self.chari).as_slice()),
643 None => return self.err(format!(
644 "Missing '}}' for unclosed '{{' at position {}",
645 self.chari).as_slice()),
647 if closer - self.chari + 1 == 0 {
648 return self.err("No Unicode class name found.")
650 name = self.slice(self.chari + 1, closer);
653 if self.chari + 1 >= self.chars.len() {
654 return self.err("No single letter Unicode class name found.")
656 name = self.slice(self.chari + 1, self.chari + 2);
659 match find_class(UNICODE_CLASSES, name.as_slice()) {
661 return self.err(format!("Could not find Unicode class '{}'",
665 Ok(Class(ranges, negated | (self.flags & FLAG_NOCASE)))
670 // Parses an octal number, up to 3 digits.
671 // Assumes that \n has been read, where n is the first digit.
672 fn parse_octal(&mut self) -> Result<Ast, Error> {
673 let start = self.chari;
674 let mut end = start + 1;
675 let (d2, d3) = (self.peek(1), self.peek(2));
676 if d2 >= Some('0') && d2 <= Some('7') {
677 try!(self.noteof("expected octal character in [0-7]"))
679 if d3 >= Some('0') && d3 <= Some('7') {
680 try!(self.noteof("expected octal character in [0-7]"))
684 let s = self.slice(start, end);
685 match num::from_str_radix::<u32>(s.as_slice(), 8) {
686 Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
688 self.err(format!("Could not parse '{}' as octal number.",
694 // Parse a hex number. Either exactly two digits or anything in {}.
695 // Assumes that \x has been read.
696 fn parse_hex(&mut self) -> Result<Ast, Error> {
697 if !self.peek_is(1, '{') {
698 try!(self.expect('{'))
699 return self.parse_hex_two()
701 let start = self.chari + 2;
703 match self.pos('}') {
706 return self.err(format!("Missing '\\}' for unclosed \
707 '\\{' at position {}",
712 return self.err(format!("Missing '}}' for unclosed \
713 '{{' at position {}",
719 self.parse_hex_digits(self.slice(start, closer).as_slice())
722 // Parses a two-digit hex number.
723 // Assumes that \xn has been read, where n is the first digit and is the
724 // current character.
725 // After return, parser will point at the second digit.
726 fn parse_hex_two(&mut self) -> Result<Ast, Error> {
727 let (start, end) = (self.chari, self.chari + 2);
728 let bad = self.slice(start - 2, self.chars.len());
729 try!(self.noteof(format!("Invalid hex escape sequence '{}'",
731 self.parse_hex_digits(self.slice(start, end).as_slice())
734 // Parses `s` as a hexadecimal number.
735 fn parse_hex_digits(&self, s: &str) -> Result<Ast, Error> {
736 match num::from_str_radix::<u32>(s, 16) {
737 Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
739 self.err(format!("Could not parse '{}' as hex number.",
745 // Parses a named capture.
746 // Assumes that '(?P<' has been consumed and that the current character
748 // When done, parser will be at the closing '>' character.
749 fn parse_named_capture(&mut self) -> Result<(), Error> {
750 try!(self.noteof("a capture name"))
752 match self.pos('>') {
754 None => return self.err("Capture name must end with '>'."),
756 if closer - self.chari == 0 {
757 return self.err("Capture names must have at least 1 character.")
759 let name = self.slice(self.chari, closer);
760 if !name.as_slice().chars().all(is_valid_cap) {
762 "Capture names can only have underscores, letters and digits.")
764 if self.names.contains(&name) {
765 return self.err(format!("Duplicate capture group name '{}'.",
768 self.names.push(name.clone());
771 self.stack.push(Paren(self.flags, self.caps, name));
775 // Parses non-capture groups and options.
776 // Assumes that '(?' has already been consumed and '?' is the current
778 fn parse_group_opts(&mut self) -> Result<(), Error> {
779 if self.peek_is(1, 'P') && self.peek_is(2, '<') {
780 try!(self.expect('P')) try!(self.expect('<'))
781 return self.parse_named_capture()
783 let start = self.chari;
784 let mut flags = self.flags;
786 let mut saw_flag = false;
788 try!(self.noteof("expected non-empty set of flags or closing ')'"))
790 'i' => { flags = flags | FLAG_NOCASE; saw_flag = true},
791 'm' => { flags = flags | FLAG_MULTI; saw_flag = true},
792 's' => { flags = flags | FLAG_DOTNL; saw_flag = true},
793 'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true},
796 return self.err(format!(
797 "Cannot negate flags twice in '{}'.",
798 self.slice(start, self.chari + 1)).as_slice())
802 flags = flags ^ flags;
807 return self.err(format!(
808 "A valid flag does not follow negation in '{}'",
809 self.slice(start, self.chari + 1)).as_slice())
811 flags = flags ^ flags;
813 if self.cur() == ':' {
814 // Save the old flags with the opening paren.
815 self.stack.push(Paren(self.flags, 0, "".to_string()));
820 _ => return self.err(format!(
821 "Unrecognized flag '{}'.", self.cur()).as_slice()),
826 // Peeks at the next character and returns whether it's ungreedy or not.
827 // If it is, then the next character is consumed.
828 fn get_next_greedy(&mut self) -> Result<Greed, Error> {
829 Ok(if self.peek_is(1, '?') {
830 try!(self.expect('?'))
834 }.swap(self.flags & FLAG_SWAP_GREED > 0))
837 // Searches the stack (starting at the top) until it finds an expression
838 // for which `pred` returns true. The index of that expression in the
839 // stack is returned.
840 // If there's no match, then one of two things happens depending on the
841 // values of `allow_start`. When it's true, then `0` will be returned.
842 // Otherwise, an error will be returned.
843 // Generally, `allow_start` is only true when you're *not* expecting an
844 // opening parenthesis.
845 fn pos_last(&self, allow_start: bool, pred: |&BuildAst| -> bool)
846 -> Result<uint, Error> {
847 let from = match self.stack.iter().rev().position(pred) {
853 return self.err("No matching opening parenthesis.")
857 // Adjust index since 'from' is for the reversed stack.
858 // Also, don't include the '(' or '|'.
859 Ok(self.stack.len() - from)
862 // concat starts at `from` in the parser's stack and concatenates all
863 // expressions up to the top of the stack. The resulting concatenation is
864 // then pushed on to the stack.
865 // Usually `from` corresponds to the position of an opening parenthesis,
866 // a '|' (alternation) or the start of the entire expression.
867 fn concat(&mut self, from: uint) -> Result<(), Error> {
868 let ast = try!(self.build_from(from, concat_flatten));
873 // concat starts at `from` in the parser's stack and alternates all
874 // expressions up to the top of the stack. The resulting alternation is
875 // then pushed on to the stack.
876 // Usually `from` corresponds to the position of an opening parenthesis
877 // or the start of the entire expression.
878 // This will also drop any opening parens or alternation bars found in
879 // the intermediate AST.
880 fn alternate(&mut self, mut from: uint) -> Result<(), Error> {
881 // Unlike in the concatenation case, we want 'build_from' to continue
882 // all the way to the opening left paren (so it will be popped off and
883 // thrown away). But be careful with overflow---we can't count on the
884 // open paren to be there.
885 if from > 0 { from = from - 1}
886 let ast = try!(self.build_from(from, |l,r| Alt(box l, box r)));
891 // build_from combines all AST elements starting at 'from' in the
892 // parser's stack using 'mk' to combine them. If any such element is not an
893 // AST then it is popped off the stack and ignored.
894 fn build_from(&mut self, from: uint, mk: |Ast, Ast| -> Ast)
895 -> Result<Ast, Error> {
896 if from >= self.stack.len() {
897 return self.err("Empty group or alternate not allowed.")
900 let mut combined = try!(self.pop_ast());
901 let mut i = self.stack.len();
904 match self.stack.pop().unwrap() {
905 Ast(x) => combined = mk(x, combined),
912 fn parse_uint(&self, s: &str) -> Result<uint, Error> {
913 match from_str::<uint>(s) {
916 self.err(format!("Expected an unsigned integer but got '{}'.",
922 fn char_from_u32(&self, n: u32) -> Result<char, Error> {
923 match char::from_u32(n) {
926 self.err(format!("Could not decode '{}' to unicode \
933 fn pos(&self, c: char) -> Option<uint> {
935 .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i)
938 fn err<T>(&self, msg: &str) -> Result<T, Error> {
941 msg: msg.to_string(),
945 fn peek(&self, offset: uint) -> Option<char> {
946 if self.chari + offset >= self.chars.len() {
949 Some(*self.chars.get(self.chari + offset))
952 fn peek_is(&self, offset: uint, is: char) -> bool {
953 self.peek(offset) == Some(is)
956 fn cur(&self) -> char {
957 *self.chars.get(self.chari)
960 fn slice(&self, start: uint, end: uint) -> String {
961 str::from_chars(self.chars.as_slice().slice(start, end)).to_string()
965 // Given an unordered collection of character ranges, combine_ranges returns
966 // an ordered sequence of character ranges where no two ranges overlap. They
967 // are ordered from least to greatest (using start position).
968 fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> {
969 // Returns true iff the two character classes overlap or share a boundary.
970 // e.g., ('a', 'g') and ('h', 'm') would return true.
971 fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool {
972 cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1
975 // This is currently O(n^2), but I think with sufficient cleverness,
976 // it can be reduced to O(n) **if necessary**.
977 let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len());
978 for (us, ue) in unordered.move_iter() {
979 let (mut us, mut ue) = (us, ue);
981 let mut which: Option<uint> = None;
982 for (i, &(os, oe)) in ordered.iter().enumerate() {
983 if should_merge((us, ue), (os, oe)) {
984 us = cmp::min(us, os);
985 ue = cmp::max(ue, oe);
991 None => ordered.push((us, ue)),
992 Some(i) => *ordered.get_mut(i) = (us, ue),
999 // Constructs a Unicode friendly Perl character class from \d, \s or \w
1000 // (or any of their negated forms). Note that this does not handle negation.
1001 fn perl_unicode_class(which: char) -> Vec<(char, char)> {
1002 match which.to_lowercase() {
1003 'd' => Vec::from_slice(PERLD),
1004 's' => Vec::from_slice(PERLS),
1005 'w' => Vec::from_slice(PERLW),
1006 _ => unreachable!(),
1010 // Returns a concatenation of two expressions. This also guarantees that a
1011 // `Cat` expression will never be a direct child of another `Cat` expression.
1012 fn concat_flatten(x: Ast, y: Ast) -> Ast {
1014 (Cat(mut xs), Cat(ys)) => { xs.push_all_move(ys); Cat(xs) }
1015 (Cat(mut xs), ast) => { xs.push(ast); Cat(xs) }
1016 (ast, Cat(mut xs)) => { xs.unshift(ast); Cat(xs) }
1017 (ast1, ast2) => Cat(vec!(ast1, ast2)),
1021 pub fn is_punct(c: char) -> bool {
1023 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
1024 '[' | ']' | '{' | '}' | '^' | '$' => true,
1029 fn is_valid_cap(c: char) -> bool {
1030 c == '_' || (c >= '0' && c <= '9')
1031 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
1034 fn find_class(classes: NamedClasses, name: &str) -> Option<Vec<(char, char)>> {
1035 match classes.bsearch(|&(s, _)| s.cmp(&name)) {
1036 Some(i) => Some(Vec::from_slice(classes[i].val1())),
1041 type Class = &'static [(char, char)];
1042 type NamedClasses = &'static [(&'static str, Class)];
1044 static ASCII_CLASSES: NamedClasses = &[
1045 // Classes must be in alphabetical order so that bsearch works.
1046 // [:alnum:] alphanumeric (== [0-9A-Za-z])
1047 // [:alpha:] alphabetic (== [A-Za-z])
1048 // [:ascii:] ASCII (== [\x00-\x7F])
1049 // [:blank:] blank (== [\t ])
1050 // [:cntrl:] control (== [\x00-\x1F\x7F])
1051 // [:digit:] digits (== [0-9])
1052 // [:graph:] graphical (== [!-~])
1053 // [:lower:] lower case (== [a-z])
1054 // [:print:] printable (== [ -~] == [ [:graph:]])
1055 // [:punct:] punctuation (== [!-/:-@[-`{-~])
1056 // [:space:] whitespace (== [\t\n\v\f\r ])
1057 // [:upper:] upper case (== [A-Z])
1058 // [:word:] word characters (== [0-9A-Za-z_])
1059 // [:xdigit:] hex digit (== [0-9A-Fa-f])
1060 // Taken from: http://golang.org/pkg/regex/syntax/
1061 ("alnum", &[('0', '9'), ('A', 'Z'), ('a', 'z')]),
1062 ("alpha", &[('A', 'Z'), ('a', 'z')]),
1063 ("ascii", &[('\x00', '\x7F')]),
1064 ("blank", &[(' ', ' '), ('\t', '\t')]),
1065 ("cntrl", &[('\x00', '\x1F'), ('\x7F', '\x7F')]),
1066 ("digit", &[('0', '9')]),
1067 ("graph", &[('!', '~')]),
1068 ("lower", &[('a', 'z')]),
1069 ("print", &[(' ', '~')]),
1070 ("punct", &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]),
1071 ("space", &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'),
1072 ('\r', '\r'), (' ', ' ')]),
1073 ("upper", &[('A', 'Z')]),
1074 ("word", &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]),
1075 ("xdigit", &[('0', '9'), ('A', 'F'), ('a', 'f')]),