1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
17 /// Static data containing Unicode ranges for general categories and scripts.
18 use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
20 /// The maximum number of repetitions allowed with the `{n,m}` syntax.
21 static MAX_REPEAT: uint = 1000;
23 /// Error corresponds to something that can go wrong while parsing
24 /// a regular expression.
26 /// (Once an expression is compiled, it is not possible to produce an error
27 /// via searching, splitting or replacing.)
29 /// The *approximate* character index of where the error occurred.
31 /// A message describing the error.
35 impl fmt::Show for Error {
36 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
37 write!(f, "Regex syntax error near position {}: {}",
42 /// Represents the abstract syntax of a regular expression.
43 /// It is showable so that error messages resulting from a bug can provide
44 /// useful information.
45 /// It is cloneable so that expressions can be repeated for the counted
46 /// repetition feature. (No other copying is done.)
48 /// Note that this representation prevents one from reproducing the regex as
49 /// it was typed. (But it could be used to reproduce an equivalent regex.)
50 #[deriving(Show, Clone)]
55 Class(Vec<(char, char)>, Flags),
59 Capture(uint, Option<String>, Box<Ast>),
60 // Represent concatenation as a flat vector to avoid blowing the
61 // stack in the compiler.
63 Alt(Box<Ast>, Box<Ast>),
64 Rep(Box<Ast>, Repeater, Greed),
67 #[deriving(Show, PartialEq, Clone)]
74 #[deriving(Show, Clone)]
81 pub fn is_greedy(&self) -> bool {
88 fn swap(self, swapped: bool) -> Greed {
89 if !swapped { return self }
97 /// BuildAst is a regrettable type that represents intermediate state for
98 /// constructing an abstract syntax tree. Its central purpose is to facilitate
99 /// parsing groups and alternations while also maintaining a stack of flag
104 Paren(Flags, uint, String), // '('
109 fn paren(&self) -> bool {
111 Paren(_, _, _) => true,
116 fn flags(&self) -> Flags {
118 Paren(flags, _, _) => flags,
119 _ => fail!("Cannot get flags from {}", self),
123 fn capture(&self) -> Option<uint> {
125 Paren(_, 0, _) => None,
126 Paren(_, c, _) => Some(c),
127 _ => fail!("Cannot get capture group from {}", self),
131 fn capture_name(&self) -> Option<String> {
133 Paren(_, 0, _) => None,
134 Paren(_, _, ref name) => {
141 _ => fail!("Cannot get capture name from {}", self),
145 fn bar(&self) -> bool {
152 fn unwrap(self) -> Result<Ast, Error> {
155 _ => fail!("Tried to unwrap non-AST item: {}", self),
160 /// Flags represents all options that can be twiddled by a user in an
164 pub static FLAG_EMPTY: u8 = 0;
165 pub static FLAG_NOCASE: u8 = 1 << 0; // i
166 pub static FLAG_MULTI: u8 = 1 << 1; // m
167 pub static FLAG_DOTNL: u8 = 1 << 2; // s
168 pub static FLAG_SWAP_GREED: u8 = 1 << 3; // U
169 pub static FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary
172 // The input, parsed only as a sequence of UTF8 code points.
174 // The index of the current character in the input.
176 // The intermediate state representing the AST.
177 stack: Vec<BuildAst>,
178 // The current set of flags.
180 // The total number of capture groups.
181 // Incremented each time an opening left paren is seen (assuming it is
182 // opening a capture group).
184 // A set of all capture group names used only to detect duplicates.
188 pub fn parse(s: &str) -> Result<Ast, Error> {
190 chars: s.chars().collect(),
199 impl<'a> Parser<'a> {
200 fn parse(&mut self) -> Result<Ast, Error> {
201 if self.chars.len() == 0 {
207 '?' | '*' | '+' => try!(self.push_repeater(c)),
209 let ast = try!(self.parse_escape());
212 '{' => try!(self.parse_counted()),
213 '[' => match self.try_parse_ascii() {
214 None => try!(self.parse_class()),
215 Some(class) => self.push(class),
218 if self.peek_is(1, '?') {
219 try!(self.expect('?'))
220 try!(self.parse_group_opts())
223 self.stack.push(Paren(self.flags,
230 self.pos_last(false, |x| x.paren() || x.bar()));
231 try!(self.concat(catfrom));
233 let altfrom = try!(self.pos_last(false, |x| x.paren()));
234 // Before we smush the alternates together and pop off the
235 // left paren, let's grab the old flags and see if we
237 let (cap, cap_name, oldflags) = {
238 let paren = self.stack.get(altfrom-1);
239 (paren.capture(), paren.capture_name(), paren.flags())
241 try!(self.alternate(altfrom));
242 self.flags = oldflags;
244 // If this was a capture, pop what we just pushed in
245 // alternate and make it a capture.
247 let ast = try!(self.pop_ast());
248 self.push(Capture(cap.unwrap(), cap_name, box ast));
253 self.pos_last(true, |x| x.paren() || x.bar()));
254 try!(self.concat(catfrom));
256 self.stack.push(Bar);
258 _ => try!(self.push_literal(c)),
260 if !self.next_char() {
265 // Try to improve error handling. At this point, there should be
266 // no remaining open parens.
267 if self.stack.iter().any(|x| x.paren()) {
268 return self.err("Unclosed parenthesis.")
270 let catfrom = try!(self.pos_last(true, |x| x.bar()));
271 try!(self.concat(catfrom));
272 try!(self.alternate(0));
274 assert!(self.stack.len() == 1);
278 fn noteof(&mut self, expected: &str) -> Result<(), Error> {
279 match self.next_char() {
282 self.err(format!("Expected {} but got EOF.",
283 expected).as_slice())
288 fn expect(&mut self, expected: char) -> Result<(), Error> {
289 match self.next_char() {
290 true if self.cur() == expected => Ok(()),
291 true => self.err(format!("Expected '{}' but got '{}'.",
292 expected, self.cur()).as_slice()),
294 self.err(format!("Expected '{}' but got EOF.",
295 expected).as_slice())
300 fn next_char(&mut self) -> bool {
302 self.chari < self.chars.len()
305 fn pop_ast(&mut self) -> Result<Ast, Error> {
306 match self.stack.pop().unwrap().unwrap() {
312 fn push(&mut self, ast: Ast) {
313 self.stack.push(Ast(ast))
316 fn push_repeater(&mut self, c: char) -> Result<(), Error> {
317 if self.stack.len() == 0 {
319 "A repeat operator must be preceded by a valid expression.")
321 let rep: Repeater = match c {
322 '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore,
323 _ => fail!("Not a valid repeater operator."),
327 Some('*') | Some('+') =>
329 "Double repeat operators are not supported."),
332 let ast = try!(self.pop_ast());
334 Begin(_) | End(_) | WordBoundary(_) =>
336 "Repeat arguments cannot be empty width assertions."),
339 let greed = try!(self.get_next_greedy());
340 self.push(Rep(box ast, rep, greed));
344 fn push_literal(&mut self, c: char) -> Result<(), Error> {
345 let flags = self.flags;
348 self.push(Dot(flags))
351 self.push(Begin(flags))
354 self.push(End(flags))
357 self.push(Literal(c, flags))
363 // Parses all forms of character classes.
364 // Assumes that '[' is the current character.
365 fn parse_class(&mut self) -> Result<(), Error> {
367 if self.peek_is(1, '^') {
368 try!(self.expect('^'))
373 let mut ranges: Vec<(char, char)> = vec!();
374 let mut alts: Vec<Ast> = vec!();
376 if self.peek_is(1, ']') {
377 try!(self.expect(']'))
378 ranges.push((']', ']'))
380 while self.peek_is(1, '-') {
381 try!(self.expect('-'))
382 ranges.push(('-', '-'))
385 try!(self.noteof("a closing ']' or a non-empty character class)"))
386 let mut c = self.cur();
389 match self.try_parse_ascii() {
390 Some(Class(asciis, flags)) => {
391 alts.push(Class(asciis, flags ^ negated));
395 fail!("Expected Class AST but got '{}'", ast),
396 // Just drop down and try to add as a regular character.
400 match try!(self.parse_escape()) {
401 Class(asciis, flags) => {
402 alts.push(Class(asciis, flags ^ negated));
405 Literal(c2, _) => c = c2, // process below
406 Begin(_) | End(_) | WordBoundary(_) =>
408 "\\A, \\z, \\b and \\B are not valid escape \
409 sequences inside a character class."),
410 ast => fail!("Unexpected AST item '{}'", ast),
417 if ranges.len() > 0 {
418 let flags = negated | (self.flags & FLAG_NOCASE);
419 let mut ast = Class(combine_ranges(ranges), flags);
420 for alt in alts.move_iter() {
421 ast = Alt(box alt, box ast)
424 } else if alts.len() > 0 {
425 let mut ast = alts.pop().unwrap();
426 for alt in alts.move_iter() {
427 ast = Alt(box alt, box ast)
434 if self.peek_is(1, '-') && !self.peek_is(2, ']') {
435 try!(self.expect('-'))
436 try!(self.noteof("not a ']'"))
439 return self.err(format!("Invalid character class \
444 ranges.push((c, self.cur()))
453 // Tries to parse an ASCII character class of the form [:name:].
454 // If successful, returns an AST character class corresponding to name
455 // and moves the parser to the final ']' character.
456 // If unsuccessful, no state is changed and None is returned.
457 // Assumes that '[' is the current character.
458 fn try_parse_ascii(&mut self) -> Option<Ast> {
459 if !self.peek_is(1, ':') {
463 match self.pos(']') {
467 if *self.chars.get(closer-1) != ':' {
470 if closer - self.chari <= 3 {
473 let mut name_start = self.chari + 2;
475 if self.peek_is(2, '^') {
481 let name = self.slice(name_start, closer - 1);
482 match find_class(ASCII_CLASSES, name.as_slice()) {
486 let flags = negated | (self.flags & FLAG_NOCASE);
487 Some(Class(combine_ranges(ranges), flags))
492 // Parses counted repetition. Supports:
493 // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}?
494 // Assumes that '{' is the current character.
495 // Returns either an error or moves the parser to the final '}' character.
496 // (Or the '?' character if not greedy.)
497 fn parse_counted(&mut self) -> Result<(), Error> {
498 // Scan until the closing '}' and grab the stuff in {}.
499 let start = self.chari;
501 match self.pos('}') {
504 return self.err(format!("No closing brace for counted \
505 repetition starting at position \
511 let greed = try!(self.get_next_greedy());
512 let inner = String::from_chars(
513 self.chars.as_slice().slice(start + 1, closer));
515 // Parse the min and max values from the regex.
516 let (mut min, mut max): (uint, Option<uint>);
517 if !inner.as_slice().contains(",") {
518 min = try!(self.parse_uint(inner.as_slice()));
521 let pieces: Vec<&str> = inner.as_slice().splitn(',', 1).collect();
522 let (smin, smax) = (*pieces.get(0), *pieces.get(1));
524 return self.err("Max repetitions cannot be specified \
525 without min repetitions.")
527 min = try!(self.parse_uint(smin));
532 Some(try!(self.parse_uint(smax)))
536 // Do some bounds checking and make sure max >= min.
537 if min > MAX_REPEAT {
538 return self.err(format!(
539 "{} exceeds maximum allowed repetitions ({})",
540 min, MAX_REPEAT).as_slice());
543 let m = max.unwrap();
545 return self.err(format!(
546 "{} exceeds maximum allowed repetitions ({})",
547 m, MAX_REPEAT).as_slice());
550 return self.err(format!(
551 "Max repetitions ({}) cannot be smaller than min \
552 repetitions ({}).", m, min).as_slice());
556 // Now manipulate the AST be repeating elements.
558 // Require N copies of what's on the stack and then repeat it.
559 let ast = try!(self.pop_ast());
560 for _ in iter::range(0, min) {
561 self.push(ast.clone())
563 self.push(Rep(box ast, ZeroMore, greed));
565 // Require N copies of what's on the stack and then repeat it
566 // up to M times optionally.
567 let ast = try!(self.pop_ast());
568 for _ in iter::range(0, min) {
569 self.push(ast.clone())
572 for _ in iter::range(min, max.unwrap()) {
573 self.push(Rep(box ast.clone(), ZeroOne, greed))
576 // It's possible that we popped something off the stack but
577 // never put anything back on it. To keep things simple, add
578 // a no-op expression.
579 if min == 0 && (max.is_none() || max == Some(0)) {
586 // Parses all escape sequences.
587 // Assumes that '\' is the current character.
588 fn parse_escape(&mut self) -> Result<Ast, Error> {
589 try!(self.noteof("an escape sequence following a '\\'"))
593 return Ok(Literal(c, FLAG_EMPTY))
596 'a' => Ok(Literal('\x07', FLAG_EMPTY)),
597 'f' => Ok(Literal('\x0C', FLAG_EMPTY)),
598 't' => Ok(Literal('\t', FLAG_EMPTY)),
599 'n' => Ok(Literal('\n', FLAG_EMPTY)),
600 'r' => Ok(Literal('\r', FLAG_EMPTY)),
601 'v' => Ok(Literal('\x0B', FLAG_EMPTY)),
602 'A' => Ok(Begin(FLAG_EMPTY)),
603 'z' => Ok(End(FLAG_EMPTY)),
604 'b' => Ok(WordBoundary(FLAG_EMPTY)),
605 'B' => Ok(WordBoundary(FLAG_NEGATED)),
606 '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())),
607 'x' => Ok(try!(self.parse_hex())),
608 'p' | 'P' => Ok(try!(self.parse_unicode_name())),
609 'd' | 'D' | 's' | 'S' | 'w' | 'W' => {
610 let ranges = perl_unicode_class(c);
611 let mut flags = self.flags & FLAG_NOCASE;
612 if c.is_uppercase() { flags |= FLAG_NEGATED }
613 Ok(Class(ranges, flags))
616 self.err(format!("Invalid escape sequence '\\\\{}'",
622 // Parses a unicode character class name, either of the form \pF where
623 // F is a one letter unicode class name or of the form \p{name} where
624 // name is the unicode class name.
625 // Assumes that \p or \P has been read (and 'p' or 'P' is the current
627 fn parse_unicode_name(&mut self) -> Result<Ast, Error> {
628 let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY };
629 let mut name: String;
630 if self.peek_is(1, '{') {
631 try!(self.expect('{'))
633 match self.pos('}') {
635 None => return self.err(format!(
636 "Missing '}}' for unclosed '{{' at position {}",
637 self.chari).as_slice()),
639 if closer - self.chari + 1 == 0 {
640 return self.err("No Unicode class name found.")
642 name = self.slice(self.chari + 1, closer);
645 if self.chari + 1 >= self.chars.len() {
646 return self.err("No single letter Unicode class name found.")
648 name = self.slice(self.chari + 1, self.chari + 2);
651 match find_class(UNICODE_CLASSES, name.as_slice()) {
653 return self.err(format!("Could not find Unicode class '{}'",
657 Ok(Class(ranges, negated | (self.flags & FLAG_NOCASE)))
662 // Parses an octal number, up to 3 digits.
663 // Assumes that \n has been read, where n is the first digit.
664 fn parse_octal(&mut self) -> Result<Ast, Error> {
665 let start = self.chari;
666 let mut end = start + 1;
667 let (d2, d3) = (self.peek(1), self.peek(2));
668 if d2 >= Some('0') && d2 <= Some('7') {
669 try!(self.noteof("expected octal character in [0-7]"))
671 if d3 >= Some('0') && d3 <= Some('7') {
672 try!(self.noteof("expected octal character in [0-7]"))
676 let s = self.slice(start, end);
677 match num::from_str_radix::<u32>(s.as_slice(), 8) {
678 Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
680 self.err(format!("Could not parse '{}' as octal number.",
686 // Parse a hex number. Either exactly two digits or anything in {}.
687 // Assumes that \x has been read.
688 fn parse_hex(&mut self) -> Result<Ast, Error> {
689 if !self.peek_is(1, '{') {
690 try!(self.expect('{'))
691 return self.parse_hex_two()
693 let start = self.chari + 2;
695 match self.pos('}') {
697 return self.err(format!("Missing '}}' for unclosed \
698 '{{' at position {}",
704 self.parse_hex_digits(self.slice(start, closer).as_slice())
707 // Parses a two-digit hex number.
708 // Assumes that \xn has been read, where n is the first digit and is the
709 // current character.
710 // After return, parser will point at the second digit.
711 fn parse_hex_two(&mut self) -> Result<Ast, Error> {
712 let (start, end) = (self.chari, self.chari + 2);
713 let bad = self.slice(start - 2, self.chars.len());
714 try!(self.noteof(format!("Invalid hex escape sequence '{}'",
716 self.parse_hex_digits(self.slice(start, end).as_slice())
719 // Parses `s` as a hexadecimal number.
720 fn parse_hex_digits(&self, s: &str) -> Result<Ast, Error> {
721 match num::from_str_radix::<u32>(s, 16) {
722 Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
724 self.err(format!("Could not parse '{}' as hex number.",
730 // Parses a named capture.
731 // Assumes that '(?P<' has been consumed and that the current character
733 // When done, parser will be at the closing '>' character.
734 fn parse_named_capture(&mut self) -> Result<(), Error> {
735 try!(self.noteof("a capture name"))
737 match self.pos('>') {
739 None => return self.err("Capture name must end with '>'."),
741 if closer - self.chari == 0 {
742 return self.err("Capture names must have at least 1 character.")
744 let name = self.slice(self.chari, closer);
745 if !name.as_slice().chars().all(is_valid_cap) {
747 "Capture names can only have underscores, letters and digits.")
749 if self.names.contains(&name) {
750 return self.err(format!("Duplicate capture group name '{}'.",
753 self.names.push(name.clone());
756 self.stack.push(Paren(self.flags, self.caps, name));
760 // Parses non-capture groups and options.
761 // Assumes that '(?' has already been consumed and '?' is the current
763 fn parse_group_opts(&mut self) -> Result<(), Error> {
764 if self.peek_is(1, 'P') && self.peek_is(2, '<') {
765 try!(self.expect('P')) try!(self.expect('<'))
766 return self.parse_named_capture()
768 let start = self.chari;
769 let mut flags = self.flags;
771 let mut saw_flag = false;
773 try!(self.noteof("expected non-empty set of flags or closing ')'"))
775 'i' => { flags = flags | FLAG_NOCASE; saw_flag = true},
776 'm' => { flags = flags | FLAG_MULTI; saw_flag = true},
777 's' => { flags = flags | FLAG_DOTNL; saw_flag = true},
778 'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true},
781 return self.err(format!(
782 "Cannot negate flags twice in '{}'.",
783 self.slice(start, self.chari + 1)).as_slice())
787 flags = flags ^ flags;
792 return self.err(format!(
793 "A valid flag does not follow negation in '{}'",
794 self.slice(start, self.chari + 1)).as_slice())
796 flags = flags ^ flags;
798 if self.cur() == ':' {
799 // Save the old flags with the opening paren.
800 self.stack.push(Paren(self.flags, 0, "".to_string()));
805 _ => return self.err(format!(
806 "Unrecognized flag '{}'.", self.cur()).as_slice()),
811 // Peeks at the next character and returns whether it's ungreedy or not.
812 // If it is, then the next character is consumed.
813 fn get_next_greedy(&mut self) -> Result<Greed, Error> {
814 Ok(if self.peek_is(1, '?') {
815 try!(self.expect('?'))
819 }.swap(self.flags & FLAG_SWAP_GREED > 0))
822 // Searches the stack (starting at the top) until it finds an expression
823 // for which `pred` returns true. The index of that expression in the
824 // stack is returned.
825 // If there's no match, then one of two things happens depending on the
826 // values of `allow_start`. When it's true, then `0` will be returned.
827 // Otherwise, an error will be returned.
828 // Generally, `allow_start` is only true when you're *not* expecting an
829 // opening parenthesis.
830 fn pos_last(&self, allow_start: bool, pred: |&BuildAst| -> bool)
831 -> Result<uint, Error> {
832 let from = match self.stack.iter().rev().position(pred) {
838 return self.err("No matching opening parenthesis.")
842 // Adjust index since 'from' is for the reversed stack.
843 // Also, don't include the '(' or '|'.
844 Ok(self.stack.len() - from)
847 // concat starts at `from` in the parser's stack and concatenates all
848 // expressions up to the top of the stack. The resulting concatenation is
849 // then pushed on to the stack.
850 // Usually `from` corresponds to the position of an opening parenthesis,
851 // a '|' (alternation) or the start of the entire expression.
852 fn concat(&mut self, from: uint) -> Result<(), Error> {
853 let ast = try!(self.build_from(from, concat_flatten));
858 // concat starts at `from` in the parser's stack and alternates all
859 // expressions up to the top of the stack. The resulting alternation is
860 // then pushed on to the stack.
861 // Usually `from` corresponds to the position of an opening parenthesis
862 // or the start of the entire expression.
863 // This will also drop any opening parens or alternation bars found in
864 // the intermediate AST.
865 fn alternate(&mut self, mut from: uint) -> Result<(), Error> {
866 // Unlike in the concatenation case, we want 'build_from' to continue
867 // all the way to the opening left paren (so it will be popped off and
868 // thrown away). But be careful with overflow---we can't count on the
869 // open paren to be there.
870 if from > 0 { from = from - 1}
871 let ast = try!(self.build_from(from, |l,r| Alt(box l, box r)));
876 // build_from combines all AST elements starting at 'from' in the
877 // parser's stack using 'mk' to combine them. If any such element is not an
878 // AST then it is popped off the stack and ignored.
879 fn build_from(&mut self, from: uint, mk: |Ast, Ast| -> Ast)
880 -> Result<Ast, Error> {
881 if from >= self.stack.len() {
882 return self.err("Empty group or alternate not allowed.")
885 let mut combined = try!(self.pop_ast());
886 let mut i = self.stack.len();
889 match self.stack.pop().unwrap() {
890 Ast(x) => combined = mk(x, combined),
897 fn parse_uint(&self, s: &str) -> Result<uint, Error> {
898 match from_str::<uint>(s) {
901 self.err(format!("Expected an unsigned integer but got '{}'.",
907 fn char_from_u32(&self, n: u32) -> Result<char, Error> {
908 match char::from_u32(n) {
911 self.err(format!("Could not decode '{}' to unicode \
918 fn pos(&self, c: char) -> Option<uint> {
920 .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i)
923 fn err<T>(&self, msg: &str) -> Result<T, Error> {
926 msg: msg.to_string(),
930 fn peek(&self, offset: uint) -> Option<char> {
931 if self.chari + offset >= self.chars.len() {
934 Some(*self.chars.get(self.chari + offset))
937 fn peek_is(&self, offset: uint, is: char) -> bool {
938 self.peek(offset) == Some(is)
941 fn cur(&self) -> char {
942 *self.chars.get(self.chari)
945 fn slice(&self, start: uint, end: uint) -> String {
946 String::from_chars(self.chars.as_slice().slice(start, end))
950 // Given an unordered collection of character ranges, combine_ranges returns
951 // an ordered sequence of character ranges where no two ranges overlap. They
952 // are ordered from least to greatest (using start position).
953 fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> {
954 // Returns true iff the two character classes overlap or share a boundary.
955 // e.g., ('a', 'g') and ('h', 'm') would return true.
956 fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool {
957 cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1
960 // This is currently O(n^2), but I think with sufficient cleverness,
961 // it can be reduced to O(n) **if necessary**.
962 let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len());
963 for (us, ue) in unordered.move_iter() {
964 let (mut us, mut ue) = (us, ue);
966 let mut which: Option<uint> = None;
967 for (i, &(os, oe)) in ordered.iter().enumerate() {
968 if should_merge((us, ue), (os, oe)) {
969 us = cmp::min(us, os);
970 ue = cmp::max(ue, oe);
976 None => ordered.push((us, ue)),
977 Some(i) => *ordered.get_mut(i) = (us, ue),
984 // Constructs a Unicode friendly Perl character class from \d, \s or \w
985 // (or any of their negated forms). Note that this does not handle negation.
986 fn perl_unicode_class(which: char) -> Vec<(char, char)> {
987 match which.to_lowercase() {
988 'd' => Vec::from_slice(PERLD),
989 's' => Vec::from_slice(PERLS),
990 'w' => Vec::from_slice(PERLW),
995 // Returns a concatenation of two expressions. This also guarantees that a
996 // `Cat` expression will never be a direct child of another `Cat` expression.
997 fn concat_flatten(x: Ast, y: Ast) -> Ast {
999 (Cat(mut xs), Cat(ys)) => { xs.push_all_move(ys); Cat(xs) }
1000 (Cat(mut xs), ast) => { xs.push(ast); Cat(xs) }
1001 (ast, Cat(mut xs)) => { xs.unshift(ast); Cat(xs) }
1002 (ast1, ast2) => Cat(vec!(ast1, ast2)),
1006 pub fn is_punct(c: char) -> bool {
1008 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
1009 '[' | ']' | '{' | '}' | '^' | '$' => true,
1014 fn is_valid_cap(c: char) -> bool {
1015 c == '_' || (c >= '0' && c <= '9')
1016 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
1019 fn find_class(classes: NamedClasses, name: &str) -> Option<Vec<(char, char)>> {
1020 match classes.bsearch(|&(s, _)| s.cmp(&name)) {
1021 Some(i) => Some(Vec::from_slice(classes[i].val1())),
1026 type Class = &'static [(char, char)];
1027 type NamedClasses = &'static [(&'static str, Class)];
1029 static ASCII_CLASSES: NamedClasses = &[
1030 // Classes must be in alphabetical order so that bsearch works.
1031 // [:alnum:] alphanumeric (== [0-9A-Za-z])
1032 // [:alpha:] alphabetic (== [A-Za-z])
1033 // [:ascii:] ASCII (== [\x00-\x7F])
1034 // [:blank:] blank (== [\t ])
1035 // [:cntrl:] control (== [\x00-\x1F\x7F])
1036 // [:digit:] digits (== [0-9])
1037 // [:graph:] graphical (== [!-~])
1038 // [:lower:] lower case (== [a-z])
1039 // [:print:] printable (== [ -~] == [ [:graph:]])
1040 // [:punct:] punctuation (== [!-/:-@[-`{-~])
1041 // [:space:] whitespace (== [\t\n\v\f\r ])
1042 // [:upper:] upper case (== [A-Z])
1043 // [:word:] word characters (== [0-9A-Za-z_])
1044 // [:xdigit:] hex digit (== [0-9A-Fa-f])
1045 // Taken from: http://golang.org/pkg/regex/syntax/
1046 ("alnum", &[('0', '9'), ('A', 'Z'), ('a', 'z')]),
1047 ("alpha", &[('A', 'Z'), ('a', 'z')]),
1048 ("ascii", &[('\x00', '\x7F')]),
1049 ("blank", &[(' ', ' '), ('\t', '\t')]),
1050 ("cntrl", &[('\x00', '\x1F'), ('\x7F', '\x7F')]),
1051 ("digit", &[('0', '9')]),
1052 ("graph", &[('!', '~')]),
1053 ("lower", &[('a', 'z')]),
1054 ("print", &[(' ', '~')]),
1055 ("punct", &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]),
1056 ("space", &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'),
1057 ('\r', '\r'), (' ', ' ')]),
1058 ("upper", &[('A', 'Z')]),
1059 ("word", &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]),
1060 ("xdigit", &[('0', '9'), ('A', 'F'), ('a', 'f')]),