1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // ignore-lexer-test FIXME #15679
13 //! This crate provides a native implementation of regular expressions that is
14 //! heavily based on RE2 both in syntax and in implementation. Notably,
15 //! backreferences and arbitrary lookahead/lookbehind assertions are not
16 //! provided. In return, regular expression searching provided by this package
17 //! has excellent worst case performance. The specific syntax supported is
18 //! documented further down.
20 //! This crate's documentation provides some simple examples, describes Unicode
21 //! support and exhaustively lists the supported syntax. For more specific
22 //! details on the API, please see the documentation for the `Regex` type.
24 //! # First example: find a date
26 //! General use of regular expressions in this package involves compiling an
27 //! expression and then using it to search, split or replace text. For example,
28 //! to confirm that some text resembles a date:
32 //! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") {
34 //! Err(err) => panic!("{}", err),
36 //! assert_eq!(re.is_match("2014-01-01"), true);
39 //! Notice the use of the `^` and `$` anchors. In this crate, every expression
40 //! is executed with an implicit `.*?` at the beginning and end, which allows
41 //! it to match anywhere in the text. Anchors can be used to ensure that the
42 //! full text matches an expression.
44 //! This example also demonstrates the utility of [raw
45 //! strings](../reference.html#character-and-string-literals) in Rust, which
46 //! are just like regular strings except they are prefixed with an `r` and do
47 //! not process any escape sequences. For example, `"\\d"` is the same
48 //! expression as `r"\d"`.
50 //! # The `regex!` macro
52 //! Rust's compile time meta-programming facilities provide a way to write a
53 //! `regex!` macro which compiles regular expressions *when your program
54 //! compiles*. Said differently, if you only use `regex!` to build regular
55 //! expressions in your program, then your program cannot compile with an
56 //! invalid regular expression. Moreover, the `regex!` macro compiles the
57 //! given expression to native Rust code, which makes it much faster for
60 //! Since `regex!` provides compiled regular expressions that are both safer
61 //! and faster to use, you should use them whenever possible. The only
62 //! requirement for using them is that you have a string literal corresponding
63 //! to your expression. Otherwise, it is indistinguishable from an expression
64 //! compiled at runtime with `Regex::new`.
66 //! To use the `regex!` macro, you must enable the `phase` feature and import
67 //! the `regex_macros` crate as a syntax extension:
70 //! #![feature(phase)]
72 //! extern crate regex_macros;
73 //! extern crate regex;
76 //! let re = regex!(r"^\d{4}-\d{2}-\d{2}$");
77 //! assert_eq!(re.is_match("2014-01-01"), true);
81 //! There are a few things worth mentioning about using the `regex!` macro.
82 //! Firstly, the `regex!` macro *only* accepts string *literals*.
83 //! Secondly, the `regex` crate *must* be linked with the name `regex` since
84 //! the generated code depends on finding symbols in the `regex` crate.
86 //! The only downside of using the `regex!` macro is that it can increase the
87 //! size of your program's binary since it generates specialized Rust code.
88 //! The extra size probably won't be significant for a small number of
89 //! expressions, but 100+ calls to `regex!` will probably result in a
90 //! noticeably bigger binary.
92 //! # Example: iterating over capture groups
94 //! This crate provides convenient iterators for matching an expression
95 //! repeatedly against a search string to find successive non-overlapping
96 //! matches. For example, to find all dates in a string and be able to access
97 //! them by their component pieces:
100 //! # #![feature(phase)]
101 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
103 //! let re = regex!(r"(\d{4})-(\d{2})-(\d{2})");
104 //! let text = "2012-03-14, 2013-01-01 and 2014-07-05";
105 //! for cap in re.captures_iter(text) {
106 //! println!("Month: {} Day: {} Year: {}",
107 //! cap.at(2).unwrap_or(""), cap.at(3).unwrap_or(""),
108 //! cap.at(1).unwrap_or(""));
111 //! // Month: 03 Day: 14 Year: 2012
112 //! // Month: 01 Day: 01 Year: 2013
113 //! // Month: 07 Day: 05 Year: 2014
117 //! Notice that the year is in the capture group indexed at `1`. This is
118 //! because the *entire match* is stored in the capture group at index `0`.
120 //! # Example: replacement with named capture groups
122 //! Building on the previous example, perhaps we'd like to rearrange the date
123 //! formats. This can be done with text replacement. But to make the code
124 //! clearer, we can *name* our capture groups and use those names as variables
125 //! in our replacement text:
128 //! # #![feature(phase)]
129 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
131 //! let re = regex!(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})");
132 //! let before = "2012-03-14, 2013-01-01 and 2014-07-05";
133 //! let after = re.replace_all(before, "$m/$d/$y");
134 //! assert_eq!(after.as_slice(), "03/14/2012, 01/01/2013 and 07/05/2014");
138 //! The `replace` methods are actually polymorphic in the replacement, which
139 //! provides more flexibility than is seen here. (See the documentation for
140 //! `Regex::replace` for more details.)
142 //! # Pay for what you use
144 //! With respect to searching text with a regular expression, there are three
145 //! questions that can be asked:
147 //! 1. Does the text match this expression?
148 //! 2. If so, where does it match?
149 //! 3. Where are the submatches?
151 //! Generally speaking, this crate could provide a function to answer only #3,
152 //! which would subsume #1 and #2 automatically. However, it can be
153 //! significantly more expensive to compute the location of submatches, so it's
154 //! best not to do it if you don't need to.
156 //! Therefore, only use what you need. For example, don't use `find` if you
157 //! only need to test if an expression matches a string. (Use `is_match`
162 //! This implementation executes regular expressions **only** on sequences of
163 //! Unicode code points while exposing match locations as byte indices into the
166 //! Currently, only naive case folding is supported. Namely, when matching
167 //! case insensitively, the characters are first converted to their uppercase
168 //! forms and then compared.
170 //! Regular expressions themselves are also **only** interpreted as a sequence
171 //! of Unicode code points. This means you can use Unicode characters
172 //! directly in your expression:
175 //! # #![feature(phase)]
176 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
178 //! let re = regex!(r"(?i)Δ+");
179 //! assert_eq!(re.find("ΔδΔ"), Some((0, 6)));
183 //! Finally, Unicode general categories and scripts are available as character
184 //! classes. For example, you can match a sequence of numerals, Greek or
185 //! Cherokee letters:
188 //! # #![feature(phase)]
189 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
191 //! let re = regex!(r"[\pN\p{Greek}\p{Cherokee}]+");
192 //! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23)));
198 //! The syntax supported in this crate is almost in an exact correspondence
199 //! with the syntax supported by RE2.
201 //! ## Matching one character
203 //! <pre class="rust">
204 //! . any character except new line (includes new line with s flag)
205 //! [xyz] A character class matching either x, y or z.
206 //! [^xyz] A character class matching any character except x, y and z.
207 //! [a-z] A character class matching any character in range a-z.
208 //! \d Perl character class ([0-9])
209 //! \D Negated Perl character class ([^0-9])
210 //! [:alpha:] ASCII character class ([A-Za-z])
211 //! [:^alpha:] Negated ASCII character class ([^A-Za-z])
212 //! \pN One letter name Unicode character class
213 //! \p{Greek} Unicode character class (general category or script)
214 //! \PN Negated one letter name Unicode character class
215 //! \P{Greek} negated Unicode character class (general category or script)
218 //! Any named character class may appear inside a bracketed `[...]` character
219 //! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral
224 //! <pre class="rust">
225 //! xy concatenation (x followed by y)
226 //! x|y alternation (x or y, prefer x)
231 //! <pre class="rust">
232 //! x* zero or more of x (greedy)
233 //! x+ one or more of x (greedy)
234 //! x? zero or one of x (greedy)
235 //! x*? zero or more of x (ungreedy)
236 //! x+? one or more of x (ungreedy)
237 //! x?? zero or one of x (ungreedy)
238 //! x{n,m} at least n x and at most m x (greedy)
239 //! x{n,} at least n x (greedy)
241 //! x{n,m}? at least n x and at most m x (ungreedy)
242 //! x{n,}? at least n x (ungreedy)
243 //! x{n}? exactly n x
248 //! <pre class="rust">
249 //! ^ the beginning of text (or start-of-line with multi-line mode)
250 //! $ the end of text (or end-of-line with multi-line mode)
251 //! \A only the beginning of text (even with multi-line mode enabled)
252 //! \z only the end of text (even with multi-line mode enabled)
253 //! \b a Unicode word boundary (\w on one side and \W, \A, or \z on other)
254 //! \B not a Unicode word boundary
257 //! ## Grouping and flags
259 //! <pre class="rust">
260 //! (exp) numbered capture group (indexed by opening parenthesis)
261 //! (?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
262 //! (?:exp) non-capturing group
263 //! (?flags) set flags within current group
264 //! (?flags:exp) set flags for exp (non-capturing)
267 //! Flags are each a single character. For example, `(?x)` sets the flag `x`
268 //! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
269 //! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
270 //! the `x` flag and clears the `y` flag.
272 //! All flags are by default disabled. They are:
274 //! <pre class="rust">
275 //! i case insensitive
276 //! m multi-line mode: ^ and $ match begin/end of line
277 //! s allow . to match \n
278 //! U swap the meaning of x* and x*?
281 //! Here's an example that matches case insensitively for only part of the
285 //! # #![feature(phase)]
286 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
288 //! let re = regex!(r"(?i)a+(?-i)b+");
289 //! let cap = re.captures("AaAaAbbBBBb").unwrap();
290 //! assert_eq!(cap.at(0), Some("AaAaAbb"));
294 //! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches
297 //! ## Escape sequences
299 //! <pre class="rust">
300 //! \* literal *, works for any punctuation character: \.+*?()|[]{}^$
302 //! \f form feed (\x0C)
303 //! \t horizontal tab
305 //! \r carriage return
306 //! \v vertical tab (\x0B)
307 //! \123 octal character code (up to three digits)
308 //! \x7F hex character code (exactly two digits)
309 //! \x{10FFFF} any hex character code corresponding to a Unicode code point
312 //! ## Perl character classes (Unicode friendly)
314 //! These classes are based on the definitions provided in
315 //! [UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
317 //! <pre class="rust">
318 //! \d digit (\p{Nd})
320 //! \s whitespace (\p{White_Space})
321 //! \S not whitespace
322 //! \w word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
323 //! \W not word character
326 //! ## ASCII character classes
328 //! <pre class="rust">
329 //! [:alnum:] alphanumeric ([0-9A-Za-z])
330 //! [:alpha:] alphabetic ([A-Za-z])
331 //! [:ascii:] ASCII ([\x00-\x7F])
332 //! [:blank:] blank ([\t ])
333 //! [:cntrl:] control ([\x00-\x1F\x7F])
334 //! [:digit:] digits ([0-9])
335 //! [:graph:] graphical ([!-~])
336 //! [:lower:] lower case ([a-z])
337 //! [:print:] printable ([ -~])
338 //! [:punct:] punctuation ([!-/:-@[-`{-~])
339 //! [:space:] whitespace ([\t\n\v\f\r ])
340 //! [:upper:] upper case ([A-Z])
341 //! [:word:] word characters ([0-9A-Za-z_])
342 //! [:xdigit:] hex digit ([0-9A-Fa-f])
345 //! # Untrusted input
347 //! There are two factors to consider here: untrusted regular expressions and
348 //! untrusted search text.
350 //! Currently, there are no counter-measures in place to prevent a malicious
351 //! user from writing an expression that may use a lot of resources. One such
352 //! example is to repeat counted repetitions: `((a{100}){100}){100}` will try
353 //! to repeat the `a` instruction `100^3` times. Essentially, this means it's
354 //! very easy for an attacker to exhaust your system's memory if they are
355 //! allowed to execute arbitrary regular expressions. A possible solution to
356 //! this is to impose a hard limit on the size of a compiled expression, but it
357 //! does not yet exist.
359 //! The story is a bit better with untrusted search text, since this crate's
360 //! implementation provides `O(nm)` search where `n` is the number of
361 //! characters in the search text and `m` is the number of instructions in a
362 //! compiled expression.
364 #![crate_name = "regex"]
365 #![crate_type = "rlib"]
366 #![crate_type = "dylib"]
367 #![experimental = "use the crates.io `regex` library instead"]
368 #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
369 html_favicon_url = "http://www.rust-lang.org/favicon.ico",
370 html_root_url = "http://doc.rust-lang.org/nightly/",
371 html_playground_url = "http://play.rust-lang.org/")]
373 #![allow(unknown_features)]
374 #![feature(macro_rules, phase, slicing_syntax, globs)]
375 #![feature(unboxed_closures)]
376 #![feature(associated_types)]
377 #![deny(missing_docs)]
380 extern crate "test" as stdtest;
384 // During tests, this links with the `regex` crate so that the `regex!` macro
389 // Unicode tables for character classes are defined in libunicode
390 extern crate unicode;
392 pub use parse::Error;
393 pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
394 pub use re::{FindCaptures, FindMatches};
395 pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN};
396 pub use re::{quote, is_match};
406 /// The `native` module exists to support the `regex!` macro. Do not use.
409 // Exporting this stuff is bad form, but it's necessary for two reasons.
410 // Firstly, the `regex!` syntax extension is in a different crate and
411 // requires access to the representation of a regex (particularly the
412 // instruction set) in order to compile to native Rust. This could be
413 // mitigated if `regex!` was defined in the same crate, but this has
414 // undesirable consequences (such as requiring a dependency on
417 // Secondly, the code generated by `regex!` must *also* be able
418 // to access various functions in this crate to reduce code duplication
419 // and to provide a value with precisely the same `Regex` type in this
420 // crate. This, AFAIK, is impossible to mitigate.
422 // On the bright side, `rustdoc` lets us hide this from the public API
426 OneChar, CharClass, Any, Save, Jump, Split,
427 Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
430 FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL,
431 FLAG_SWAP_GREED, FLAG_NEGATED,
433 pub use re::{Dynamic, ExDynamic, Native, ExNative};
435 MatchKind, Exists, Location, Submatches,
436 StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
437 CharReader, find_prefix,