src/libregex/lib.rs

   1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10 //
  11 // ignore-lexer-test FIXME #15679
  12
  13 //! This crate provides a native implementation of regular expressions that is
  14 //! heavily based on RE2 both in syntax and in implementation. Notably,
  15 //! backreferences and arbitrary lookahead/lookbehind assertions are not
  16 //! provided. In return, regular expression searching provided by this package
  17 //! has excellent worst case performance. The specific syntax supported is
  18 //! documented further down.
  19 //!
  20 //! This crate's documentation provides some simple examples, describes Unicode
  21 //! support and exhaustively lists the supported syntax. For more specific
  22 //! details on the API, please see the documentation for the `Regex` type.
  23 //!
  24 //! # First example: find a date
  25 //!
  26 //! General use of regular expressions in this package involves compiling an
  27 //! expression and then using it to search, split or replace text. For example,
  28 //! to confirm that some text resembles a date:
  29 //!
  30 //! ```rust
  31 //! use regex::Regex;
  32 //! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") {
  33 //!     Ok(re) => re,
  34 //!     Err(err) => panic!("{}", err),
  35 //! };
  36 //! assert_eq!(re.is_match("2014-01-01"), true);
  37 //! ```
  38 //!
  39 //! Notice the use of the `^` and `$` anchors. In this crate, every expression
  40 //! is executed with an implicit `.*?` at the beginning and end, which allows
  41 //! it to match anywhere in the text. Anchors can be used to ensure that the
  42 //! full text matches an expression.
  43 //!
  44 //! This example also demonstrates the utility of [raw
  45 //! strings](../reference.html#character-and-string-literals) in Rust, which
  46 //! are just like regular strings except they are prefixed with an `r` and do
  47 //! not process any escape sequences. For example, `"\\d"` is the same
  48 //! expression as `r"\d"`.
  49 //!
  50 //! # The `regex!` macro
  51 //!
  52 //! Rust's compile time meta-programming facilities provide a way to write a
  53 //! `regex!` macro which compiles regular expressions *when your program
  54 //! compiles*. Said differently, if you only use `regex!` to build regular
  55 //! expressions in your program, then your program cannot compile with an
  56 //! invalid regular expression. Moreover, the `regex!` macro compiles the
  57 //! given expression to native Rust code, which makes it much faster for
  58 //! searching text.
  59 //!
  60 //! Since `regex!` provides compiled regular expressions that are both safer
  61 //! and faster to use, you should use them whenever possible. The only
  62 //! requirement for using them is that you have a string literal corresponding
  63 //! to your expression. Otherwise, it is indistinguishable from an expression
  64 //! compiled at runtime with `Regex::new`.
  65 //!
  66 //! To use the `regex!` macro, you must enable the `phase` feature and import
  67 //! the `regex_macros` crate as a syntax extension:
  68 //!
  69 //! ```rust
  70 //! #![feature(phase)]
  71 //! #[phase(plugin)]
  72 //! extern crate regex_macros;
  73 //! extern crate regex;
  74 //!
  75 //! fn main() {
  76 //!     let re = regex!(r"^\d{4}-\d{2}-\d{2}$");
  77 //!     assert_eq!(re.is_match("2014-01-01"), true);
  78 //! }
  79 //! ```
  80 //!
  81 //! There are a few things worth mentioning about using the `regex!` macro.
  82 //! Firstly, the `regex!` macro *only* accepts string *literals*.
  83 //! Secondly, the `regex` crate *must* be linked with the name `regex` since
  84 //! the generated code depends on finding symbols in the `regex` crate.
  85 //!
  86 //! The only downside of using the `regex!` macro is that it can increase the
  87 //! size of your program's binary since it generates specialized Rust code.
  88 //! The extra size probably won't be significant for a small number of
  89 //! expressions, but 100+ calls to `regex!` will probably result in a
  90 //! noticeably bigger binary.
  91 //!
  92 //! # Example: iterating over capture groups
  93 //!
  94 //! This crate provides convenient iterators for matching an expression
  95 //! repeatedly against a search string to find successive non-overlapping
  96 //! matches. For example, to find all dates in a string and be able to access
  97 //! them by their component pieces:
  98 //!
  99 //! ```rust
 100 //! # #![feature(phase)]
 101 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
 102 //! # fn main() {
 103 //! let re = regex!(r"(\d{4})-(\d{2})-(\d{2})");
 104 //! let text = "2012-03-14, 2013-01-01 and 2014-07-05";
 105 //! for cap in re.captures_iter(text) {
 106 //!     println!("Month: {} Day: {} Year: {}",
 107 //!              cap.at(2).unwrap_or(""), cap.at(3).unwrap_or(""),
 108 //!              cap.at(1).unwrap_or(""));
 109 //! }
 110 //! // Output:
 111 //! // Month: 03 Day: 14 Year: 2012
 112 //! // Month: 01 Day: 01 Year: 2013
 113 //! // Month: 07 Day: 05 Year: 2014
 114 //! # }
 115 //! ```
 116 //!
 117 //! Notice that the year is in the capture group indexed at `1`. This is
 118 //! because the *entire match* is stored in the capture group at index `0`.
 119 //!
 120 //! # Example: replacement with named capture groups
 121 //!
 122 //! Building on the previous example, perhaps we'd like to rearrange the date
 123 //! formats. This can be done with text replacement. But to make the code
 124 //! clearer, we can *name*  our capture groups and use those names as variables
 125 //! in our replacement text:
 126 //!
 127 //! ```rust
 128 //! # #![feature(phase)]
 129 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
 130 //! # fn main() {
 131 //! let re = regex!(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})");
 132 //! let before = "2012-03-14, 2013-01-01 and 2014-07-05";
 133 //! let after = re.replace_all(before, "$m/$d/$y");
 134 //! assert_eq!(after.as_slice(), "03/14/2012, 01/01/2013 and 07/05/2014");
 135 //! # }
 136 //! ```
 137 //!
 138 //! The `replace` methods are actually polymorphic in the replacement, which
 139 //! provides more flexibility than is seen here. (See the documentation for
 140 //! `Regex::replace` for more details.)
 141 //!
 142 //! # Pay for what you use
 143 //!
 144 //! With respect to searching text with a regular expression, there are three
 145 //! questions that can be asked:
 146 //!
 147 //! 1. Does the text match this expression?
 148 //! 2. If so, where does it match?
 149 //! 3. Where are the submatches?
 150 //!
 151 //! Generally speaking, this crate could provide a function to answer only #3,
 152 //! which would subsume #1 and #2 automatically. However, it can be
 153 //! significantly more expensive to compute the location of submatches, so it's
 154 //! best not to do it if you don't need to.
 155 //!
 156 //! Therefore, only use what you need. For example, don't use `find` if you
 157 //! only need to test if an expression matches a string. (Use `is_match`
 158 //! instead.)
 159 //!
 160 //! # Unicode
 161 //!
 162 //! This implementation executes regular expressions **only** on sequences of
 163 //! Unicode code points while exposing match locations as byte indices into the
 164 //! search string.
 165 //!
 166 //! Currently, only naive case folding is supported. Namely, when matching
 167 //! case insensitively, the characters are first converted to their uppercase
 168 //! forms and then compared.
 169 //!
 170 //! Regular expressions themselves are also **only** interpreted as a sequence
 171 //! of Unicode code points. This means you can use Unicode characters
 172 //! directly in your expression:
 173 //!
 174 //! ```rust
 175 //! # #![feature(phase)]
 176 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
 177 //! # fn main() {
 178 //! let re = regex!(r"(?i)Δ+");
 179 //! assert_eq!(re.find("ΔδΔ"), Some((0, 6)));
 180 //! # }
 181 //! ```
 182 //!
 183 //! Finally, Unicode general categories and scripts are available as character
 184 //! classes. For example, you can match a sequence of numerals, Greek or
 185 //! Cherokee letters:
 186 //!
 187 //! ```rust
 188 //! # #![feature(phase)]
 189 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
 190 //! # fn main() {
 191 //! let re = regex!(r"[\pN\p{Greek}\p{Cherokee}]+");
 192 //! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23)));
 193 //! # }
 194 //! ```
 195 //!
 196 //! # Syntax
 197 //!
 198 //! The syntax supported in this crate is almost in an exact correspondence
 199 //! with the syntax supported by RE2.
 200 //!
 201 //! ## Matching one character
 202 //!
 203 //! <pre class="rust">
 204 //! .           any character except new line (includes new line with s flag)
 205 //! [xyz]       A character class matching either x, y or z.
 206 //! [^xyz]      A character class matching any character except x, y and z.
 207 //! [a-z]       A character class matching any character in range a-z.
 208 //! \d          Perl character class ([0-9])
 209 //! \D          Negated Perl character class ([^0-9])
 210 //! [:alpha:]   ASCII character class ([A-Za-z])
 211 //! [:^alpha:]  Negated ASCII character class ([^A-Za-z])
 212 //! \pN         One letter name Unicode character class
 213 //! \p{Greek}   Unicode character class (general category or script)
 214 //! \PN         Negated one letter name Unicode character class
 215 //! \P{Greek}   negated Unicode character class (general category or script)
 216 //! </pre>
 217 //!
 218 //! Any named character class may appear inside a bracketed `[...]` character
 219 //! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral
 220 //! character.
 221 //!
 222 //! ## Composites
 223 //!
 224 //! <pre class="rust">
 225 //! xy    concatenation (x followed by y)
 226 //! x|y   alternation (x or y, prefer x)
 227 //! </pre>
 228 //!
 229 //! ## Repetitions
 230 //!
 231 //! <pre class="rust">
 232 //! x*        zero or more of x (greedy)
 233 //! x+        one or more of x (greedy)
 234 //! x?        zero or one of x (greedy)
 235 //! x*?       zero or more of x (ungreedy)
 236 //! x+?       one or more of x (ungreedy)
 237 //! x??       zero or one of x (ungreedy)
 238 //! x{n,m}    at least n x and at most m x (greedy)
 239 //! x{n,}     at least n x (greedy)
 240 //! x{n}      exactly n x
 241 //! x{n,m}?   at least n x and at most m x (ungreedy)
 242 //! x{n,}?    at least n x (ungreedy)
 243 //! x{n}?     exactly n x
 244 //! </pre>
 245 //!
 246 //! ## Empty matches
 247 //!
 248 //! <pre class="rust">
 249 //! ^     the beginning of text (or start-of-line with multi-line mode)
 250 //! $     the end of text (or end-of-line with multi-line mode)
 251 //! \A    only the beginning of text (even with multi-line mode enabled)
 252 //! \z    only the end of text (even with multi-line mode enabled)
 253 //! \b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
 254 //! \B    not a Unicode word boundary
 255 //! </pre>
 256 //!
 257 //! ## Grouping and flags
 258 //!
 259 //! <pre class="rust">
 260 //! (exp)          numbered capture group (indexed by opening parenthesis)
 261 //! (?P&lt;name&gt;exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
 262 //! (?:exp)        non-capturing group
 263 //! (?flags)       set flags within current group
 264 //! (?flags:exp)   set flags for exp (non-capturing)
 265 //! </pre>
 266 //!
 267 //! Flags are each a single character. For example, `(?x)` sets the flag `x`
 268 //! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
 269 //! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
 270 //! the `x` flag and clears the `y` flag.
 271 //!
 272 //! All flags are by default disabled. They are:
 273 //!
 274 //! <pre class="rust">
 275 //! i     case insensitive
 276 //! m     multi-line mode: ^ and $ match begin/end of line
 277 //! s     allow . to match \n
 278 //! U     swap the meaning of x* and x*?
 279 //! </pre>
 280 //!
 281 //! Here's an example that matches case insensitively for only part of the
 282 //! expression:
 283 //!
 284 //! ```rust
 285 //! # #![feature(phase)]
 286 //! # extern crate regex; #[phase(plugin)] extern crate regex_macros;
 287 //! # fn main() {
 288 //! let re = regex!(r"(?i)a+(?-i)b+");
 289 //! let cap = re.captures("AaAaAbbBBBb").unwrap();
 290 //! assert_eq!(cap.at(0), Some("AaAaAbb"));
 291 //! # }
 292 //! ```
 293 //!
 294 //! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches
 295 //! `b`.
 296 //!
 297 //! ## Escape sequences
 298 //!
 299 //! <pre class="rust">
 300 //! \*         literal *, works for any punctuation character: \.+*?()|[]{}^$
 301 //! \a         bell (\x07)
 302 //! \f         form feed (\x0C)
 303 //! \t         horizontal tab
 304 //! \n         new line
 305 //! \r         carriage return
 306 //! \v         vertical tab (\x0B)
 307 //! \123       octal character code (up to three digits)
 308 //! \x7F       hex character code (exactly two digits)
 309 //! \x{10FFFF} any hex character code corresponding to a Unicode code point
 310 //! </pre>
 311 //!
 312 //! ## Perl character classes (Unicode friendly)
 313 //!
 314 //! These classes are based on the definitions provided in
 315 //! [UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
 316 //!
 317 //! <pre class="rust">
 318 //! \d     digit (\p{Nd})
 319 //! \D     not digit
 320 //! \s     whitespace (\p{White_Space})
 321 //! \S     not whitespace
 322 //! \w     word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
 323 //! \W     not word character
 324 //! </pre>
 325 //!
 326 //! ## ASCII character classes
 327 //!
 328 //! <pre class="rust">
 329 //! [:alnum:]    alphanumeric ([0-9A-Za-z])
 330 //! [:alpha:]    alphabetic ([A-Za-z])
 331 //! [:ascii:]    ASCII ([\x00-\x7F])
 332 //! [:blank:]    blank ([\t ])
 333 //! [:cntrl:]    control ([\x00-\x1F\x7F])
 334 //! [:digit:]    digits ([0-9])
 335 //! [:graph:]    graphical ([!-~])
 336 //! [:lower:]    lower case ([a-z])
 337 //! [:print:]    printable ([ -~])
 338 //! [:punct:]    punctuation ([!-/:-@[-`{-~])
 339 //! [:space:]    whitespace ([\t\n\v\f\r ])
 340 //! [:upper:]    upper case ([A-Z])
 341 //! [:word:]     word characters ([0-9A-Za-z_])
 342 //! [:xdigit:]   hex digit ([0-9A-Fa-f])
 343 //! </pre>
 344 //!
 345 //! # Untrusted input
 346 //!
 347 //! There are two factors to consider here: untrusted regular expressions and
 348 //! untrusted search text.
 349 //!
 350 //! Currently, there are no counter-measures in place to prevent a malicious
 351 //! user from writing an expression that may use a lot of resources. One such
 352 //! example is to repeat counted repetitions: `((a{100}){100}){100}` will try
 353 //! to repeat the `a` instruction `100^3` times. Essentially, this means it's
 354 //! very easy for an attacker to exhaust your system's memory if they are
 355 //! allowed to execute arbitrary regular expressions. A possible solution to
 356 //! this is to impose a hard limit on the size of a compiled expression, but it
 357 //! does not yet exist.
 358 //!
 359 //! The story is a bit better with untrusted search text, since this crate's
 360 //! implementation provides `O(nm)` search where `n` is the number of
 361 //! characters in the search text and `m` is the number of instructions in a
 362 //! compiled expression.
 363
 364 #![crate_name = "regex"]
 365 #![crate_type = "rlib"]
 366 #![crate_type = "dylib"]
 367 #![experimental = "use the crates.io `regex` library instead"]
 368 #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
 369        html_favicon_url = "http://www.rust-lang.org/favicon.ico",
 370        html_root_url = "http://doc.rust-lang.org/nightly/",
 371        html_playground_url = "http://play.rust-lang.org/")]
 372
 373 #![allow(unknown_features)]
 374 #![feature(macro_rules, phase, slicing_syntax, globs)]
 375 #![feature(unboxed_closures)]
 376 #![feature(associated_types)]
 377 #![deny(missing_docs)]
 378
 379 #[cfg(test)]
 380 extern crate "test" as stdtest;
 381 #[cfg(test)]
 382 extern crate rand;
 383
 384 // During tests, this links with the `regex` crate so that the `regex!` macro
 385 // can be tested.
 386 #[cfg(test)]
 387 extern crate regex;
 388
 389 // Unicode tables for character classes are defined in libunicode
 390 extern crate unicode;
 391
 392 pub use parse::Error;
 393 pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
 394 pub use re::{FindCaptures, FindMatches};
 395 pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN};
 396 pub use re::{quote, is_match};
 397
 398 mod compile;
 399 mod parse;
 400 mod re;
 401 mod vm;
 402
 403 #[cfg(test)]
 404 mod test;
 405
 406 /// The `native` module exists to support the `regex!` macro. Do not use.
 407 #[doc(hidden)]
 408 pub mod native {
 409     // Exporting this stuff is bad form, but it's necessary for two reasons.
 410     // Firstly, the `regex!` syntax extension is in a different crate and
 411     // requires access to the representation of a regex (particularly the
 412     // instruction set) in order to compile to native Rust. This could be
 413     // mitigated if `regex!` was defined in the same crate, but this has
 414     // undesirable consequences (such as requiring a dependency on
 415     // `libsyntax`).
 416     //
 417     // Secondly, the code generated by `regex!` must *also* be able
 418     // to access various functions in this crate to reduce code duplication
 419     // and to provide a value with precisely the same `Regex` type in this
 420     // crate. This, AFAIK, is impossible to mitigate.
 421     //
 422     // On the bright side, `rustdoc` lets us hide this from the public API
 423     // documentation.
 424     pub use compile::{
 425         Program,
 426         OneChar, CharClass, Any, Save, Jump, Split,
 427         Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
 428     };
 429     pub use parse::{
 430         FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL,
 431         FLAG_SWAP_GREED, FLAG_NEGATED,
 432     };
 433     pub use re::{Dynamic, ExDynamic, Native, ExNative};
 434     pub use vm::{
 435         MatchKind, Exists, Location, Submatches,
 436         StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
 437         CharReader, find_prefix,
 438     };
 439 }