1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // Enable this to squash warnings due to exporting pieces of the representation
12 // for use with the regex! macro. See lib.rs for explanation.
13 #![allow(visible_private_types)]
19 Nothing, Literal, Dot, AstClass, Begin, End, WordBoundary, Capture, Cat, Alt,
21 ZeroOne, ZeroMore, OneMore,
26 #[deriving(Show, Clone)]
28 // When a Match instruction is executed, the current thread is successful.
31 // The OneChar instruction matches a literal character.
32 // The flags indicate whether to do a case insensitive match.
35 // The CharClass instruction tries to match one input character against
36 // the range of characters given.
37 // The flags indicate whether to do a case insensitive match and whether
38 // the character class is negated or not.
39 CharClass(Vec<(char, char)>, Flags),
41 // Matches any character except new lines.
42 // The flags indicate whether to include the '\n' character.
45 // Matches the beginning of the string, consumes no characters.
46 // The flags indicate whether it matches if the preceding character
50 // Matches the end of the string, consumes no characters.
51 // The flags indicate whether it matches if the proceeding character
55 // Matches a word boundary (\w on one side and \W \A or \z on the other),
56 // and consumes no character.
57 // The flags indicate whether this matches a word boundary or something
58 // that isn't a word boundary.
59 EmptyWordBoundary(Flags),
61 // Saves the current position in the input string to the Nth save slot.
64 // Jumps to the instruction at the index given.
67 // Jumps to the instruction at the first index given. If that leads to
68 // a failing state, then the instruction at the second index given is
70 Split(InstIdx, InstIdx),
73 /// Program represents a compiled regular expression. Once an expression is
74 /// compiled, its representation is immutable and will never change.
76 /// All of the data in a compiled expression is wrapped in "MaybeStatic" or
77 /// "MaybeOwned" types so that a `Program` can be represented as static data.
78 /// (This makes it convenient and efficient for use with the `regex!` macro.)
81 /// A sequence of instructions.
83 /// If the regular expression requires a literal prefix in order to have a
84 /// match, that prefix is stored here. (It's used in the VM to implement
90 /// Compiles a Regex given its AST.
91 pub fn new(ast: parse::Ast) -> (Program, Vec<Option<String>>) {
92 let mut c = Compiler {
93 insts: Vec::with_capacity(100),
94 names: Vec::with_capacity(10),
97 c.insts.push(Save(0));
99 c.insts.push(Save(1));
102 // Try to discover a literal string prefix.
103 // This is a bit hacky since we have to skip over the initial
104 // 'Save' instruction.
105 let mut pre = String::with_capacity(5);
106 for inst in c.insts.slice_from(1).iter() {
108 OneChar(c, FLAG_EMPTY) => pre.push_char(c),
113 let Compiler { insts, names } = c;
121 /// Returns the total number of capture groups in the regular expression.
122 /// This includes the zeroth capture.
123 pub fn num_captures(&self) -> uint {
125 for inst in self.insts.iter() {
127 Save(c) => n = cmp::max(n, c+1),
131 // There's exactly 2 Save slots for every capture.
136 struct Compiler<'r> {
138 names: Vec<Option<String>>,
141 // The compiler implemented here is extremely simple. Most of the complexity
142 // in this crate is in the parser or the VM.
143 // The only tricky thing here is patching jump/split instructions to point to
144 // the right instruction.
145 impl<'r> Compiler<'r> {
146 fn compile(&mut self, ast: parse::Ast) {
149 Literal(c, flags) => self.push(OneChar(c, flags)),
150 Dot(nl) => self.push(Any(nl)),
151 AstClass(ranges, flags) =>
152 self.push(CharClass(ranges, flags)),
153 Begin(flags) => self.push(EmptyBegin(flags)),
154 End(flags) => self.push(EmptyEnd(flags)),
155 WordBoundary(flags) => self.push(EmptyWordBoundary(flags)),
156 Capture(cap, name, x) => {
157 let len = self.names.len();
159 self.names.grow(10 + cap - len, &None)
161 *self.names.get_mut(cap) = name;
163 self.push(Save(2 * cap));
165 self.push(Save(2 * cap + 1));
168 for x in xs.into_iter() {
173 let split = self.empty_split(); // push: split 0, 0
174 let j1 = self.insts.len();
175 self.compile(*x); // push: insts for x
176 let jmp = self.empty_jump(); // push: jmp 0
177 let j2 = self.insts.len();
178 self.compile(*y); // push: insts for y
179 let j3 = self.insts.len();
181 self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2
182 self.set_jump(jmp, j3); // jmp 0 -> jmp j3
184 Rep(x, ZeroOne, g) => {
185 let split = self.empty_split();
186 let j1 = self.insts.len();
188 let j2 = self.insts.len();
191 self.set_split(split, j1, j2);
193 self.set_split(split, j2, j1);
196 Rep(x, ZeroMore, g) => {
197 let j1 = self.insts.len();
198 let split = self.empty_split();
199 let j2 = self.insts.len();
201 let jmp = self.empty_jump();
202 let j3 = self.insts.len();
204 self.set_jump(jmp, j1);
206 self.set_split(split, j2, j3);
208 self.set_split(split, j3, j2);
211 Rep(x, OneMore, g) => {
212 let j1 = self.insts.len();
214 let split = self.empty_split();
215 let j2 = self.insts.len();
218 self.set_split(split, j1, j2);
220 self.set_split(split, j2, j1);
226 /// Appends the given instruction to the program.
228 fn push(&mut self, x: Inst) {
232 /// Appends an *empty* `Split` instruction to the program and returns
233 /// the index of that instruction. (The index can then be used to "patch"
234 /// the actual locations of the split in later.)
236 fn empty_split(&mut self) -> InstIdx {
237 self.insts.push(Split(0, 0));
241 /// Sets the left and right locations of a `Split` instruction at index
242 /// `i` to `pc1` and `pc2`, respectively.
243 /// If the instruction at index `i` isn't a `Split` instruction, then
244 /// `fail!` is called.
246 fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) {
247 let split = self.insts.get_mut(i);
249 Split(_, _) => *split = Split(pc1, pc2),
250 _ => fail!("BUG: Invalid split index."),
254 /// Appends an *empty* `Jump` instruction to the program and returns the
255 /// index of that instruction.
257 fn empty_jump(&mut self) -> InstIdx {
258 self.insts.push(Jump(0));
262 /// Sets the location of a `Jump` instruction at index `i` to `pc`.
263 /// If the instruction at index `i` isn't a `Jump` instruction, then
264 /// `fail!` is called.
266 fn set_jump(&mut self, i: InstIdx, pc: InstIdx) {
267 let jmp = self.insts.get_mut(i);
269 Jump(_) => *jmp = Jump(pc),
270 _ => fail!("BUG: Invalid jump index."),