]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/awk/re.c
merge
[plan9front.git] / sys / src / cmd / awk / re.c
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <u.h>
26 #include <libc.h>
27 #include <ctype.h>
28 #include <bio.h>
29 #include <regexp.h>
30 #include "awk.h"
31 #include "y.tab.h"
32
33         /* This file provides the interface between the main body of
34          * awk and the pattern matching package.  It preprocesses
35          * patterns prior to compilation to provide awk-like semantics
36          * to character sequences not supported by the pattern package.
37          * The following conversions are performed:
38          *
39          *      "()"            ->      "[]"
40          *      "[-"            ->      "[\-"
41          *      "[^-"           ->      "[^\-"
42          *      "-]"            ->      "\-]"
43          *      "[]"            ->      "[]*"
44          *      "\xdddd"        ->      "\z" where 'z' is the UTF sequence
45          *                                      for the hex value
46          *      "\ddd"          ->      "\o" where 'o' is a char octal value
47          *      "\b"            ->      "\B"    where 'B' is backspace
48          *      "\t"            ->      "\T"    where 'T' is tab
49          *      "\f"            ->      "\F"    where 'F' is form feed
50          *      "\n"            ->      "\N"    where 'N' is newline
51          *      "\r"            ->      "\r"    where 'C' is cr
52          */
53
54 #define MAXRE   512
55
56 static char     re[MAXRE];      /* copy buffer */
57
58 char    *patbeg;
59 int     patlen;                 /* number of chars in pattern */
60
61 #define NPATS   20              /* number of slots in pattern cache */
62
63 static struct pat_list          /* dynamic pattern cache */
64 {
65         char    *re;
66         int     use;
67         Reprog  *program;
68 } pattern[NPATS];
69
70 static int npats;               /* cache fill level */
71
72         /* Compile a pattern */
73 void
74 *compre(char *pat)
75 {
76         int i, j, inclass;
77         char c, *p, *s;
78         Reprog *program;
79
80         if (!compile_time) {    /* search cache for dynamic pattern */
81                 for (i = 0; i < npats; i++)
82                         if (!strcmp(pat, pattern[i].re)) {
83                                 pattern[i].use++;
84                                 return((void *) pattern[i].program);
85                         }
86         }
87                 /* Preprocess Pattern for compilation */
88         p = re;
89         s = pat;
90         inclass = 0;
91         while (c = *s++) {
92                 if (c == '\\') {
93                         quoted(&s, &p, re+MAXRE);
94                         continue;
95                 }
96                 else if (!inclass && c == '(' && *s == ')') {
97                         if (p < re+MAXRE-2) {   /* '()' -> '[]*' */
98                                 *p++ = '[';
99                                 *p++ = ']';
100                                 c = '*';
101                                 s++;
102                         }
103                         else overflow();
104                 }
105                 else if (c == '['){                     /* '[-' -> '[\-' */
106                         inclass = 1;
107                         if (*s == '-') {
108                                 if (p < re+MAXRE-2) {
109                                         *p++ = '[';
110                                         *p++ = '\\';
111                                         c = *s++;
112                                 }
113                                 else overflow();
114                         }                               /* '[^-' -> '[^\-'*/
115                         else if (*s == '^' && s[1] == '-'){
116                                 if (p < re+MAXRE-3) {
117                                         *p++ = '[';
118                                         *p++ = *s++;
119                                         *p++ = '\\';
120                                         c = *s++;
121                                 }
122                                 else overflow();
123                         }
124                         else if (*s == '['){            /* skip '[[' */
125                                 if (p < re+MAXRE-1)
126                                         *p++ = c;
127                                 else overflow();
128                                 c = *s++;
129                         }
130                         else if (*s == '^' && s[1] == '[') {    /* skip '[^['*/
131                                 if (p < re+MAXRE-2) {
132                                         *p++ = c;
133                                         *p++ = *s++;
134                                         c = *s++;
135                                 }
136                                 else overflow();
137                         }
138                         else if (*s == ']') {           /* '[]' -> '[]*' */
139                                 if (p < re+MAXRE-2) {
140                                         *p++ = c;
141                                         *p++ = *s++;
142                                         c = '*';
143                                         inclass = 0;
144                                 }
145                                 else overflow();
146                         }
147                 }
148                 else if (c == '-' && *s == ']') {       /* '-]' -> '\-]' */
149                         if (p < re+MAXRE-1)
150                                 *p++ = '\\';
151                         else overflow();
152                 }
153                 else if (c == ']')
154                         inclass = 0;
155                 if (p < re+MAXRE-1)
156                         *p++ = c;
157                 else overflow();
158         }
159         *p = 0;
160         program = regcomp(re);          /* compile pattern */
161         if (!compile_time) {
162                 if (npats < NPATS)      /* Room in cache */
163                         i = npats++;
164                 else {                  /* Throw out least used */
165                         int use = pattern[0].use;
166                         i = 0;
167                         for (j = 1; j < NPATS; j++) {
168                                 if (pattern[j].use < use) {
169                                         use = pattern[j].use;
170                                         i = j;
171                                 }
172                         }
173                         xfree(pattern[i].program);
174                         xfree(pattern[i].re);
175                 }
176                 pattern[i].re = tostring(pat);
177                 pattern[i].program = program;
178                 pattern[i].use = 1;
179         }
180         return((void *) program);
181 }
182
183         /* T/F match indication - matched string not exported */
184 int
185 match(void *p, char *s, char *)
186 {
187         return regexec((Reprog *) p, (char *) s, 0, 0);
188 }
189
190         /* match and delimit the matched string */
191 int
192 pmatch(void *p, char *s, char *start)
193 {
194         Resub m;
195
196         m.sp = start;
197         m.ep = 0;
198         if (regexec((Reprog *) p, (char *) s, &m, 1)) {
199                 patbeg = m.sp;
200                 patlen = m.ep-m.sp;
201                 return 1;
202         }
203         patlen = -1;
204         patbeg = start;
205         return 0;
206 }
207
208         /* perform a non-empty match */
209 int
210 nematch(void *p, char *s, char *start)
211 {
212         if (pmatch(p, s, start) == 1 && patlen > 0)
213                 return 1;
214         patlen = -1;
215         patbeg = start; 
216         return 0;
217 }
218 /* in the parsing of regular expressions, metacharacters like . have */
219 /* to be seen literally;  \056 is not a metacharacter. */
220
221 hexstr(char **pp)       /* find and eval hex string at pp, return new p */
222 {
223         char c;
224         int n = 0;
225         int i;
226
227         for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
228                 if (isdigit(c))
229                         n = 16 * n + c - '0';
230                 else if ('a' <= c && c <= 'f')
231                         n = 16 * n + c - 'a' + 10;
232                 else if ('A' <= c && c <= 'F')
233                         n = 16 * n + c - 'A' + 10;
234         }
235         *pp += i;
236         return n;
237 }
238
239         /* look for awk-specific escape sequences */
240
241 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
242
243 void
244 quoted(char **s, char **to, char *end)  /* handle escaped sequence */
245 {
246         char *p = *s;
247         char *t = *to;
248         Rune c;
249
250         switch(c = *p++) {
251         case 't':
252                 c = '\t';
253                 break;
254         case 'n':
255                 c = '\n';
256                 break;
257         case 'f':
258                 c = '\f';
259                 break;
260         case 'r':
261                 c = '\r';
262                 break;
263         case 'b':
264                 c = '\b';
265                 break;
266         default:
267                 if (t < end-1)          /* all else must be escaped */
268                         *t++ = '\\';
269                 if (c == 'x') {         /* hexadecimal goo follows */
270                         c = hexstr(&p);
271                         if (t < end-UTFmax)
272                                 t += runelen(c);
273                         else overflow();
274                         *to = t;
275                         *s = p;
276                         return;
277                 } else if (isoctdigit(c)) {     /* \d \dd \ddd */
278                         c -= '0';
279                         if (isoctdigit(*p)) {
280                                 c = 8 * c + *p++ - '0';
281                                 if (isoctdigit(*p))
282                                         c = 8 * c + *p++ - '0';
283                         }
284                 }
285                 break;
286         }
287         if (t < end-1)
288                 *t++ = c;
289         *s = p;
290         *to = t;
291 }
292
293         /* pattern package error handler */
294
295 void
296 regerror(char *s)
297 {
298         FATAL("%s", s);
299 }
300
301 void
302 overflow(void)
303 {
304         FATAL("%s", "regular expression too big");
305 }