]> git.lizzy.rs Git - plan9front.git/blob - sys/src/libregexp/regcomp.c
libregexp: simplify regular expression vm implementation
[plan9front.git] / sys / src / libregexp / regcomp.c
1 #include <u.h>
2 #include <libc.h>
3 #include <regexp.h>
4 #include "regimpl.h"
5
6 int instrcnt[] = {
7         [TANY]    2,
8         [TBOL]    1,
9         [TCAT]    0,
10         [TCLASS]  1,
11         [TEOL]    1,
12         [TNOTNL]  1,
13         [TOR]     2,
14         [TPLUS]   1,
15         [TQUES]   1,
16         [TRUNE]   1,
17         [TSTAR]   2,
18         [TSUB]    2
19 };
20
21 static Renode*
22 node(Parselex *plex, int op, Renode *l, Renode *r)
23 {
24         Renode *n;
25
26         plex->instrs += instrcnt[op];
27         n = plex->next++;
28         n->op = op;
29         n->left = l;
30         n->right = r;
31         return n;
32 }
33
34 static Renode*
35 e3(Parselex *plex)
36 {
37         char error[128];
38         Renode *n;
39
40         switch(lex(plex)) {
41         case LANY:
42                 return node(plex, TANY, nil, nil);
43         case LBOL:
44                 return node(plex, TBOL, nil, nil);
45         case LEOL:
46                 return node(plex, TEOL, nil, nil);
47         case LRUNE:
48                 n = node(plex, TRUNE, nil, nil);
49                 n->r = plex->rune;
50                 return n;
51         case LCLASS:
52                 if(plex->nc)
53                         return buildclassn(plex);
54                 return buildclass(plex);
55         case LLPAR:
56                 n = e0(plex);
57                 n = node(plex, TSUB, n, nil);
58                 if(lex(plex) != LRPAR) {
59                         snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig);
60                         regerror(error);
61                         longjmp(plex->exitenv, 1);
62                 }
63                 return n;
64         default:
65                 if(plex->rune)
66                         snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune);
67                 else
68                         snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig);
69                 regerror(error);
70                 longjmp(plex->exitenv, 1);
71         }
72         return nil;
73 }
74
75 static Renode*
76 e2(Parselex *plex)
77 {
78         Renode *n;
79
80         n = e3(plex);
81         while(lex(plex) == LREP) {
82                 switch(plex->rune) {
83                 case L'*':
84                         n = node(plex, TSTAR, n, nil);
85                         break;
86                 case L'+':
87                         n = node(plex, TPLUS, n, nil);
88                         break;
89                 case L'?':
90                         n = node(plex, TQUES, n, nil);
91                         break;
92                 }
93         }
94         plex->peek = 1;
95         return n;
96 }
97
98 static Renode*
99 invert(Renode *n)
100 {
101         Renode *n1;
102
103         if(n->op != TCAT)
104                 return n;
105         while(n->left->op == TCAT) {
106                 n1 = n->left;
107                 n->left = n1->right;
108                 n1->right = n;
109                 n = n1;
110         }
111         return n;
112 }
113
114 static Renode*
115 e1(Parselex *plex)
116 {
117         Renode *n;
118         int sym;
119
120         n = e2(plex);
121         for(;;) {
122                 sym = lex(plex);
123                 if(sym == LEND || sym == LOR || sym == LRPAR)
124                         break;
125                 plex->peek = 1;
126                 n = node(plex, TCAT, n, e2(plex));
127         }
128         plex->peek = 1;
129         return invert(n);
130 }
131
132 static Renode*
133 e0(Parselex *plex)
134 {
135         Renode *n;
136
137         n = e1(plex);
138         for(;;) {
139                 if(lex(plex) != LOR)
140                         break;
141                 n = node(plex, TOR, n, e1(plex));
142         }
143         plex->peek = 1;
144         return n;
145 }
146
147 static Parselex*
148 initplex(Parselex *plex, char *regstr, int lit)
149 {
150         plex->getnextr = lit ? getnextrlit : getnextr;
151         plex->rawexp = plex->orig = regstr;
152         plex->sub = 0;
153         plex->instrs = 0;
154         plex->peek = 0;
155         plex->done = 0;
156         return plex;
157 }
158
159 static Reprog*
160 regcomp1(char *regstr, int nl, int lit)
161 {
162         Reprog *reprog;
163         Parselex plex;
164         Renode *parsetr;
165         int regstrlen, maxthr;
166
167         regstrlen = utflen(regstr);
168         initplex(&plex, regstr, lit);
169         plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2);
170         if(plex.nodes == nil)
171                 return nil;
172         plex.next = plex.nodes;
173
174         if(setjmp(plex.exitenv) != 0) {
175                 free(plex.nodes);
176                 return nil;
177         }
178
179         maxthr = regstrlen + 1;
180         parsetr = node(&plex, TSUB, e0(&plex), nil);
181
182 //      prtree(parsetr, 0, 1);
183         reprog = malloc(sizeof(Reprog) +
184                         sizeof(Reinst) * plex.instrs +
185                         sizeof(Rethread) * maxthr);
186         reprog->len = plex.instrs;
187         reprog->nthr = maxthr;
188         reprog->startinst = compile(parsetr, reprog, nl);
189         reprog->threads = (Rethread*)(reprog->startinst + reprog->len);
190         reprog->regstr = regstr;
191
192         free(plex.nodes);
193         return reprog;
194 }
195
196 Reprog*
197 regcomp(char *str)
198 {
199         return regcomp1(str, 0, 0);
200 }
201
202 Reprog*
203 regcomplit(char *str)
204 {
205         return regcomp1(str, 0, 1);
206 }
207
208 Reprog*
209 regcompnl(char *str)
210 {
211         return regcomp1(str, 1, 0);
212 }
213
214 static Reinst*
215 compile1(Renode *renode, Reinst *reinst, int *sub, int nl)
216 {
217         Reinst *i;
218         int s;
219
220 Tailcall:
221         if(renode == nil)
222                 return reinst;
223         switch(renode->op) {
224         case TCLASS:
225                 reinst->op = OCLASS;
226                 reinst->r = renode->r;
227                 reinst->r1 = renode->r1;
228                 reinst->a = reinst + 1 + renode->nclass;
229                 renode = renode->left;
230                 reinst++;
231                 goto Tailcall;
232         case TCAT:
233                 reinst = compile1(renode->left, reinst, sub, nl);
234                 renode = renode->right;
235                 goto Tailcall;
236         case TOR:
237                 reinst->op = OSPLIT;
238                 reinst->a = reinst + 1;
239                 i = compile1(renode->left, reinst->a, sub, nl);
240                 reinst->b = i + 1;
241                 i->op = OJMP;
242                 i->a = compile1(renode->right, reinst->b, sub, nl);
243                 return i->a;
244         case TSTAR:
245                 reinst->op = OSPLIT;
246                 reinst->a = reinst + 1;
247                 i = compile1(renode->left, reinst->a, sub, nl);
248                 reinst->b = i + 1;
249                 i->op = OJMP;
250                 i->a = reinst;
251                 return reinst->b;
252         case TPLUS:
253                 i = reinst;
254                 reinst = compile1(renode->left, reinst, sub, nl);
255                 reinst->op = OSPLIT;
256                 reinst->a = i;
257                 reinst->b = reinst + 1;
258                 return reinst->b;
259         case TQUES:
260                 reinst->op = OSPLIT;
261                 reinst->a = reinst + 1;
262                 reinst->b = compile1(renode->left, reinst->a, sub, nl);
263                 return reinst->b;
264         case TSUB:
265                 reinst->op = OSAVE;
266                 reinst->sub = s = (*sub)++;
267                 reinst = compile1(renode->left, reinst+1, sub, nl);
268                 reinst->op = OUNSAVE;
269                 reinst->sub = s;
270                 return reinst + 1;
271         case TANY:
272                 if(nl == 0)
273                         reinst++->op = ONOTNL;
274                 reinst->op = OANY;
275                 return reinst + 1;
276         case TRUNE:
277                 reinst->op = ORUNE;
278                 reinst->r = renode->r;
279                 return reinst + 1;
280         case TNOTNL:
281                 reinst->op = ONOTNL;
282                 return reinst + 1;
283         case TEOL:
284                 reinst->op = OEOL;
285                 return reinst + 1;
286         case TBOL:
287                 reinst->op = OBOL;
288                 return reinst + 1;
289         }
290         return nil;
291 }
292
293 static Reinst*
294 compile(Renode *parsetr, Reprog *reprog, int nl)
295 {
296         Reinst *reinst, *end;
297         int sub;
298
299         sub = 0;
300         reinst = (Reinst*)(reprog+1);
301         end = compile1(parsetr, reinst, &sub, nl);
302         assert(reinst + reprog->len == end);
303         return reinst;
304 }
305
306 static void
307 getnextr(Parselex *l)
308 {
309         l->literal = 0;
310         if(l->done) {
311                 l->rune = 0;
312                 return;
313         }
314         l->rawexp += chartorune(&l->rune, l->rawexp);
315         if(l->rune == L'\\') {
316                 l->rawexp += chartorune(&l->rune, l->rawexp);
317                 l->literal = 1;
318         }
319         if(*l->rawexp == 0)
320                 l->done = 1;
321         return;
322 }
323
324 static void
325 getnextrlit(Parselex *l)
326 {
327         l->literal = 1;
328         if(l->done) {
329                 l->literal = 0;
330                 l->rune = 0;
331                 return;
332         }
333         l->rawexp += chartorune(&l->rune, l->rawexp);
334         if(*l->rawexp == 0)
335                 l->done = 1;
336         return;
337 }
338
339 static int
340 lex(Parselex *l)
341 {
342         if(l->peek) {
343                 l->peek = 0;
344                 return l->peeklex;
345         }
346         l->getnextr(l);
347         if(l->literal)
348                 return l->peeklex = LRUNE;
349         switch(l->rune){
350         case 0:
351                 return l->peeklex = LEND;
352         case L'*':
353         case L'?':
354         case L'+':
355                 return l->peeklex = LREP;
356         case L'|':
357                 return l->peeklex = LOR;
358         case L'.':
359                 return l->peeklex = LANY;
360         case L'(':
361                 return l->peeklex = LLPAR;
362         case L')':
363                 return l->peeklex = LRPAR;
364         case L'^':
365                 return l->peeklex = LBOL;
366         case L'$':
367                 return l->peeklex = LEOL;
368         case L'[':
369                 getclass(l);
370                 return l->peeklex = LCLASS;
371         }
372         return l->peeklex = LRUNE;
373 }
374
375 static int
376 pcmp(void *va, void *vb)
377 {
378         vlong n;
379         Rune *a, *b;
380
381         a = va;
382         b = vb;
383
384         n = (vlong)b[0] - (vlong)a[0];
385         if(n)
386                 return n;
387         return (vlong)b[1] - (vlong)a[1];
388 }
389
390 static void
391 getclass(Parselex *l)
392 {
393         Rune *p, *q, t;
394
395         l->nc = 0;
396         getnextrlit(l);
397         if(l->rune == L'^') {
398                 l->nc = 1;
399                 getnextrlit(l);
400         }
401         p = l->cpairs;
402         for(;;) {
403                 *p = l->rune;
404                 if(l->rune == L']')
405                         break;
406                 if(l->rune == L'-') {
407                         regerror("Malformed class");
408                         longjmp(l->exitenv, 1);
409                 }
410                 if(l->rune == '\\') {
411                         getnextrlit(l);
412                         *p = l->rune;
413                 }
414                 if(l->rune == 0) {
415                         regerror("No closing ] for class");
416                         longjmp(l->exitenv, 1);
417                 }
418                 getnextrlit(l);
419                 if(l->rune == L'-') {
420                         getnextrlit(l);
421                         if(l->rune == L']') {
422                                 regerror("Malformed class");
423                                 longjmp(l->exitenv, 1);
424                         }
425                         if(l->rune == L'-') {
426                                 regerror("Malformed class");
427                                 longjmp(l->exitenv, 1);
428                         }
429                         if(l->rune == L'\\')
430                                 getnextrlit(l);
431                         p[1] = l->rune;
432                         if(p[0] > p[1]) {
433                                 t = p[0];
434                                 p[0] = p[1];
435                                 p[1] = t;
436                         }
437                         getnextrlit(l);
438                 } else
439                         p[1] = p[0];
440                 if(p >= l->cpairs + nelem(l->cpairs) - 2) {
441                         regerror("Class too big\n");
442                         longjmp(l->exitenv, 1);
443                 }
444                 p += 2;
445         }
446         *p = L'\0';
447         qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp);
448         q = l->cpairs;
449         for(p = l->cpairs+2; *p != 0; p += 2) {
450                 if(p[1] < q[0] - 1) {
451                         q[2] = p[0];
452                         q[3] = p[1];
453                         q += 2;
454                         continue;
455                 }
456                 q[0] = p[0];
457                 if(p[1] > q[1])
458                         q[1] = p[1];
459         }
460         q[2] = 0;
461 }
462
463 /* classes are in descending order */
464 static Renode*
465 buildclassn(Parselex *l)
466 {
467         Renode *n;
468         Rune *p;
469         int i;
470
471         i = 0;
472         p = l->cpairs;
473         n = node(l, TCLASS, nil, nil);
474         n->r = p[1] + 1;
475         n->r1 = Runemax;
476         n->nclass = i++;
477
478         for(; *p != 0; p += 2) {
479                 n = node(l, TCLASS, n, nil);
480                 n->r = p[3] + 1;
481                 n->r1 = p[0] - 1;
482                 n->nclass = i++;
483         }
484         n->r = 0;
485         return node(l, TCAT, node(l, TNOTNL, nil, nil), n);
486 }
487
488 static Renode*
489 buildclass(Parselex *l)
490 {
491         Renode *n;
492         Rune *p;
493         int i;
494
495         i = 0;
496         n = node(l, TCLASS, nil, nil);
497         n->r = Runemax + 1;
498         n->nclass = i++;
499
500         for(p = l->cpairs; *p != 0; p += 2) {
501                 n = node(l, TCLASS, n, nil);
502                 n->r = p[0];
503                 n->r1 = p[1];
504                 n->nclass = i++;
505         }
506         return n;
507 }
508
509 static void
510 prtree(Renode *tree, int d, int f)
511 {
512         int i;
513
514         if(tree == nil)
515                 return;
516         if(f)
517         for(i = 0; i < d; i++)
518                 print("\t");
519         switch(tree->op) {
520         case TCAT:
521                 prtree(tree->left, d, 0);
522                 prtree(tree->right, d, 1);
523                 break;
524         case TOR:
525                 print("TOR\n");
526                 prtree(tree->left, d+1, 1);
527                 for(i = 0; i < d; i++)
528                         print("\t");
529                 print("|\n");
530                 prtree(tree->right, d+1, 1);
531                 break;
532         case TSTAR:
533                 print("*\n");
534                 prtree(tree->left, d+1, 1);
535                 break;
536         case TPLUS:
537                 print("+\n");
538                 prtree(tree->left, d+1, 1);
539                 break;
540         case TQUES:
541                 print("?\n");
542                 prtree(tree->left, d+1, 1);
543                 break;
544         case TANY:
545                 print(".\n");
546                 prtree(tree->left, d+1, 1);
547                 break;
548         case TBOL:
549                 print("^\n");
550                 break;
551         case TEOL:
552                 print("$\n");
553                 break;
554         case TSUB:
555                 print("TSUB\n");
556                 prtree(tree->left, d+1, 1);
557                 break;
558         case TRUNE:
559                 print("TRUNE: %C\n", tree->r);
560                 break;
561         case TNOTNL:
562                 print("TNOTNL: !\\n\n");
563                 break;
564         case TCLASS:
565                 print("CLASS: %C-%C\n", tree->r, tree->r1);
566                 prtree(tree->left, d, 1);
567                 break;
568         }
569 }