]> git.lizzy.rs Git - plan9front.git/blob - sys/src/libregexp/regcomp.c
New libregexp and APE ported to native
[plan9front.git] / sys / src / libregexp / regcomp.c
1 #include <u.h>
2 #include <libc.h>
3 #include <regexp.h>
4 #include "regimpl.h"
5
6 int instrcnt[] = {
7         [TANY]    2,
8         [TBOL]    1,
9         [TCAT]    0,
10         [TCLASS]  1,
11         [TEOL]    1,
12         [TNOTNL]  1,
13         [TOR]     2,
14         [TPLUS]   1,
15         [TQUES]   1,
16         [TRUNE]   1,
17         [TSTAR]   2,
18         [TSUB]    2
19 };
20
21 static Renode*
22 node(Parselex *plex, int op, Renode *l, Renode *r)
23 {
24         Renode *n;
25
26         plex->instrs += instrcnt[op];
27         n = plex->next++;
28         n->op = op;
29         n->left = l;
30         n->right = r;
31         return n;
32 }
33
34 static Renode*
35 e3(Parselex *plex)
36 {
37         char error[128];
38         Renode *n;
39
40         switch(lex(plex)) {
41         case LANY:
42                 return node(plex, TANY, nil, nil);
43         case LBOL:
44                 return node(plex, TBOL, nil, nil);
45         case LEOL:
46                 return node(plex, TEOL, nil, nil);
47         case LRUNE:
48                 n = node(plex, TRUNE, nil, nil);
49                 n->r = plex->rune;
50                 return n;
51         case LCLASS:
52                 if(plex->nc)
53                         return buildclassn(plex);
54                 return buildclass(plex);
55         case LLPAR:
56                 n = e0(plex);
57                 n = node(plex, TSUB, n, nil);
58                 if(lex(plex) != LRPAR) {
59                         snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig);
60                         regerror(error);
61                         longjmp(plex->exitenv, 1);
62                 }
63                 return n;
64         default:
65                 if(plex->rune)
66                         snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune);
67                 else
68                         snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig);
69                 regerror(error);
70                 longjmp(plex->exitenv, 1);
71         }
72         return nil;
73 }
74
75 static Renode*
76 e2(Parselex *plex)
77 {
78         Renode *n;
79
80         n = e3(plex);
81         if(lex(plex) == LREP) {
82                 switch(plex->rune) {
83                 case L'*':
84                         return node(plex, TSTAR, n, nil);
85                 case L'+':
86                         return node(plex, TPLUS, n, nil);
87                 case L'?':
88                         return node(plex, TQUES, n, nil);
89                 }
90         }
91         plex->peek = 1;
92         return n;
93 }
94
95 static Renode*
96 invert(Renode *n)
97 {
98         Renode *n1;
99
100         if(n->op != TCAT)
101                 return n;
102         while(n->left->op == TCAT) {
103                 n1 = n->left;
104                 n->left = n1->right;
105                 n1->right = n;
106                 n = n1;
107         }
108         return n;
109 }
110
111 static Renode*
112 e1(Parselex *plex)
113 {
114         Renode *n;
115         int sym;
116
117         n = e2(plex);
118         for(;;) {
119                 sym = lex(plex);
120                 if(sym == LEND || sym == LOR || sym == LRPAR)
121                         break;
122                 plex->peek = 1;
123                 n = node(plex, TCAT, n, e2(plex));
124         }
125         plex->peek = 1;
126         return invert(n);
127 }
128
129 static Renode*
130 e0(Parselex *plex)
131 {
132         Renode *n;
133
134         n = e1(plex);
135         for(;;) {
136                 if(lex(plex) != LOR)
137                         break;
138                 n = node(plex, TOR, n, e1(plex));
139         }
140         plex->peek = 1;
141         return n;
142 }
143
144 static Parselex*
145 initplex(Parselex *plex, char *regstr, int lit)
146 {
147         plex->getnextr = lit ? getnextrlit : getnextr;
148         plex->rawexp = plex->orig = regstr;
149         plex->sub = 0;
150         plex->instrs = 0;
151         plex->peek = 0;
152         plex->done = 0;
153         return plex;
154 }
155
156 static int
157 maxthreads(Renode *tree)
158 {
159         tree = tree->left;
160         if(tree->op == TCAT)
161                 tree = tree->left;
162         if(tree->op == TBOL)
163                 return 2;
164         return -1;
165 }
166
167 static Reprog*
168 regcomp1(char *regstr, int nl, int lit)
169 {
170         Reprog *reprog;
171         Parselex plex;
172         Renode *parsetr;
173         int regstrlen, maxthr;
174
175         regstrlen = utflen(regstr);
176         initplex(&plex, regstr, lit);
177         plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2);
178         if(plex.nodes == nil)
179                 return nil;
180         plex.next = plex.nodes;
181
182         if(setjmp(plex.exitenv) != 0) {
183                 free(plex.nodes);
184                 return nil;
185         }
186
187         parsetr = node(&plex, TSUB, e0(&plex), nil);
188         maxthr = maxthreads(parsetr);
189         if(maxthr == -1)
190                 maxthr = regstrlen;
191
192         prtree(parsetr, 0, 1);
193         reprog = malloc(sizeof(Reprog) +
194                         sizeof(Reinst) * plex.instrs +
195                         sizeof(Rethread) * maxthr +
196                         sizeof(Rethread*) * maxthr);
197         reprog->len = plex.instrs;
198         reprog->nthr = maxthr;
199         reprog->startinst = compile(parsetr, reprog, nl);
200         reprog->threads = (Rethread*)(reprog->startinst + reprog->len);
201         reprog->thrpool = (Rethread**)(reprog->threads + reprog->nthr);
202         reprog->regstr = regstr;
203
204         free(plex.nodes);
205         return reprog;
206 }
207
208 Reprog*
209 regcomp(char *str)
210 {
211         return regcomp1(str, 0, 0);
212 }
213
214 Reprog*
215 regcomplit(char *str)
216 {
217         return regcomp1(str, 0, 1);
218 }
219
220 Reprog*
221 regcompnl(char *str)
222 {
223         return regcomp1(str, 1, 0);
224 }
225
226 static Reinst*
227 compile1(Renode *renode, Reinst *reinst, int *sub, int nl)
228 {
229         Reinst *i;
230         int s;
231
232 Tailcall:
233         if(renode == nil)
234                 return reinst;
235         switch(renode->op) {
236         case TCLASS:
237                 reinst->op = OCLASS;
238                 reinst->r = renode->r;
239                 reinst->r1 = renode->r1;
240                 reinst->a = reinst + 1 + renode->nclass;
241                 renode = renode->left;
242                 reinst++;
243                 goto Tailcall;
244         case TCAT:
245                 reinst = compile1(renode->left, reinst, sub, nl);
246                 renode = renode->right;
247                 goto Tailcall;
248         case TOR:
249                 reinst->op = OSPLIT;
250                 reinst->a = reinst + 1;
251                 i = compile1(renode->left, reinst->a, sub, nl);
252                 reinst->b = i + 1;
253                 i->op = OJMP;
254                 i->a = compile1(renode->right, reinst->b, sub, nl);
255                 return i->a;
256         case TSTAR:
257                 reinst->op = OSPLIT;
258                 reinst->a = reinst + 1;
259                 i = compile1(renode->left, reinst->a, sub, nl);
260                 reinst->b = i + 1;
261                 i->op = OJMP;
262                 i->a = reinst;
263                 return reinst->b;
264         case TPLUS:
265                 i = reinst;
266                 reinst = compile1(renode->left, reinst, sub, nl);
267                 reinst->op = OSPLIT;
268                 reinst->a = i;
269                 reinst->b = reinst + 1;
270                 return reinst->b;
271         case TQUES:
272                 reinst->op = OSPLIT;
273                 reinst->a = reinst + 1;
274                 reinst->b = compile1(renode->left, reinst->a, sub, nl);
275                 return reinst->b;
276         case TSUB:
277                 reinst->op = OSAVE;
278                 reinst->sub = s = (*sub)++;
279                 reinst = compile1(renode->left, reinst+1, sub, nl);
280                 reinst->op = OUNSAVE;
281                 reinst->sub = s;
282                 return reinst + 1;
283         case TANY:
284                 if(nl == 0)
285                         reinst++->op = ONOTNL;
286                 reinst->op = OANY;
287                 return reinst + 1;
288         case TRUNE:
289                 reinst->op = ORUNE;
290                 reinst->r = renode->r;
291                 return reinst + 1;
292         case TNOTNL:
293                 reinst->op = ONOTNL;
294                 return reinst + 1;
295         case TEOL:
296                 reinst->op = OEOL;
297                 return reinst + 1;
298         case TBOL:
299                 reinst->op = OBOL;
300                 return reinst + 1;
301         }
302         return nil;
303 }
304
305 static Reinst*
306 compile(Renode *parsetr, Reprog *reprog, int nl)
307 {
308         Reinst *reinst;
309         int sub;
310
311         sub = 0;
312         reinst = (Reinst*)(reprog+1);
313         compile1(parsetr, reinst, &sub, nl);
314         return reinst;
315 }
316
317 static void
318 getnextr(Parselex *l)
319 {
320         l->literal = 0;
321         if(l->done) {
322                 l->rune = 0;
323                 return;
324         }
325         l->rawexp += chartorune(&l->rune, l->rawexp);
326         if(l->rune == L'\\') {
327                 l->rawexp += chartorune(&l->rune, l->rawexp);
328                 l->literal = 1;
329         }
330         if(*l->rawexp == 0)
331                 l->done = 1;
332         return;
333 }
334
335 static void
336 getnextrlit(Parselex *l)
337 {
338         l->literal = 1;
339         if(l->done) {
340                 l->literal = 0;
341                 l->rune = 0;
342                 return;
343         }
344         l->rawexp += chartorune(&l->rune, l->rawexp);
345         if(*l->rawexp == 0)
346                 l->done = 1;
347         return;
348 }
349
350 static int
351 lex(Parselex *l)
352 {
353         if(l->peek) {
354                 l->peek = 0;
355                 return l->peeklex;
356         }
357         l->getnextr(l);
358         if(l->literal)
359                 return l->peeklex = LRUNE;
360         switch(l->rune){
361         case 0:
362                 return l->peeklex = LEND;
363         case L'*':
364         case L'?':
365         case L'+':
366                 return l->peeklex = LREP;
367         case L'|':
368                 return l->peeklex = LOR;
369         case L'.':
370                 return l->peeklex = LANY;
371         case L'(':
372                 return l->peeklex = LLPAR;
373         case L')':
374                 return l->peeklex = LRPAR;
375         case L'^':
376                 return l->peeklex = LBOL;
377         case L'$':
378                 return l->peeklex = LEOL;
379         case L'[':
380                 getclass(l);
381                 return l->peeklex = LCLASS;
382         }
383         return l->peeklex = LRUNE;
384 }
385
386 static int
387 pcmp(void *va, void *vb)
388 {
389         vlong n;
390         Rune *a, *b;
391
392         a = va;
393         b = vb;
394
395         n = (vlong)b[0] - (vlong)a[0];
396         if(n)
397                 return n;
398         return (vlong)b[1] - (vlong)a[1];
399 }
400
401 static void
402 getclass(Parselex *l)
403 {
404         Rune *p, *q, t;
405
406         l->nc = 0;
407         getnextrlit(l);
408         if(l->rune == L'^') {
409                 l->nc = 1;
410                 getnextrlit(l);
411         }
412         p = l->cpairs;
413         for(;;) {
414                 *p = l->rune;
415                 if(l->rune == L']')
416                         break;
417                 if(l->rune == L'-') {
418                         regerror("Malformed class");
419                         longjmp(l->exitenv, 1);
420                 }
421                 if(l->rune == '\\') {
422                         getnextrlit(l);
423                         *p = l->rune;
424                 }
425                 if(l->rune == 0) {
426                         regerror("No closing ] for class");
427                         longjmp(l->exitenv, 1);
428                 }
429                 getnextrlit(l);
430                 if(l->rune == L'-') {
431                         getnextrlit(l);
432                         if(l->rune == L']') {
433                                 regerror("Malformed class");
434                                 longjmp(l->exitenv, 1);
435                         }
436                         if(l->rune == L'-') {
437                                 regerror("Malformed class");
438                                 longjmp(l->exitenv, 1);
439                         }
440                         if(l->rune == L'\\')
441                                 getnextrlit(l);
442                         p[1] = l->rune;
443                         if(p[0] > p[1]) {
444                                 t = p[0];
445                                 p[0] = p[1];
446                                 p[1] = t;
447                         }
448                         getnextrlit(l);
449                 } else
450                         p[1] = p[0];
451                 if(p >= l->cpairs + nelem(l->cpairs) - 2) {
452                         regerror("Class too big\n");
453                         longjmp(l->exitenv, 1);
454                 }
455                 p += 2;
456         }
457         *p = L'\0';
458         qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp);
459         q = l->cpairs;
460         for(p = l->cpairs+2; *p != 0; p += 2) {
461                 if(p[1] < q[0] - 1) {
462                         q[2] = p[0];
463                         q[3] = p[1];
464                         q += 2;
465                         continue;
466                 }
467                 q[0] = p[0];
468                 if(p[1] > q[1])
469                         q[1] = p[1];
470         }
471         q[2] = 0;
472 }
473
474 /* classes are in descending order */
475 static Renode*
476 buildclassn(Parselex *l)
477 {
478         Renode *n;
479         Rune *p;
480         int i;
481
482         i = 0;
483         p = l->cpairs;
484         n = node(l, TCLASS, nil, nil);
485         n->r = p[1] + 1;
486         n->r1 = Runemax;
487         n->nclass = i++;
488
489         for(; *p != 0; p += 2) {
490                 n = node(l, TCLASS, n, nil);
491                 n->r = p[3] + 1;
492                 n->r1 = p[0] - 1;
493                 n->nclass = i++;
494         }
495         n->r = 0;
496         return node(l, TCAT, node(l, TNOTNL, nil, nil), n);
497 }
498
499 static Renode*
500 buildclass(Parselex *l)
501 {
502         Renode *n;
503         Rune *p;
504         int i;
505
506         i = 0;
507         n = node(l, TCLASS, nil, nil);
508         n->r = Runemax + 1;
509         n->nclass = i++;
510
511         for(p = l->cpairs; *p != 0; p += 2) {
512                 n = node(l, TCLASS, n, nil);
513                 n->r = p[0];
514                 n->r1 = p[1];
515                 n->nclass = i++;
516         }
517         return n;
518 }
519
520 static void
521 prtree(Renode *tree, int d, int f)
522 {
523         int i;
524
525         if(tree == nil)
526                 return;
527         if(f)
528         for(i = 0; i < d; i++)
529                 print("\t");
530         switch(tree->op) {
531         case TCAT:
532                 prtree(tree->left, d, 0);
533                 prtree(tree->right, d, 1);
534                 break;
535         case TOR:
536                 print("TOR\n");
537                 prtree(tree->left, d+1, 1);
538                 for(i = 0; i < d; i++)
539                         print("\t");
540                 print("|\n");
541                 prtree(tree->right, d+1, 1);
542                 break;
543         case TSTAR:
544                 print("*\n");
545                 prtree(tree->left, d+1, 1);
546                 break;
547         case TPLUS:
548                 print("+\n");
549                 prtree(tree->left, d+1, 1);
550                 break;
551         case TQUES:
552                 print("?\n");
553                 prtree(tree->left, d+1, 1);
554                 break;
555         case TANY:
556                 print(".\n");
557                 prtree(tree->left, d+1, 1);
558                 break;
559         case TBOL:
560                 print("^\n");
561                 break;
562         case TEOL:
563                 print("$\n");
564                 break;
565         case TSUB:
566                 print("TSUB\n");
567                 prtree(tree->left, d+1, 1);
568                 break;
569         case TRUNE:
570                 print("TRUNE: %C\n", tree->r);
571                 break;
572         case TNOTNL:
573                 print("TNOTNL: !\\n\n");
574                 break;
575         case TCLASS:
576                 print("CLASS: %C-%C\n", tree->r, tree->r1);
577                 prtree(tree->left, d, 1);
578                 break;
579         }
580 }