]> git.lizzy.rs Git - plan9front.git/blob - sys/src/libregexp/regcomp.c
check program size
[plan9front.git] / sys / src / libregexp / regcomp.c
1 #include <u.h>
2 #include <libc.h>
3 #include <regexp.h>
4 #include "regimpl.h"
5
6 int instrcnt[] = {
7         [TANY]    2,
8         [TBOL]    1,
9         [TCAT]    0,
10         [TCLASS]  1,
11         [TEOL]    1,
12         [TNOTNL]  1,
13         [TOR]     2,
14         [TPLUS]   1,
15         [TQUES]   1,
16         [TRUNE]   1,
17         [TSTAR]   2,
18         [TSUB]    2
19 };
20
21 static Renode*
22 node(Parselex *plex, int op, Renode *l, Renode *r)
23 {
24         Renode *n;
25
26         plex->instrs += instrcnt[op];
27         n = plex->next++;
28         n->op = op;
29         n->left = l;
30         n->right = r;
31         return n;
32 }
33
34 static Renode*
35 e3(Parselex *plex)
36 {
37         char error[128];
38         Renode *n;
39
40         switch(lex(plex)) {
41         case LANY:
42                 return node(plex, TANY, nil, nil);
43         case LBOL:
44                 return node(plex, TBOL, nil, nil);
45         case LEOL:
46                 return node(plex, TEOL, nil, nil);
47         case LRUNE:
48                 n = node(plex, TRUNE, nil, nil);
49                 n->r = plex->rune;
50                 return n;
51         case LCLASS:
52                 if(plex->nc)
53                         return buildclassn(plex);
54                 return buildclass(plex);
55         case LLPAR:
56                 n = e0(plex);
57                 n = node(plex, TSUB, n, nil);
58                 if(lex(plex) != LRPAR) {
59                         snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig);
60                         regerror(error);
61                         longjmp(plex->exitenv, 1);
62                 }
63                 return n;
64         default:
65                 if(plex->rune)
66                         snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune);
67                 else
68                         snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig);
69                 regerror(error);
70                 longjmp(plex->exitenv, 1);
71         }
72         return nil;
73 }
74
75 static Renode*
76 e2(Parselex *plex)
77 {
78         Renode *n;
79
80         n = e3(plex);
81         if(lex(plex) == LREP) {
82                 switch(plex->rune) {
83                 case L'*':
84                         return node(plex, TSTAR, n, nil);
85                 case L'+':
86                         return node(plex, TPLUS, n, nil);
87                 case L'?':
88                         return node(plex, TQUES, n, nil);
89                 }
90         }
91         plex->peek = 1;
92         return n;
93 }
94
95 static Renode*
96 invert(Renode *n)
97 {
98         Renode *n1;
99
100         if(n->op != TCAT)
101                 return n;
102         while(n->left->op == TCAT) {
103                 n1 = n->left;
104                 n->left = n1->right;
105                 n1->right = n;
106                 n = n1;
107         }
108         return n;
109 }
110
111 static Renode*
112 e1(Parselex *plex)
113 {
114         Renode *n;
115         int sym;
116
117         n = e2(plex);
118         for(;;) {
119                 sym = lex(plex);
120                 if(sym == LEND || sym == LOR || sym == LRPAR)
121                         break;
122                 plex->peek = 1;
123                 n = node(plex, TCAT, n, e2(plex));
124         }
125         plex->peek = 1;
126         return invert(n);
127 }
128
129 static Renode*
130 e0(Parselex *plex)
131 {
132         Renode *n;
133
134         n = e1(plex);
135         for(;;) {
136                 if(lex(plex) != LOR)
137                         break;
138                 n = node(plex, TOR, n, e1(plex));
139         }
140         plex->peek = 1;
141         return n;
142 }
143
144 static Parselex*
145 initplex(Parselex *plex, char *regstr, int lit)
146 {
147         plex->getnextr = lit ? getnextrlit : getnextr;
148         plex->rawexp = plex->orig = regstr;
149         plex->sub = 0;
150         plex->instrs = 0;
151         plex->peek = 0;
152         plex->done = 0;
153         return plex;
154 }
155
156 static int
157 maxthreads(Renode *tree)
158 {
159         tree = tree->left;
160         if(tree->op == TCAT)
161                 tree = tree->left;
162         if(tree->op == TBOL)
163                 return 2;
164         return -1;
165 }
166
167 static Reprog*
168 regcomp1(char *regstr, int nl, int lit)
169 {
170         Reprog *reprog;
171         Parselex plex;
172         Renode *parsetr;
173         int regstrlen, maxthr;
174
175         regstrlen = utflen(regstr);
176         initplex(&plex, regstr, lit);
177         plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2);
178         if(plex.nodes == nil)
179                 return nil;
180         plex.next = plex.nodes;
181
182         if(setjmp(plex.exitenv) != 0) {
183                 free(plex.nodes);
184                 return nil;
185         }
186
187         maxthr = regstrlen;
188         parsetr = node(&plex, TSUB, e0(&plex), nil);
189
190 //      prtree(parsetr, 0, 1);
191 //      fprint(2, "Program size %ld\n", sizeof(Reprog) + sizeof(Reinst) * plex.instrs + sizeof(Rethread) * maxthr);
192         reprog = malloc(sizeof(Reprog) +
193                         sizeof(Reinst) * plex.instrs +
194                         sizeof(Rethread) * maxthr);
195         reprog->len = plex.instrs;
196         reprog->nthr = maxthr;
197         reprog->startinst = compile(parsetr, reprog, nl);
198         reprog->threads = (Rethread*)(reprog->startinst + reprog->len);
199         reprog->regstr = regstr;
200
201         free(plex.nodes);
202         return reprog;
203 }
204
205 Reprog*
206 regcomp(char *str)
207 {
208         return regcomp1(str, 0, 0);
209 }
210
211 Reprog*
212 regcomplit(char *str)
213 {
214         return regcomp1(str, 0, 1);
215 }
216
217 Reprog*
218 regcompnl(char *str)
219 {
220         return regcomp1(str, 1, 0);
221 }
222
223 static Reinst*
224 compile1(Renode *renode, Reinst *reinst, int *sub, int nl)
225 {
226         Reinst *i;
227         int s;
228
229 Tailcall:
230         if(renode == nil)
231                 return reinst;
232         switch(renode->op) {
233         case TCLASS:
234                 reinst->op = OCLASS;
235                 reinst->r = renode->r;
236                 reinst->r1 = renode->r1;
237                 reinst->a = reinst + 1 + renode->nclass;
238                 renode = renode->left;
239                 reinst++;
240                 goto Tailcall;
241         case TCAT:
242                 reinst = compile1(renode->left, reinst, sub, nl);
243                 renode = renode->right;
244                 goto Tailcall;
245         case TOR:
246                 reinst->op = OSPLIT;
247                 reinst->a = reinst + 1;
248                 i = compile1(renode->left, reinst->a, sub, nl);
249                 reinst->b = i + 1;
250                 i->op = OJMP;
251                 i->a = compile1(renode->right, reinst->b, sub, nl);
252                 return i->a;
253         case TSTAR:
254                 reinst->op = OSPLIT;
255                 reinst->a = reinst + 1;
256                 i = compile1(renode->left, reinst->a, sub, nl);
257                 reinst->b = i + 1;
258                 i->op = OJMP;
259                 i->a = reinst;
260                 return reinst->b;
261         case TPLUS:
262                 i = reinst;
263                 reinst = compile1(renode->left, reinst, sub, nl);
264                 reinst->op = OSPLIT;
265                 reinst->a = i;
266                 reinst->b = reinst + 1;
267                 return reinst->b;
268         case TQUES:
269                 reinst->op = OSPLIT;
270                 reinst->a = reinst + 1;
271                 reinst->b = compile1(renode->left, reinst->a, sub, nl);
272                 return reinst->b;
273         case TSUB:
274                 reinst->op = OSAVE;
275                 reinst->sub = s = (*sub)++;
276                 reinst = compile1(renode->left, reinst+1, sub, nl);
277                 reinst->op = OUNSAVE;
278                 reinst->sub = s;
279                 return reinst + 1;
280         case TANY:
281                 if(nl == 0)
282                         reinst++->op = ONOTNL;
283                 reinst->op = OANY;
284                 return reinst + 1;
285         case TRUNE:
286                 reinst->op = ORUNE;
287                 reinst->r = renode->r;
288                 return reinst + 1;
289         case TNOTNL:
290                 reinst->op = ONOTNL;
291                 return reinst + 1;
292         case TEOL:
293                 reinst->op = OEOL;
294                 return reinst + 1;
295         case TBOL:
296                 reinst->op = OBOL;
297                 return reinst + 1;
298         }
299         return nil;
300 }
301
302 static Reinst*
303 compile(Renode *parsetr, Reprog *reprog, int nl)
304 {
305         Reinst *reinst;
306         int sub;
307
308         sub = 0;
309         reinst = (Reinst*)(reprog+1);
310         compile1(parsetr, reinst, &sub, nl);
311         return reinst;
312 }
313
314 static void
315 getnextr(Parselex *l)
316 {
317         l->literal = 0;
318         if(l->done) {
319                 l->rune = 0;
320                 return;
321         }
322         l->rawexp += chartorune(&l->rune, l->rawexp);
323         if(l->rune == L'\\') {
324                 l->rawexp += chartorune(&l->rune, l->rawexp);
325                 l->literal = 1;
326         }
327         if(*l->rawexp == 0)
328                 l->done = 1;
329         return;
330 }
331
332 static void
333 getnextrlit(Parselex *l)
334 {
335         l->literal = 1;
336         if(l->done) {
337                 l->literal = 0;
338                 l->rune = 0;
339                 return;
340         }
341         l->rawexp += chartorune(&l->rune, l->rawexp);
342         if(*l->rawexp == 0)
343                 l->done = 1;
344         return;
345 }
346
347 static int
348 lex(Parselex *l)
349 {
350         if(l->peek) {
351                 l->peek = 0;
352                 return l->peeklex;
353         }
354         l->getnextr(l);
355         if(l->literal)
356                 return l->peeklex = LRUNE;
357         switch(l->rune){
358         case 0:
359                 return l->peeklex = LEND;
360         case L'*':
361         case L'?':
362         case L'+':
363                 return l->peeklex = LREP;
364         case L'|':
365                 return l->peeklex = LOR;
366         case L'.':
367                 return l->peeklex = LANY;
368         case L'(':
369                 return l->peeklex = LLPAR;
370         case L')':
371                 return l->peeklex = LRPAR;
372         case L'^':
373                 return l->peeklex = LBOL;
374         case L'$':
375                 return l->peeklex = LEOL;
376         case L'[':
377                 getclass(l);
378                 return l->peeklex = LCLASS;
379         }
380         return l->peeklex = LRUNE;
381 }
382
383 static int
384 pcmp(void *va, void *vb)
385 {
386         vlong n;
387         Rune *a, *b;
388
389         a = va;
390         b = vb;
391
392         n = (vlong)b[0] - (vlong)a[0];
393         if(n)
394                 return n;
395         return (vlong)b[1] - (vlong)a[1];
396 }
397
398 static void
399 getclass(Parselex *l)
400 {
401         Rune *p, *q, t;
402
403         l->nc = 0;
404         getnextrlit(l);
405         if(l->rune == L'^') {
406                 l->nc = 1;
407                 getnextrlit(l);
408         }
409         p = l->cpairs;
410         for(;;) {
411                 *p = l->rune;
412                 if(l->rune == L']')
413                         break;
414                 if(l->rune == L'-') {
415                         regerror("Malformed class");
416                         longjmp(l->exitenv, 1);
417                 }
418                 if(l->rune == '\\') {
419                         getnextrlit(l);
420                         *p = l->rune;
421                 }
422                 if(l->rune == 0) {
423                         regerror("No closing ] for class");
424                         longjmp(l->exitenv, 1);
425                 }
426                 getnextrlit(l);
427                 if(l->rune == L'-') {
428                         getnextrlit(l);
429                         if(l->rune == L']') {
430                                 regerror("Malformed class");
431                                 longjmp(l->exitenv, 1);
432                         }
433                         if(l->rune == L'-') {
434                                 regerror("Malformed class");
435                                 longjmp(l->exitenv, 1);
436                         }
437                         if(l->rune == L'\\')
438                                 getnextrlit(l);
439                         p[1] = l->rune;
440                         if(p[0] > p[1]) {
441                                 t = p[0];
442                                 p[0] = p[1];
443                                 p[1] = t;
444                         }
445                         getnextrlit(l);
446                 } else
447                         p[1] = p[0];
448                 if(p >= l->cpairs + nelem(l->cpairs) - 2) {
449                         regerror("Class too big\n");
450                         longjmp(l->exitenv, 1);
451                 }
452                 p += 2;
453         }
454         *p = L'\0';
455         qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp);
456         q = l->cpairs;
457         for(p = l->cpairs+2; *p != 0; p += 2) {
458                 if(p[1] < q[0] - 1) {
459                         q[2] = p[0];
460                         q[3] = p[1];
461                         q += 2;
462                         continue;
463                 }
464                 q[0] = p[0];
465                 if(p[1] > q[1])
466                         q[1] = p[1];
467         }
468         q[2] = 0;
469 }
470
471 /* classes are in descending order */
472 static Renode*
473 buildclassn(Parselex *l)
474 {
475         Renode *n;
476         Rune *p;
477         int i;
478
479         i = 0;
480         p = l->cpairs;
481         n = node(l, TCLASS, nil, nil);
482         n->r = p[1] + 1;
483         n->r1 = Runemax;
484         n->nclass = i++;
485
486         for(; *p != 0; p += 2) {
487                 n = node(l, TCLASS, n, nil);
488                 n->r = p[3] + 1;
489                 n->r1 = p[0] - 1;
490                 n->nclass = i++;
491         }
492         n->r = 0;
493         return node(l, TCAT, node(l, TNOTNL, nil, nil), n);
494 }
495
496 static Renode*
497 buildclass(Parselex *l)
498 {
499         Renode *n;
500         Rune *p;
501         int i;
502
503         i = 0;
504         n = node(l, TCLASS, nil, nil);
505         n->r = Runemax + 1;
506         n->nclass = i++;
507
508         for(p = l->cpairs; *p != 0; p += 2) {
509                 n = node(l, TCLASS, n, nil);
510                 n->r = p[0];
511                 n->r1 = p[1];
512                 n->nclass = i++;
513         }
514         return n;
515 }
516
517 static void
518 prtree(Renode *tree, int d, int f)
519 {
520         int i;
521
522         if(tree == nil)
523                 return;
524         if(f)
525         for(i = 0; i < d; i++)
526                 print("\t");
527         switch(tree->op) {
528         case TCAT:
529                 prtree(tree->left, d, 0);
530                 prtree(tree->right, d, 1);
531                 break;
532         case TOR:
533                 print("TOR\n");
534                 prtree(tree->left, d+1, 1);
535                 for(i = 0; i < d; i++)
536                         print("\t");
537                 print("|\n");
538                 prtree(tree->right, d+1, 1);
539                 break;
540         case TSTAR:
541                 print("*\n");
542                 prtree(tree->left, d+1, 1);
543                 break;
544         case TPLUS:
545                 print("+\n");
546                 prtree(tree->left, d+1, 1);
547                 break;
548         case TQUES:
549                 print("?\n");
550                 prtree(tree->left, d+1, 1);
551                 break;
552         case TANY:
553                 print(".\n");
554                 prtree(tree->left, d+1, 1);
555                 break;
556         case TBOL:
557                 print("^\n");
558                 break;
559         case TEOL:
560                 print("$\n");
561                 break;
562         case TSUB:
563                 print("TSUB\n");
564                 prtree(tree->left, d+1, 1);
565                 break;
566         case TRUNE:
567                 print("TRUNE: %C\n", tree->r);
568                 break;
569         case TNOTNL:
570                 print("TNOTNL: !\\n\n");
571                 break;
572         case TCLASS:
573                 print("CLASS: %C-%C\n", tree->r, tree->r1);
574                 prtree(tree->left, d, 1);
575                 break;
576         }
577 }