]> git.lizzy.rs Git - plan9front.git/blob - sys/src/libregexp/regcomp.c
libavl, libregexp: put debug functions back
[plan9front.git] / sys / src / libregexp / regcomp.c
1 #include <u.h>
2 #include <libc.h>
3 #include <regexp.h>
4 #include "regimpl.h"
5
6 int instrcnt[] = {
7         [TANY]    2,
8         [TBOL]    1,
9         [TCAT]    0,
10         [TCLASS]  1,
11         [TEOL]    1,
12         [TNOTNL]  1,
13         [TOR]     2,
14         [TPLUS]   1,
15         [TQUES]   1,
16         [TRUNE]   1,
17         [TSTAR]   2,
18         [TSUB]    2
19 };
20
21 static Renode*
22 node(Parselex *plex, int op, Renode *l, Renode *r)
23 {
24         Renode *n;
25
26         plex->instrs += instrcnt[op];
27         n = plex->next++;
28         n->op = op;
29         n->left = l;
30         n->right = r;
31         return n;
32 }
33
34 static Renode*
35 e3(Parselex *plex)
36 {
37         char error[128];
38         Renode *n;
39
40         switch(lex(plex)) {
41         case LANY:
42                 return node(plex, TANY, nil, nil);
43         case LBOL:
44                 return node(plex, TBOL, nil, nil);
45         case LEOL:
46                 return node(plex, TEOL, nil, nil);
47         case LRUNE:
48                 n = node(plex, TRUNE, nil, nil);
49                 n->r = plex->rune;
50                 return n;
51         case LCLASS:
52                 if(plex->nc)
53                         return buildclassn(plex);
54                 return buildclass(plex);
55         case LLPAR:
56                 n = e0(plex);
57                 n = node(plex, TSUB, n, nil);
58                 if(lex(plex) != LRPAR) {
59                         snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig);
60                         regerror(error);
61                         longjmp(plex->exitenv, 1);
62                 }
63                 return n;
64         default:
65                 if(plex->rune)
66                         snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune);
67                 else
68                         snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig);
69                 regerror(error);
70                 longjmp(plex->exitenv, 1);
71         }
72         return nil;
73 }
74
75 static Renode*
76 e2(Parselex *plex)
77 {
78         Renode *n;
79
80         n = e3(plex);
81         while(lex(plex) == LREP) {
82                 switch(plex->rune) {
83                 case L'*':
84                         n = node(plex, TSTAR, n, nil);
85                         break;
86                 case L'+':
87                         n = node(plex, TPLUS, n, nil);
88                         break;
89                 case L'?':
90                         n = node(plex, TQUES, n, nil);
91                         break;
92                 }
93         }
94         plex->peek = 1;
95         return n;
96 }
97
98 static Renode*
99 invert(Renode *n)
100 {
101         Renode *n1;
102
103         if(n->op != TCAT)
104                 return n;
105         while(n->left->op == TCAT) {
106                 n1 = n->left;
107                 n->left = n1->right;
108                 n1->right = n;
109                 n = n1;
110         }
111         return n;
112 }
113
114 static Renode*
115 e1(Parselex *plex)
116 {
117         Renode *n;
118         int sym;
119
120         n = e2(plex);
121         for(;;) {
122                 sym = lex(plex);
123                 if(sym == LEND || sym == LOR || sym == LRPAR)
124                         break;
125                 plex->peek = 1;
126                 n = node(plex, TCAT, n, e2(plex));
127         }
128         plex->peek = 1;
129         return invert(n);
130 }
131
132 static Renode*
133 e0(Parselex *plex)
134 {
135         Renode *n;
136
137         n = e1(plex);
138         for(;;) {
139                 if(lex(plex) != LOR)
140                         break;
141                 n = node(plex, TOR, n, e1(plex));
142         }
143         plex->peek = 1;
144         return n;
145 }
146
147 static Parselex*
148 initplex(Parselex *plex, char *regstr, int lit)
149 {
150         plex->getnextr = lit ? getnextrlit : getnextr;
151         plex->rawexp = plex->orig = regstr;
152         plex->sub = 0;
153         plex->instrs = 0;
154         plex->peek = 0;
155         plex->done = 0;
156         return plex;
157 }
158
159 static int
160 maxthreads(Renode *tree)
161 {
162         tree = tree->left;
163         if(tree->op == TCAT)
164                 tree = tree->left;
165         if(tree->op == TBOL)
166                 return 2;
167         return -1;
168 }
169
170 static Reprog*
171 regcomp1(char *regstr, int nl, int lit)
172 {
173         Reprog *reprog;
174         Parselex plex;
175         Renode *parsetr;
176         int regstrlen, maxthr;
177
178         regstrlen = utflen(regstr);
179         initplex(&plex, regstr, lit);
180         plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2);
181         if(plex.nodes == nil)
182                 return nil;
183         plex.next = plex.nodes;
184
185         if(setjmp(plex.exitenv) != 0) {
186                 free(plex.nodes);
187                 return nil;
188         }
189
190         maxthr = regstrlen;
191         parsetr = node(&plex, TSUB, e0(&plex), nil);
192
193 //      prtree(parsetr, 0, 1);
194         reprog = malloc(sizeof(Reprog) +
195                         sizeof(Reinst) * plex.instrs +
196                         sizeof(Rethread) * maxthr);
197         reprog->len = plex.instrs;
198         reprog->nthr = maxthr;
199         reprog->startinst = compile(parsetr, reprog, nl);
200         reprog->threads = (Rethread*)(reprog->startinst + reprog->len);
201         reprog->regstr = regstr;
202
203         free(plex.nodes);
204         return reprog;
205 }
206
207 Reprog*
208 regcomp(char *str)
209 {
210         return regcomp1(str, 0, 0);
211 }
212
213 Reprog*
214 regcomplit(char *str)
215 {
216         return regcomp1(str, 0, 1);
217 }
218
219 Reprog*
220 regcompnl(char *str)
221 {
222         return regcomp1(str, 1, 0);
223 }
224
225 static Reinst*
226 compile1(Renode *renode, Reinst *reinst, int *sub, int nl)
227 {
228         Reinst *i;
229         int s;
230
231 Tailcall:
232         if(renode == nil)
233                 return reinst;
234         switch(renode->op) {
235         case TCLASS:
236                 reinst->op = OCLASS;
237                 reinst->r = renode->r;
238                 reinst->r1 = renode->r1;
239                 reinst->a = reinst + 1 + renode->nclass;
240                 renode = renode->left;
241                 reinst++;
242                 goto Tailcall;
243         case TCAT:
244                 reinst = compile1(renode->left, reinst, sub, nl);
245                 renode = renode->right;
246                 goto Tailcall;
247         case TOR:
248                 reinst->op = OSPLIT;
249                 reinst->a = reinst + 1;
250                 i = compile1(renode->left, reinst->a, sub, nl);
251                 reinst->b = i + 1;
252                 i->op = OJMP;
253                 i->a = compile1(renode->right, reinst->b, sub, nl);
254                 return i->a;
255         case TSTAR:
256                 reinst->op = OSPLIT;
257                 reinst->a = reinst + 1;
258                 i = compile1(renode->left, reinst->a, sub, nl);
259                 reinst->b = i + 1;
260                 i->op = OJMP;
261                 i->a = reinst;
262                 return reinst->b;
263         case TPLUS:
264                 i = reinst;
265                 reinst = compile1(renode->left, reinst, sub, nl);
266                 reinst->op = OSPLIT;
267                 reinst->a = i;
268                 reinst->b = reinst + 1;
269                 return reinst->b;
270         case TQUES:
271                 reinst->op = OSPLIT;
272                 reinst->a = reinst + 1;
273                 reinst->b = compile1(renode->left, reinst->a, sub, nl);
274                 return reinst->b;
275         case TSUB:
276                 reinst->op = OSAVE;
277                 reinst->sub = s = (*sub)++;
278                 reinst = compile1(renode->left, reinst+1, sub, nl);
279                 reinst->op = OUNSAVE;
280                 reinst->sub = s;
281                 return reinst + 1;
282         case TANY:
283                 if(nl == 0)
284                         reinst++->op = ONOTNL;
285                 reinst->op = OANY;
286                 return reinst + 1;
287         case TRUNE:
288                 reinst->op = ORUNE;
289                 reinst->r = renode->r;
290                 return reinst + 1;
291         case TNOTNL:
292                 reinst->op = ONOTNL;
293                 return reinst + 1;
294         case TEOL:
295                 reinst->op = OEOL;
296                 return reinst + 1;
297         case TBOL:
298                 reinst->op = OBOL;
299                 return reinst + 1;
300         }
301         return nil;
302 }
303
304 static Reinst*
305 compile(Renode *parsetr, Reprog *reprog, int nl)
306 {
307         Reinst *reinst;
308         int sub;
309
310         sub = 0;
311         reinst = (Reinst*)(reprog+1);
312         compile1(parsetr, reinst, &sub, nl);
313         return reinst;
314 }
315
316 static void
317 getnextr(Parselex *l)
318 {
319         l->literal = 0;
320         if(l->done) {
321                 l->rune = 0;
322                 return;
323         }
324         l->rawexp += chartorune(&l->rune, l->rawexp);
325         if(l->rune == L'\\') {
326                 l->rawexp += chartorune(&l->rune, l->rawexp);
327                 l->literal = 1;
328         }
329         if(*l->rawexp == 0)
330                 l->done = 1;
331         return;
332 }
333
334 static void
335 getnextrlit(Parselex *l)
336 {
337         l->literal = 1;
338         if(l->done) {
339                 l->literal = 0;
340                 l->rune = 0;
341                 return;
342         }
343         l->rawexp += chartorune(&l->rune, l->rawexp);
344         if(*l->rawexp == 0)
345                 l->done = 1;
346         return;
347 }
348
349 static int
350 lex(Parselex *l)
351 {
352         if(l->peek) {
353                 l->peek = 0;
354                 return l->peeklex;
355         }
356         l->getnextr(l);
357         if(l->literal)
358                 return l->peeklex = LRUNE;
359         switch(l->rune){
360         case 0:
361                 return l->peeklex = LEND;
362         case L'*':
363         case L'?':
364         case L'+':
365                 return l->peeklex = LREP;
366         case L'|':
367                 return l->peeklex = LOR;
368         case L'.':
369                 return l->peeklex = LANY;
370         case L'(':
371                 return l->peeklex = LLPAR;
372         case L')':
373                 return l->peeklex = LRPAR;
374         case L'^':
375                 return l->peeklex = LBOL;
376         case L'$':
377                 return l->peeklex = LEOL;
378         case L'[':
379                 getclass(l);
380                 return l->peeklex = LCLASS;
381         }
382         return l->peeklex = LRUNE;
383 }
384
385 static int
386 pcmp(void *va, void *vb)
387 {
388         vlong n;
389         Rune *a, *b;
390
391         a = va;
392         b = vb;
393
394         n = (vlong)b[0] - (vlong)a[0];
395         if(n)
396                 return n;
397         return (vlong)b[1] - (vlong)a[1];
398 }
399
400 static void
401 getclass(Parselex *l)
402 {
403         Rune *p, *q, t;
404
405         l->nc = 0;
406         getnextrlit(l);
407         if(l->rune == L'^') {
408                 l->nc = 1;
409                 getnextrlit(l);
410         }
411         p = l->cpairs;
412         for(;;) {
413                 *p = l->rune;
414                 if(l->rune == L']')
415                         break;
416                 if(l->rune == L'-') {
417                         regerror("Malformed class");
418                         longjmp(l->exitenv, 1);
419                 }
420                 if(l->rune == '\\') {
421                         getnextrlit(l);
422                         *p = l->rune;
423                 }
424                 if(l->rune == 0) {
425                         regerror("No closing ] for class");
426                         longjmp(l->exitenv, 1);
427                 }
428                 getnextrlit(l);
429                 if(l->rune == L'-') {
430                         getnextrlit(l);
431                         if(l->rune == L']') {
432                                 regerror("Malformed class");
433                                 longjmp(l->exitenv, 1);
434                         }
435                         if(l->rune == L'-') {
436                                 regerror("Malformed class");
437                                 longjmp(l->exitenv, 1);
438                         }
439                         if(l->rune == L'\\')
440                                 getnextrlit(l);
441                         p[1] = l->rune;
442                         if(p[0] > p[1]) {
443                                 t = p[0];
444                                 p[0] = p[1];
445                                 p[1] = t;
446                         }
447                         getnextrlit(l);
448                 } else
449                         p[1] = p[0];
450                 if(p >= l->cpairs + nelem(l->cpairs) - 2) {
451                         regerror("Class too big\n");
452                         longjmp(l->exitenv, 1);
453                 }
454                 p += 2;
455         }
456         *p = L'\0';
457         qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp);
458         q = l->cpairs;
459         for(p = l->cpairs+2; *p != 0; p += 2) {
460                 if(p[1] < q[0] - 1) {
461                         q[2] = p[0];
462                         q[3] = p[1];
463                         q += 2;
464                         continue;
465                 }
466                 q[0] = p[0];
467                 if(p[1] > q[1])
468                         q[1] = p[1];
469         }
470         q[2] = 0;
471 }
472
473 /* classes are in descending order */
474 static Renode*
475 buildclassn(Parselex *l)
476 {
477         Renode *n;
478         Rune *p;
479         int i;
480
481         i = 0;
482         p = l->cpairs;
483         n = node(l, TCLASS, nil, nil);
484         n->r = p[1] + 1;
485         n->r1 = Runemax;
486         n->nclass = i++;
487
488         for(; *p != 0; p += 2) {
489                 n = node(l, TCLASS, n, nil);
490                 n->r = p[3] + 1;
491                 n->r1 = p[0] - 1;
492                 n->nclass = i++;
493         }
494         n->r = 0;
495         return node(l, TCAT, node(l, TNOTNL, nil, nil), n);
496 }
497
498 static Renode*
499 buildclass(Parselex *l)
500 {
501         Renode *n;
502         Rune *p;
503         int i;
504
505         i = 0;
506         n = node(l, TCLASS, nil, nil);
507         n->r = Runemax + 1;
508         n->nclass = i++;
509
510         for(p = l->cpairs; *p != 0; p += 2) {
511                 n = node(l, TCLASS, n, nil);
512                 n->r = p[0];
513                 n->r1 = p[1];
514                 n->nclass = i++;
515         }
516         return n;
517 }
518
519 static void
520 prtree(Renode *tree, int d, int f)
521 {
522         int i;
523
524         if(tree == nil)
525                 return;
526         if(f)
527         for(i = 0; i < d; i++)
528                 print("\t");
529         switch(tree->op) {
530         case TCAT:
531                 prtree(tree->left, d, 0);
532                 prtree(tree->right, d, 1);
533                 break;
534         case TOR:
535                 print("TOR\n");
536                 prtree(tree->left, d+1, 1);
537                 for(i = 0; i < d; i++)
538                         print("\t");
539                 print("|\n");
540                 prtree(tree->right, d+1, 1);
541                 break;
542         case TSTAR:
543                 print("*\n");
544                 prtree(tree->left, d+1, 1);
545                 break;
546         case TPLUS:
547                 print("+\n");
548                 prtree(tree->left, d+1, 1);
549                 break;
550         case TQUES:
551                 print("?\n");
552                 prtree(tree->left, d+1, 1);
553                 break;
554         case TANY:
555                 print(".\n");
556                 prtree(tree->left, d+1, 1);
557                 break;
558         case TBOL:
559                 print("^\n");
560                 break;
561         case TEOL:
562                 print("$\n");
563                 break;
564         case TSUB:
565                 print("TSUB\n");
566                 prtree(tree->left, d+1, 1);
567                 break;
568         case TRUNE:
569                 print("TRUNE: %C\n", tree->r);
570                 break;
571         case TNOTNL:
572                 print("TNOTNL: !\\n\n");
573                 break;
574         case TCLASS:
575                 print("CLASS: %C-%C\n", tree->r, tree->r1);
576                 prtree(tree->left, d, 1);
577                 break;
578         }
579 }