]> git.lizzy.rs Git - plan9front.git/blob - sys/src/libregexp/regcomp.c
get rid of size printing
[plan9front.git] / sys / src / libregexp / regcomp.c
1 #include <u.h>
2 #include <libc.h>
3 #include <regexp.h>
4 #include "regimpl.h"
5
6 int instrcnt[] = {
7         [TANY]    2,
8         [TBOL]    1,
9         [TCAT]    0,
10         [TCLASS]  1,
11         [TEOL]    1,
12         [TNOTNL]  1,
13         [TOR]     2,
14         [TPLUS]   1,
15         [TQUES]   1,
16         [TRUNE]   1,
17         [TSTAR]   2,
18         [TSUB]    2
19 };
20
21 static Renode*
22 node(Parselex *plex, int op, Renode *l, Renode *r)
23 {
24         Renode *n;
25
26         plex->instrs += instrcnt[op];
27         n = plex->next++;
28         n->op = op;
29         n->left = l;
30         n->right = r;
31         return n;
32 }
33
34 static Renode*
35 e3(Parselex *plex)
36 {
37         char error[128];
38         Renode *n;
39
40         switch(lex(plex)) {
41         case LANY:
42                 return node(plex, TANY, nil, nil);
43         case LBOL:
44                 return node(plex, TBOL, nil, nil);
45         case LEOL:
46                 return node(plex, TEOL, nil, nil);
47         case LRUNE:
48                 n = node(plex, TRUNE, nil, nil);
49                 n->r = plex->rune;
50                 return n;
51         case LCLASS:
52                 if(plex->nc)
53                         return buildclassn(plex);
54                 return buildclass(plex);
55         case LLPAR:
56                 n = e0(plex);
57                 n = node(plex, TSUB, n, nil);
58                 if(lex(plex) != LRPAR) {
59                         snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig);
60                         regerror(error);
61                         longjmp(plex->exitenv, 1);
62                 }
63                 return n;
64         default:
65                 if(plex->rune)
66                         snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune);
67                 else
68                         snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig);
69                 regerror(error);
70                 longjmp(plex->exitenv, 1);
71         }
72         return nil;
73 }
74
75 static Renode*
76 e2(Parselex *plex)
77 {
78         Renode *n;
79
80         n = e3(plex);
81         if(lex(plex) == LREP) {
82                 switch(plex->rune) {
83                 case L'*':
84                         return node(plex, TSTAR, n, nil);
85                 case L'+':
86                         return node(plex, TPLUS, n, nil);
87                 case L'?':
88                         return node(plex, TQUES, n, nil);
89                 }
90         }
91         plex->peek = 1;
92         return n;
93 }
94
95 static Renode*
96 invert(Renode *n)
97 {
98         Renode *n1;
99
100         if(n->op != TCAT)
101                 return n;
102         while(n->left->op == TCAT) {
103                 n1 = n->left;
104                 n->left = n1->right;
105                 n1->right = n;
106                 n = n1;
107         }
108         return n;
109 }
110
111 static Renode*
112 e1(Parselex *plex)
113 {
114         Renode *n;
115         int sym;
116
117         n = e2(plex);
118         for(;;) {
119                 sym = lex(plex);
120                 if(sym == LEND || sym == LOR || sym == LRPAR)
121                         break;
122                 plex->peek = 1;
123                 n = node(plex, TCAT, n, e2(plex));
124         }
125         plex->peek = 1;
126         return invert(n);
127 }
128
129 static Renode*
130 e0(Parselex *plex)
131 {
132         Renode *n;
133
134         n = e1(plex);
135         for(;;) {
136                 if(lex(plex) != LOR)
137                         break;
138                 n = node(plex, TOR, n, e1(plex));
139         }
140         plex->peek = 1;
141         return n;
142 }
143
144 static Parselex*
145 initplex(Parselex *plex, char *regstr, int lit)
146 {
147         plex->getnextr = lit ? getnextrlit : getnextr;
148         plex->rawexp = plex->orig = regstr;
149         plex->sub = 0;
150         plex->instrs = 0;
151         plex->peek = 0;
152         plex->done = 0;
153         return plex;
154 }
155
156 static int
157 maxthreads(Renode *tree)
158 {
159         tree = tree->left;
160         if(tree->op == TCAT)
161                 tree = tree->left;
162         if(tree->op == TBOL)
163                 return 2;
164         return -1;
165 }
166
167 static Reprog*
168 regcomp1(char *regstr, int nl, int lit)
169 {
170         Reprog *reprog;
171         Parselex plex;
172         Renode *parsetr;
173         int regstrlen, maxthr;
174
175         regstrlen = utflen(regstr);
176         initplex(&plex, regstr, lit);
177         plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2);
178         if(plex.nodes == nil)
179                 return nil;
180         plex.next = plex.nodes;
181
182         if(setjmp(plex.exitenv) != 0) {
183                 free(plex.nodes);
184                 return nil;
185         }
186
187         maxthr = regstrlen;
188         parsetr = node(&plex, TSUB, e0(&plex), nil);
189
190 //      prtree(parsetr, 0, 1);
191         reprog = malloc(sizeof(Reprog) +
192                         sizeof(Reinst) * plex.instrs +
193                         sizeof(Rethread) * maxthr);
194         reprog->len = plex.instrs;
195         reprog->nthr = maxthr;
196         reprog->startinst = compile(parsetr, reprog, nl);
197         reprog->threads = (Rethread*)(reprog->startinst + reprog->len);
198         reprog->regstr = regstr;
199
200         free(plex.nodes);
201         return reprog;
202 }
203
204 Reprog*
205 regcomp(char *str)
206 {
207         return regcomp1(str, 0, 0);
208 }
209
210 Reprog*
211 regcomplit(char *str)
212 {
213         return regcomp1(str, 0, 1);
214 }
215
216 Reprog*
217 regcompnl(char *str)
218 {
219         return regcomp1(str, 1, 0);
220 }
221
222 static Reinst*
223 compile1(Renode *renode, Reinst *reinst, int *sub, int nl)
224 {
225         Reinst *i;
226         int s;
227
228 Tailcall:
229         if(renode == nil)
230                 return reinst;
231         switch(renode->op) {
232         case TCLASS:
233                 reinst->op = OCLASS;
234                 reinst->r = renode->r;
235                 reinst->r1 = renode->r1;
236                 reinst->a = reinst + 1 + renode->nclass;
237                 renode = renode->left;
238                 reinst++;
239                 goto Tailcall;
240         case TCAT:
241                 reinst = compile1(renode->left, reinst, sub, nl);
242                 renode = renode->right;
243                 goto Tailcall;
244         case TOR:
245                 reinst->op = OSPLIT;
246                 reinst->a = reinst + 1;
247                 i = compile1(renode->left, reinst->a, sub, nl);
248                 reinst->b = i + 1;
249                 i->op = OJMP;
250                 i->a = compile1(renode->right, reinst->b, sub, nl);
251                 return i->a;
252         case TSTAR:
253                 reinst->op = OSPLIT;
254                 reinst->a = reinst + 1;
255                 i = compile1(renode->left, reinst->a, sub, nl);
256                 reinst->b = i + 1;
257                 i->op = OJMP;
258                 i->a = reinst;
259                 return reinst->b;
260         case TPLUS:
261                 i = reinst;
262                 reinst = compile1(renode->left, reinst, sub, nl);
263                 reinst->op = OSPLIT;
264                 reinst->a = i;
265                 reinst->b = reinst + 1;
266                 return reinst->b;
267         case TQUES:
268                 reinst->op = OSPLIT;
269                 reinst->a = reinst + 1;
270                 reinst->b = compile1(renode->left, reinst->a, sub, nl);
271                 return reinst->b;
272         case TSUB:
273                 reinst->op = OSAVE;
274                 reinst->sub = s = (*sub)++;
275                 reinst = compile1(renode->left, reinst+1, sub, nl);
276                 reinst->op = OUNSAVE;
277                 reinst->sub = s;
278                 return reinst + 1;
279         case TANY:
280                 if(nl == 0)
281                         reinst++->op = ONOTNL;
282                 reinst->op = OANY;
283                 return reinst + 1;
284         case TRUNE:
285                 reinst->op = ORUNE;
286                 reinst->r = renode->r;
287                 return reinst + 1;
288         case TNOTNL:
289                 reinst->op = ONOTNL;
290                 return reinst + 1;
291         case TEOL:
292                 reinst->op = OEOL;
293                 return reinst + 1;
294         case TBOL:
295                 reinst->op = OBOL;
296                 return reinst + 1;
297         }
298         return nil;
299 }
300
301 static Reinst*
302 compile(Renode *parsetr, Reprog *reprog, int nl)
303 {
304         Reinst *reinst;
305         int sub;
306
307         sub = 0;
308         reinst = (Reinst*)(reprog+1);
309         compile1(parsetr, reinst, &sub, nl);
310         return reinst;
311 }
312
313 static void
314 getnextr(Parselex *l)
315 {
316         l->literal = 0;
317         if(l->done) {
318                 l->rune = 0;
319                 return;
320         }
321         l->rawexp += chartorune(&l->rune, l->rawexp);
322         if(l->rune == L'\\') {
323                 l->rawexp += chartorune(&l->rune, l->rawexp);
324                 l->literal = 1;
325         }
326         if(*l->rawexp == 0)
327                 l->done = 1;
328         return;
329 }
330
331 static void
332 getnextrlit(Parselex *l)
333 {
334         l->literal = 1;
335         if(l->done) {
336                 l->literal = 0;
337                 l->rune = 0;
338                 return;
339         }
340         l->rawexp += chartorune(&l->rune, l->rawexp);
341         if(*l->rawexp == 0)
342                 l->done = 1;
343         return;
344 }
345
346 static int
347 lex(Parselex *l)
348 {
349         if(l->peek) {
350                 l->peek = 0;
351                 return l->peeklex;
352         }
353         l->getnextr(l);
354         if(l->literal)
355                 return l->peeklex = LRUNE;
356         switch(l->rune){
357         case 0:
358                 return l->peeklex = LEND;
359         case L'*':
360         case L'?':
361         case L'+':
362                 return l->peeklex = LREP;
363         case L'|':
364                 return l->peeklex = LOR;
365         case L'.':
366                 return l->peeklex = LANY;
367         case L'(':
368                 return l->peeklex = LLPAR;
369         case L')':
370                 return l->peeklex = LRPAR;
371         case L'^':
372                 return l->peeklex = LBOL;
373         case L'$':
374                 return l->peeklex = LEOL;
375         case L'[':
376                 getclass(l);
377                 return l->peeklex = LCLASS;
378         }
379         return l->peeklex = LRUNE;
380 }
381
382 static int
383 pcmp(void *va, void *vb)
384 {
385         vlong n;
386         Rune *a, *b;
387
388         a = va;
389         b = vb;
390
391         n = (vlong)b[0] - (vlong)a[0];
392         if(n)
393                 return n;
394         return (vlong)b[1] - (vlong)a[1];
395 }
396
397 static void
398 getclass(Parselex *l)
399 {
400         Rune *p, *q, t;
401
402         l->nc = 0;
403         getnextrlit(l);
404         if(l->rune == L'^') {
405                 l->nc = 1;
406                 getnextrlit(l);
407         }
408         p = l->cpairs;
409         for(;;) {
410                 *p = l->rune;
411                 if(l->rune == L']')
412                         break;
413                 if(l->rune == L'-') {
414                         regerror("Malformed class");
415                         longjmp(l->exitenv, 1);
416                 }
417                 if(l->rune == '\\') {
418                         getnextrlit(l);
419                         *p = l->rune;
420                 }
421                 if(l->rune == 0) {
422                         regerror("No closing ] for class");
423                         longjmp(l->exitenv, 1);
424                 }
425                 getnextrlit(l);
426                 if(l->rune == L'-') {
427                         getnextrlit(l);
428                         if(l->rune == L']') {
429                                 regerror("Malformed class");
430                                 longjmp(l->exitenv, 1);
431                         }
432                         if(l->rune == L'-') {
433                                 regerror("Malformed class");
434                                 longjmp(l->exitenv, 1);
435                         }
436                         if(l->rune == L'\\')
437                                 getnextrlit(l);
438                         p[1] = l->rune;
439                         if(p[0] > p[1]) {
440                                 t = p[0];
441                                 p[0] = p[1];
442                                 p[1] = t;
443                         }
444                         getnextrlit(l);
445                 } else
446                         p[1] = p[0];
447                 if(p >= l->cpairs + nelem(l->cpairs) - 2) {
448                         regerror("Class too big\n");
449                         longjmp(l->exitenv, 1);
450                 }
451                 p += 2;
452         }
453         *p = L'\0';
454         qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp);
455         q = l->cpairs;
456         for(p = l->cpairs+2; *p != 0; p += 2) {
457                 if(p[1] < q[0] - 1) {
458                         q[2] = p[0];
459                         q[3] = p[1];
460                         q += 2;
461                         continue;
462                 }
463                 q[0] = p[0];
464                 if(p[1] > q[1])
465                         q[1] = p[1];
466         }
467         q[2] = 0;
468 }
469
470 /* classes are in descending order */
471 static Renode*
472 buildclassn(Parselex *l)
473 {
474         Renode *n;
475         Rune *p;
476         int i;
477
478         i = 0;
479         p = l->cpairs;
480         n = node(l, TCLASS, nil, nil);
481         n->r = p[1] + 1;
482         n->r1 = Runemax;
483         n->nclass = i++;
484
485         for(; *p != 0; p += 2) {
486                 n = node(l, TCLASS, n, nil);
487                 n->r = p[3] + 1;
488                 n->r1 = p[0] - 1;
489                 n->nclass = i++;
490         }
491         n->r = 0;
492         return node(l, TCAT, node(l, TNOTNL, nil, nil), n);
493 }
494
495 static Renode*
496 buildclass(Parselex *l)
497 {
498         Renode *n;
499         Rune *p;
500         int i;
501
502         i = 0;
503         n = node(l, TCLASS, nil, nil);
504         n->r = Runemax + 1;
505         n->nclass = i++;
506
507         for(p = l->cpairs; *p != 0; p += 2) {
508                 n = node(l, TCLASS, n, nil);
509                 n->r = p[0];
510                 n->r1 = p[1];
511                 n->nclass = i++;
512         }
513         return n;
514 }
515
516 static void
517 prtree(Renode *tree, int d, int f)
518 {
519         int i;
520
521         if(tree == nil)
522                 return;
523         if(f)
524         for(i = 0; i < d; i++)
525                 print("\t");
526         switch(tree->op) {
527         case TCAT:
528                 prtree(tree->left, d, 0);
529                 prtree(tree->right, d, 1);
530                 break;
531         case TOR:
532                 print("TOR\n");
533                 prtree(tree->left, d+1, 1);
534                 for(i = 0; i < d; i++)
535                         print("\t");
536                 print("|\n");
537                 prtree(tree->right, d+1, 1);
538                 break;
539         case TSTAR:
540                 print("*\n");
541                 prtree(tree->left, d+1, 1);
542                 break;
543         case TPLUS:
544                 print("+\n");
545                 prtree(tree->left, d+1, 1);
546                 break;
547         case TQUES:
548                 print("?\n");
549                 prtree(tree->left, d+1, 1);
550                 break;
551         case TANY:
552                 print(".\n");
553                 prtree(tree->left, d+1, 1);
554                 break;
555         case TBOL:
556                 print("^\n");
557                 break;
558         case TEOL:
559                 print("$\n");
560                 break;
561         case TSUB:
562                 print("TSUB\n");
563                 prtree(tree->left, d+1, 1);
564                 break;
565         case TRUNE:
566                 print("TRUNE: %C\n", tree->r);
567                 break;
568         case TNOTNL:
569                 print("TNOTNL: !\\n\n");
570                 break;
571         case TCLASS:
572                 print("CLASS: %C-%C\n", tree->r, tree->r1);
573                 prtree(tree->left, d, 1);
574                 break;
575         }
576 }