]> git.lizzy.rs Git - plan9front.git/blob - sys/src/ape/lib/regexp/regcomp.c
rune(2): add Runeerror reencoding considerations in BUGS section (thanks aiju)
[plan9front.git] / sys / src / ape / lib / regexp / regcomp.c
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <setjmp.h>
4 #include <string.h>
5 #include "regexp.h"
6 #include "regcomp.h"
7
8 #define TRUE    1
9 #define FALSE   0
10
11 /*
12  * Parser Information
13  */
14 typedef
15 struct Node
16 {
17         Reinst* first;
18         Reinst* last;
19 }Node;
20
21 #define NSTACK  20
22 static  Node    andstack[NSTACK];
23 static  Node    *andp;
24 static  int     atorstack[NSTACK];
25 static  int*    atorp;
26 static  int     cursubid;               /* id of current subexpression */
27 static  int     subidstack[NSTACK];     /* parallel to atorstack */
28 static  int*    subidp;
29 static  int     lastwasand;     /* Last token was operand */
30 static  int     nbra;
31 static  char*   exprp;          /* pointer to next character in source expression */
32 static  int     lexdone;
33 static  int     nclass;
34 static  Reclass*classp;
35 static  Reinst* freep;
36 static  int     errors;
37 static  wchar_t yyrune;         /* last lex'd rune */
38 static  Reclass*yyclassp;       /* last lex'd class */
39
40 /* predeclared crap */
41 static  void    operator(int);
42 static  void    pushand(Reinst*, Reinst*);
43 static  void    pushator(int);
44 static  void    evaluntil(int);
45 static  int     bldcclass(void);
46
47 static jmp_buf regkaboom;
48
49 static  void
50 rcerror(char *s)
51 {
52         errors++;
53         regerror(s);
54         longjmp(regkaboom, 1);
55 }
56
57 static  Reinst*
58 newinst(int t)
59 {
60         freep->type = t;
61         freep->l.left = 0;
62         freep->r.right = 0;
63         return freep++;
64 }
65
66 static  void
67 operand(int t)
68 {
69         Reinst *i;
70
71         if(lastwasand)
72                 operator(CAT);  /* catenate is implicit */
73         i = newinst(t);
74
75         if(t == CCLASS || t == NCCLASS)
76                 i->r.cp = yyclassp;
77         if(t == RUNE)
78                 i->r.r = yyrune;
79
80         pushand(i, i);
81         lastwasand = TRUE;
82 }
83
84 static  void
85 operator(int t)
86 {
87         if(t==RBRA && --nbra<0)
88                 rcerror("unmatched right paren");
89         if(t==LBRA){
90                 if(++cursubid >= NSUBEXP)
91                         rcerror ("too many subexpressions");
92                 nbra++;
93                 if(lastwasand)
94                         operator(CAT);
95         } else
96                 evaluntil(t);
97         if(t != RBRA)
98                 pushator(t);
99         lastwasand = FALSE;
100         if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
101                 lastwasand = TRUE;      /* these look like operands */
102 }
103
104 static  void
105 regerr2(char *s, int c)
106 {
107         char buf[100];
108         char *cp = buf;
109         while(*s)
110                 *cp++ = *s++;
111         *cp++ = c;
112         *cp = '\0'; 
113         rcerror(buf);
114 }
115
116 static  void
117 cant(char *s)
118 {
119         char buf[100];
120         strcpy(buf, "can't happen: ");
121         strcat(buf, s);
122         rcerror(buf);
123 }
124
125 static  void
126 pushand(Reinst *f, Reinst *l)
127 {
128         if(andp >= &andstack[NSTACK])
129                 cant("operand stack overflow");
130         andp->first = f;
131         andp->last = l;
132         andp++;
133 }
134
135 static  void
136 pushator(int t)
137 {
138         if(atorp >= &atorstack[NSTACK])
139                 cant("operator stack overflow");
140         *atorp++ = t;
141         *subidp++ = cursubid;
142 }
143
144 static  Node*
145 popand(int op)
146 {
147         Reinst *inst;
148
149         if(andp <= &andstack[0]){
150                 regerr2("missing operand for ", op);
151                 inst = newinst(NOP);
152                 pushand(inst,inst);
153         }
154         return --andp;
155 }
156
157 static  int
158 popator(void)
159 {
160         if(atorp <= &atorstack[0])
161                 cant("operator stack underflow");
162         --subidp;
163         return *--atorp;
164 }
165
166 static  void
167 evaluntil(int pri)
168 {
169         Node *op1, *op2;
170         Reinst *inst1, *inst2;
171
172         while(pri==RBRA || atorp[-1]>=pri){
173                 switch(popator()){
174                 default:
175                         rcerror("unknown operator in evaluntil");
176                         break;
177                 case LBRA:              /* must have been RBRA */
178                         op1 = popand('(');
179                         inst2 = newinst(RBRA);
180                         inst2->r.subid = *subidp;
181                         op1->last->l.next = inst2;
182                         inst1 = newinst(LBRA);
183                         inst1->r.subid = *subidp;
184                         inst1->l.next = op1->first;
185                         pushand(inst1, inst2);
186                         return;
187                 case OR:
188                         op2 = popand('|');
189                         op1 = popand('|');
190                         inst2 = newinst(NOP);
191                         op2->last->l.next = inst2;
192                         op1->last->l.next = inst2;
193                         inst1 = newinst(OR);
194                         inst1->r.right = op1->first;
195                         inst1->l.left = op2->first;
196                         pushand(inst1, inst2);
197                         break;
198                 case CAT:
199                         op2 = popand(0);
200                         op1 = popand(0);
201                         op1->last->l.next = op2->first;
202                         pushand(op1->first, op2->last);
203                         break;
204                 case STAR:
205                         op2 = popand('*');
206                         inst1 = newinst(OR);
207                         op2->last->l.next = inst1;
208                         inst1->r.right = op2->first;
209                         pushand(inst1, inst1);
210                         break;
211                 case PLUS:
212                         op2 = popand('+');
213                         inst1 = newinst(OR);
214                         op2->last->l.next = inst1;
215                         inst1->r.right = op2->first;
216                         pushand(op2->first, inst1);
217                         break;
218                 case QUEST:
219                         op2 = popand('?');
220                         inst1 = newinst(OR);
221                         inst2 = newinst(NOP);
222                         inst1->l.left = inst2;
223                         inst1->r.right = op2->first;
224                         op2->last->l.next = inst2;
225                         pushand(inst1, inst2);
226                         break;
227                 }
228         }
229 }
230
231 static  Reprog*
232 optimize(Reprog *pp)
233 {
234         Reinst *inst, *target;
235         int size;
236         Reprog *npp;
237         int diff;
238
239         /*
240          *  get rid of NOOP chains
241          */
242         for(inst=pp->firstinst; inst->type!=END; inst++){
243                 target = inst->l.next;
244                 while(target->type == NOP)
245                         target = target->l.next;
246                 inst->l.next = target;
247         }
248
249         /*
250          *  The original allocation is for an area larger than
251          *  necessary.  Reallocate to the actual space used
252          *  and then relocate the code.
253          */
254         size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
255         npp = realloc(pp, size);
256         if(npp==0 || npp==pp)
257                 return pp;
258         diff = (char *)npp - (char *)pp;
259         freep = (Reinst *)((char *)freep + diff);
260         for(inst=npp->firstinst; inst<freep; inst++){
261                 switch(inst->type){
262                 case OR:
263                 case STAR:
264                 case PLUS:
265                 case QUEST:
266                 case CCLASS:
267                 case NCCLASS:
268                         *(char **)&inst->r.right += diff;
269                         break;
270                 }
271                 *(char **)&inst->l.left += diff;
272         }
273         *(char **)&npp->startinst += diff;
274         return npp;
275 }
276
277 #ifdef  DEBUG
278 static  void
279 dumpstack(void){
280         Node *stk;
281         int *ip;
282
283         print("operators\n");
284         for(ip=atorstack; ip<atorp; ip++)
285                 print("0%o\n", *ip);
286         print("operands\n");
287         for(stk=andstack; stk<andp; stk++)
288                 print("0%o\t0%o\n", stk->first->type, stk->last->type);
289 }
290
291 static  void
292 dump(Reprog *pp)
293 {
294         Reinst *l;
295         wchar_t *p;
296
297         l = pp->firstinst;
298         do{
299                 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
300                         l->l.left-pp->firstinst, l->l.right-pp->firstinst);
301                 if(l->type == RUNE)
302                         print("\t%C\n", l->r);
303                 else if(l->type == CCLASS || l->type == NCCLASS){
304                         print("\t[");
305                         if(l->type == NCCLASS)
306                                 print("^");
307                         for(p = l->r.cp->spans; p < l->r.cp->end; p += 2)
308                                 if(p[0] == p[1])
309                                         print("%C", p[0]);
310                                 else
311                                         print("%C-%C", p[0], p[1]);
312                         print("]\n");
313                 } else
314                         print("\n");
315         }while(l++->type);
316 }
317 #endif
318
319 static  Reclass*
320 newclass(void)
321 {
322         if(nclass >= NCLASS)
323                 regerr2("too many character classes; limit", NCLASS+'0');
324         return &(classp[nclass++]);
325 }
326
327 static  int
328 nextc(wchar_t *rp)
329 {
330         int n;
331
332         if(lexdone){
333                 *rp = 0;
334                 return 1;
335         }
336         n = mbtowc(rp, exprp, MB_CUR_MAX);
337         if (n <= 0)
338                 n = 1;
339         exprp += n;
340         if(*rp == L'\\'){
341                 n = mbtowc(rp, exprp, MB_CUR_MAX);
342                 if (n <= 0)
343                         n = 1;
344                 exprp += n;
345                 return 1;
346         }
347         if(*rp == 0)
348                 lexdone = 1;
349         return 0;
350 }
351
352 static  int
353 lex(int literal, int dot_type)
354 {
355         int quoted;
356
357         quoted = nextc(&yyrune);
358         if(literal || quoted){
359                 if(yyrune == 0)
360                         return END;
361                 return RUNE;
362         }
363
364         switch(yyrune){
365         case 0:
366                 return END;
367         case L'*':
368                 return STAR;
369         case L'?':
370                 return QUEST;
371         case L'+':
372                 return PLUS;
373         case L'|':
374                 return OR;
375         case L'.':
376                 return dot_type;
377         case L'(':
378                 return LBRA;
379         case L')':
380                 return RBRA;
381         case L'^':
382                 return BOL;
383         case L'$':
384                 return EOL;
385         case L'[':
386                 return bldcclass();
387         }
388         return RUNE;
389 }
390
391 static int
392 bldcclass(void)
393 {
394         int type;
395         wchar_t r[NCCRUNE];
396         wchar_t *p, *ep, *np;
397         wchar_t rune;
398         int quoted;
399
400         /* we have already seen the '[' */
401         type = CCLASS;
402         yyclassp = newclass();
403
404         /* look ahead for negation */
405         ep = r;
406         quoted = nextc(&rune);
407         if(!quoted && rune == L'^'){
408                 type = NCCLASS;
409                 quoted = nextc(&rune);
410                 *ep++ = L'\n';
411                 *ep++ = L'\n';
412         }
413
414         /* parse class into a set of spans */
415         for(; ep<&r[NCCRUNE];){
416                 if(rune == 0){
417                         rcerror("malformed '[]'");
418                         return 0;
419                 }
420                 if(!quoted && rune == L']')
421                         break;
422                 if(!quoted && rune == L'-'){
423                         if(ep == r){
424                                 rcerror("malformed '[]'");
425                                 return 0;
426                         }
427                         quoted = nextc(&rune);
428                         if((!quoted && rune == L']') || rune == 0){
429                                 rcerror("malformed '[]'");
430                                 return 0;
431                         }
432                         *(ep-1) = rune;
433                 } else {
434                         *ep++ = rune;
435                         *ep++ = rune;
436                 }
437                 quoted = nextc(&rune);
438         }
439
440         /* sort on span start */
441         for(p = r; p < ep; p += 2){
442                 for(np = p; np < ep; np += 2)
443                         if(*np < *p){
444                                 rune = np[0];
445                                 np[0] = p[0];
446                                 p[0] = rune;
447                                 rune = np[1];
448                                 np[1] = p[1];
449                                 p[1] = rune;
450                         }
451         }
452
453         /* merge spans */
454         np = yyclassp->spans;
455         p = r;
456         if(r == ep)
457                 yyclassp->end = np;
458         else {
459                 np[0] = *p++;
460                 np[1] = *p++;
461                 for(; p < ep; p += 2)
462                         if(p[0] <= np[1]){
463                                 if(p[1] > np[1])
464                                         np[1] = p[1];
465                         } else {
466                                 np += 2;
467                                 np[0] = p[0];
468                                 np[1] = p[1];
469                         }
470                 yyclassp->end = np+2;
471         }
472
473         return type;
474 }
475
476 static  Reprog*
477 regcomp1(char *s, int literal, int dot_type)
478 {
479         int token;
480         Reprog *pp;
481
482         /* get memory for the program */
483         pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
484         if(pp == 0){
485                 regerror("out of memory");
486                 return 0;
487         }
488         freep = pp->firstinst;
489         classp = pp->class;
490         errors = 0;
491
492         if(setjmp(regkaboom))
493                 goto out;
494
495         /* go compile the sucker */
496         lexdone = 0;
497         exprp = s;
498         nclass = 0;
499         nbra = 0;
500         atorp = atorstack;
501         andp = andstack;
502         subidp = subidstack;
503         lastwasand = FALSE;
504         cursubid = 0;
505
506         /* Start with a low priority operator to prime parser */
507         pushator(START-1);
508         while((token = lex(literal, dot_type)) != END){
509                 if((token&0300) == OPERATOR)
510                         operator(token);
511                 else
512                         operand(token);
513         }
514
515         /* Close with a low priority operator */
516         evaluntil(START);
517
518         /* Force END */
519         operand(END);
520         evaluntil(START);
521 #ifdef DEBUG
522         dumpstack();
523 #endif
524         if(nbra)
525                 rcerror("unmatched left paren");
526         --andp; /* points to first and only operand */
527         pp->startinst = andp->first;
528 #ifdef DEBUG
529         dump(pp);
530 #endif
531         pp = optimize(pp);
532 #ifdef DEBUG
533         print("start: %d\n", andp->first-pp->firstinst);
534         dump(pp);
535 #endif
536 out:
537         if(errors){
538                 free(pp);
539                 pp = 0;
540         }
541         return pp;
542 }
543
544 extern  Reprog*
545 regcomp(char *s)
546 {
547         return regcomp1(s, 0, ANY);
548 }
549
550 extern  Reprog*
551 regcomplit(char *s)
552 {
553         return regcomp1(s, 1, ANY);
554 }
555
556 extern  Reprog*
557 regcompnl(char *s)
558 {
559         return regcomp1(s, 0, ANYNL);
560 }