]> git.lizzy.rs Git - plan9front.git/blob - sys/src/libregexp/regcomp.c
libsec: use tsmemcmp() when comparing hashes, use mpfield() for ecc, use mptober...
[plan9front.git] / sys / src / libregexp / regcomp.c
1 #include <u.h>
2 #include <libc.h>
3 #include "regexp.h"
4 #include "regcomp.h"
5
6 #define TRUE    1
7 #define FALSE   0
8
9 /*
10  * Parser Information
11  */
12 typedef
13 struct Node
14 {
15         Reinst* first;
16         Reinst* last;
17 }Node;
18
19 /* max character classes per program is nelem(reprog->class) */
20 static Reprog   *reprog;
21
22 /* max rune ranges per character class is nelem(classp->spans)/2 */
23 #define NCCRUNE nelem(classp->spans)
24
25 #define NSTACK  20
26 static  Node    andstack[NSTACK];
27 static  Node    *andp;
28 static  int     atorstack[NSTACK];
29 static  int*    atorp;
30 static  int     cursubid;               /* id of current subexpression */
31 static  int     subidstack[NSTACK];     /* parallel to atorstack */
32 static  int*    subidp;
33 static  int     lastwasand;     /* Last token was operand */
34 static  int     nbra;
35 static  char*   exprp;          /* pointer to next character in source expression */
36 static  int     lexdone;
37 static  int     nclass;
38 static  Reclass*classp;
39 static  Reinst* freep;
40 static  int     errors;
41 static  Rune    yyrune;         /* last lex'd rune */
42 static  Reclass*yyclassp;       /* last lex'd class */
43
44 /* predeclared crap */
45 static  void    operator(int);
46 static  void    pushand(Reinst*, Reinst*);
47 static  void    pushator(int);
48 static  void    evaluntil(int);
49 static  int     bldcclass(void);
50
51 static jmp_buf regkaboom;
52
53 static  void
54 rcerror(char *s)
55 {
56         errors++;
57         regerror(s);
58         longjmp(regkaboom, 1);
59 }
60
61 static  Reinst*
62 newinst(int t)
63 {
64         freep->type = t;
65         freep->left = 0;
66         freep->right = 0;
67         return freep++;
68 }
69
70 static  void
71 operand(int t)
72 {
73         Reinst *i;
74
75         if(lastwasand)
76                 operator(CAT);  /* catenate is implicit */
77         i = newinst(t);
78
79         if(t == CCLASS || t == NCCLASS)
80                 i->cp = yyclassp;
81         if(t == RUNE)
82                 i->r = yyrune;
83
84         pushand(i, i);
85         lastwasand = TRUE;
86 }
87
88 static  void
89 operator(int t)
90 {
91         if(t==RBRA && --nbra<0)
92                 rcerror("unmatched right paren");
93         if(t==LBRA){
94                 if(++cursubid >= NSUBEXP)
95                         rcerror ("too many subexpressions");
96                 nbra++;
97                 if(lastwasand)
98                         operator(CAT);
99         } else
100                 evaluntil(t);
101         if(t != RBRA)
102                 pushator(t);
103         lastwasand = FALSE;
104         if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
105                 lastwasand = TRUE;      /* these look like operands */
106 }
107
108 static  void
109 regerr2(char *s, int c)
110 {
111         char buf[100];
112         char *cp = buf;
113         while(*s)
114                 *cp++ = *s++;
115         *cp++ = c;
116         *cp = '\0'; 
117         rcerror(buf);
118 }
119
120 static  void
121 cant(char *s)
122 {
123         char buf[100];
124         strcpy(buf, "can't happen: ");
125         strcat(buf, s);
126         rcerror(buf);
127 }
128
129 static  void
130 pushand(Reinst *f, Reinst *l)
131 {
132         if(andp >= &andstack[NSTACK])
133                 cant("operand stack overflow");
134         andp->first = f;
135         andp->last = l;
136         andp++;
137 }
138
139 static  void
140 pushator(int t)
141 {
142         if(atorp >= &atorstack[NSTACK])
143                 cant("operator stack overflow");
144         *atorp++ = t;
145         *subidp++ = cursubid;
146 }
147
148 static  Node*
149 popand(int op)
150 {
151         Reinst *inst;
152
153         if(andp <= &andstack[0]){
154                 regerr2("missing operand for ", op);
155                 inst = newinst(NOP);
156                 pushand(inst,inst);
157         }
158         return --andp;
159 }
160
161 static  int
162 popator(void)
163 {
164         if(atorp <= &atorstack[0])
165                 cant("operator stack underflow");
166         --subidp;
167         return *--atorp;
168 }
169
170 static  void
171 evaluntil(int pri)
172 {
173         Node *op1, *op2;
174         Reinst *inst1, *inst2;
175
176         while(pri==RBRA || atorp[-1]>=pri){
177                 switch(popator()){
178                 default:
179                         rcerror("unknown operator in evaluntil");
180                         break;
181                 case LBRA:              /* must have been RBRA */
182                         op1 = popand('(');
183                         inst2 = newinst(RBRA);
184                         inst2->subid = *subidp;
185                         op1->last->next = inst2;
186                         inst1 = newinst(LBRA);
187                         inst1->subid = *subidp;
188                         inst1->next = op1->first;
189                         pushand(inst1, inst2);
190                         return;
191                 case OR:
192                         op2 = popand('|');
193                         op1 = popand('|');
194                         inst2 = newinst(NOP);
195                         op2->last->next = inst2;
196                         op1->last->next = inst2;
197                         inst1 = newinst(OR);
198                         inst1->right = op1->first;
199                         inst1->left = op2->first;
200                         pushand(inst1, inst2);
201                         break;
202                 case CAT:
203                         op2 = popand(0);
204                         op1 = popand(0);
205                         op1->last->next = op2->first;
206                         pushand(op1->first, op2->last);
207                         break;
208                 case STAR:
209                         op2 = popand('*');
210                         inst1 = newinst(OR);
211                         op2->last->next = inst1;
212                         inst1->right = op2->first;
213                         pushand(inst1, inst1);
214                         break;
215                 case PLUS:
216                         op2 = popand('+');
217                         inst1 = newinst(OR);
218                         op2->last->next = inst1;
219                         inst1->right = op2->first;
220                         pushand(op2->first, inst1);
221                         break;
222                 case QUEST:
223                         op2 = popand('?');
224                         inst1 = newinst(OR);
225                         inst2 = newinst(NOP);
226                         inst1->left = inst2;
227                         inst1->right = op2->first;
228                         op2->last->next = inst2;
229                         pushand(inst1, inst2);
230                         break;
231                 }
232         }
233 }
234
235 static  Reprog*
236 optimize(Reprog *pp)
237 {
238         Reinst *inst, *target;
239         int size;
240         Reprog *npp;
241         Reclass *cl;
242         int diff;
243
244         /*
245          *  get rid of NOOP chains
246          */
247         for(inst=pp->firstinst; inst->type!=END; inst++){
248                 target = inst->next;
249                 while(target->type == NOP)
250                         target = target->next;
251                 inst->next = target;
252         }
253
254         /*
255          *  The original allocation is for an area larger than
256          *  necessary.  Reallocate to the actual space used
257          *  and then relocate the code.
258          */
259         size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
260         npp = realloc(pp, size);
261         if(npp==0 || npp==pp)
262                 return pp;
263         diff = (char *)npp - (char *)pp;
264         freep = (Reinst *)((char *)freep + diff);
265         for(inst=npp->firstinst; inst<freep; inst++){
266                 switch(inst->type){
267                 case OR:
268                 case STAR:
269                 case PLUS:
270                 case QUEST:
271                         *(char **)&inst->right += diff;
272                         break;
273                 case CCLASS:
274                 case NCCLASS:
275                         *(char **)&inst->right += diff;
276                         cl = inst->cp;
277                         *(char **)&cl->end += diff;
278                         break;
279                 }
280                 *(char **)&inst->left += diff;
281         }
282         *(char **)&npp->startinst += diff;
283         return npp;
284 }
285
286 #ifdef  DEBUG
287 static  void
288 dumpstack(void){
289         Node *stk;
290         int *ip;
291
292         print("operators\n");
293         for(ip=atorstack; ip<atorp; ip++)
294                 print("0%o\n", *ip);
295         print("operands\n");
296         for(stk=andstack; stk<andp; stk++)
297                 print("0%o\t0%o\n", stk->first->type, stk->last->type);
298 }
299
300 static  void
301 dump(Reprog *pp)
302 {
303         Reinst *l;
304         Rune *p;
305
306         l = pp->firstinst;
307         do{
308                 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
309                         l->left-pp->firstinst, l->right-pp->firstinst);
310                 if(l->type == RUNE)
311                         print("\t%C\n", l->r);
312                 else if(l->type == CCLASS || l->type == NCCLASS){
313                         print("\t[");
314                         if(l->type == NCCLASS)
315                                 print("^");
316                         for(p = l->cp->spans; p < l->cp->end; p += 2)
317                                 if(p[0] == p[1])
318                                         print("%C", p[0]);
319                                 else
320                                         print("%C-%C", p[0], p[1]);
321                         print("]\n");
322                 } else
323                         print("\n");
324         }while(l++->type);
325 }
326 #endif
327
328 static  Reclass*
329 newclass(void)
330 {
331         if(nclass >= nelem(reprog->class))
332                 rcerror("too many character classes; increase Reprog.class size");
333         return &(classp[nclass++]);
334 }
335
336 static  int
337 nextc(Rune *rp)
338 {
339         if(lexdone){
340                 *rp = 0;
341                 return 1;
342         }
343         exprp += chartorune(rp, exprp);
344         if(*rp == L'\\'){
345                 exprp += chartorune(rp, exprp);
346                 return 1;
347         }
348         if(*rp == 0)
349                 lexdone = 1;
350         return 0;
351 }
352
353 static  int
354 lex(int literal, int dot_type)
355 {
356         int quoted;
357
358         quoted = nextc(&yyrune);
359         if(literal || quoted){
360                 if(yyrune == 0)
361                         return END;
362                 return RUNE;
363         }
364
365         switch(yyrune){
366         case 0:
367                 return END;
368         case L'*':
369                 return STAR;
370         case L'?':
371                 return QUEST;
372         case L'+':
373                 return PLUS;
374         case L'|':
375                 return OR;
376         case L'.':
377                 return dot_type;
378         case L'(':
379                 return LBRA;
380         case L')':
381                 return RBRA;
382         case L'^':
383                 return BOL;
384         case L'$':
385                 return EOL;
386         case L'[':
387                 return bldcclass();
388         }
389         return RUNE;
390 }
391
392 static int
393 bldcclass(void)
394 {
395         int type;
396         Rune r[NCCRUNE];
397         Rune *p, *ep, *np;
398         Rune rune;
399         int quoted;
400
401         /* we have already seen the '[' */
402         type = CCLASS;
403         yyclassp = newclass();
404
405         /* look ahead for negation */
406         /* SPECIAL CASE!!! negated classes don't match \n */
407         ep = r;
408         quoted = nextc(&rune);
409         if(!quoted && rune == L'^'){
410                 type = NCCLASS;
411                 quoted = nextc(&rune);
412                 *ep++ = L'\n';
413                 *ep++ = L'\n';
414         }
415
416         /* parse class into a set of spans */
417         while(ep < &r[NCCRUNE-1]){
418                 if(rune == 0){
419                         rcerror("malformed '[]'");
420                         return 0;
421                 }
422                 if(!quoted && rune == L']')
423                         break;
424                 if(!quoted && rune == L'-'){
425                         if(ep == r){
426                                 rcerror("malformed '[]'");
427                                 return 0;
428                         }
429                         quoted = nextc(&rune);
430                         if((!quoted && rune == L']') || rune == 0){
431                                 rcerror("malformed '[]'");
432                                 return 0;
433                         }
434                         *(ep-1) = rune;
435                 } else {
436                         *ep++ = rune;
437                         *ep++ = rune;
438                 }
439                 quoted = nextc(&rune);
440         }
441         if(ep >= &r[NCCRUNE-1]) {
442                 rcerror("char class too large; increase Reclass.spans size");
443                 return 0;
444         }
445
446         /* sort on span start */
447         for(p = r; p < ep; p += 2){
448                 for(np = p; np < ep; np += 2)
449                         if(*np < *p){
450                                 rune = np[0];
451                                 np[0] = p[0];
452                                 p[0] = rune;
453                                 rune = np[1];
454                                 np[1] = p[1];
455                                 p[1] = rune;
456                         }
457         }
458
459         /* merge spans */
460         np = yyclassp->spans;
461         p = r;
462         if(r == ep)
463                 yyclassp->end = np;
464         else {
465                 np[0] = *p++;
466                 np[1] = *p++;
467                 for(; p < ep; p += 2)
468                         /* overlapping or adjacent ranges? */
469                         if(p[0] <= np[1] + 1){
470                                 if(p[1] >= np[1])
471                                         np[1] = p[1];   /* coalesce */
472                         } else {
473                                 np += 2;
474                                 np[0] = p[0];
475                                 np[1] = p[1];
476                         }
477                 yyclassp->end = np+2;
478         }
479
480         return type;
481 }
482
483 static  Reprog*
484 regcomp1(char *s, int literal, int dot_type)
485 {
486         int token;
487         Reprog *pp;
488
489         /* get memory for the program */
490         pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
491         if(pp == 0){
492                 regerror("out of memory");
493                 return 0;
494         }
495         freep = pp->firstinst;
496         classp = pp->class;
497         errors = 0;
498
499         if(setjmp(regkaboom))
500                 goto out;
501
502         /* go compile the sucker */
503         lexdone = 0;
504         exprp = s;
505         nclass = 0;
506         nbra = 0;
507         atorp = atorstack;
508         andp = andstack;
509         subidp = subidstack;
510         lastwasand = FALSE;
511         cursubid = 0;
512
513         /* Start with a low priority operator to prime parser */
514         pushator(START-1);
515         while((token = lex(literal, dot_type)) != END){
516                 if((token&0300) == OPERATOR)
517                         operator(token);
518                 else
519                         operand(token);
520         }
521
522         /* Close with a low priority operator */
523         evaluntil(START);
524
525         /* Force END */
526         operand(END);
527         evaluntil(START);
528 #ifdef DEBUG
529         dumpstack();
530 #endif
531         if(nbra)
532                 rcerror("unmatched left paren");
533         --andp; /* points to first and only operand */
534         pp->startinst = andp->first;
535 #ifdef DEBUG
536         dump(pp);
537 #endif
538         pp = optimize(pp);
539 #ifdef DEBUG
540         print("start: %d\n", andp->first-pp->firstinst);
541         dump(pp);
542 #endif
543 out:
544         if(errors){
545                 free(pp);
546                 pp = 0;
547         }
548         return pp;
549 }
550
551 extern  Reprog*
552 regcomp(char *s)
553 {
554         return regcomp1(s, 0, ANY);
555 }
556
557 extern  Reprog*
558 regcomplit(char *s)
559 {
560         return regcomp1(s, 1, ANY);
561 }
562
563 extern  Reprog*
564 regcompnl(char *s)
565 {
566         return regcomp1(s, 0, ANYNL);
567 }