]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/sam/regexp.c
snoopy(8): avoid extra spaces in dhcp filter output
[plan9front.git] / sys / src / cmd / sam / regexp.c
1 #include "sam.h"
2
3 Rangeset        sel;
4 String          lastregexp;
5 /*
6  * Machine Information
7  */
8 typedef struct Inst Inst;
9
10 struct Inst
11 {
12         long    type;   /* <= Runemax ==> literal, otherwise action */
13         union {
14                 int rsid;
15                 int rsubid;
16                 int class;
17                 struct Inst *rother;
18                 struct Inst *rright;
19         } r;
20         union{
21                 struct Inst *lleft;
22                 struct Inst *lnext;
23         } l;
24 };
25 #define sid     r.rsid
26 #define subid   r.rsubid
27 #define rclass  r.class
28 #define other   r.rother
29 #define right   r.rright
30 #define left    l.lleft
31 #define next    l.lnext
32
33 #define NPROG   1024
34 Inst    program[NPROG];
35 Inst    *progp;
36 Inst    *startinst;     /* First inst. of program; might not be program[0] */
37 Inst    *bstartinst;    /* same for backwards machine */
38
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42         Inst    *inst;          /* Instruction of the thread */
43         Rangeset se;
44         Posn    startp;         /* first char of match */
45 };
46
47 #define NLIST   127
48
49 Ilist   *tl, *nl;               /* This list, next list */
50 Ilist   list[2][NLIST+1];       /* +1 for trailing null */
51 static  Rangeset sempty;
52
53 /*
54  * Actions and Tokens
55  *
56  *      0x100xx are operators, value == precedence
57  *      0x200xx are tokens, i.e. operands for operators
58  */
59 enum {
60         OPERATOR = Runemask+1,  /* Bitmask of all operators */
61         START   = OPERATOR,     /* Start, used for marker on stack */
62         RBRA,                   /* Right bracket, ) */
63         LBRA,                   /* Left bracket, ( */
64         OR,                     /* Alternation, | */
65         CAT,                    /* Concatentation, implicit operator */
66         STAR,                   /* Closure, * */
67         PLUS,                   /* a+ == aa* */
68         QUEST,                  /* a? == a|nothing, i.e. 0 or 1 a's */
69
70         ANY     = OPERATOR<<1,  /* Any character but newline, . */
71         NOP,                    /* No operation, internal use only */
72         BOL,                    /* Beginning of line, ^ */
73         EOL,                    /* End of line, $ */
74         CCLASS,                 /* Character class, [] */
75         NCCLASS,                /* Negated character class, [^] */
76         END,                    /* Terminate: match found */
77
78         ISATOR  = OPERATOR,
79         ISAND   = OPERATOR<<1,
80 };
81
82 /*
83  * Parser Information
84  */
85 typedef struct Node Node;
86 struct Node
87 {
88         Inst    *first;
89         Inst    *last;
90 };
91
92 #define NSTACK  20
93 Node    andstack[NSTACK];
94 Node    *andp;
95 int     atorstack[NSTACK];
96 int     *atorp;
97 int     lastwasand;     /* Last token was operand */
98 int     cursubid;
99 int     subidstack[NSTACK];
100 int     *subidp;
101 int     backwards;
102 int     nbra;
103 Rune    *exprp;         /* pointer to next character in source expression */
104 #define DCLASS  10      /* allocation increment */
105 int     nclass;         /* number active */
106 int     Nclass;         /* high water mark */
107 Rune    **class;
108 int     negateclass;
109
110 int     addinst(Ilist *l, Inst *inst, Rangeset *sep);
111 void    newmatch(Rangeset*);
112 void    bnewmatch(Rangeset*);
113 void    pushand(Inst*, Inst*);
114 void    pushator(int);
115 Node    *popand(int);
116 int     popator(void);
117 void    startlex(Rune*);
118 int     lex(void);
119 void    operator(int);
120 void    operand(int);
121 void    evaluntil(int);
122 void    optimize(Inst*);
123 void    bldcclass(void);
124
125 void
126 regerror(Err e)
127 {
128         Strzero(&lastregexp);
129         error(e);
130 }
131
132 void
133 regerror_c(Err e, int c)
134 {
135         Strzero(&lastregexp);
136         error_c(e, c);
137 }
138
139 Inst *
140 newinst(int t)
141 {
142         if(progp >= &program[NPROG])
143                 regerror(Etoolong);
144         progp->type = t;
145         progp->left = 0;
146         progp->right = 0;
147         return progp++;
148 }
149
150 Inst *
151 realcompile(Rune *s)
152 {
153         int token;
154
155         startlex(s);
156         atorp = atorstack;
157         andp = andstack;
158         subidp = subidstack;
159         cursubid = 0;
160         lastwasand = FALSE;
161         /* Start with a low priority operator to prime parser */
162         pushator(START-1);
163         while((token=lex()) != END){
164                 if((token&ISATOR) == OPERATOR)
165                         operator(token);
166                 else
167                         operand(token);
168         }
169         /* Close with a low priority operator */
170         evaluntil(START);
171         /* Force END */
172         operand(END);
173         evaluntil(START);
174         if(nbra)
175                 regerror(Eleftpar);
176         --andp; /* points to first and only operand */
177         return andp->first;
178 }
179
180 void
181 compile(String *s)
182 {
183         int i;
184         Inst *oprogp;
185
186         if(Strcmp(s, &lastregexp)==0)
187                 return;
188         for(i=0; i<nclass; i++)
189                 free(class[i]);
190         nclass = 0;
191         progp = program;
192         backwards = FALSE;
193         startinst = realcompile(s->s);
194         optimize(program);
195         oprogp = progp;
196         backwards = TRUE;
197         bstartinst = realcompile(s->s);
198         optimize(oprogp);
199         Strduplstr(&lastregexp, s);
200 }
201
202 void
203 operand(int t)
204 {
205         Inst *i;
206         if(lastwasand)
207                 operator(CAT);  /* catenate is implicit */
208         i = newinst(t);
209         if(t == CCLASS){
210                 if(negateclass)
211                         i->type = NCCLASS;      /* UGH */
212                 i->rclass = nclass-1;           /* UGH */
213         }
214         pushand(i, i);
215         lastwasand = TRUE;
216 }
217
218 void
219 operator(int t)
220 {
221         if(t==RBRA && --nbra<0)
222                 regerror(Erightpar);
223         if(t==LBRA){
224 /*
225  *              if(++cursubid >= NSUBEXP)
226  *                      regerror(Esubexp);
227  */
228                 cursubid++;     /* silently ignored */
229                 nbra++;
230                 if(lastwasand)
231                         operator(CAT);
232         }else
233                 evaluntil(t);
234         if(t!=RBRA)
235                 pushator(t);
236         lastwasand = FALSE;
237         if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
238                 lastwasand = TRUE;      /* these look like operands */
239 }
240
241 void
242 cant(char *s)
243 {
244         char buf[100];
245
246         sprint(buf, "regexp: can't happen: %s", s);
247         panic(buf);
248 }
249
250 void
251 pushand(Inst *f, Inst *l)
252 {
253         if(andp >= &andstack[NSTACK])
254                 cant("operand stack overflow");
255         andp->first = f;
256         andp->last = l;
257         andp++;
258 }
259
260 void
261 pushator(int t)
262 {
263         if(atorp >= &atorstack[NSTACK])
264                 cant("operator stack overflow");
265         *atorp++=t;
266         if(cursubid >= NSUBEXP)
267                 *subidp++= -1;
268         else
269                 *subidp++=cursubid;
270 }
271
272 Node *
273 popand(int op)
274 {
275         if(andp <= &andstack[0])
276                 if(op)
277                         regerror_c(Emissop, op);
278                 else
279                         regerror(Ebadregexp);
280         return --andp;
281 }
282
283 int
284 popator(void)
285 {
286         if(atorp <= &atorstack[0])
287                 cant("operator stack underflow");
288         --subidp;
289         return *--atorp;
290 }
291
292 void
293 evaluntil(int pri)
294 {
295         Node *op1, *op2, *t;
296         Inst *inst1, *inst2;
297
298         while(pri==RBRA || atorp[-1]>=pri){
299                 switch(popator()){
300                 case LBRA:
301                         op1 = popand('(');
302                         inst2 = newinst(RBRA);
303                         inst2->subid = *subidp;
304                         op1->last->next = inst2;
305                         inst1 = newinst(LBRA);
306                         inst1->subid = *subidp;
307                         inst1->next = op1->first;
308                         pushand(inst1, inst2);
309                         return;         /* must have been RBRA */
310                 default:
311                         panic("unknown regexp operator");
312                         break;
313                 case OR:
314                         op2 = popand('|');
315                         op1 = popand('|');
316                         inst2 = newinst(NOP);
317                         op2->last->next = inst2;
318                         op1->last->next = inst2;
319                         inst1 = newinst(OR);
320                         inst1->right = op1->first;
321                         inst1->left = op2->first;
322                         pushand(inst1, inst2);
323                         break;
324                 case CAT:
325                         op2 = popand(0);
326                         op1 = popand(0);
327                         if(backwards && op2->first->type!=END)
328                                 t = op1, op1 = op2, op2 = t;
329                         op1->last->next = op2->first;
330                         pushand(op1->first, op2->last);
331                         break;
332                 case STAR:
333                         op2 = popand('*');
334                         inst1 = newinst(OR);
335                         op2->last->next = inst1;
336                         inst1->right = op2->first;
337                         pushand(inst1, inst1);
338                         break;
339                 case PLUS:
340                         op2 = popand('+');
341                         inst1 = newinst(OR);
342                         op2->last->next = inst1;
343                         inst1->right = op2->first;
344                         pushand(op2->first, inst1);
345                         break;
346                 case QUEST:
347                         op2 = popand('?');
348                         inst1 = newinst(OR);
349                         inst2 = newinst(NOP);
350                         inst1->left = inst2;
351                         inst1->right = op2->first;
352                         op2->last->next = inst2;
353                         pushand(inst1, inst2);
354                         break;
355                 }
356         }
357 }
358
359
360 void
361 optimize(Inst *start)
362 {
363         Inst *inst, *target;
364
365         for(inst=start; inst->type!=END; inst++){
366                 target = inst->next;
367                 while(target->type == NOP)
368                         target = target->next;
369                 inst->next = target;
370         }
371 }
372
373 #ifdef  DEBUG
374 void
375 dumpstack(void){
376         Node *stk;
377         int *ip;
378
379         dprint("operators\n");
380         for(ip = atorstack; ip<atorp; ip++)
381                 dprint("0%o\n", *ip);
382         dprint("operands\n");
383         for(stk = andstack; stk<andp; stk++)
384                 dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
385 }
386 void
387 dump(void){
388         Inst *l;
389
390         l = program;
391         do{
392                 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
393                         l->left-program, l->right-program);
394         }while(l++->type);
395 }
396 #endif
397
398 void
399 startlex(Rune *s)
400 {
401         exprp = s;
402         nbra = 0;
403 }
404
405
406 int
407 lex(void){
408         int c= *exprp++;
409
410         switch(c){
411         case '\\':
412                 if(*exprp)
413                         if((c= *exprp++)=='n')
414                                 c='\n';
415                 break;
416         case 0:
417                 c = END;
418                 --exprp;        /* In case we come here again */
419                 break;
420         case '*':
421                 c = STAR;
422                 break;
423         case '?':
424                 c = QUEST;
425                 break;
426         case '+':
427                 c = PLUS;
428                 break;
429         case '|':
430                 c = OR;
431                 break;
432         case '.':
433                 c = ANY;
434                 break;
435         case '(':
436                 c = LBRA;
437                 break;
438         case ')':
439                 c = RBRA;
440                 break;
441         case '^':
442                 c = BOL;
443                 break;
444         case '$':
445                 c = EOL;
446                 break;
447         case '[':
448                 c = CCLASS;
449                 bldcclass();
450                 break;
451         }
452         return c;
453 }
454
455 long
456 nextrec(void){
457         if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
458                 regerror(Ebadclass);
459         if(exprp[0] == '\\'){
460                 exprp++;
461                 if(*exprp=='n'){
462                         exprp++;
463                         return '\n';
464                 }
465                 return *exprp++|(Runemask+1);
466         }
467         return *exprp++;
468 }
469
470 void
471 bldcclass(void)
472 {
473         long c1, c2, n, na;
474         Rune *classp;
475
476         classp = emalloc(DCLASS*RUNESIZE);
477         n = 0;
478         na = DCLASS;
479         /* we have already seen the '[' */
480         if(*exprp == '^'){
481                 classp[n++] = '\n';     /* don't match newline in negate case */
482                 negateclass = TRUE;
483                 exprp++;
484         }else
485                 negateclass = FALSE;
486         while((c1 = nextrec()) != ']'){
487                 if(c1 == '-'){
488     Error:
489                         free(classp);
490                         regerror(Ebadclass);
491                 }
492                 if(n+4 >= na){          /* 3 runes plus NUL */
493                         na += DCLASS;
494                         classp = erealloc(classp, na*RUNESIZE);
495                 }
496                 if(*exprp == '-'){
497                         exprp++;        /* eat '-' */
498                         if((c2 = nextrec()) == ']')
499                                 goto Error;
500                         classp[n+0] = Runemax;
501                         classp[n+1] = c1 & Runemask;
502                         classp[n+2] = c2 & Runemask;
503                         n += 3;
504                 }else
505                         classp[n++] = c1 & Runemask;
506         }
507         classp[n] = 0;
508         if(nclass == Nclass){
509                 Nclass += DCLASS;
510                 class = erealloc(class, Nclass*sizeof(Rune*));
511         }
512         class[nclass++] = classp;
513 }
514
515 int
516 classmatch(int classno, int c, int negate)
517 {
518         Rune *p;
519
520         p = class[classno];
521         while(*p){
522                 if(*p == Runemax){
523                         if(p[1]<=c && c<=p[2])
524                                 return !negate;
525                         p += 3;
526                 }else if(*p++ == c)
527                         return !negate;
528         }
529         return negate;
530 }
531
532 /*
533  * Note optimization in addinst:
534  *      *l must be pending when addinst called; if *l has been looked
535  *              at already, the optimization is a bug.
536  */
537 int
538 addinst(Ilist *l, Inst *inst, Rangeset *sep)
539 {
540         Ilist *p;
541
542         for(p = l; p->inst; p++){
543                 if(p->inst==inst){
544                         if((sep)->p[0].p1 < p->se.p[0].p1)
545                                 p->se= *sep;    /* this would be bug */
546                         return 0;       /* It's already there */
547                 }
548         }
549         p->inst = inst;
550         p->se= *sep;
551         (p+1)->inst = 0;
552         return 1;
553 }
554
555 int
556 execute(File *f, Posn startp, Posn eof)
557 {
558         int flag = 0;
559         Inst *inst;
560         Ilist *tlp;
561         Posn p = startp;
562         int nnl = 0, ntl;
563         int c;
564         int wrapped = 0;
565         int startchar = startinst->type<OPERATOR? startinst->type : 0;
566
567         list[0][0].inst = list[1][0].inst = 0;
568         sel.p[0].p1 = -1;
569         /* Execute machine once for each character */
570         for(;;p++){
571         doloop:
572                 c = filereadc(f, p);
573                 if(p>=eof || c<0){
574                         switch(wrapped++){
575                         case 0:         /* let loop run one more click */
576                         case 2:
577                                 break;
578                         case 1:         /* expired; wrap to beginning */
579                                 if(sel.p[0].p1>=0 || eof!=INFINITY)
580                                         goto Return;
581                                 list[0][0].inst = list[1][0].inst = 0;
582                                 p = 0;
583                                 goto doloop;
584                         default:
585                                 goto Return;
586                         }
587                 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
588                         break;
589                 /* fast check for first char */
590                 if(startchar && nnl==0 && c!=startchar)
591                         continue;
592                 tl = list[flag];
593                 nl = list[flag^=1];
594                 nl->inst = 0;
595                 ntl = nnl;
596                 nnl = 0;
597                 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
598                         /* Add first instruction to this list */
599                         sempty.p[0].p1 = p;
600                         if(addinst(tl, startinst, &sempty))
601                         if(++ntl >= NLIST)
602         Overflow:
603                                 error(Eoverflow);
604                 }
605                 /* Execute machine until this list is empty */
606                 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
607         Switchstmt:
608                         switch(inst->type){
609                         default:        /* regular character */
610                                 if(inst->type==c){
611         Addinst:
612                                         if(addinst(nl, inst->next, &tlp->se))
613                                         if(++nnl >= NLIST)
614                                                 goto Overflow;
615                                 }
616                                 break;
617                         case LBRA:
618                                 if(inst->subid>=0)
619                                         tlp->se.p[inst->subid].p1 = p;
620                                 inst = inst->next;
621                                 goto Switchstmt;
622                         case RBRA:
623                                 if(inst->subid>=0)
624                                         tlp->se.p[inst->subid].p2 = p;
625                                 inst = inst->next;
626                                 goto Switchstmt;
627                         case ANY:
628                                 if(c!='\n')
629                                         goto Addinst;
630                                 break;
631                         case BOL:
632                                 if(p==0 || filereadc(f, p - 1)=='\n'){
633         Step:
634                                         inst = inst->next;
635                                         goto Switchstmt;
636                                 }
637                                 break;
638                         case EOL:
639                                 if(c == '\n')
640                                         goto Step;
641                                 break;
642                         case CCLASS:
643                                 if(c>=0 && classmatch(inst->rclass, c, 0))
644                                         goto Addinst;
645                                 break;
646                         case NCCLASS:
647                                 if(c>=0 && classmatch(inst->rclass, c, 1))
648                                         goto Addinst;
649                                 break;
650                         case OR:
651                                 /* evaluate right choice later */
652                                 if(addinst(tlp, inst->right, &tlp->se))
653                                 if(++ntl >= NLIST)
654                                         goto Overflow;
655                                 /* efficiency: advance and re-evaluate */
656                                 inst = inst->left;
657                                 goto Switchstmt;
658                         case END:       /* Match! */
659                                 tlp->se.p[0].p2 = p;
660                                 newmatch(&tlp->se);
661                                 break;
662                         }
663                 }
664         }
665     Return:
666         return sel.p[0].p1>=0;
667 }
668
669 void
670 newmatch(Rangeset *sp)
671 {
672         int i;
673
674         if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
675            (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
676                 for(i = 0; i<NSUBEXP; i++)
677                         sel.p[i] = sp->p[i];
678 }
679
680 int
681 bexecute(File *f, Posn startp)
682 {
683         int flag = 0;
684         Inst *inst;
685         Ilist *tlp;
686         Posn p = startp;
687         int nnl = 0, ntl;
688         int c;
689         int wrapped = 0;
690         int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
691
692         list[0][0].inst = list[1][0].inst = 0;
693         sel.p[0].p1= -1;
694         /* Execute machine once for each character, including terminal NUL */
695         for(;;--p){
696         doloop:
697                 if((c = filereadc(f, p - 1))==-1){
698                         switch(wrapped++){
699                         case 0:         /* let loop run one more click */
700                         case 2:
701                                 break;
702                         case 1:         /* expired; wrap to end */
703                                 if(sel.p[0].p1>=0)
704                         case 3:
705                                         goto Return;
706                                 list[0][0].inst = list[1][0].inst = 0;
707                                 p = f->nc;
708                                 goto doloop;
709                         default:
710                                 goto Return;
711                         }
712                 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
713                         break;
714                 /* fast check for first char */
715                 if(startchar && nnl==0 && c!=startchar)
716                         continue;
717                 tl = list[flag];
718                 nl = list[flag^=1];
719                 nl->inst = 0;
720                 ntl = nnl;
721                 nnl = 0;
722                 if(sel.p[0].p1<0 && (!wrapped || p>startp)){
723                         /* Add first instruction to this list */
724                         /* the minus is so the optimizations in addinst work */
725                         sempty.p[0].p1 = -p;
726                         if(addinst(tl, bstartinst, &sempty))
727                         if(++ntl >= NLIST)
728         Overflow:
729                                 error(Eoverflow);
730                 }
731                 /* Execute machine until this list is empty */
732                 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
733         Switchstmt:
734                         switch(inst->type){
735                         default:        /* regular character */
736                                 if(inst->type == c){
737         Addinst:
738                                         if(addinst(nl, inst->next, &tlp->se))
739                                         if(++nnl >= NLIST)
740                                                 goto Overflow;
741                                 }
742                                 break;
743                         case LBRA:
744                                 if(inst->subid>=0)
745                                         tlp->se.p[inst->subid].p1 = p;
746                                 inst = inst->next;
747                                 goto Switchstmt;
748                         case RBRA:
749                                 if(inst->subid >= 0)
750                                         tlp->se.p[inst->subid].p2 = p;
751                                 inst = inst->next;
752                                 goto Switchstmt;
753                         case ANY:
754                                 if(c != '\n')
755                                         goto Addinst;
756                                 break;
757                         case BOL:
758                                 if(c=='\n' || p==0){
759         Step:
760                                         inst = inst->next;
761                                         goto Switchstmt;
762                                 }
763                                 break;
764                         case EOL:
765                                 if(p==f->nc || filereadc(f, p)=='\n')
766                                         goto Step;
767                                 break;
768                         case CCLASS:
769                                 if(c>=0 && classmatch(inst->rclass, c, 0))
770                                         goto Addinst;
771                                 break;
772                         case NCCLASS:
773                                 if(c>=0 && classmatch(inst->rclass, c, 1))
774                                         goto Addinst;
775                                 break;
776                         case OR:
777                                 /* evaluate right choice later */
778                                 if(addinst(tl, inst->right, &tlp->se))
779                                 if(++ntl >= NLIST)
780                                         goto Overflow;
781                                 /* efficiency: advance and re-evaluate */
782                                 inst = inst->left;
783                                 goto Switchstmt;
784                         case END:       /* Match! */
785                                 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
786                                 tlp->se.p[0].p2 = p;
787                                 bnewmatch(&tlp->se);
788                                 break;
789                         }
790                 }
791         }
792     Return:
793         return sel.p[0].p1>=0;
794 }
795
796 void
797 bnewmatch(Rangeset *sp)
798 {
799         int  i;
800         if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
801                 for(i = 0; i<NSUBEXP; i++){       /* note the reversal; p1<=p2 */
802                         sel.p[i].p1 = sp->p[i].p2;
803                         sel.p[i].p2 = sp->p[i].p1;
804                 }
805 }