]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/sam/regexp.c
merge
[plan9front.git] / sys / src / cmd / sam / regexp.c
1 #include "sam.h"
2
3 Rangeset        sel;
4 String          lastregexp;
5 /*
6  * Machine Information
7  */
8 typedef struct Inst Inst;
9
10 struct Inst
11 {
12         long    type;   /* < 0x10000 ==> literal, otherwise action */
13         union {
14                 int rsid;
15                 int rsubid;
16                 int class;
17                 struct Inst *rother;
18                 struct Inst *rright;
19         } r;
20         union{
21                 struct Inst *lleft;
22                 struct Inst *lnext;
23         } l;
24 };
25 #define sid     r.rsid
26 #define subid   r.rsubid
27 #define rclass  r.class
28 #define other   r.rother
29 #define right   r.rright
30 #define left    l.lleft
31 #define next    l.lnext
32
33 #define NPROG   1024
34 Inst    program[NPROG];
35 Inst    *progp;
36 Inst    *startinst;     /* First inst. of program; might not be program[0] */
37 Inst    *bstartinst;    /* same for backwards machine */
38
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42         Inst    *inst;          /* Instruction of the thread */
43         Rangeset se;
44         Posn    startp;         /* first char of match */
45 };
46
47 #define NLIST   127
48
49 Ilist   *tl, *nl;       /* This list, next list */
50 Ilist   list[2][NLIST+1];       /* +1 for trailing null */
51 static  Rangeset sempty;
52
53 /*
54  * Actions and Tokens
55  *
56  *      0x100xx are operators, value == precedence
57  *      0x200xx are tokens, i.e. operands for operators
58  */
59 #define OPERATOR        0x10000 /* Bitmask of all operators */
60 #define START           0x10000 /* Start, used for marker on stack */
61 #define RBRA            0x10001 /* Right bracket, ) */
62 #define LBRA            0x10002 /* Left bracket, ( */
63 #define OR              0x10003 /* Alternation, | */
64 #define CAT             0x10004 /* Concatentation, implicit operator */
65 #define STAR            0x10005 /* Closure, * */
66 #define PLUS            0x10006 /* a+ == aa* */
67 #define QUEST           0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
68 #define ANY             0x20000 /* Any character but newline, . */
69 #define NOP             0x20001 /* No operation, internal use only */
70 #define BOL             0x20002 /* Beginning of line, ^ */
71 #define EOL             0x20003 /* End of line, $ */
72 #define CCLASS          0x20004 /* Character class, [] */
73 #define NCCLASS         0x20005 /* Negated character class, [^] */
74 #define END             0x20077 /* Terminate: match found */
75
76 #define ISATOR          0x10000
77 #define ISAND           0x20000
78
79 /*
80  * Parser Information
81  */
82 typedef struct Node Node;
83 struct Node
84 {
85         Inst    *first;
86         Inst    *last;
87 };
88
89 #define NSTACK  20
90 Node    andstack[NSTACK];
91 Node    *andp;
92 int     atorstack[NSTACK];
93 int     *atorp;
94 int     lastwasand;     /* Last token was operand */
95 int     cursubid;
96 int     subidstack[NSTACK];
97 int     *subidp;
98 int     backwards;
99 int     nbra;
100 Rune    *exprp;         /* pointer to next character in source expression */
101 #define DCLASS  10      /* allocation increment */
102 int     nclass;         /* number active */
103 int     Nclass;         /* high water mark */
104 Rune    **class;
105 int     negateclass;
106
107 int     addinst(Ilist *l, Inst *inst, Rangeset *sep);
108 void    newmatch(Rangeset*);
109 void    bnewmatch(Rangeset*);
110 void    pushand(Inst*, Inst*);
111 void    pushator(int);
112 Node    *popand(int);
113 int     popator(void);
114 void    startlex(Rune*);
115 int     lex(void);
116 void    operator(int);
117 void    operand(int);
118 void    evaluntil(int);
119 void    optimize(Inst*);
120 void    bldcclass(void);
121
122 void
123 regerror(Err e)
124 {
125         Strzero(&lastregexp);
126         error(e);
127 }
128
129 void
130 regerror_c(Err e, int c)
131 {
132         Strzero(&lastregexp);
133         error_c(e, c);
134 }
135
136 Inst *
137 newinst(int t)
138 {
139         if(progp >= &program[NPROG])
140                 regerror(Etoolong);
141         progp->type = t;
142         progp->left = 0;
143         progp->right = 0;
144         return progp++;
145 }
146
147 Inst *
148 realcompile(Rune *s)
149 {
150         int token;
151
152         startlex(s);
153         atorp = atorstack;
154         andp = andstack;
155         subidp = subidstack;
156         cursubid = 0;
157         lastwasand = FALSE;
158         /* Start with a low priority operator to prime parser */
159         pushator(START-1);
160         while((token=lex()) != END){
161                 if((token&ISATOR) == OPERATOR)
162                         operator(token);
163                 else
164                         operand(token);
165         }
166         /* Close with a low priority operator */
167         evaluntil(START);
168         /* Force END */
169         operand(END);
170         evaluntil(START);
171         if(nbra)
172                 regerror(Eleftpar);
173         --andp; /* points to first and only operand */
174         return andp->first;
175 }
176
177 void
178 compile(String *s)
179 {
180         int i;
181         Inst *oprogp;
182
183         if(Strcmp(s, &lastregexp)==0)
184                 return;
185         for(i=0; i<nclass; i++)
186                 free(class[i]);
187         nclass = 0;
188         progp = program;
189         backwards = FALSE;
190         startinst = realcompile(s->s);
191         optimize(program);
192         oprogp = progp;
193         backwards = TRUE;
194         bstartinst = realcompile(s->s);
195         optimize(oprogp);
196         Strduplstr(&lastregexp, s);
197 }
198
199 void
200 operand(int t)
201 {
202         Inst *i;
203         if(lastwasand)
204                 operator(CAT);  /* catenate is implicit */
205         i = newinst(t);
206         if(t == CCLASS){
207                 if(negateclass)
208                         i->type = NCCLASS;      /* UGH */
209                 i->rclass = nclass-1;           /* UGH */
210         }
211         pushand(i, i);
212         lastwasand = TRUE;
213 }
214
215 void
216 operator(int t)
217 {
218         if(t==RBRA && --nbra<0)
219                 regerror(Erightpar);
220         if(t==LBRA){
221 /*
222  *              if(++cursubid >= NSUBEXP)
223  *                      regerror(Esubexp);
224  */
225                 cursubid++;     /* silently ignored */
226                 nbra++;
227                 if(lastwasand)
228                         operator(CAT);
229         }else
230                 evaluntil(t);
231         if(t!=RBRA)
232                 pushator(t);
233         lastwasand = FALSE;
234         if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
235                 lastwasand = TRUE;      /* these look like operands */
236 }
237
238 void
239 cant(char *s)
240 {
241         char buf[100];
242
243         sprint(buf, "regexp: can't happen: %s", s);
244         panic(buf);
245 }
246
247 void
248 pushand(Inst *f, Inst *l)
249 {
250         if(andp >= &andstack[NSTACK])
251                 cant("operand stack overflow");
252         andp->first = f;
253         andp->last = l;
254         andp++;
255 }
256
257 void
258 pushator(int t)
259 {
260         if(atorp >= &atorstack[NSTACK])
261                 cant("operator stack overflow");
262         *atorp++=t;
263         if(cursubid >= NSUBEXP)
264                 *subidp++= -1;
265         else
266                 *subidp++=cursubid;
267 }
268
269 Node *
270 popand(int op)
271 {
272         if(andp <= &andstack[0])
273                 if(op)
274                         regerror_c(Emissop, op);
275                 else
276                         regerror(Ebadregexp);
277         return --andp;
278 }
279
280 int
281 popator(void)
282 {
283         if(atorp <= &atorstack[0])
284                 cant("operator stack underflow");
285         --subidp;
286         return *--atorp;
287 }
288
289 void
290 evaluntil(int pri)
291 {
292         Node *op1, *op2, *t;
293         Inst *inst1, *inst2;
294
295         while(pri==RBRA || atorp[-1]>=pri){
296                 switch(popator()){
297                 case LBRA:
298                         op1 = popand('(');
299                         inst2 = newinst(RBRA);
300                         inst2->subid = *subidp;
301                         op1->last->next = inst2;
302                         inst1 = newinst(LBRA);
303                         inst1->subid = *subidp;
304                         inst1->next = op1->first;
305                         pushand(inst1, inst2);
306                         return;         /* must have been RBRA */
307                 default:
308                         panic("unknown regexp operator");
309                         break;
310                 case OR:
311                         op2 = popand('|');
312                         op1 = popand('|');
313                         inst2 = newinst(NOP);
314                         op2->last->next = inst2;
315                         op1->last->next = inst2;
316                         inst1 = newinst(OR);
317                         inst1->right = op1->first;
318                         inst1->left = op2->first;
319                         pushand(inst1, inst2);
320                         break;
321                 case CAT:
322                         op2 = popand(0);
323                         op1 = popand(0);
324                         if(backwards && op2->first->type!=END)
325                                 t = op1, op1 = op2, op2 = t;
326                         op1->last->next = op2->first;
327                         pushand(op1->first, op2->last);
328                         break;
329                 case STAR:
330                         op2 = popand('*');
331                         inst1 = newinst(OR);
332                         op2->last->next = inst1;
333                         inst1->right = op2->first;
334                         pushand(inst1, inst1);
335                         break;
336                 case PLUS:
337                         op2 = popand('+');
338                         inst1 = newinst(OR);
339                         op2->last->next = inst1;
340                         inst1->right = op2->first;
341                         pushand(op2->first, inst1);
342                         break;
343                 case QUEST:
344                         op2 = popand('?');
345                         inst1 = newinst(OR);
346                         inst2 = newinst(NOP);
347                         inst1->left = inst2;
348                         inst1->right = op2->first;
349                         op2->last->next = inst2;
350                         pushand(inst1, inst2);
351                         break;
352                 }
353         }
354 }
355
356
357 void
358 optimize(Inst *start)
359 {
360         Inst *inst, *target;
361
362         for(inst=start; inst->type!=END; inst++){
363                 target = inst->next;
364                 while(target->type == NOP)
365                         target = target->next;
366                 inst->next = target;
367         }
368 }
369
370 #ifdef  DEBUG
371 void
372 dumpstack(void){
373         Node *stk;
374         int *ip;
375
376         dprint("operators\n");
377         for(ip = atorstack; ip<atorp; ip++)
378                 dprint("0%o\n", *ip);
379         dprint("operands\n");
380         for(stk = andstack; stk<andp; stk++)
381                 dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
382 }
383 void
384 dump(void){
385         Inst *l;
386
387         l = program;
388         do{
389                 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
390                         l->left-program, l->right-program);
391         }while(l++->type);
392 }
393 #endif
394
395 void
396 startlex(Rune *s)
397 {
398         exprp = s;
399         nbra = 0;
400 }
401
402
403 int
404 lex(void){
405         int c= *exprp++;
406
407         switch(c){
408         case '\\':
409                 if(*exprp)
410                         if((c= *exprp++)=='n')
411                                 c='\n';
412                 break;
413         case 0:
414                 c = END;
415                 --exprp;        /* In case we come here again */
416                 break;
417         case '*':
418                 c = STAR;
419                 break;
420         case '?':
421                 c = QUEST;
422                 break;
423         case '+':
424                 c = PLUS;
425                 break;
426         case '|':
427                 c = OR;
428                 break;
429         case '.':
430                 c = ANY;
431                 break;
432         case '(':
433                 c = LBRA;
434                 break;
435         case ')':
436                 c = RBRA;
437                 break;
438         case '^':
439                 c = BOL;
440                 break;
441         case '$':
442                 c = EOL;
443                 break;
444         case '[':
445                 c = CCLASS;
446                 bldcclass();
447                 break;
448         }
449         return c;
450 }
451
452 long
453 nextrec(void){
454         if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
455                 regerror(Ebadclass);
456         if(exprp[0] == '\\'){
457                 exprp++;
458                 if(*exprp=='n'){
459                         exprp++;
460                         return '\n';
461                 }
462                 return *exprp++|0x10000;
463         }
464         return *exprp++;
465 }
466
467 void
468 bldcclass(void)
469 {
470         long c1, c2, n, na;
471         Rune *classp;
472
473         classp = emalloc(DCLASS*RUNESIZE);
474         n = 0;
475         na = DCLASS;
476         /* we have already seen the '[' */
477         if(*exprp == '^'){
478                 classp[n++] = '\n';     /* don't match newline in negate case */
479                 negateclass = TRUE;
480                 exprp++;
481         }else
482                 negateclass = FALSE;
483         while((c1 = nextrec()) != ']'){
484                 if(c1 == '-'){
485     Error:
486                         free(classp);
487                         regerror(Ebadclass);
488                 }
489                 if(n+4 >= na){          /* 3 runes plus NUL */
490                         na += DCLASS;
491                         classp = erealloc(classp, na*RUNESIZE);
492                 }
493                 if(*exprp == '-'){
494                         exprp++;        /* eat '-' */
495                         if((c2 = nextrec()) == ']')
496                                 goto Error;
497                         classp[n+0] = 0xFFFF;
498                         classp[n+1] = c1;
499                         classp[n+2] = c2;
500                         n += 3;
501                 }else
502                         classp[n++] = c1;
503         }
504         classp[n] = 0;
505         if(nclass == Nclass){
506                 Nclass += DCLASS;
507                 class = erealloc(class, Nclass*sizeof(Rune*));
508         }
509         class[nclass++] = classp;
510 }
511
512 int
513 classmatch(int classno, int c, int negate)
514 {
515         Rune *p;
516
517         p = class[classno];
518         while(*p){
519                 if(*p == 0xFFFF){
520                         if(p[1]<=c && c<=p[2])
521                                 return !negate;
522                         p += 3;
523                 }else if(*p++ == c)
524                         return !negate;
525         }
526         return negate;
527 }
528
529 /*
530  * Note optimization in addinst:
531  *      *l must be pending when addinst called; if *l has been looked
532  *              at already, the optimization is a bug.
533  */
534 int
535 addinst(Ilist *l, Inst *inst, Rangeset *sep)
536 {
537         Ilist *p;
538
539         for(p = l; p->inst; p++){
540                 if(p->inst==inst){
541                         if((sep)->p[0].p1 < p->se.p[0].p1)
542                                 p->se= *sep;    /* this would be bug */
543                         return 0;       /* It's already there */
544                 }
545         }
546         p->inst = inst;
547         p->se= *sep;
548         (p+1)->inst = 0;
549         return 1;
550 }
551
552 int
553 execute(File *f, Posn startp, Posn eof)
554 {
555         int flag = 0;
556         Inst *inst;
557         Ilist *tlp;
558         Posn p = startp;
559         int nnl = 0, ntl;
560         int c;
561         int wrapped = 0;
562         int startchar = startinst->type<OPERATOR? startinst->type : 0;
563
564         list[0][0].inst = list[1][0].inst = 0;
565         sel.p[0].p1 = -1;
566         /* Execute machine once for each character */
567         for(;;p++){
568         doloop:
569                 c = filereadc(f, p);
570                 if(p>=eof || c<0){
571                         switch(wrapped++){
572                         case 0:         /* let loop run one more click */
573                         case 2:
574                                 break;
575                         case 1:         /* expired; wrap to beginning */
576                                 if(sel.p[0].p1>=0 || eof!=INFINITY)
577                                         goto Return;
578                                 list[0][0].inst = list[1][0].inst = 0;
579                                 p = 0;
580                                 goto doloop;
581                         default:
582                                 goto Return;
583                         }
584                 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
585                         break;
586                 /* fast check for first char */
587                 if(startchar && nnl==0 && c!=startchar)
588                         continue;
589                 tl = list[flag];
590                 nl = list[flag^=1];
591                 nl->inst = 0;
592                 ntl = nnl;
593                 nnl = 0;
594                 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
595                         /* Add first instruction to this list */
596                         sempty.p[0].p1 = p;
597                         if(addinst(tl, startinst, &sempty))
598                         if(++ntl >= NLIST)
599         Overflow:
600                                 error(Eoverflow);
601                 }
602                 /* Execute machine until this list is empty */
603                 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
604         Switchstmt:
605                         switch(inst->type){
606                         default:        /* regular character */
607                                 if(inst->type==c){
608         Addinst:
609                                         if(addinst(nl, inst->next, &tlp->se))
610                                         if(++nnl >= NLIST)
611                                                 goto Overflow;
612                                 }
613                                 break;
614                         case LBRA:
615                                 if(inst->subid>=0)
616                                         tlp->se.p[inst->subid].p1 = p;
617                                 inst = inst->next;
618                                 goto Switchstmt;
619                         case RBRA:
620                                 if(inst->subid>=0)
621                                         tlp->se.p[inst->subid].p2 = p;
622                                 inst = inst->next;
623                                 goto Switchstmt;
624                         case ANY:
625                                 if(c!='\n')
626                                         goto Addinst;
627                                 break;
628                         case BOL:
629                                 if(p==0 || filereadc(f, p - 1)=='\n'){
630         Step:
631                                         inst = inst->next;
632                                         goto Switchstmt;
633                                 }
634                                 break;
635                         case EOL:
636                                 if(c == '\n')
637                                         goto Step;
638                                 break;
639                         case CCLASS:
640                                 if(c>=0 && classmatch(inst->rclass, c, 0))
641                                         goto Addinst;
642                                 break;
643                         case NCCLASS:
644                                 if(c>=0 && classmatch(inst->rclass, c, 1))
645                                         goto Addinst;
646                                 break;
647                         case OR:
648                                 /* evaluate right choice later */
649                                 if(addinst(tlp, inst->right, &tlp->se))
650                                 if(++ntl >= NLIST)
651                                         goto Overflow;
652                                 /* efficiency: advance and re-evaluate */
653                                 inst = inst->left;
654                                 goto Switchstmt;
655                         case END:       /* Match! */
656                                 tlp->se.p[0].p2 = p;
657                                 newmatch(&tlp->se);
658                                 break;
659                         }
660                 }
661         }
662     Return:
663         return sel.p[0].p1>=0;
664 }
665
666 void
667 newmatch(Rangeset *sp)
668 {
669         int i;
670
671         if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
672            (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
673                 for(i = 0; i<NSUBEXP; i++)
674                         sel.p[i] = sp->p[i];
675 }
676
677 int
678 bexecute(File *f, Posn startp)
679 {
680         int flag = 0;
681         Inst *inst;
682         Ilist *tlp;
683         Posn p = startp;
684         int nnl = 0, ntl;
685         int c;
686         int wrapped = 0;
687         int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
688
689         list[0][0].inst = list[1][0].inst = 0;
690         sel.p[0].p1= -1;
691         /* Execute machine once for each character, including terminal NUL */
692         for(;;--p){
693         doloop:
694                 if((c = filereadc(f, p - 1))==-1){
695                         switch(wrapped++){
696                         case 0:         /* let loop run one more click */
697                         case 2:
698                                 break;
699                         case 1:         /* expired; wrap to end */
700                                 if(sel.p[0].p1>=0)
701                         case 3:
702                                         goto Return;
703                                 list[0][0].inst = list[1][0].inst = 0;
704                                 p = f->nc;
705                                 goto doloop;
706                         default:
707                                 goto Return;
708                         }
709                 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
710                         break;
711                 /* fast check for first char */
712                 if(startchar && nnl==0 && c!=startchar)
713                         continue;
714                 tl = list[flag];
715                 nl = list[flag^=1];
716                 nl->inst = 0;
717                 ntl = nnl;
718                 nnl = 0;
719                 if(sel.p[0].p1<0 && (!wrapped || p>startp)){
720                         /* Add first instruction to this list */
721                         /* the minus is so the optimizations in addinst work */
722                         sempty.p[0].p1 = -p;
723                         if(addinst(tl, bstartinst, &sempty))
724                         if(++ntl >= NLIST)
725         Overflow:
726                                 error(Eoverflow);
727                 }
728                 /* Execute machine until this list is empty */
729                 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
730         Switchstmt:
731                         switch(inst->type){
732                         default:        /* regular character */
733                                 if(inst->type == c){
734         Addinst:
735                                         if(addinst(nl, inst->next, &tlp->se))
736                                         if(++nnl >= NLIST)
737                                                 goto Overflow;
738                                 }
739                                 break;
740                         case LBRA:
741                                 if(inst->subid>=0)
742                                         tlp->se.p[inst->subid].p1 = p;
743                                 inst = inst->next;
744                                 goto Switchstmt;
745                         case RBRA:
746                                 if(inst->subid >= 0)
747                                         tlp->se.p[inst->subid].p2 = p;
748                                 inst = inst->next;
749                                 goto Switchstmt;
750                         case ANY:
751                                 if(c != '\n')
752                                         goto Addinst;
753                                 break;
754                         case BOL:
755                                 if(c=='\n' || p==0){
756         Step:
757                                         inst = inst->next;
758                                         goto Switchstmt;
759                                 }
760                                 break;
761                         case EOL:
762                                 if(p==f->nc || filereadc(f, p)=='\n')
763                                         goto Step;
764                                 break;
765                         case CCLASS:
766                                 if(c>=0 && classmatch(inst->rclass, c, 0))
767                                         goto Addinst;
768                                 break;
769                         case NCCLASS:
770                                 if(c>=0 && classmatch(inst->rclass, c, 1))
771                                         goto Addinst;
772                                 break;
773                         case OR:
774                                 /* evaluate right choice later */
775                                 if(addinst(tl, inst->right, &tlp->se))
776                                 if(++ntl >= NLIST)
777                                         goto Overflow;
778                                 /* efficiency: advance and re-evaluate */
779                                 inst = inst->left;
780                                 goto Switchstmt;
781                         case END:       /* Match! */
782                                 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
783                                 tlp->se.p[0].p2 = p;
784                                 bnewmatch(&tlp->se);
785                                 break;
786                         }
787                 }
788         }
789     Return:
790         return sel.p[0].p1>=0;
791 }
792
793 void
794 bnewmatch(Rangeset *sp)
795 {
796         int  i;
797         if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
798                 for(i = 0; i<NSUBEXP; i++){       /* note the reversal; p1<=p2 */
799                         sel.p[i].p1 = sp->p[i].p2;
800                         sel.p[i].p2 = sp->p[i].p1;
801                 }
802 }