8 typedef struct Inst Inst;
12 long type; /* <= Runemax ==> literal, otherwise action */
26 #define subid r.rsubid
27 #define rclass r.class
28 #define other r.rother
29 #define right r.rright
36 Inst *startinst; /* First inst. of program; might not be program[0] */
37 Inst *bstartinst; /* same for backwards machine */
39 typedef struct Ilist Ilist;
42 Inst *inst; /* Instruction of the thread */
44 Posn startp; /* first char of match */
49 Ilist *tl, *nl; /* This list, next list */
50 Ilist list[2][NLIST+1]; /* +1 for trailing null */
51 static Rangeset sempty;
56 * 0x100xx are operators, value == precedence
57 * 0x200xx are tokens, i.e. operands for operators
60 OPERATOR = Runemask+1, /* Bitmask of all operators */
61 START = OPERATOR, /* Start, used for marker on stack */
62 RBRA, /* Right bracket, ) */
63 LBRA, /* Left bracket, ( */
64 OR, /* Alternation, | */
65 CAT, /* Concatentation, implicit operator */
66 STAR, /* Closure, * */
68 QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
70 ANY = OPERATOR<<1, /* Any character but newline, . */
71 NOP, /* No operation, internal use only */
72 BOL, /* Beginning of line, ^ */
73 EOL, /* End of line, $ */
74 CCLASS, /* Character class, [] */
75 NCCLASS, /* Negated character class, [^] */
76 END, /* Terminate: match found */
85 typedef struct Node Node;
93 Node andstack[NSTACK];
95 int atorstack[NSTACK];
97 int lastwasand; /* Last token was operand */
99 int subidstack[NSTACK];
103 Rune *exprp; /* pointer to next character in source expression */
104 #define DCLASS 10 /* allocation increment */
105 int nclass; /* number active */
106 int Nclass; /* high water mark */
110 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
111 void newmatch(Rangeset*);
112 void bnewmatch(Rangeset*);
113 void pushand(Inst*, Inst*);
117 void startlex(Rune*);
122 void optimize(Inst*);
123 void bldcclass(void);
128 Strzero(&lastregexp);
133 regerror_c(Err e, int c)
135 Strzero(&lastregexp);
142 if(progp >= &program[NPROG])
161 /* Start with a low priority operator to prime parser */
163 while((token=lex()) != END){
164 if((token&ISATOR) == OPERATOR)
169 /* Close with a low priority operator */
176 --andp; /* points to first and only operand */
186 if(Strcmp(s, &lastregexp)==0)
188 for(i=0; i<nclass; i++)
193 startinst = realcompile(s->s);
197 bstartinst = realcompile(s->s);
199 Strduplstr(&lastregexp, s);
207 operator(CAT); /* catenate is implicit */
211 i->type = NCCLASS; /* UGH */
212 i->rclass = nclass-1; /* UGH */
221 if(t==RBRA && --nbra<0)
225 * if(++cursubid >= NSUBEXP)
228 cursubid++; /* silently ignored */
237 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
238 lastwasand = TRUE; /* these look like operands */
246 sprint(buf, "regexp: can't happen: %s", s);
251 pushand(Inst *f, Inst *l)
253 if(andp >= &andstack[NSTACK])
254 cant("operand stack overflow");
263 if(atorp >= &atorstack[NSTACK])
264 cant("operator stack overflow");
266 if(cursubid >= NSUBEXP)
275 if(andp <= &andstack[0])
277 regerror_c(Emissop, op);
279 regerror(Ebadregexp);
286 if(atorp <= &atorstack[0])
287 cant("operator stack underflow");
298 while(pri==RBRA || atorp[-1]>=pri){
302 inst2 = newinst(RBRA);
303 inst2->subid = *subidp;
304 op1->last->next = inst2;
305 inst1 = newinst(LBRA);
306 inst1->subid = *subidp;
307 inst1->next = op1->first;
308 pushand(inst1, inst2);
309 return; /* must have been RBRA */
311 panic("unknown regexp operator");
316 inst2 = newinst(NOP);
317 op2->last->next = inst2;
318 op1->last->next = inst2;
320 inst1->right = op1->first;
321 inst1->left = op2->first;
322 pushand(inst1, inst2);
327 if(backwards && op2->first->type!=END)
328 t = op1, op1 = op2, op2 = t;
329 op1->last->next = op2->first;
330 pushand(op1->first, op2->last);
335 op2->last->next = inst1;
336 inst1->right = op2->first;
337 pushand(inst1, inst1);
342 op2->last->next = inst1;
343 inst1->right = op2->first;
344 pushand(op2->first, inst1);
349 inst2 = newinst(NOP);
351 inst1->right = op2->first;
352 op2->last->next = inst2;
353 pushand(inst1, inst2);
361 optimize(Inst *start)
365 for(inst=start; inst->type!=END; inst++){
367 while(target->type == NOP)
368 target = target->next;
379 dprint("operators\n");
380 for(ip = atorstack; ip<atorp; ip++)
381 dprint("0%o\n", *ip);
382 dprint("operands\n");
383 for(stk = andstack; stk<andp; stk++)
384 dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
392 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
393 l->left-program, l->right-program);
413 if((c= *exprp++)=='n')
418 --exprp; /* In case we come here again */
457 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
459 if(exprp[0] == '\\'){
465 return *exprp++|(Runemask+1);
476 classp = emalloc(DCLASS*RUNESIZE);
479 /* we have already seen the '[' */
481 classp[n++] = '\n'; /* don't match newline in negate case */
486 while((c1 = nextrec()) != ']'){
492 if(n+4 >= na){ /* 3 runes plus NUL */
494 classp = erealloc(classp, na*RUNESIZE);
497 exprp++; /* eat '-' */
498 if((c2 = nextrec()) == ']')
500 classp[n+0] = Runemax;
501 classp[n+1] = c1 & Runemask;
502 classp[n+2] = c2 & Runemask;
505 classp[n++] = c1 & Runemask;
508 if(nclass == Nclass){
510 class = erealloc(class, Nclass*sizeof(Rune*));
512 class[nclass++] = classp;
516 classmatch(int classno, int c, int negate)
523 if(p[1]<=c && c<=p[2])
533 * Note optimization in addinst:
534 * *l must be pending when addinst called; if *l has been looked
535 * at already, the optimization is a bug.
538 addinst(Ilist *l, Inst *inst, Rangeset *sep)
542 for(p = l; p->inst; p++){
544 if((sep)->p[0].p1 < p->se.p[0].p1)
545 p->se= *sep; /* this would be bug */
546 return 0; /* It's already there */
556 execute(File *f, Posn startp, Posn eof)
565 int startchar = startinst->type<OPERATOR? startinst->type : 0;
567 list[0][0].inst = list[1][0].inst = 0;
569 /* Execute machine once for each character */
575 case 0: /* let loop run one more click */
578 case 1: /* expired; wrap to beginning */
579 if(sel.p[0].p1>=0 || eof!=INFINITY)
581 list[0][0].inst = list[1][0].inst = 0;
587 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
589 /* fast check for first char */
590 if(startchar && nnl==0 && c!=startchar)
597 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
598 /* Add first instruction to this list */
600 if(addinst(tl, startinst, &sempty))
605 /* Execute machine until this list is empty */
606 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
609 default: /* regular character */
612 if(addinst(nl, inst->next, &tlp->se))
619 tlp->se.p[inst->subid].p1 = p;
624 tlp->se.p[inst->subid].p2 = p;
632 if(p==0 || filereadc(f, p - 1)=='\n'){
643 if(c>=0 && classmatch(inst->rclass, c, 0))
647 if(c>=0 && classmatch(inst->rclass, c, 1))
651 /* evaluate right choice later */
652 if(addinst(tlp, inst->right, &tlp->se))
655 /* efficiency: advance and re-evaluate */
658 case END: /* Match! */
666 return sel.p[0].p1>=0;
670 newmatch(Rangeset *sp)
674 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
675 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
676 for(i = 0; i<NSUBEXP; i++)
681 bexecute(File *f, Posn startp)
690 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
692 list[0][0].inst = list[1][0].inst = 0;
694 /* Execute machine once for each character, including terminal NUL */
697 if((c = filereadc(f, p - 1))==-1){
699 case 0: /* let loop run one more click */
702 case 1: /* expired; wrap to end */
706 list[0][0].inst = list[1][0].inst = 0;
712 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
714 /* fast check for first char */
715 if(startchar && nnl==0 && c!=startchar)
722 if(sel.p[0].p1<0 && (!wrapped || p>startp)){
723 /* Add first instruction to this list */
724 /* the minus is so the optimizations in addinst work */
726 if(addinst(tl, bstartinst, &sempty))
731 /* Execute machine until this list is empty */
732 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
735 default: /* regular character */
738 if(addinst(nl, inst->next, &tlp->se))
745 tlp->se.p[inst->subid].p1 = p;
750 tlp->se.p[inst->subid].p2 = p;
765 if(p==f->nc || filereadc(f, p)=='\n')
769 if(c>=0 && classmatch(inst->rclass, c, 0))
773 if(c>=0 && classmatch(inst->rclass, c, 1))
777 /* evaluate right choice later */
778 if(addinst(tl, inst->right, &tlp->se))
781 /* efficiency: advance and re-evaluate */
784 case END: /* Match! */
785 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
793 return sel.p[0].p1>=0;
797 bnewmatch(Rangeset *sp)
800 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
801 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */
802 sel.p[i].p1 = sp->p[i].p2;
803 sel.p[i].p2 = sp->p[i].p1;