8 typedef struct Inst Inst;
12 long type; /* < 0x10000 ==> literal, otherwise action */
26 #define subid r.rsubid
27 #define rclass r.class
28 #define other r.rother
29 #define right r.rright
36 Inst *startinst; /* First inst. of program; might not be program[0] */
37 Inst *bstartinst; /* same for backwards machine */
39 typedef struct Ilist Ilist;
42 Inst *inst; /* Instruction of the thread */
44 Posn startp; /* first char of match */
49 Ilist *tl, *nl; /* This list, next list */
50 Ilist list[2][NLIST+1]; /* +1 for trailing null */
51 static Rangeset sempty;
56 * 0x100xx are operators, value == precedence
57 * 0x200xx are tokens, i.e. operands for operators
59 #define OPERATOR 0x10000 /* Bitmask of all operators */
60 #define START 0x10000 /* Start, used for marker on stack */
61 #define RBRA 0x10001 /* Right bracket, ) */
62 #define LBRA 0x10002 /* Left bracket, ( */
63 #define OR 0x10003 /* Alternation, | */
64 #define CAT 0x10004 /* Concatentation, implicit operator */
65 #define STAR 0x10005 /* Closure, * */
66 #define PLUS 0x10006 /* a+ == aa* */
67 #define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
68 #define ANY 0x20000 /* Any character but newline, . */
69 #define NOP 0x20001 /* No operation, internal use only */
70 #define BOL 0x20002 /* Beginning of line, ^ */
71 #define EOL 0x20003 /* End of line, $ */
72 #define CCLASS 0x20004 /* Character class, [] */
73 #define NCCLASS 0x20005 /* Negated character class, [^] */
74 #define END 0x20077 /* Terminate: match found */
76 #define ISATOR 0x10000
82 typedef struct Node Node;
90 Node andstack[NSTACK];
92 int atorstack[NSTACK];
94 int lastwasand; /* Last token was operand */
96 int subidstack[NSTACK];
100 Rune *exprp; /* pointer to next character in source expression */
101 #define DCLASS 10 /* allocation increment */
102 int nclass; /* number active */
103 int Nclass; /* high water mark */
107 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
108 void newmatch(Rangeset*);
109 void bnewmatch(Rangeset*);
110 void pushand(Inst*, Inst*);
114 void startlex(Rune*);
119 void optimize(Inst*);
120 void bldcclass(void);
125 Strzero(&lastregexp);
130 regerror_c(Err e, int c)
132 Strzero(&lastregexp);
139 if(progp >= &program[NPROG])
158 /* Start with a low priority operator to prime parser */
160 while((token=lex()) != END){
161 if((token&ISATOR) == OPERATOR)
166 /* Close with a low priority operator */
173 --andp; /* points to first and only operand */
183 if(Strcmp(s, &lastregexp)==0)
185 for(i=0; i<nclass; i++)
190 startinst = realcompile(s->s);
194 bstartinst = realcompile(s->s);
196 Strduplstr(&lastregexp, s);
204 operator(CAT); /* catenate is implicit */
208 i->type = NCCLASS; /* UGH */
209 i->rclass = nclass-1; /* UGH */
218 if(t==RBRA && --nbra<0)
222 * if(++cursubid >= NSUBEXP)
225 cursubid++; /* silently ignored */
234 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
235 lastwasand = TRUE; /* these look like operands */
243 sprint(buf, "regexp: can't happen: %s", s);
248 pushand(Inst *f, Inst *l)
250 if(andp >= &andstack[NSTACK])
251 cant("operand stack overflow");
260 if(atorp >= &atorstack[NSTACK])
261 cant("operator stack overflow");
263 if(cursubid >= NSUBEXP)
272 if(andp <= &andstack[0])
274 regerror_c(Emissop, op);
276 regerror(Ebadregexp);
283 if(atorp <= &atorstack[0])
284 cant("operator stack underflow");
295 while(pri==RBRA || atorp[-1]>=pri){
299 inst2 = newinst(RBRA);
300 inst2->subid = *subidp;
301 op1->last->next = inst2;
302 inst1 = newinst(LBRA);
303 inst1->subid = *subidp;
304 inst1->next = op1->first;
305 pushand(inst1, inst2);
306 return; /* must have been RBRA */
308 panic("unknown regexp operator");
313 inst2 = newinst(NOP);
314 op2->last->next = inst2;
315 op1->last->next = inst2;
317 inst1->right = op1->first;
318 inst1->left = op2->first;
319 pushand(inst1, inst2);
324 if(backwards && op2->first->type!=END)
325 t = op1, op1 = op2, op2 = t;
326 op1->last->next = op2->first;
327 pushand(op1->first, op2->last);
332 op2->last->next = inst1;
333 inst1->right = op2->first;
334 pushand(inst1, inst1);
339 op2->last->next = inst1;
340 inst1->right = op2->first;
341 pushand(op2->first, inst1);
346 inst2 = newinst(NOP);
348 inst1->right = op2->first;
349 op2->last->next = inst2;
350 pushand(inst1, inst2);
358 optimize(Inst *start)
362 for(inst=start; inst->type!=END; inst++){
364 while(target->type == NOP)
365 target = target->next;
376 dprint("operators\n");
377 for(ip = atorstack; ip<atorp; ip++)
378 dprint("0%o\n", *ip);
379 dprint("operands\n");
380 for(stk = andstack; stk<andp; stk++)
381 dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
389 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
390 l->left-program, l->right-program);
410 if((c= *exprp++)=='n')
415 --exprp; /* In case we come here again */
454 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
456 if(exprp[0] == '\\'){
462 return *exprp++|0x10000;
473 classp = emalloc(DCLASS*RUNESIZE);
476 /* we have already seen the '[' */
478 classp[n++] = '\n'; /* don't match newline in negate case */
483 while((c1 = nextrec()) != ']'){
489 if(n+4 >= na){ /* 3 runes plus NUL */
491 classp = erealloc(classp, na*RUNESIZE);
494 exprp++; /* eat '-' */
495 if((c2 = nextrec()) == ']')
497 classp[n+0] = 0xFFFF;
505 if(nclass == Nclass){
507 class = erealloc(class, Nclass*sizeof(Rune*));
509 class[nclass++] = classp;
513 classmatch(int classno, int c, int negate)
520 if(p[1]<=c && c<=p[2])
530 * Note optimization in addinst:
531 * *l must be pending when addinst called; if *l has been looked
532 * at already, the optimization is a bug.
535 addinst(Ilist *l, Inst *inst, Rangeset *sep)
539 for(p = l; p->inst; p++){
541 if((sep)->p[0].p1 < p->se.p[0].p1)
542 p->se= *sep; /* this would be bug */
543 return 0; /* It's already there */
553 execute(File *f, Posn startp, Posn eof)
562 int startchar = startinst->type<OPERATOR? startinst->type : 0;
564 list[0][0].inst = list[1][0].inst = 0;
566 /* Execute machine once for each character */
572 case 0: /* let loop run one more click */
575 case 1: /* expired; wrap to beginning */
576 if(sel.p[0].p1>=0 || eof!=INFINITY)
578 list[0][0].inst = list[1][0].inst = 0;
584 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
586 /* fast check for first char */
587 if(startchar && nnl==0 && c!=startchar)
594 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
595 /* Add first instruction to this list */
597 if(addinst(tl, startinst, &sempty))
602 /* Execute machine until this list is empty */
603 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
606 default: /* regular character */
609 if(addinst(nl, inst->next, &tlp->se))
616 tlp->se.p[inst->subid].p1 = p;
621 tlp->se.p[inst->subid].p2 = p;
629 if(p==0 || filereadc(f, p - 1)=='\n'){
640 if(c>=0 && classmatch(inst->rclass, c, 0))
644 if(c>=0 && classmatch(inst->rclass, c, 1))
648 /* evaluate right choice later */
649 if(addinst(tlp, inst->right, &tlp->se))
652 /* efficiency: advance and re-evaluate */
655 case END: /* Match! */
663 return sel.p[0].p1>=0;
667 newmatch(Rangeset *sp)
671 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
672 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
673 for(i = 0; i<NSUBEXP; i++)
678 bexecute(File *f, Posn startp)
687 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
689 list[0][0].inst = list[1][0].inst = 0;
691 /* Execute machine once for each character, including terminal NUL */
694 if((c = filereadc(f, p - 1))==-1){
696 case 0: /* let loop run one more click */
699 case 1: /* expired; wrap to end */
703 list[0][0].inst = list[1][0].inst = 0;
709 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
711 /* fast check for first char */
712 if(startchar && nnl==0 && c!=startchar)
719 if(sel.p[0].p1<0 && (!wrapped || p>startp)){
720 /* Add first instruction to this list */
721 /* the minus is so the optimizations in addinst work */
723 if(addinst(tl, bstartinst, &sempty))
728 /* Execute machine until this list is empty */
729 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
732 default: /* regular character */
735 if(addinst(nl, inst->next, &tlp->se))
742 tlp->se.p[inst->subid].p1 = p;
747 tlp->se.p[inst->subid].p2 = p;
762 if(p==f->nc || filereadc(f, p)=='\n')
766 if(c>=0 && classmatch(inst->rclass, c, 0))
770 if(c>=0 && classmatch(inst->rclass, c, 1))
774 /* evaluate right choice later */
775 if(addinst(tl, inst->right, &tlp->se))
778 /* efficiency: advance and re-evaluate */
781 case END: /* Match! */
782 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
790 return sel.p[0].p1>=0;
794 bnewmatch(Rangeset *sp)
797 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
798 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */
799 sel.p[i].p1 = sp->p[i].p2;
800 sel.p[i].p2 = sp->p[i].p1;