20 typedef struct Inst Inst;
23 uint type; /* <= Runemax+1 ==> literal, otherwise action */
40 Inst *startinst; /* First inst. of program; might not be program[0] */
41 Inst *bstartinst; /* same for backwards machine */
42 Channel *rechan; /* chan(Inst*) */
44 typedef struct Ilist Ilist;
47 Inst *inst; /* Instruction of the thread */
49 uint startp; /* first char of match */
54 Ilist *tl, *nl; /* This list, next list */
55 Ilist list[2][NLIST+1]; /* +1 for trailing null */
56 static Rangeset sempty;
61 * 0x100xx are operators, value == precedence
62 * 0x200xx are tokens, i.e. operands for operators
65 OPERATOR = Runemask+1, /* Bitmask of all operators */
66 START = OPERATOR, /* Start, used for marker on stack */
67 RBRA, /* Right bracket, ) */
68 LBRA, /* Left bracket, ( */
69 OR, /* Alternation, | */
70 CAT, /* Concatentation, implicit operator */
71 STAR, /* Closure, * */
73 QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
75 ANY = OPERATOR<<1, /* Any character but newline, . */
76 NOP, /* No operation, internal use only */
77 BOL, /* Beginning of line, ^ */
78 EOL, /* End of line, $ */
79 CCLASS, /* Character class, [] */
80 NCCLASS, /* Negated character class, [^] */
81 END, /* Terminate: match found */
90 typedef struct Node Node;
98 Node andstack[NSTACK];
100 int atorstack[NSTACK];
102 int lastwasand; /* Last token was operand */
104 int subidstack[NSTACK];
108 Rune *exprp; /* pointer to next character in source expression */
109 #define DCLASS 10 /* allocation increment */
110 int nclass; /* number active */
111 int Nclass; /* high water mark */
115 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
116 void newmatch(Rangeset*);
117 void bnewmatch(Rangeset*);
118 void pushand(Inst*, Inst*);
122 void startlex(Rune*);
127 void optimize(Inst*);
128 void bldcclass(void);
133 rechan = chancreate(sizeof(Inst*), 0);
134 lastregexp = runemalloc(1);
141 warning(nil, "regexp: %s\n", e);
149 if(progp >= &program[NPROG])
150 regerror("expression too long");
158 realcompile(void *arg)
163 threadsetname("regcomp");
171 /* Start with a low priority operator to prime parser */
173 while((token=lex()) != END){
174 if((token&ISATOR) == OPERATOR)
179 /* Close with a low priority operator */
185 regerror("unmatched `('");
186 --andp; /* points to first and only operand */
187 sendp(rechan, andp->first);
191 /* r is null terminated */
198 nr = runestrlen(r)+1;
199 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
202 for(i=0; i<nclass; i++)
208 threadcreate(realcompile, r, STACK);
209 startinst = recvp(rechan);
215 threadcreate(realcompile, r, STACK);
216 bstartinst = recvp(rechan);
217 if(bstartinst == nil)
220 lastregexp = runerealloc(lastregexp, nr);
221 runemove(lastregexp, r, nr);
230 operator(CAT); /* catenate is implicit */
234 i->type = NCCLASS; /* UGH */
235 i->class = nclass-1; /* UGH */
244 if(t==RBRA && --nbra<0)
245 regerror("unmatched `)'");
247 cursubid++; /* silently ignored */
256 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
257 lastwasand = TRUE; /* these look like operands */
261 pushand(Inst *f, Inst *l)
263 if(andp >= &andstack[NSTACK])
264 error("operand stack overflow");
273 if(atorp >= &atorstack[NSTACK])
274 error("operator stack overflow");
276 if(cursubid >= NRange)
287 if(andp <= &andstack[0])
289 sprint(buf, "missing operand for %c", op);
292 regerror("malformed regexp");
299 if(atorp <= &atorstack[0])
300 error("operator stack underflow");
311 while(pri==RBRA || atorp[-1]>=pri){
315 inst2 = newinst(RBRA);
316 inst2->subid = *subidp;
317 op1->last->next = inst2;
318 inst1 = newinst(LBRA);
319 inst1->subid = *subidp;
320 inst1->next = op1->first;
321 pushand(inst1, inst2);
322 return; /* must have been RBRA */
324 error("unknown regexp operator");
329 inst2 = newinst(NOP);
330 op2->last->next = inst2;
331 op1->last->next = inst2;
333 inst1->right = op1->first;
334 inst1->left = op2->first;
335 pushand(inst1, inst2);
340 if(backwards && op2->first->type!=END){
345 op1->last->next = op2->first;
346 pushand(op1->first, op2->last);
351 op2->last->next = inst1;
352 inst1->right = op2->first;
353 pushand(inst1, inst1);
358 op2->last->next = inst1;
359 inst1->right = op2->first;
360 pushand(op2->first, inst1);
365 inst2 = newinst(NOP);
367 inst1->right = op2->first;
368 op2->last->next = inst2;
369 pushand(inst1, inst2);
377 optimize(Inst *start)
381 for(inst=start; inst->type!=END; inst++){
383 while(target->type == NOP)
384 target = target->next;
405 if((c= *exprp++)=='n')
410 --exprp; /* In case we come here again */
450 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
451 regerror("malformed `[]'");
452 if(exprp[0] == '\\'){
458 return *exprp++|(Runemask+1);
469 classp = runemalloc(DCLASS);
472 /* we have already seen the '[' */
474 classp[n++] = '\n'; /* don't match newline in negate case */
479 while((c1 = nextrec()) != ']'){
483 regerror("malformed `[]'");
485 if(n+4 >= na){ /* 3 runes plus NUL */
487 classp = runerealloc(classp, na);
490 exprp++; /* eat '-' */
491 if((c2 = nextrec()) == ']')
493 classp[n+0] = Runemax;
494 classp[n+1] = c1 & Runemask;
495 classp[n+2] = c2 & Runemask;
498 classp[n++] = c1 & Runemask;
501 if(nclass == Nclass){
503 class = realloc(class, Nclass*sizeof(Rune*));
505 class[nclass++] = classp;
509 classmatch(int classno, int c, int negate)
516 if(p[1]<=c && c<=p[2])
526 * Note optimization in addinst:
527 * *l must be pending when addinst called; if *l has been looked
528 * at already, the optimization is a bug.
531 addinst(Ilist *l, Inst *inst, Rangeset *sep)
535 for(p = l; p->inst; p++){
537 if((sep)->r[0].q0 < p->se.r[0].q0)
538 p->se= *sep; /* this would be bug */
539 return 0; /* It's already there */
551 return startinst==nil || bstartinst==nil;
554 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
556 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
572 if(startinst->type<OPERATOR)
573 startchar = startinst->type;
574 list[0][0].inst = list[1][0].inst = nil;
580 /* Execute machine once for each character */
585 case 0: /* let loop run one more click */
588 case 1: /* expired; wrap to beginning */
589 if(sel.r[0].q0>=0 || eof!=Infinity)
591 list[0][0].inst = list[1][0].inst = nil;
599 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
606 /* fast check for first char */
607 if(startchar && nnl==0 && c!=startchar)
614 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
615 /* Add first instruction to this list */
617 if(addinst(tl, startinst, &sempty))
620 warning(nil, "regexp list overflow\n");
625 /* Execute machine until this list is empty */
626 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
629 default: /* regular character */
632 if(addinst(nl, inst->next, &tlp->se))
639 tlp->se.r[inst->subid].q0 = p;
644 tlp->se.r[inst->subid].q1 = p;
652 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
663 if(c>=0 && classmatch(inst->class, c, 0))
667 if(c>=0 && classmatch(inst->class, c, 1))
671 /* evaluate right choice later */
672 if(addinst(tlp, inst->right, &tlp->se))
675 /* efficiency: advance and re-evaluate */
678 case END: /* Match! */
687 return sel.r[0].q0 >= 0;
691 newmatch(Rangeset *sp)
693 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
694 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
699 rxbexecute(Text *t, uint startp, Rangeset *rp)
715 if(bstartinst->type<OPERATOR)
716 startchar = bstartinst->type;
717 list[0][0].inst = list[1][0].inst = nil;
719 /* Execute machine once for each character, including terminal NUL */
724 case 0: /* let loop run one more click */
727 case 1: /* expired; wrap to end */
730 list[0][0].inst = list[1][0].inst = nil;
739 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
741 c = textreadc(t, p-1);
743 /* fast check for first char */
744 if(startchar && nnl==0 && c!=startchar)
751 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
752 /* Add first instruction to this list */
753 /* the minus is so the optimizations in addinst work */
755 if(addinst(tl, bstartinst, &sempty))
758 warning(nil, "regexp list overflow\n");
763 /* Execute machine until this list is empty */
764 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
767 default: /* regular character */
770 if(addinst(nl, inst->next, &tlp->se))
777 tlp->se.r[inst->subid].q0 = p;
782 tlp->se.r[inst->subid].q1 = p;
797 if(p<t->file->nc && textreadc(t, p)=='\n')
801 if(c>0 && classmatch(inst->class, c, 0))
805 if(c>0 && classmatch(inst->class, c, 1))
809 /* evaluate right choice later */
810 if(addinst(tl, inst->right, &tlp->se))
813 /* efficiency: advance and re-evaluate */
816 case END: /* Match! */
817 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
826 return sel.r[0].q0 >= 0;
830 bnewmatch(Rangeset *sp)
834 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
835 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
836 sel.r[i].q0 = sp->r[i].q1;
837 sel.r[i].q1 = sp->r[i].q0;