X-Git-Url: https://git.lizzy.rs/?a=blobdiff_plain;f=sys%2Fsrc%2Flibregexp%2Fregcomp.c;h=c7b3b9b870385f52a24cca568267f36538faa7ab;hb=cea9e2267a70d17173c2b0fcdc421ee11b66ee79;hp=e48ef15acb58b69f089100ce2bad237316e1d544;hpb=a9060cc06bee66e12fe16644511f181a4b0cdbd3;p=plan9front.git diff --git a/sys/src/libregexp/regcomp.c b/sys/src/libregexp/regcomp.c index e48ef15ac..c7b3b9b87 100644 --- a/sys/src/libregexp/regcomp.c +++ b/sys/src/libregexp/regcomp.c @@ -1,567 +1,569 @@ #include #include -#include "regexp.h" -#include "regcomp.h" - -#define TRUE 1 -#define FALSE 0 - -/* - * Parser Information - */ -typedef -struct Node -{ - Reinst* first; - Reinst* last; -}Node; - -/* max character classes per program is nelem(reprog->class) */ -static Reprog *reprog; - -/* max rune ranges per character class is nelem(classp->spans)/2 */ -#define NCCRUNE nelem(classp->spans) - -#define NSTACK 20 -static Node andstack[NSTACK]; -static Node *andp; -static int atorstack[NSTACK]; -static int* atorp; -static int cursubid; /* id of current subexpression */ -static int subidstack[NSTACK]; /* parallel to atorstack */ -static int* subidp; -static int lastwasand; /* Last token was operand */ -static int nbra; -static char* exprp; /* pointer to next character in source expression */ -static int lexdone; -static int nclass; -static Reclass*classp; -static Reinst* freep; -static int errors; -static Rune yyrune; /* last lex'd rune */ -static Reclass*yyclassp; /* last lex'd class */ - -/* predeclared crap */ -static void operator(int); -static void pushand(Reinst*, Reinst*); -static void pushator(int); -static void evaluntil(int); -static int bldcclass(void); - -static jmp_buf regkaboom; - -static void -rcerror(char *s) +#include +#include "regimpl.h" + +int instrcnt[] = { + [TANY] 2, + [TBOL] 1, + [TCAT] 0, + [TCLASS] 1, + [TEOL] 1, + [TNOTNL] 1, + [TOR] 2, + [TPLUS] 1, + [TQUES] 1, + [TRUNE] 1, + [TSTAR] 2, + [TSUB] 2 +}; + +static Renode* +node(Parselex *plex, int op, Renode *l, Renode *r) { - errors++; - regerror(s); - longjmp(regkaboom, 1); + Renode *n; + + plex->instrs += instrcnt[op]; + n = plex->next++; + n->op = op; + n->left = l; + n->right = r; + return n; } -static Reinst* -newinst(int t) +static Renode* +e3(Parselex *plex) { - freep->type = t; - freep->left = 0; - freep->right = 0; - return freep++; + char error[128]; + Renode *n; + + switch(lex(plex)) { + case LANY: + return node(plex, TANY, nil, nil); + case LBOL: + return node(plex, TBOL, nil, nil); + case LEOL: + return node(plex, TEOL, nil, nil); + case LRUNE: + n = node(plex, TRUNE, nil, nil); + n->r = plex->rune; + return n; + case LCLASS: + if(plex->nc) + return buildclassn(plex); + return buildclass(plex); + case LLPAR: + n = e0(plex); + n = node(plex, TSUB, n, nil); + if(lex(plex) != LRPAR) { + snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig); + regerror(error); + longjmp(plex->exitenv, 1); + } + return n; + default: + if(plex->rune) + snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune); + else + snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig); + regerror(error); + longjmp(plex->exitenv, 1); + } + return nil; } -static void -operand(int t) +static Renode* +e2(Parselex *plex) { - Reinst *i; - - if(lastwasand) - operator(CAT); /* catenate is implicit */ - i = newinst(t); - - if(t == CCLASS || t == NCCLASS) - i->cp = yyclassp; - if(t == RUNE) - i->r = yyrune; + Renode *n; - pushand(i, i); - lastwasand = TRUE; + n = e3(plex); + while(lex(plex) == LREP) { + switch(plex->rune) { + case L'*': + n = node(plex, TSTAR, n, nil); + break; + case L'+': + n = node(plex, TPLUS, n, nil); + break; + case L'?': + n = node(plex, TQUES, n, nil); + break; + } + } + plex->peek = 1; + return n; } -static void -operator(int t) +static Renode* +invert(Renode *n) { - if(t==RBRA && --nbra<0) - rcerror("unmatched right paren"); - if(t==LBRA){ - if(++cursubid >= NSUBEXP) - rcerror ("too many subexpressions"); - nbra++; - if(lastwasand) - operator(CAT); - } else - evaluntil(t); - if(t != RBRA) - pushator(t); - lastwasand = FALSE; - if(t==STAR || t==QUEST || t==PLUS || t==RBRA) - lastwasand = TRUE; /* these look like operands */ + Renode *n1; + + if(n->op != TCAT) + return n; + while(n->left->op == TCAT) { + n1 = n->left; + n->left = n1->right; + n1->right = n; + n = n1; + } + return n; } -static void -regerr2(char *s, int c) +static Renode* +e1(Parselex *plex) { - char buf[100]; - char *cp = buf; - while(*s) - *cp++ = *s++; - *cp++ = c; - *cp = '\0'; - rcerror(buf); -} + Renode *n; + int sym; -static void -cant(char *s) -{ - char buf[100]; - strcpy(buf, "can't happen: "); - strcat(buf, s); - rcerror(buf); + n = e2(plex); + for(;;) { + sym = lex(plex); + if(sym == LEND || sym == LOR || sym == LRPAR) + break; + plex->peek = 1; + n = node(plex, TCAT, n, e2(plex)); + } + plex->peek = 1; + return invert(n); } -static void -pushand(Reinst *f, Reinst *l) +static Renode* +e0(Parselex *plex) { - if(andp >= &andstack[NSTACK]) - cant("operand stack overflow"); - andp->first = f; - andp->last = l; - andp++; + Renode *n; + + n = e1(plex); + for(;;) { + if(lex(plex) != LOR) + break; + n = node(plex, TOR, n, e1(plex)); + } + plex->peek = 1; + return n; } -static void -pushator(int t) +static Parselex* +initplex(Parselex *plex, char *regstr, int lit) { - if(atorp >= &atorstack[NSTACK]) - cant("operator stack overflow"); - *atorp++ = t; - *subidp++ = cursubid; + plex->getnextr = lit ? getnextrlit : getnextr; + plex->rawexp = plex->orig = regstr; + plex->sub = 0; + plex->instrs = 0; + plex->peek = 0; + plex->done = 0; + return plex; } -static Node* -popand(int op) +static Reprog* +regcomp1(char *regstr, int nl, int lit) { - Reinst *inst; - - if(andp <= &andstack[0]){ - regerr2("missing operand for ", op); - inst = newinst(NOP); - pushand(inst,inst); + Reprog *reprog; + Parselex plex; + Renode *parsetr; + int regstrlen, maxthr; + + regstrlen = utflen(regstr); + initplex(&plex, regstr, lit); + plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2); + if(plex.nodes == nil) + return nil; + plex.next = plex.nodes; + + if(setjmp(plex.exitenv) != 0) { + free(plex.nodes); + return nil; } - return --andp; + + maxthr = regstrlen + 1; + parsetr = node(&plex, TSUB, e0(&plex), nil); + +// prtree(parsetr, 0, 1); + reprog = malloc(sizeof(Reprog) + + sizeof(Reinst) * plex.instrs + + sizeof(Rethread) * maxthr); + reprog->len = plex.instrs; + reprog->nthr = maxthr; + reprog->startinst = compile(parsetr, reprog, nl); + reprog->threads = (Rethread*)(reprog->startinst + reprog->len); + reprog->regstr = regstr; + + free(plex.nodes); + return reprog; } -static int -popator(void) +Reprog* +regcomp(char *str) { - if(atorp <= &atorstack[0]) - cant("operator stack underflow"); - --subidp; - return *--atorp; + return regcomp1(str, 0, 0); } -static void -evaluntil(int pri) +Reprog* +regcomplit(char *str) { - Node *op1, *op2; - Reinst *inst1, *inst2; - - while(pri==RBRA || atorp[-1]>=pri){ - switch(popator()){ - default: - rcerror("unknown operator in evaluntil"); - break; - case LBRA: /* must have been RBRA */ - op1 = popand('('); - inst2 = newinst(RBRA); - inst2->subid = *subidp; - op1->last->next = inst2; - inst1 = newinst(LBRA); - inst1->subid = *subidp; - inst1->next = op1->first; - pushand(inst1, inst2); - return; - case OR: - op2 = popand('|'); - op1 = popand('|'); - inst2 = newinst(NOP); - op2->last->next = inst2; - op1->last->next = inst2; - inst1 = newinst(OR); - inst1->right = op1->first; - inst1->left = op2->first; - pushand(inst1, inst2); - break; - case CAT: - op2 = popand(0); - op1 = popand(0); - op1->last->next = op2->first; - pushand(op1->first, op2->last); - break; - case STAR: - op2 = popand('*'); - inst1 = newinst(OR); - op2->last->next = inst1; - inst1->right = op2->first; - pushand(inst1, inst1); - break; - case PLUS: - op2 = popand('+'); - inst1 = newinst(OR); - op2->last->next = inst1; - inst1->right = op2->first; - pushand(op2->first, inst1); - break; - case QUEST: - op2 = popand('?'); - inst1 = newinst(OR); - inst2 = newinst(NOP); - inst1->left = inst2; - inst1->right = op2->first; - op2->last->next = inst2; - pushand(inst1, inst2); - break; - } - } + return regcomp1(str, 0, 1); } -static Reprog* -optimize(Reprog *pp) +Reprog* +regcompnl(char *str) { - Reinst *inst, *target; - int size; - Reprog *npp; - Reclass *cl; - int diff; - - /* - * get rid of NOOP chains - */ - for(inst=pp->firstinst; inst->type!=END; inst++){ - target = inst->next; - while(target->type == NOP) - target = target->next; - inst->next = target; - } - - /* - * The original allocation is for an area larger than - * necessary. Reallocate to the actual space used - * and then relocate the code. - */ - size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); - npp = realloc(pp, size); - if(npp==0 || npp==pp) - return pp; - diff = (char *)npp - (char *)pp; - freep = (Reinst *)((char *)freep + diff); - for(inst=npp->firstinst; insttype){ - case OR: - case STAR: - case PLUS: - case QUEST: - *(char **)&inst->right += diff; - break; - case CCLASS: - case NCCLASS: - *(char **)&inst->right += diff; - cl = inst->cp; - *(char **)&cl->end += diff; - break; - } - *(char **)&inst->left += diff; - } - *(char **)&npp->startinst += diff; - return npp; + return regcomp1(str, 1, 0); } -#ifdef DEBUG -static void -dumpstack(void){ - Node *stk; - int *ip; - - print("operators\n"); - for(ip=atorstack; ipfirst->type, stk->last->type); +static Reinst* +compile1(Renode *renode, Reinst *reinst, int *sub, int nl) +{ + Reinst *i; + int s; + +Tailcall: + if(renode == nil) + return reinst; + switch(renode->op) { + case TCLASS: + reinst->op = OCLASS; + reinst->r = renode->r; + reinst->r1 = renode->r1; + reinst->a = reinst + 1 + renode->nclass; + renode = renode->left; + reinst++; + goto Tailcall; + case TCAT: + reinst = compile1(renode->left, reinst, sub, nl); + renode = renode->right; + goto Tailcall; + case TOR: + reinst->op = OSPLIT; + reinst->a = reinst + 1; + i = compile1(renode->left, reinst->a, sub, nl); + reinst->b = i + 1; + i->op = OJMP; + i->a = compile1(renode->right, reinst->b, sub, nl); + return i->a; + case TSTAR: + reinst->op = OSPLIT; + reinst->a = reinst + 1; + i = compile1(renode->left, reinst->a, sub, nl); + reinst->b = i + 1; + i->op = OJMP; + i->a = reinst; + return reinst->b; + case TPLUS: + i = reinst; + reinst = compile1(renode->left, reinst, sub, nl); + reinst->op = OSPLIT; + reinst->a = i; + reinst->b = reinst + 1; + return reinst->b; + case TQUES: + reinst->op = OSPLIT; + reinst->a = reinst + 1; + reinst->b = compile1(renode->left, reinst->a, sub, nl); + return reinst->b; + case TSUB: + reinst->op = OSAVE; + reinst->sub = s = (*sub)++; + reinst = compile1(renode->left, reinst+1, sub, nl); + reinst->op = OUNSAVE; + reinst->sub = s; + return reinst + 1; + case TANY: + if(nl == 0) + reinst++->op = ONOTNL; + reinst->op = OANY; + return reinst + 1; + case TRUNE: + reinst->op = ORUNE; + reinst->r = renode->r; + return reinst + 1; + case TNOTNL: + reinst->op = ONOTNL; + return reinst + 1; + case TEOL: + reinst->op = OEOL; + return reinst + 1; + case TBOL: + reinst->op = OBOL; + return reinst + 1; + } + return nil; } -static void -dump(Reprog *pp) +static Reinst* +compile(Renode *parsetr, Reprog *reprog, int nl) { - Reinst *l; - Rune *p; - - l = pp->firstinst; - do{ - print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, - l->left-pp->firstinst, l->right-pp->firstinst); - if(l->type == RUNE) - print("\t%C\n", l->r); - else if(l->type == CCLASS || l->type == NCCLASS){ - print("\t["); - if(l->type == NCCLASS) - print("^"); - for(p = l->cp->spans; p < l->cp->end; p += 2) - if(p[0] == p[1]) - print("%C", p[0]); - else - print("%C-%C", p[0], p[1]); - print("]\n"); - } else - print("\n"); - }while(l++->type); + Reinst *reinst, *end; + int sub; + + sub = 0; + reinst = (Reinst*)(reprog+1); + end = compile1(parsetr, reinst, &sub, nl); + assert(end <= reinst + reprog->len); + return reinst; } -#endif -static Reclass* -newclass(void) +static void +getnextr(Parselex *l) { - if(nclass >= nelem(reprog->class)) - rcerror("too many character classes; increase Reprog.class size"); - return &(classp[nclass++]); + l->literal = 0; + if(l->done) { + l->rune = L'\0'; + return; + } + l->rawexp += chartorune(&l->rune, l->rawexp); + if(*l->rawexp == 0) + l->done = 1; + if(l->rune == L'\\') + getnextrlit(l); } -static int -nextc(Rune *rp) +static void +getnextrlit(Parselex *l) { - if(lexdone){ - *rp = 0; - return 1; + l->literal = 1; + if(l->done) { + l->literal = 0; + l->rune = L'\0'; + return; } - exprp += chartorune(rp, exprp); - if(*rp == L'\\'){ - exprp += chartorune(rp, exprp); - return 1; - } - if(*rp == 0) - lexdone = 1; - return 0; + l->rawexp += chartorune(&l->rune, l->rawexp); + if(*l->rawexp == 0) + l->done = 1; } -static int -lex(int literal, int dot_type) +static int +lex(Parselex *l) { - int quoted; - - quoted = nextc(&yyrune); - if(literal || quoted){ - if(yyrune == 0) - return END; - return RUNE; + if(l->peek) { + l->peek = 0; + return l->peeklex; } - - switch(yyrune){ - case 0: - return END; + l->getnextr(l); + if(l->literal) + return l->peeklex = LRUNE; + switch(l->rune){ + case L'\0': + return l->peeklex = LEND; case L'*': - return STAR; case L'?': - return QUEST; case L'+': - return PLUS; + return l->peeklex = LREP; case L'|': - return OR; + return l->peeklex = LOR; case L'.': - return dot_type; + return l->peeklex = LANY; case L'(': - return LBRA; + return l->peeklex = LLPAR; case L')': - return RBRA; + return l->peeklex = LRPAR; case L'^': - return BOL; + return l->peeklex = LBOL; case L'$': - return EOL; + return l->peeklex = LEOL; case L'[': - return bldcclass(); + getclass(l); + return l->peeklex = LCLASS; } - return RUNE; + return l->peeklex = LRUNE; } static int -bldcclass(void) +pcmp(void *va, void *vb) { - int type; - Rune r[NCCRUNE]; - Rune *p, *ep, *np; - Rune rune; - int quoted; - - /* we have already seen the '[' */ - type = CCLASS; - yyclassp = newclass(); - - /* look ahead for negation */ - /* SPECIAL CASE!!! negated classes don't match \n */ - ep = r; - quoted = nextc(&rune); - if(!quoted && rune == L'^'){ - type = NCCLASS; - quoted = nextc(&rune); - *ep++ = L'\n'; - *ep++ = L'\n'; - } - - /* parse class into a set of spans */ - while(ep < &r[NCCRUNE-1]){ - if(rune == 0){ - rcerror("malformed '[]'"); - return 0; - } - if(!quoted && rune == L']') - break; - if(!quoted && rune == L'-'){ - if(ep == r){ - rcerror("malformed '[]'"); - return 0; - } - quoted = nextc(&rune); - if((!quoted && rune == L']') || rune == 0){ - rcerror("malformed '[]'"); - return 0; - } - *(ep-1) = rune; - } else { - *ep++ = rune; - *ep++ = rune; - } - quoted = nextc(&rune); - } - if(ep >= &r[NCCRUNE-1]) { - rcerror("char class too large; increase Reclass.spans size"); - return 0; - } + Rune *a, *b; - /* sort on span start */ - for(p = r; p < ep; p += 2){ - for(np = p; np < ep; np += 2) - if(*np < *p){ - rune = np[0]; - np[0] = p[0]; - p[0] = rune; - rune = np[1]; - np[1] = p[1]; - p[1] = rune; - } - } - - /* merge spans */ - np = yyclassp->spans; - p = r; - if(r == ep) - yyclassp->end = np; - else { - np[0] = *p++; - np[1] = *p++; - for(; p < ep; p += 2) - /* overlapping or adjacent ranges? */ - if(p[0] <= np[1] + 1){ - if(p[1] >= np[1]) - np[1] = p[1]; /* coalesce */ - } else { - np += 2; - np[0] = p[0]; - np[1] = p[1]; - } - yyclassp->end = np+2; - } + a = va; + b = vb; - return type; + if(a[0] < b[0]) + return 1; + if(a[0] > b[0]) + return -1; + if(a[1] < b[1]) + return 1; + if(a[1] > b[1]) + return -1; + return 0; } -static Reprog* -regcomp1(char *s, int literal, int dot_type) +static void +getclass(Parselex *l) { - int token; - Reprog *pp; - - /* get memory for the program */ - pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); - if(pp == 0){ - regerror("out of memory"); - return 0; + Rune *p, *q, t; + + l->nc = 0; + getnextrlit(l); + if(l->rune == L'^') { + l->nc = 1; + getnextrlit(l); } - freep = pp->firstinst; - classp = pp->class; - errors = 0; - - if(setjmp(regkaboom)) - goto out; - - /* go compile the sucker */ - lexdone = 0; - exprp = s; - nclass = 0; - nbra = 0; - atorp = atorstack; - andp = andstack; - subidp = subidstack; - lastwasand = FALSE; - cursubid = 0; - - /* Start with a low priority operator to prime parser */ - pushator(START-1); - while((token = lex(literal, dot_type)) != END){ - if((token&0300) == OPERATOR) - operator(token); - else - operand(token); + p = l->cpairs; + for(;;) { + *p = l->rune; + if(l->rune == L']') + break; + if(l->rune == L'-') { + regerror("Malformed class"); + longjmp(l->exitenv, 1); + } + if(l->rune == '\\') { + getnextrlit(l); + *p = l->rune; + } + if(l->rune == 0) { + regerror("No closing ] for class"); + longjmp(l->exitenv, 1); + } + getnextrlit(l); + if(l->rune == L'-') { + getnextrlit(l); + if(l->rune == L']') { + regerror("Malformed class"); + longjmp(l->exitenv, 1); + } + if(l->rune == L'-') { + regerror("Malformed class"); + longjmp(l->exitenv, 1); + } + if(l->rune == L'\\') + getnextrlit(l); + p[1] = l->rune; + if(p[0] > p[1]) { + t = p[0]; + p[0] = p[1]; + p[1] = t; + } + getnextrlit(l); + } else + p[1] = p[0]; + if(p >= l->cpairs + nelem(l->cpairs) - 2) { + regerror("Class too big\n"); + longjmp(l->exitenv, 1); + } + p += 2; } - - /* Close with a low priority operator */ - evaluntil(START); - - /* Force END */ - operand(END); - evaluntil(START); -#ifdef DEBUG - dumpstack(); -#endif - if(nbra) - rcerror("unmatched left paren"); - --andp; /* points to first and only operand */ - pp->startinst = andp->first; -#ifdef DEBUG - dump(pp); -#endif - pp = optimize(pp); -#ifdef DEBUG - print("start: %d\n", andp->first-pp->firstinst); - dump(pp); -#endif -out: - if(errors){ - free(pp); - pp = 0; + *p = L'\0'; + qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp); + q = l->cpairs; + for(p = l->cpairs+2; *p != 0; p += 2) { + if(p[1] < q[0] - 1) { + q[2] = p[0]; + q[3] = p[1]; + q += 2; + continue; + } + q[0] = p[0]; + if(p[1] > q[1]) + q[1] = p[1]; } - return pp; + q[2] = 0; } -extern Reprog* -regcomp(char *s) +/* classes are in descending order see pcmp */ +static Renode* +buildclassn(Parselex *l) { - return regcomp1(s, 0, ANY); + Renode *n; + Rune *p; + int i; + + i = 0; + p = l->cpairs; + n = node(l, TCLASS, nil, nil); + n->r = p[1] + 1; + n->r1 = Runemax; + n->nclass = i++; + + for(; *p != 0; p += 2) { + n = node(l, TCLASS, n, nil); + n->r = p[3] + 1; + n->r1 = p[0] - 1; + n->nclass = i++; + } + n->r = 0; + return node(l, TCAT, node(l, TNOTNL, nil, nil), n); } -extern Reprog* -regcomplit(char *s) +static Renode* +buildclass(Parselex *l) { - return regcomp1(s, 1, ANY); + Renode *n; + Rune *p; + int i; + + i = 0; + n = node(l, TCLASS, nil, nil); + n->r = Runemax + 1; + n->nclass = i++; + + for(p = l->cpairs; *p != 0; p += 2) { + n = node(l, TCLASS, n, nil); + n->r = p[0]; + n->r1 = p[1]; + n->nclass = i++; + } + return n; } -extern Reprog* -regcompnl(char *s) +static void +prtree(Renode *tree, int d, int f) { - return regcomp1(s, 0, ANYNL); + int i; + + if(tree == nil) + return; + if(f) + for(i = 0; i < d; i++) + print("\t"); + switch(tree->op) { + case TCAT: + prtree(tree->left, d, 0); + prtree(tree->right, d, 1); + break; + case TOR: + print("TOR\n"); + prtree(tree->left, d+1, 1); + for(i = 0; i < d; i++) + print("\t"); + print("|\n"); + prtree(tree->right, d+1, 1); + break; + case TSTAR: + print("*\n"); + prtree(tree->left, d+1, 1); + break; + case TPLUS: + print("+\n"); + prtree(tree->left, d+1, 1); + break; + case TQUES: + print("?\n"); + prtree(tree->left, d+1, 1); + break; + case TANY: + print(".\n"); + prtree(tree->left, d+1, 1); + break; + case TBOL: + print("^\n"); + break; + case TEOL: + print("$\n"); + break; + case TSUB: + print("TSUB\n"); + prtree(tree->left, d+1, 1); + break; + case TRUNE: + print("TRUNE: %C\n", tree->r); + break; + case TNOTNL: + print("TNOTNL: !\\n\n"); + break; + case TCLASS: + print("CLASS: %C-%C\n", tree->r, tree->r1); + prtree(tree->left, d, 1); + break; + } }