6 #include "/sys/src/libregexp/regcomp.h"
13 * Standard NFA determinization and DFA minimization.
15 typedef struct Deter Deter;
16 typedef struct Reiset Reiset;
20 /* state of determinization */
23 jmp_buf kaboom; /* jmp on error */
25 Bin *bin; /* bin for temporary allocations */
27 Reprog *p; /* program being determinized */
28 uint ninst; /* number of instructions in program */
30 Reiset *alloc; /* chain of all Reisets */
33 Reiset **hash; /* hash of all Reisets */
36 Reiset *tmp; /* temporaries for walk */
39 Rune *c; /* ``interesting'' characters */
43 /* set of Reinsts: perhaps we should use a bit list instead of the indices? */
46 uint *inst; /* indices of instructions in set */
47 uint ninst; /* size of set */
49 Reiset *next; /* d.alloc chain */
50 Reiset *hash; /* d.hash chain */
51 Reiset **delta; /* where to go on each interesting char */
52 uint id; /* assigned id during minimization */
53 uint isfinal; /* is an accepting (final) state */
57 ralloc(Deter *d, int ninst)
61 t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0);
63 longjmp(d->kaboom, 1);
64 t->delta = (Reiset**)&t[1];
65 t->inst = (uint*)&t->delta[2*d->nc];
69 /* find the canonical form a given Reiset */
71 findreiset(Deter *d, Reiset *s)
78 for(i=0; i<s->ninst; i++)
79 h = h*1000003 + s->inst[i];
82 szinst = s->ninst*sizeof(s->inst[0]);
83 for(t=d->hash[h]; t; t=t->hash)
84 if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0)
87 t = ralloc(d, s->ninst);
96 memmove(t->inst, s->inst, szinst);
98 /* delta is filled in later */
103 /* convert bits to a real reiset */
105 bits2reiset(Deter *d, uchar *bits)
112 for(k=0; k<d->ninst; k++)
114 s->inst[s->ninst++] = k;
115 return findreiset(d, s);
118 /* add n to state set; if n < k, need to go around again */
120 add(int n, uchar *bits, int k)
128 /* update bits to follow all the empty (non-character-related) transitions possible */
130 followempty(Deter *d, uchar *bits, int bol, int eol)
137 for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
143 again |= add(i->next - d->p->firstinst, bits, k);
146 again |= add(i->left - d->p->firstinst, bits, k);
147 again |= add(i->right - d->p->firstinst, bits, k);
151 again |= add(i->next - d->p->firstinst, bits, k);
155 again |= add(i->next - d->p->firstinst, bits, k);
162 * Clear bits for useless transitions. We could do this during
163 * the switch above, but then we have no guarantee of termination
164 * if we get a loop in the regexp.
166 for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
182 * Where does s go if it sees rune r?
183 * Eol is true if a $ matches the string at the position just after r.
186 transition(Deter *d, Reiset *s, Rune r, uint eol)
194 memset(bits, 0, d->ninst);
196 inst0 = d->p->firstinst;
197 for(k=0; k < s->ninst; k++){
198 i = inst0 + s->inst[k];
201 werrstr("bad reprog: got type %d", i->type);
202 longjmp(d->kaboom, 1);
208 werrstr("internal error: got type %d", i->type);
209 longjmp(d->kaboom, 1);
213 bits[i->next - inst0] = 1;
217 bits[i->next - inst0] = 1;
220 bits[i->next - inst0] = 1;
228 for(rp = i->cp->spans; rp < ep; rp += 2)
229 if(rp[0] <= r && r <= rp[1])
231 if((rp < ep) ^! (i->type == CCLASS))
232 bits[i->next - inst0] = 1;
239 followempty(d, bits, r=='\n', eol);
240 return bits2reiset(d, bits);
244 countinst(Reprog *pp)
257 set(Deter *d, u32int **tab, Rune r)
261 if((u = tab[r/4096]) == nil){
262 u = binalloc(&d->bin, 4096/8, 1);
264 longjmp(d->kaboom, 1);
267 u[(r%4096)/32] |= 1<<(r%32);
271 * Compute the list of important characters.
272 * Other characters behave like the ones that surround them.
275 findchars(Deter *d, Reprog *p)
277 u32int *tab[65536/4096], *u, x;
282 memset(tab, 0, sizeof tab);
285 for(i=p->firstinst; i->type; i++){
288 set(d, tab, L'\n'-1);
290 set(d, tab, L'\n'+1);
298 set(d, tab, L'\n'-1);
300 set(d, tab, L'\n'+1);
304 for(rp = i->cp->spans; rp < ep; rp += 2){
305 set(d, tab, rp[0]-1);
308 set(d, tab, rp[1]+1);
315 for(k=0; k<nelem(tab); k++){
316 if((u = tab[k]) == nil)
318 for(m=0; m<4096/32; m++){
327 d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0);
329 longjmp(d->kaboom, 1);
333 for(k=0; k<nelem(tab); k++){
334 if((u = tab[k]) == nil)
336 for(m=0; m<4096/32; m++){
341 d->c[n++] = k*4096+m*32+a;
351 * convert the Deter and Reisets into a Dreprog.
352 * if dp and c are nil, just return the count of Drecases needed.
355 buildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c)
363 for(i=0; i<nid; i++){
367 di->isfinal = s->isfinal;
371 for(j=0; j<2*d->nc; j++){
372 if(s->delta[j]->id != id){
373 id = s->delta[j]->id;
375 c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc];
376 c[n].next = &dp->inst[id];
382 if(n == 1 && c[0].next == di)
397 uint again, n, nid, id;
399 Reiset **id2set, *s, *t, *start[4];
403 memset(&d, 0, sizeof d);
405 if(setjmp(d.kaboom)){
411 d.ninst = countinst(p);
416 /* round up to power of two; this loop is the least of our efficiency problems */
420 d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1);
422 /* get list of important runes */
426 print("relevant chars are: «%S»\n", d.c+1);
429 d.bits = bits = binalloc(&d.bin, d.ninst, 0);
430 d.tmp = ralloc(&d, d.ninst);
436 /* 4 start states, depending on initial bol, eol */
438 memset(bits, 0, d.ninst);
439 bits[p->startinst - p->firstinst] = 1;
440 followempty(&d, bits, n&1, n&2);
441 start[n] = bits2reiset(&d, bits);
444 /* explore the reiset space */
445 for(s=d.alloc; s; s=s->next)
446 for(n=0; n<2*d.nc; n++)
447 s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc);
451 for(s=d.alloc; s; s=s->next)
460 /* first class division is final or not */
461 for(s=d.alloc; s; s=s->next){
463 for(n=0; n<s->ninst; n++)
464 if(p->firstinst[s->inst[n]].type == END)
469 /* divide states with different transition tables in id space */
473 for(s=d.alloc; s; s=s->next){
475 for(t=s->next; t; t=t->next){
478 for(n=0; n<2*d.nc; n++){
479 /* until we finish the for(t) loop, s->id and id are same */
480 if((s->delta[n]->id == t->delta[n]->id)
481 || (s->delta[n]->id == s->id && t->delta[n]->id == id)
482 || (s->delta[n]->id == id && t->delta[n]->id == s->id))
501 id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1);
503 longjmp(d.kaboom, 1);
504 for(s=d.alloc; s; s=s->next)
507 n = buildprog(&d, id2set, nid, nil, nil);
508 dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1);
510 longjmp(d.kaboom, 1);
511 c = (Drecase*)&dp->inst[nid];
512 buildprog(&d, id2set, nid, dp, c);
515 dp->start[n] = &dp->inst[start[n]->id];
523 dregexec(Dreprog *p, char *s, int bol)
532 i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)];
544 if((*s&0xFF) < Runeself){
548 n = chartorune(&r, s);
552 if(s[n] == '\n' || s[n] == '\0')
576 for(s=d->alloc; s; s=s->next){
579 for(i=0; i<2*d->nc; i++){
580 if(id != s->delta[i]->id){
584 print(" [%C$", d->c[i%d->nc]);
586 print(" [%C", d->c[i%d->nc]);
587 print(" %d]", s->delta[i]->id);
588 id = s->delta[i]->id;
603 print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type,
604 l->left-pp->firstinst, l->right-pp->firstinst);
606 print("\t%C\n", l->r);
607 else if(l->type == CCLASS || l->type == NCCLASS){
609 if(l->type == NCCLASS)
611 for(p = l->cp->spans; p < l->cp->end; p += 2)
615 print("%C-%C", p[0], p[1]);
628 print("start %ld %ld %ld %ld\n",
629 pp->start[0]-pp->inst,
630 pp->start[1]-pp->inst,
631 pp->start[2]-pp->inst,
632 pp->start[3]-pp->inst);
634 for(i=0; i<pp->ninst; i++){
637 for(j=0; j<l->nc; j++){
640 if(l->c[j].start != 1)
643 print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : "");
646 print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : "");
647 print("] %ld", l->c[j].next - pp->inst);
659 main(int argc, char **argv)
666 p = regcomp(argv[i]);
668 print("=== %s: bad regexp\n", argv[i]);
670 // print("=== %s\n", argv[i]);
676 for(i=2; i<argc; i++)
677 print("match %d\n", dregexec(dp, argv[i], 0));
683 Bprintdfa(Biobuf *b, Dreprog *p)
687 Bprint(b, "# dreprog\n");
689 for(i=0; i<p->ninst; i++)
691 Bprint(b, "%d %d %zd %zd %zd %zd\n", p->ninst, nc,
692 p->start[0]-p->inst, p->start[1]-p->inst,
693 p->start[2]-p->inst, p->start[3]-p->inst);
694 for(i=0; i<p->ninst; i++){
695 Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc);
696 for(j=0; j<p->inst[i].nc; j++)
697 Bprint(b, " %d %zd", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst);
703 egetline(Biobuf *b, int c, jmp_buf jb)
710 p[Blinelen(b)-1] = '\0';
715 egetc(Biobuf *b, int c, jmp_buf jb)
722 egetnum(Biobuf *b, int want, jmp_buf jb)
729 while((c = Bgetc(b)) != Beof){
730 if(c < '0' || c > '9'){
735 if(first || c != want){
736 werrstr("format error");
744 werrstr("unexpected eof");
766 s = egetline(b, '\n', jb);
767 if(strcmp(s, "# dreprog") != 0){
768 werrstr("format error");
772 ninst = egetnum(b, ' ', jb);
773 nc = egetnum(b, ' ', jb);
775 p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1);
778 c = (Drecase*)&p->inst[ninst];
780 p->start[0] = &p->inst[egetnum(b, ' ', jb)];
781 p->start[1] = &p->inst[egetnum(b, ' ', jb)];
782 p->start[2] = &p->inst[egetnum(b, ' ', jb)];
783 p->start[3] = &p->inst[egetnum(b, '\n', jb)];
785 for(j=0; j<ninst; j++){
787 l->isfinal = egetnum(b, ' ', jb);
788 l->isloop = egetnum(b, ' ', jb);
789 l->nc = egetnum(b, 0, jb);
791 for(k=0; k<l->nc; k++){
793 c->start = egetnum(b, ' ', jb);
794 c->next = &p->inst[egetnum(b, 0, jb)];