]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/upas/bayes/dfa.c
merge
[plan9front.git] / sys / src / cmd / upas / bayes / dfa.c
1 #include <u.h>
2 #include <libc.h>
3 #include <bin.h>
4 #include <bio.h>
5 #include "regexp.h"
6 #include "regcomp.h"
7 #include "dfa.h"
8
9 void rdump(Reprog*);
10 void dump(Dreprog*);
11
12 /*
13  * Standard NFA determinization and DFA minimization.
14  */
15 typedef struct Deter Deter;
16 typedef struct Reiset Reiset;
17
18 void ddump(Deter*);
19
20 /* state of determinization */
21 struct Deter
22 {
23         jmp_buf kaboom; /* jmp on error */
24
25         Bin *bin;               /* bin for temporary allocations */
26
27         Reprog *p;      /* program being determinized */
28         uint ninst;             /* number of instructions in program */
29
30         Reiset *alloc;  /* chain of all Reisets */
31         Reiset **last;
32
33         Reiset **hash;  /* hash of all Reisets */
34         uint nhash;
35
36         Reiset *tmp;    /* temporaries for walk */
37         uchar *bits;
38
39         Rune *c;                /* ``interesting'' characters */
40         uint nc;
41 };
42
43 /* set of Reinsts: perhaps we should use a bit list instead of the indices? */
44 struct Reiset
45 {
46         uint *inst;             /* indices of instructions in set */
47         uint ninst;             /* size of set */
48
49         Reiset *next;   /* d.alloc chain */
50         Reiset *hash;   /* d.hash chain */
51         Reiset **delta; /* where to go on each interesting char */
52         uint id;                /* assigned id during minimization */
53         uint isfinal;   /* is an accepting (final) state */
54 };
55
56 static Reiset*
57 ralloc(Deter *d, int ninst)
58 {
59         Reiset *t;
60
61         t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0);
62         if(t == nil)
63                 longjmp(d->kaboom, 1);
64         t->delta = (Reiset**)&t[1];
65         t->inst = (uint*)&t->delta[2*d->nc];
66         return t;
67 }
68
69 /* find the canonical form a given Reiset */
70 static Reiset*
71 findreiset(Deter *d, Reiset *s)
72 {
73         int i, szinst;
74         uint h;
75         Reiset *t;
76
77         h = 0;
78         for(i=0; i<s->ninst; i++)
79                 h = h*1000003 + s->inst[i];
80         h %= d->nhash;
81
82         szinst = s->ninst*sizeof(s->inst[0]);
83         for(t=d->hash[h]; t; t=t->hash)
84                 if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0)
85                         return t;
86
87         t = ralloc(d, s->ninst);
88         t->hash = d->hash[h];
89         d->hash[h] = t;
90
91         *d->last = t;
92         d->last = &t->next;
93         t->next = 0;
94
95         t->ninst = s->ninst;
96         memmove(t->inst, s->inst, szinst);
97
98         /* delta is filled in later */
99
100         return t;
101 }
102
103 /* convert bits to a real reiset */
104 static Reiset*
105 bits2reiset(Deter *d, uchar *bits)
106 {
107         int k;
108         Reiset *s;
109
110         s = d->tmp;
111         s->ninst = 0;
112         for(k=0; k<d->ninst; k++)
113                 if(bits[k])
114                         s->inst[s->ninst++] = k;
115         return findreiset(d, s);
116 }
117
118 /* add n to state set; if n < k, need to go around again */
119 static int
120 add(int n, uchar *bits, int k)
121 {
122         if(bits[n])
123                 return 0;
124         bits[n] = 1;
125         return n < k;
126 }
127
128 /* update bits to follow all the empty (non-character-related) transitions possible */
129 static void
130 followempty(Deter *d, uchar *bits, int bol, int eol)
131 {
132         int again, k;
133         Reinst *i;
134
135         do{
136                 again = 0;
137                 for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
138                         if(!bits[k])
139                                 continue;
140                         switch(i->type){
141                         case RBRA:
142                         case LBRA:
143                                 again |= add(i->next - d->p->firstinst, bits, k);
144                                 break;
145                         case OR:
146                                 again |= add(i->left - d->p->firstinst, bits, k);
147                                 again |= add(i->right - d->p->firstinst, bits, k);
148                                 break;
149                         case BOL:
150                                 if(bol)
151                                         again |= add(i->next - d->p->firstinst, bits, k);
152                                 break;
153                         case EOL:
154                                 if(eol)
155                                         again |= add(i->next - d->p->firstinst, bits, k);
156                                 break;
157                         }
158                 }
159         }while(again);
160
161         /*
162          * Clear bits for useless transitions.  We could do this during
163          * the switch above, but then we have no guarantee of termination
164          * if we get a loop in the regexp.
165          */
166         for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
167                 if(!bits[k])
168                         continue;
169                 switch(i->type){
170                 case RBRA:
171                 case LBRA:
172                 case OR:
173                 case BOL:
174                 case EOL:
175                         bits[k] = 0;
176                         break;
177                 }
178         }
179 }
180
181 /*
182  * Where does s go if it sees rune r?
183  * Eol is true if a $ matches the string at the position just after r.
184  */
185 static Reiset*
186 transition(Deter *d, Reiset *s, Rune r, uint eol)
187 {
188         int k;
189         uchar *bits;
190         Reinst *i, *inst0;
191         Rune *rp, *ep;
192
193         bits = d->bits;
194         memset(bits, 0, d->ninst);
195
196         inst0 = d->p->firstinst;
197         for(k=0; k < s->ninst; k++){
198                 i = inst0 + s->inst[k];
199                 switch(i->type){
200                 default:
201                         werrstr("bad reprog: got type %d", i->type);
202                         longjmp(d->kaboom, 1);
203                 case RBRA:
204                 case LBRA:
205                 case OR:
206                 case BOL:
207                 case EOL:
208                         werrstr("internal error: got type %d", i->type);
209                         longjmp(d->kaboom, 1);
210
211                 case RUNE:
212                         if(r == i->r)
213                                 bits[i->next - inst0] = 1;
214                         break;
215                 case ANY:
216                         if(r != L'\n')
217                                 bits[i->next - inst0] = 1;
218                         break;
219                 case ANYNL:
220                         bits[i->next - inst0] = 1;
221                         break;
222                 case NCCLASS:
223                         if(r == L'\n')
224                                 break;
225                         /* fall through */
226                 case CCLASS:
227                         ep = i->cp->end;
228                         for(rp = i->cp->spans; rp < ep; rp += 2)
229                                 if(rp[0] <= r && r <= rp[1])
230                                         break;
231                         if((rp < ep) ^! (i->type == CCLASS))
232                                 bits[i->next - inst0] = 1;
233                         break;
234                 case END:
235                         break;
236                 }
237         }
238
239         followempty(d, bits, r=='\n', eol);
240         return bits2reiset(d, bits);
241 }
242
243 static int
244 countinst(Reprog *pp)
245 {
246         int n;
247         Reinst *l;
248
249         n = 0;
250         l = pp->firstinst;
251         while(l++->type)
252                 n++;
253         return n;
254 }
255
256 static void
257 set(Deter *d, u32int **tab, Rune r)
258 {
259         u32int *u;
260
261         if((u = tab[r/4096]) == nil){
262                 u = binalloc(&d->bin, 4096/8, 1);
263                 if(u == nil)
264                         longjmp(d->kaboom, 1);
265                 tab[r/4096] = u;
266         }
267         u[(r%4096)/32] |= 1<<(r%32);
268 }
269
270 /*
271  * Compute the list of important characters. 
272  * Other characters behave like the ones that surround them.
273  */
274 static void
275 findchars(Deter *d, Reprog *p)
276 {
277         u32int *tab[65536/4096], *u, x;
278         Reinst *i;
279         Rune *rp, *ep;
280         int k, m, n, a;
281
282         memset(tab, 0, sizeof tab);
283         set(d, tab, 0);
284         set(d, tab, 0xFFFF);
285         for(i=p->firstinst; i->type; i++){
286                 switch(i->type){
287                 case ANY:
288                         set(d, tab, L'\n'-1);
289                         set(d, tab, L'\n');
290                         set(d, tab, L'\n'+1);
291                         break;
292                 case RUNE:
293                         set(d, tab, i->r-1);
294                         set(d, tab, i->r);
295                         set(d, tab, i->r+1);
296                         break;
297                 case NCCLASS:
298                         set(d, tab, L'\n'-1);
299                         set(d, tab, L'\n');
300                         set(d, tab, L'\n'+1);
301                         /* fall through */
302                 case CCLASS:
303                         ep = i->cp->end;
304                         for(rp = i->cp->spans; rp < ep; rp += 2){
305                                 set(d, tab, rp[0]-1);
306                                 set(d, tab, rp[0]);
307                                 set(d, tab, rp[1]);
308                                 set(d, tab, rp[1]+1);
309                         }
310                         break;
311                 }
312         }
313
314         n = 0;
315         for(k=0; k<nelem(tab); k++){
316                 if((u = tab[k]) == nil)
317                         continue;
318                 for(m=0; m<4096/32; m++){
319                         if((x = u[m]) == 0)
320                                 continue;
321                         for(a=0; a<32; a++)
322                                 if(x&(1<<a))
323                                         n++;
324                 }
325         }
326
327         d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0);
328         if(d->c == 0)
329                 longjmp(d->kaboom, 1);
330         d->nc = n;
331
332         n = 0;
333         for(k=0; k<nelem(tab); k++){
334                 if((u = tab[k]) == nil)
335                         continue;
336                 for(m=0; m<4096/32; m++){
337                         if((x = u[m]) == 0)
338                                 continue;
339                         for(a=0; a<32; a++)
340                                 if(x&(1<<a))
341                                         d->c[n++] = k*4096+m*32+a;
342                 }
343         }
344
345         d->c[n] = 0;
346         if(n != d->nc)
347                 abort();
348 }
349
350 /*
351  * convert the Deter and Reisets into a Dreprog.
352  * if dp and c are nil, just return the count of Drecases needed.
353  */
354 static int
355 buildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c)
356 {
357         int i, j, id, n, nn;
358         Dreinst *di;
359         Reiset *s;
360
361         nn = 0;
362         di = 0;
363         for(i=0; i<nid; i++){
364                 s = id2set[i];
365                 if(c){
366                         di = &dp->inst[i];
367                         di->isfinal = s->isfinal;
368                 }
369                 n = 0;
370                 id = -1;
371                 for(j=0; j<2*d->nc; j++){
372                         if(s->delta[j]->id != id){
373                                 id = s->delta[j]->id;
374                                 if(c){
375                                         c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc];
376                                         c[n].next = &dp->inst[id];
377                                 }
378                                 n++;
379                         }
380                 }
381                 if(c){
382                         if(n == 1 && c[0].next == di)
383                                 di->isloop = 1;
384                         di->c = c;
385                         di->nc = n;
386                         c += n;
387                 }
388                 nn += n;
389         }
390         return nn;
391 }
392
393 Dreprog*
394 dregcvt(Reprog *p)
395 {
396         uchar *bits;
397         uint again, n, nid, id;
398         Deter d;
399         Reiset **id2set, *s, *t, *start[4];
400         Dreprog *dp;
401         Drecase *c;
402
403         memset(&d, 0, sizeof d);
404
405         if(setjmp(d.kaboom)){
406                 binfree(&d.bin);
407                 return nil;
408         }
409
410         d.p = p;
411         d.ninst = countinst(p);
412
413         d.last = &d.alloc;
414
415         n = d.ninst;
416         /* round up to power of two; this loop is the least of our efficiency problems */
417         while(n&(n-1))
418                 n++;
419         d.nhash = n;
420         d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1);
421
422         /* get list of important runes */
423         findchars(&d, p);
424
425 #ifdef DUMP
426         print("relevant chars are: Â«%S»\n", d.c+1);
427 #endif
428
429         d.bits = bits = binalloc(&d.bin, d.ninst, 0);
430         d.tmp = ralloc(&d, d.ninst);
431
432         /*
433          * Convert to DFA
434          */
435
436         /* 4 start states, depending on initial bol, eol */
437         for(n=0; n<4; n++){
438                 memset(bits, 0, d.ninst);
439                 bits[p->startinst - p->firstinst] = 1;
440                 followempty(&d, bits, n&1, n&2);
441                 start[n] = bits2reiset(&d, bits);
442         }
443
444         /* explore the reiset space */
445         for(s=d.alloc; s; s=s->next)
446                 for(n=0; n<2*d.nc; n++)
447                         s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc);
448
449 #ifdef DUMP
450         nid = 0;
451         for(s=d.alloc; s; s=s->next)
452                 s->id = nid++;
453         ddump(&d);
454 #endif
455
456         /*
457          * Minimize.
458          */
459
460         /* first class division is final or not */
461         for(s=d.alloc; s; s=s->next){
462                 s->isfinal = 0;
463                 for(n=0; n<s->ninst; n++)
464                         if(p->firstinst[s->inst[n]].type == END)
465                                 s->isfinal = 1;
466                 s->id = s->isfinal;
467         }
468
469         /* divide states with different transition tables in id space */
470         nid = 2;
471         do{
472                 again = 0;
473                 for(s=d.alloc; s; s=s->next){
474                         id = -1;
475                         for(t=s->next; t; t=t->next){
476                                 if(s->id != t->id)
477                                         continue;
478                                 for(n=0; n<2*d.nc; n++){
479                                         /* until we finish the for(t) loop, s->id and id are same */
480                                         if((s->delta[n]->id == t->delta[n]->id)
481                                         || (s->delta[n]->id == s->id && t->delta[n]->id == id)
482                                         || (s->delta[n]->id == id && t->delta[n]->id == s->id))
483                                                 continue;
484                                         break;
485                                 }
486                                 if(n == 2*d.nc)
487                                         continue;
488                                 if(id == -1)
489                                         id = nid++;
490                                 t->id = id;
491                                 again = 1;
492                         }
493                 }
494         }while(again);
495
496 #ifdef DUMP
497         ddump(&d);
498 #endif
499
500         /* build dreprog */
501         id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1);
502         if(id2set == nil)
503                 longjmp(d.kaboom, 1);
504         for(s=d.alloc; s; s=s->next)
505                 id2set[s->id] = s;
506
507         n = buildprog(&d, id2set, nid, nil, nil);
508         dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1);
509         if(dp == nil)
510                 longjmp(d.kaboom, 1);
511         c = (Drecase*)&dp->inst[nid];
512         buildprog(&d, id2set, nid, dp, c);
513
514         for(n=0; n<4; n++)
515                 dp->start[n] = &dp->inst[start[n]->id];
516         dp->ninst = nid;
517
518         binfree(&d.bin);
519         return dp;
520 }
521
522 int
523 dregexec(Dreprog *p, char *s, int bol)
524 {
525         Rune r;
526         ulong rr;
527         Dreinst *i;
528         Drecase *c, *ec;
529         int best, n;
530         char *os;
531
532         i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)];
533         best = -1;
534         os = s;
535         for(; *s; s+=n){
536                 if(i->isfinal)
537                         best = s - os;
538                 if(i->isloop){
539                         if(i->isfinal)
540                                 return strlen(os);
541                         else
542                                 return best;
543                 }
544                 if((*s&0xFF) < Runeself){
545                         r = *s;
546                         n = 1;
547                 }else
548                         n = chartorune(&r, s);
549                 c = i->c;
550                 ec = c+i->nc;
551                 rr = r;
552                 if(s[n] == '\n' || s[n] == '\0')
553                         rr |= 0x10000;
554                 for(; c<ec; c++){
555                         if(c->start > rr){
556                                 i = c[-1].next;
557                                 goto Out;
558                         }
559                 }
560                 i = ec[-1].next;
561         Out:;
562         }
563         if(i->isfinal)
564                 best = s - os;
565         return best;
566 }
567
568
569 #ifdef DUMP
570 void
571 ddump(Deter *d)
572 {
573         int i, id;
574         Reiset *s;
575
576         for(s=d->alloc; s; s=s->next){
577                 print("%d ", s->id);
578                 id = -1;
579                 for(i=0; i<2*d->nc; i++){
580                         if(id != s->delta[i]->id){
581                                 if(i==0)
582                                         print(" [");
583                                 else if(i/d->nc)
584                                         print(" [%C$", d->c[i%d->nc]);
585                                 else
586                                         print(" [%C", d->c[i%d->nc]);
587                                 print(" %d]", s->delta[i]->id);
588                                 id = s->delta[i]->id;
589                         }
590                 }
591                 print("\n");
592         }
593 }
594
595 void
596 rdump(Reprog *pp)
597 {
598         Reinst *l;
599         Rune *p;
600
601         l = pp->firstinst;
602         do{
603                 print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type,
604                         l->left-pp->firstinst, l->right-pp->firstinst);
605                 if(l->type == RUNE)
606                         print("\t%C\n", l->r);
607                 else if(l->type == CCLASS || l->type == NCCLASS){
608                         print("\t[");
609                         if(l->type == NCCLASS)
610                                 print("^");
611                         for(p = l->cp->spans; p < l->cp->end; p += 2)
612                                 if(p[0] == p[1])
613                                         print("%C", p[0]);
614                                 else
615                                         print("%C-%C", p[0], p[1]);
616                         print("]\n");
617                 } else
618                         print("\n");
619         }while(l++->type);
620 }
621
622 void
623 dump(Dreprog *pp)
624 {
625         int i, j;
626         Dreinst *l;
627
628         print("start %ld %ld %ld %ld\n",
629                 pp->start[0]-pp->inst,
630                 pp->start[1]-pp->inst,
631                 pp->start[2]-pp->inst,
632                 pp->start[3]-pp->inst);
633
634         for(i=0; i<pp->ninst; i++){
635                 l = &pp->inst[i];
636                 print("%d:", i);
637                 for(j=0; j<l->nc; j++){
638                         print(" [");
639                         if(j == 0)
640                                 if(l->c[j].start != 1)
641                                         abort();
642                         if(j != 0)
643                                 print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : "");
644                         print("-");
645                         if(j != l->nc-1)
646                                 print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : "");
647                         print("] %ld", l->c[j].next - pp->inst);
648                 }
649                 if(l->isfinal)
650                         print(" final");
651                 if(l->isloop)
652                         print(" loop");
653                 print("\n");
654         }
655 }
656
657
658 void
659 main(int argc, char **argv)
660 {
661         int i;
662         Reprog *p;
663         Dreprog *dp;
664
665         i = 1;
666                 p = regcomp(argv[i]);
667                 if(p == 0){
668                         print("=== %s: bad regexp\n", argv[i]);
669                 }
670         //      print("=== %s\n", argv[i]);
671         //      rdump(p);
672                 dp = dregcvt(p);
673                 print("=== dfa\n");
674                 dump(dp);
675         
676         for(i=2; i<argc; i++)
677                 print("match %d\n", dregexec(dp, argv[i], 0));
678         exits(0);
679 }
680 #endif
681
682 void
683 Bprintdfa(Biobuf *b, Dreprog *p)
684 {
685         int i, j, nc;
686
687         Bprint(b, "# dreprog\n");
688         nc = 0;
689         for(i=0; i<p->ninst; i++)
690                 nc += p->inst[i].nc;
691         Bprint(b, "%d %d %zd %zd %zd %zd\n", p->ninst, nc,
692                 p->start[0]-p->inst, p->start[1]-p->inst,
693                 p->start[2]-p->inst, p->start[3]-p->inst);
694         for(i=0; i<p->ninst; i++){
695                 Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc);
696                 for(j=0; j<p->inst[i].nc; j++)
697                         Bprint(b, " %d %zd", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst);
698                 Bprint(b, "\n");
699         }
700 }
701
702 static char*
703 egetline(Biobuf *b, int c, jmp_buf jb)
704 {
705         char *p;
706
707         p = Brdline(b, c);
708         if(p == nil)
709                 longjmp(jb, 1);
710         p[Blinelen(b)-1] = '\0';
711         return p;
712 }
713
714 static void
715 egetc(Biobuf *b, int c, jmp_buf jb)
716 {
717         if(Bgetc(b) != c)
718                 longjmp(jb, 1);
719 }
720
721 static int
722 egetnum(Biobuf *b, int want, jmp_buf jb)
723 {
724         int c;
725         int n, first;
726
727         n = 0;
728         first = 1;
729         while((c = Bgetc(b)) != Beof){
730                 if(c < '0' || c > '9'){
731                         if(want == 0){
732                                 Bungetc(b);
733                                 c = 0;
734                         }
735                         if(first || c != want){
736                                 werrstr("format error");
737                                 longjmp(jb, 1);
738                         }
739                         return n;
740                 }
741                 n = n*10 + c - '0';
742                 first = 0;
743         }
744         werrstr("unexpected eof");
745         longjmp(jb, 1);
746         return -1;
747 }
748
749 Dreprog*
750 Breaddfa(Biobuf *b)
751 {
752         char *s;
753         int ninst, nc;
754         jmp_buf jb;
755         Dreprog *p;
756         Drecase *c;
757         Dreinst *l;
758         int j, k;
759
760         p = nil;
761         if(setjmp(jb)){
762                 free(p);
763                 return nil;
764         }
765
766         s = egetline(b, '\n', jb);
767         if(strcmp(s, "# dreprog") != 0){
768                 werrstr("format error");
769                 longjmp(jb, 1);
770         }
771
772         ninst = egetnum(b, ' ', jb);
773         nc = egetnum(b, ' ', jb);
774
775         p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1);
776         if(p == nil)
777                 longjmp(jb, 1);
778         c = (Drecase*)&p->inst[ninst];
779
780         p->start[0] = &p->inst[egetnum(b, ' ', jb)];
781         p->start[1] = &p->inst[egetnum(b, ' ', jb)];
782         p->start[2] = &p->inst[egetnum(b, ' ', jb)];
783         p->start[3] = &p->inst[egetnum(b, '\n', jb)];
784
785         for(j=0; j<ninst; j++){
786                 l = &p->inst[j];
787                 l->isfinal = egetnum(b, ' ', jb);
788                 l->isloop = egetnum(b, ' ', jb);
789                 l->nc = egetnum(b, 0, jb);
790                 l->c = c;
791                 for(k=0; k<l->nc; k++){
792                         egetc(b, ' ', jb);
793                         c->start = egetnum(b, ' ', jb);
794                         c->next = &p->inst[egetnum(b, 0, jb)];
795                         c++;
796                 }
797                 egetc(b, '\n', jb);
798         }
799         return p;
800 }