]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/acme/regx.c
Libflac: Tell it that we have stdint.h so it finds SIZE_MAX
[plan9front.git] / sys / src / cmd / acme / regx.c
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <thread.h>
5 #include <cursor.h>
6 #include <mouse.h>
7 #include <keyboard.h>
8 #include <frame.h>
9 #include <fcall.h>
10 #include <plumb.h>
11 #include "dat.h"
12 #include "fns.h"
13
14 Rangeset        sel;
15 Rune            *lastregexp;
16
17 /*
18  * Machine Information
19  */
20 typedef struct Inst Inst;
21 struct Inst
22 {
23         uint    type;   /* <= Runemax+1 ==> literal, otherwise action */
24         union {
25                 int sid;
26                 int subid;
27                 int class;
28                 Inst *other;
29                 Inst *right;
30         };
31         union{
32                 Inst *left;
33                 Inst *next;
34         };
35 };
36
37 #define NPROG   1024
38 Inst    program[NPROG];
39 Inst    *progp;
40 Inst    *startinst;     /* First inst. of program; might not be program[0] */
41 Inst    *bstartinst;    /* same for backwards machine */
42 Channel *rechan;        /* chan(Inst*) */
43
44 typedef struct Ilist Ilist;
45 struct Ilist
46 {
47         Inst    *inst;          /* Instruction of the thread */
48         Rangeset se;
49         uint    startp;         /* first char of match */
50 };
51
52 #define NLIST   127
53
54 Ilist   *tl, *nl;       /* This list, next list */
55 Ilist   list[2][NLIST+1];       /* +1 for trailing null */
56 static  Rangeset sempty;
57
58 /*
59  * Actions and Tokens
60  *
61  *      0x100xx are operators, value == precedence
62  *      0x200xx are tokens, i.e. operands for operators
63  */
64 enum {
65         OPERATOR = Runemask+1,  /* Bitmask of all operators */
66         START   = OPERATOR,     /* Start, used for marker on stack */
67         RBRA,                   /* Right bracket, ) */
68         LBRA,                   /* Left bracket, ( */
69         OR,                     /* Alternation, | */
70         CAT,                    /* Concatentation, implicit operator */
71         STAR,                   /* Closure, * */
72         PLUS,                   /* a+ == aa* */
73         QUEST,                  /* a? == a|nothing, i.e. 0 or 1 a's */
74
75         ANY     = OPERATOR<<1,  /* Any character but newline, . */
76         NOP,                    /* No operation, internal use only */
77         BOL,                    /* Beginning of line, ^ */
78         EOL,                    /* End of line, $ */
79         CCLASS,                 /* Character class, [] */
80         NCCLASS,                /* Negated character class, [^] */
81         END,                    /* Terminate: match found */
82
83         ISATOR  = OPERATOR,
84         ISAND   = OPERATOR<<1,
85 };
86
87 /*
88  * Parser Information
89  */
90 typedef struct Node Node;
91 struct Node
92 {
93         Inst    *first;
94         Inst    *last;
95 };
96
97 #define NSTACK  20
98 Node    andstack[NSTACK];
99 Node    *andp;
100 int     atorstack[NSTACK];
101 int     *atorp;
102 int     lastwasand;     /* Last token was operand */
103 int     cursubid;
104 int     subidstack[NSTACK];
105 int     *subidp;
106 int     backwards;
107 int     nbra;
108 Rune    *exprp;         /* pointer to next character in source expression */
109 #define DCLASS  10      /* allocation increment */
110 int     nclass;         /* number active */
111 int     Nclass;         /* high water mark */
112 Rune    **class;
113 int     negateclass;
114
115 int     addinst(Ilist *l, Inst *inst, Rangeset *sep);
116 void    newmatch(Rangeset*);
117 void    bnewmatch(Rangeset*);
118 void    pushand(Inst*, Inst*);
119 void    pushator(int);
120 Node    *popand(int);
121 int     popator(void);
122 void    startlex(Rune*);
123 int     lex(void);
124 void    operator(int);
125 void    operand(int);
126 void    evaluntil(int);
127 void    optimize(Inst*);
128 void    bldcclass(void);
129
130 void
131 rxinit(void)
132 {
133         rechan = chancreate(sizeof(Inst*), 0);
134         lastregexp = runemalloc(1);
135 }
136
137 void
138 regerror(char *e)
139 {
140         lastregexp[0] = 0;
141         warning(nil, "regexp: %s\n", e);
142         sendp(rechan, nil);
143         threadexits(nil);
144 }
145
146 Inst *
147 newinst(int t)
148 {
149         if(progp >= &program[NPROG])
150                 regerror("expression too long");
151         progp->type = t;
152         progp->left = nil;
153         progp->right = nil;
154         return progp++;
155 }
156
157 void
158 realcompile(void *arg)
159 {
160         int token;
161         Rune *s;
162
163         threadsetname("regcomp");
164         s = arg;
165         startlex(s);
166         atorp = atorstack;
167         andp = andstack;
168         subidp = subidstack;
169         cursubid = 0;
170         lastwasand = FALSE;
171         /* Start with a low priority operator to prime parser */
172         pushator(START-1);
173         while((token=lex()) != END){
174                 if((token&ISATOR) == OPERATOR)
175                         operator(token);
176                 else
177                         operand(token);
178         }
179         /* Close with a low priority operator */
180         evaluntil(START);
181         /* Force END */
182         operand(END);
183         evaluntil(START);
184         if(nbra)
185                 regerror("unmatched `('");
186         --andp; /* points to first and only operand */
187         sendp(rechan, andp->first);
188         threadexits(nil);
189 }
190
191 /* r is null terminated */
192 int
193 rxcompile(Rune *r)
194 {
195         int i, nr;
196         Inst *oprogp;
197
198         nr = runestrlen(r)+1;
199         if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
200                 return TRUE;
201         lastregexp[0] = 0;
202         for(i=0; i<nclass; i++)
203                 free(class[i]);
204         nclass = 0;
205         progp = program;
206         backwards = FALSE;
207         bstartinst = nil;
208         threadcreate(realcompile, r, STACK);
209         startinst = recvp(rechan);
210         if(startinst == nil)
211                 return FALSE;
212         optimize(program);
213         oprogp = progp;
214         backwards = TRUE;
215         threadcreate(realcompile, r, STACK);
216         bstartinst = recvp(rechan);
217         if(bstartinst == nil)
218                 return FALSE;
219         optimize(oprogp);
220         lastregexp = runerealloc(lastregexp, nr);
221         runemove(lastregexp, r, nr);
222         return TRUE;
223 }
224
225 void
226 operand(int t)
227 {
228         Inst *i;
229         if(lastwasand)
230                 operator(CAT);  /* catenate is implicit */
231         i = newinst(t);
232         if(t == CCLASS){
233                 if(negateclass)
234                         i->type = NCCLASS;      /* UGH */
235                 i->class = nclass-1;            /* UGH */
236         }
237         pushand(i, i);
238         lastwasand = TRUE;
239 }
240
241 void
242 operator(int t)
243 {
244         if(t==RBRA && --nbra<0)
245                 regerror("unmatched `)'");
246         if(t==LBRA){
247                 cursubid++;     /* silently ignored */
248                 nbra++;
249                 if(lastwasand)
250                         operator(CAT);
251         }else
252                 evaluntil(t);
253         if(t!=RBRA)
254                 pushator(t);
255         lastwasand = FALSE;
256         if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
257                 lastwasand = TRUE;      /* these look like operands */
258 }
259
260 void
261 pushand(Inst *f, Inst *l)
262 {
263         if(andp >= &andstack[NSTACK])
264                 error("operand stack overflow");
265         andp->first = f;
266         andp->last = l;
267         andp++;
268 }
269
270 void
271 pushator(int t)
272 {
273         if(atorp >= &atorstack[NSTACK])
274                 error("operator stack overflow");
275         *atorp++=t;
276         if(cursubid >= NRange)
277                 *subidp++= -1;
278         else
279                 *subidp++=cursubid;
280 }
281
282 Node *
283 popand(int op)
284 {
285         char buf[64];
286
287         if(andp <= &andstack[0])
288                 if(op){
289                         sprint(buf, "missing operand for %c", op);
290                         regerror(buf);
291                 }else
292                         regerror("malformed regexp");
293         return --andp;
294 }
295
296 int
297 popator()
298 {
299         if(atorp <= &atorstack[0])
300                 error("operator stack underflow");
301         --subidp;
302         return *--atorp;
303 }
304
305 void
306 evaluntil(int pri)
307 {
308         Node *op1, *op2, *t;
309         Inst *inst1, *inst2;
310
311         while(pri==RBRA || atorp[-1]>=pri){
312                 switch(popator()){
313                 case LBRA:
314                         op1 = popand('(');
315                         inst2 = newinst(RBRA);
316                         inst2->subid = *subidp;
317                         op1->last->next = inst2;
318                         inst1 = newinst(LBRA);
319                         inst1->subid = *subidp;
320                         inst1->next = op1->first;
321                         pushand(inst1, inst2);
322                         return;         /* must have been RBRA */
323                 default:
324                         error("unknown regexp operator");
325                         break;
326                 case OR:
327                         op2 = popand('|');
328                         op1 = popand('|');
329                         inst2 = newinst(NOP);
330                         op2->last->next = inst2;
331                         op1->last->next = inst2;
332                         inst1 = newinst(OR);
333                         inst1->right = op1->first;
334                         inst1->left = op2->first;
335                         pushand(inst1, inst2);
336                         break;
337                 case CAT:
338                         op2 = popand(0);
339                         op1 = popand(0);
340                         if(backwards && op2->first->type!=END){
341                                 t = op1;
342                                 op1 = op2;
343                                 op2 = t;
344                         }
345                         op1->last->next = op2->first;
346                         pushand(op1->first, op2->last);
347                         break;
348                 case STAR:
349                         op2 = popand('*');
350                         inst1 = newinst(OR);
351                         op2->last->next = inst1;
352                         inst1->right = op2->first;
353                         pushand(inst1, inst1);
354                         break;
355                 case PLUS:
356                         op2 = popand('+');
357                         inst1 = newinst(OR);
358                         op2->last->next = inst1;
359                         inst1->right = op2->first;
360                         pushand(op2->first, inst1);
361                         break;
362                 case QUEST:
363                         op2 = popand('?');
364                         inst1 = newinst(OR);
365                         inst2 = newinst(NOP);
366                         inst1->left = inst2;
367                         inst1->right = op2->first;
368                         op2->last->next = inst2;
369                         pushand(inst1, inst2);
370                         break;
371                 }
372         }
373 }
374
375
376 void
377 optimize(Inst *start)
378 {
379         Inst *inst, *target;
380
381         for(inst=start; inst->type!=END; inst++){
382                 target = inst->next;
383                 while(target->type == NOP)
384                         target = target->next;
385                 inst->next = target;
386         }
387 }
388
389 void
390 startlex(Rune *s)
391 {
392         exprp = s;
393         nbra = 0;
394 }
395
396
397 int
398 lex(void){
399         int c;
400
401         c = *exprp++;
402         switch(c){
403         case '\\':
404                 if(*exprp)
405                         if((c= *exprp++)=='n')
406                                 c='\n';
407                 break;
408         case 0:
409                 c = END;
410                 --exprp;        /* In case we come here again */
411                 break;
412         case '*':
413                 c = STAR;
414                 break;
415         case '?':
416                 c = QUEST;
417                 break;
418         case '+':
419                 c = PLUS;
420                 break;
421         case '|':
422                 c = OR;
423                 break;
424         case '.':
425                 c = ANY;
426                 break;
427         case '(':
428                 c = LBRA;
429                 break;
430         case ')':
431                 c = RBRA;
432                 break;
433         case '^':
434                 c = BOL;
435                 break;
436         case '$':
437                 c = EOL;
438                 break;
439         case '[':
440                 c = CCLASS;
441                 bldcclass();
442                 break;
443         }
444         return c;
445 }
446
447 int
448 nextrec(void)
449 {
450         if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
451                 regerror("malformed `[]'");
452         if(exprp[0] == '\\'){
453                 exprp++;
454                 if(*exprp=='n'){
455                         exprp++;
456                         return '\n';
457                 }
458                 return *exprp++|(Runemask+1);
459         }
460         return *exprp++;
461 }
462
463 void
464 bldcclass(void)
465 {
466         int c1, c2, n, na;
467         Rune *classp;
468
469         classp = runemalloc(DCLASS);
470         n = 0;
471         na = DCLASS;
472         /* we have already seen the '[' */
473         if(*exprp == '^'){
474                 classp[n++] = '\n';     /* don't match newline in negate case */
475                 negateclass = TRUE;
476                 exprp++;
477         }else
478                 negateclass = FALSE;
479         while((c1 = nextrec()) != ']'){
480                 if(c1 == '-'){
481     Error:
482                         free(classp);
483                         regerror("malformed `[]'");
484                 }
485                 if(n+4 >= na){          /* 3 runes plus NUL */
486                         na += DCLASS;
487                         classp = runerealloc(classp, na);
488                 }
489                 if(*exprp == '-'){
490                         exprp++;        /* eat '-' */
491                         if((c2 = nextrec()) == ']')
492                                 goto Error;
493                         classp[n+0] = Runemax;
494                         classp[n+1] = c1 & Runemask;
495                         classp[n+2] = c2 & Runemask;
496                         n += 3;
497                 }else
498                         classp[n++] = c1 & Runemask;
499         }
500         classp[n] = 0;
501         if(nclass == Nclass){
502                 Nclass += DCLASS;
503                 class = realloc(class, Nclass*sizeof(Rune*));
504         }
505         class[nclass++] = classp;
506 }
507
508 int
509 classmatch(int classno, int c, int negate)
510 {
511         Rune *p;
512
513         p = class[classno];
514         while(*p){
515                 if(*p == Runemax){
516                         if(p[1]<=c && c<=p[2])
517                                 return !negate;
518                         p += 3;
519                 }else if(*p++ == c)
520                         return !negate;
521         }
522         return negate;
523 }
524
525 /*
526  * Note optimization in addinst:
527  *      *l must be pending when addinst called; if *l has been looked
528  *              at already, the optimization is a bug.
529  */
530 int
531 addinst(Ilist *l, Inst *inst, Rangeset *sep)
532 {
533         Ilist *p;
534
535         for(p = l; p->inst; p++){
536                 if(p->inst==inst){
537                         if((sep)->r[0].q0 < p->se.r[0].q0)
538                                 p->se= *sep;    /* this would be bug */
539                         return 0;       /* It's already there */
540                 }
541         }
542         p->inst = inst;
543         p->se= *sep;
544         (p+1)->inst = nil;
545         return 1;
546 }
547
548 int
549 rxnull(void)
550 {
551         return startinst==nil || bstartinst==nil;
552 }
553
554 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
555 int
556 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
557 {
558         int flag;
559         Inst *inst;
560         Ilist *tlp;
561         uint p;
562         int nnl, ntl;
563         int nc, c;
564         int wrapped;
565         int startchar;
566
567         flag = 0;
568         p = startp;
569         startchar = 0;
570         wrapped = 0;
571         nnl = 0;
572         if(startinst->type<OPERATOR)
573                 startchar = startinst->type;
574         list[0][0].inst = list[1][0].inst = nil;
575         sel.r[0].q0 = -1;
576         if(t != nil)
577                 nc = t->file->nc;
578         else
579                 nc = runestrlen(r);
580         /* Execute machine once for each character */
581         for(;;p++){
582         doloop:
583                 if(p>=eof || p>=nc){
584                         switch(wrapped++){
585                         case 0:         /* let loop run one more click */
586                         case 2:
587                                 break;
588                         case 1:         /* expired; wrap to beginning */
589                                 if(sel.r[0].q0>=0 || eof!=Infinity)
590                                         goto Return;
591                                 list[0][0].inst = list[1][0].inst = nil;
592                                 p = 0;
593                                 goto doloop;
594                         default:
595                                 goto Return;
596                         }
597                         c = 0;
598                 }else{
599                         if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
600                                 break;
601                         if(t != nil)
602                                 c = textreadc(t, p);
603                         else
604                                 c = r[p];
605                 }
606                 /* fast check for first char */
607                 if(startchar && nnl==0 && c!=startchar)
608                         continue;
609                 tl = list[flag];
610                 nl = list[flag^=1];
611                 nl->inst = nil;
612                 ntl = nnl;
613                 nnl = 0;
614                 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
615                         /* Add first instruction to this list */
616                         sempty.r[0].q0 = p;
617                         if(addinst(tl, startinst, &sempty))
618                         if(++ntl >= NLIST){
619         Overflow:
620                                 warning(nil, "regexp list overflow\n");
621                                 sel.r[0].q0 = -1;
622                                 goto Return;
623                         }
624                 }
625                 /* Execute machine until this list is empty */
626                 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
627         Switchstmt:
628                         switch(inst->type){
629                         default:        /* regular character */
630                                 if(inst->type==c){
631         Addinst:
632                                         if(addinst(nl, inst->next, &tlp->se))
633                                         if(++nnl >= NLIST)
634                                                 goto Overflow;
635                                 }
636                                 break;
637                         case LBRA:
638                                 if(inst->subid>=0)
639                                         tlp->se.r[inst->subid].q0 = p;
640                                 inst = inst->next;
641                                 goto Switchstmt;
642                         case RBRA:
643                                 if(inst->subid>=0)
644                                         tlp->se.r[inst->subid].q1 = p;
645                                 inst = inst->next;
646                                 goto Switchstmt;
647                         case ANY:
648                                 if(c!='\n')
649                                         goto Addinst;
650                                 break;
651                         case BOL:
652                                 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
653         Step:
654                                         inst = inst->next;
655                                         goto Switchstmt;
656                                 }
657                                 break;
658                         case EOL:
659                                 if(c == '\n')
660                                         goto Step;
661                                 break;
662                         case CCLASS:
663                                 if(c>=0 && classmatch(inst->class, c, 0))
664                                         goto Addinst;
665                                 break;
666                         case NCCLASS:
667                                 if(c>=0 && classmatch(inst->class, c, 1))
668                                         goto Addinst;
669                                 break;
670                         case OR:
671                                 /* evaluate right choice later */
672                                 if(addinst(tlp, inst->right, &tlp->se))
673                                 if(++ntl >= NLIST)
674                                         goto Overflow;
675                                 /* efficiency: advance and re-evaluate */
676                                 inst = inst->left;
677                                 goto Switchstmt;
678                         case END:       /* Match! */
679                                 tlp->se.r[0].q1 = p;
680                                 newmatch(&tlp->se);
681                                 break;
682                         }
683                 }
684         }
685     Return:
686         *rp = sel;
687         return sel.r[0].q0 >= 0;
688 }
689
690 void
691 newmatch(Rangeset *sp)
692 {
693         if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
694            (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
695                 sel = *sp;
696 }
697
698 int
699 rxbexecute(Text *t, uint startp, Rangeset *rp)
700 {
701         int flag;
702         Inst *inst;
703         Ilist *tlp;
704         int p;
705         int nnl, ntl;
706         int c;
707         int wrapped;
708         int startchar;
709
710         flag = 0;
711         nnl = 0;
712         wrapped = 0;
713         p = startp;
714         startchar = 0;
715         if(bstartinst->type<OPERATOR)
716                 startchar = bstartinst->type;
717         list[0][0].inst = list[1][0].inst = nil;
718         sel.r[0].q0= -1;
719         /* Execute machine once for each character, including terminal NUL */
720         for(;;--p){
721         doloop:
722                 if(p <= 0){
723                         switch(wrapped++){
724                         case 0:         /* let loop run one more click */
725                         case 2:
726                                 break;
727                         case 1:         /* expired; wrap to end */
728                                 if(sel.r[0].q0>=0)
729                                         goto Return;
730                                 list[0][0].inst = list[1][0].inst = nil;
731                                 p = t->file->nc;
732                                 goto doloop;
733                         case 3:
734                         default:
735                                 goto Return;
736                         }
737                         c = 0;
738                 }else{
739                         if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
740                                 break;
741                         c = textreadc(t, p-1);
742                 }
743                 /* fast check for first char */
744                 if(startchar && nnl==0 && c!=startchar)
745                         continue;
746                 tl = list[flag];
747                 nl = list[flag^=1];
748                 nl->inst = nil;
749                 ntl = nnl;
750                 nnl = 0;
751                 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
752                         /* Add first instruction to this list */
753                         /* the minus is so the optimizations in addinst work */
754                         sempty.r[0].q0 = -p;
755                         if(addinst(tl, bstartinst, &sempty))
756                         if(++ntl >= NLIST){
757         Overflow:
758                                 warning(nil, "regexp list overflow\n");
759                                 sel.r[0].q0 = -1;
760                                 goto Return;
761                         }
762                 }
763                 /* Execute machine until this list is empty */
764                 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
765         Switchstmt:
766                         switch(inst->type){
767                         default:        /* regular character */
768                                 if(inst->type == c){
769         Addinst:
770                                         if(addinst(nl, inst->next, &tlp->se))
771                                         if(++nnl >= NLIST)
772                                                 goto Overflow;
773                                 }
774                                 break;
775                         case LBRA:
776                                 if(inst->subid>=0)
777                                         tlp->se.r[inst->subid].q0 = p;
778                                 inst = inst->next;
779                                 goto Switchstmt;
780                         case RBRA:
781                                 if(inst->subid >= 0)
782                                         tlp->se.r[inst->subid].q1 = p;
783                                 inst = inst->next;
784                                 goto Switchstmt;
785                         case ANY:
786                                 if(c != '\n')
787                                         goto Addinst;
788                                 break;
789                         case BOL:
790                                 if(c=='\n' || p==0){
791         Step:
792                                         inst = inst->next;
793                                         goto Switchstmt;
794                                 }
795                                 break;
796                         case EOL:
797                                 if(p<t->file->nc && textreadc(t, p)=='\n')
798                                         goto Step;
799                                 break;
800                         case CCLASS:
801                                 if(c>0 && classmatch(inst->class, c, 0))
802                                         goto Addinst;
803                                 break;
804                         case NCCLASS:
805                                 if(c>0 && classmatch(inst->class, c, 1))
806                                         goto Addinst;
807                                 break;
808                         case OR:
809                                 /* evaluate right choice later */
810                                 if(addinst(tl, inst->right, &tlp->se))
811                                 if(++ntl >= NLIST)
812                                         goto Overflow;
813                                 /* efficiency: advance and re-evaluate */
814                                 inst = inst->left;
815                                 goto Switchstmt;
816                         case END:       /* Match! */
817                                 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
818                                 tlp->se.r[0].q1 = p;
819                                 bnewmatch(&tlp->se);
820                                 break;
821                         }
822                 }
823         }
824     Return:
825         *rp = sel;
826         return sel.r[0].q0 >= 0;
827 }
828
829 void
830 bnewmatch(Rangeset *sp)
831 {
832         int  i;
833
834         if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
835                 for(i = 0; i<NRange; i++){       /* note the reversal; q0<=q1 */
836                         sel.r[i].q0 = sp->r[i].q1;
837                         sel.r[i].q1 = sp->r[i].q0;
838                 }
839 }