]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/spell/sprog.c
9bootfat: rename open() to fileinit and make it static as its really a internal funct...
[plan9front.git] / sys / src / cmd / spell / sprog.c
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include "code.h"
6
7 /* fig leaves for possibly signed char quantities */
8 #define ISUPPER(c)      isupper((c)&0xff)
9 #define ISLOWER(c)      islower((c)&0xff)
10 #define ISALPHA(c)      isalpha((c)&0xff)
11 #define ISDIGIT(c)      isdigit((c)&0xff)
12 #define ISVOWEL(c)      voweltab[(c)&0xff]
13 #define Tolower(c)      (ISUPPER(c)? (c)-'A'+'a': (c))
14 #define pair(a,b)       (((a)<<8) | (b))
15 #define DLEV            2
16 #define DSIZ            40
17
18 typedef long    Bits;
19 #define Set(h, f)       ((long)(h) & (f))
20
21 Bits    nop(char*, char*, char*, int, int);
22 Bits    strip(char*, char*, char*, int, int);
23 Bits    ize(char*, char*, char*, int, int);
24 Bits    i_to_y(char*, char*, char*, int, int);
25 Bits    ily(char*, char*, char*, int, int);
26 Bits    subst(char*, char*, char*, int, int);
27 Bits    CCe(char*, char*, char*, int, int);
28 Bits    tion(char*, char*, char*, int, int);
29 Bits    an(char*, char*, char*, int, int);
30 Bits    s(char*, char*, char*, int, int);
31 Bits    es(char*, char*, char*, int, int);
32 Bits    bility(char*, char*, char*, int, int);
33 Bits    y_to_e(char*, char*, char*, int, int);
34 Bits    VCe(char*, char*, char*, int, int);
35
36 Bits    trypref(char*, char*, int, int);
37 Bits    tryword(char*, char*, int, int);
38 Bits    trysuff(char*, int, int);
39 Bits    dict(char*, char*);
40 void    typeprint(Bits);
41 void    pcomma(char*);
42
43 void    ise(void);
44 int     ordinal(void);
45 char*   skipv(char*);
46 int     inun(char*, Bits);
47 char*   ztos(char*);
48 void    readdict(char*);
49
50 typedef struct  Ptab    Ptab;
51 struct  Ptab
52 {
53         char*   s;
54         int     flag;
55 };
56
57 typedef struct  Suftab  Suftab;
58 struct  Suftab
59 {
60         char    *suf;
61         Bits    (*p1)(char*, char*, char*, int, int);
62         int     n1;
63         char    *d1;
64         char    *a1;
65         int     flag;
66         int     affixable;
67         Bits    (*p2)(char*, char*, char*, int, int);
68         int     n2;
69         char    *d2;
70         char    *a2;
71 };
72
73 Suftab  staba[] = {
74         {"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
75         0
76 };
77
78 Suftab  stabc[] =
79 {
80         {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
81         {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
82         {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
83         {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
84         {"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
85         {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
86         {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
87         {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
88         {"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
89         0
90 };
91 Suftab  stabd[] =
92 {
93         {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
94         {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
95         0
96 };
97 Suftab  stabe[] =
98 {
99         /*
100          * V_affix for comment ->commence->commentment??
101          */
102         {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
103         {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
104         {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
105         {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
106         {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
107         {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
108         {"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
109         0
110 };
111 Suftab  stabg[] =
112 {
113         {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
114         {"gnikam",strip,6,"","+making",NOUN,NOUN},
115         {"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
116         {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
117         0
118 };
119 Suftab  stabl[] =
120 {
121         {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
122         {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
123         {"latnem",strip,2,"","+al",N_AFFIX,ADJ},
124         {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
125         {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
126         0
127 };
128 Suftab  stabm[] =
129 {
130                 /* congregational + ism */
131         {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
132         {"margo",subst,-1,"-ph+m","",NOUN,NOUN},
133         0
134 };
135 Suftab  stabn[] =
136 {
137         {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
138         {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
139         {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
140         {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
141         {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
142         {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
143         {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
144         {"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
145         {"nem",strip,3,"","+man",MAN,PROP_COLLECT},
146         {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
147         0
148 };
149 Suftab  stabp[] =
150 {
151         {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
152         0
153 };
154 Suftab  stabr[] =
155 {
156         {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
157         {"reyhparg",nop,0,"","",0,NOUN},
158         {"reyl",nop,0,"","",0,NOUN},
159         {"rekam",strip,5,"","+maker",NOUN,NOUN},
160         {"repeek",strip,6,"","+keeper",NOUN,NOUN},
161         {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ,    i_to_y,2,"-y+ier","+er"},
162         {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
163         {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
164         {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
165         0
166 };
167 Suftab  stabs[] =
168 {
169         {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
170         {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
171         {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH ,   es,2,"-y+ies","+es"},
172         {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
173         {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH  },
174         0
175 };
176 Suftab  stabt[] =
177 {
178         {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
179         {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
180         {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
181         {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
182         0
183 };
184 Suftab  staby[] =
185 {
186         {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
187         {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
188         {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
189         {"ytisuo",nop,0,"","",NOUN},
190         {"ytilb",nop,0,"","",0,NOUN},
191         {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
192         {"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
193         {"ylc",nop,0,"","",0},
194         {"ylelb",nop,0,"","",0},
195         {"ylelp",nop,0,"","",0},
196         {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
197         {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
198         {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
199         0
200 };
201 Suftab  stabz[] =
202 {
203         0
204 };
205 Suftab* suftab[] =
206 {
207         staba,
208         stabz,
209         stabc,
210         stabd,
211         stabe,
212         stabz,
213         stabg,
214         stabz,
215         stabz,
216         stabz,
217         stabz,
218         stabl,
219         stabm,
220         stabn,
221         stabz,
222         stabp,
223         stabz,
224         stabr,
225         stabs,
226         stabt,
227         stabz,
228         stabz,
229         stabz,
230         stabz,
231         staby,
232         stabz,
233 };
234
235 Ptab    ptaba[] =
236 {
237         "anti", 0,
238         "auto", 0,
239         0
240 };
241 Ptab    ptabb[] =
242 {
243         "bio", 0,
244         0
245 };
246 Ptab    ptabc[] =
247 {
248         "counter", 0,
249         0
250 };
251 Ptab    ptabd[] =
252 {
253         "dis", 0,
254         0
255 };
256 Ptab    ptabe[] =
257 {
258         "electro", 0,
259         0
260 };
261 Ptab    ptabf[] =
262 {
263         "femto", 0,
264         0
265 };
266 Ptab    ptabg[] =
267 {
268         "geo", 0,
269         "giga", 0,
270         0
271 };
272 Ptab    ptabh[] =
273 {
274         "hyper", 0,
275         0
276 };
277 Ptab    ptabi[] =
278 {
279         "immuno", 0,
280         "im", IN,
281         "intra", 0,
282         "inter", 0,
283         "in", IN,
284         "ir", IN,
285         "iso", 0,
286         0
287 };
288 Ptab    ptabj[] =
289 {
290         0
291 };
292 Ptab    ptabk[] =
293 {
294         "kilo", 0,
295         0
296 };
297 Ptab    ptabl[] =
298 {
299         0
300 };
301 Ptab    ptabm[] =
302 {
303         "magneto", 0,
304         "mega", 0,
305         "meta", 0,
306         "micro", 0,
307         "mid", 0,
308         "milli", 0,
309         "mini", 0,
310         "mis", 0,
311         "mono", 0,
312         "multi", 0,
313         0
314 };
315 Ptab    ptabn[] =
316 {
317         "nano", 0,
318         "neuro", 0,
319         "non", 0,
320         0
321 };
322 Ptab    ptabo[] =
323 {
324         "out", 0,
325         "over", 0,
326         0
327 };
328 Ptab    ptabp[] =
329 {
330         "para", 0,
331         "photo", 0,
332         "pico", 0,
333         "poly", 0,
334         "pre", 0,
335         "pseudo", 0,
336         "psycho", 0,
337         0
338 };
339 Ptab    ptabq[] =
340 {
341         "quasi", 0,
342         0
343 };
344 Ptab    ptabr[] =
345 {
346         "radio", 0,
347         "re", 0,
348         0
349 };
350 Ptab    ptabs[] =
351 {
352         "semi", 0,
353         "stereo", 0,
354         "sub", 0,
355         "super", 0,
356         0
357 };
358 Ptab    ptabt[] =
359 {
360         "tele", 0,
361         "tera", 0,
362         "thermo", 0,
363         0
364 };
365 Ptab    ptabu[] =
366 {
367         "ultra", 0,
368         "under", 0,     /*must precede un*/
369         "un", IN,
370         0
371 };
372 Ptab    ptabv[] =
373 {
374         0
375 };
376 Ptab    ptabw[] =
377 {
378         0
379 };
380 Ptab    ptabx[] =
381 {
382         0
383 };
384 Ptab    ptaby[] =
385 {
386         0
387 };
388 Ptab    ptabz[] =
389 {
390         0
391 };
392
393 Ptab*   preftab[] =
394 {
395         ptaba,
396         ptabb,
397         ptabc,
398         ptabd,
399         ptabe,
400         ptabf,
401         ptabg,
402         ptabh,
403         ptabi,
404         ptabj,
405         ptabk,
406         ptabl,
407         ptabm,
408         ptabn,
409         ptabo,
410         ptabp,
411         ptabq,
412         ptabr,
413         ptabs,
414         ptabt,
415         ptabu,
416         ptabv,
417         ptabw,
418         ptabx,
419         ptaby,
420         ptabz,
421 };
422
423 typedef struct {
424         char *mesg;
425         enum { NONE, SUFF, PREF} type;
426 } Deriv;
427
428 int     aflag;
429 int     cflag;
430 int     fflag;
431 int     vflag;
432 int     xflag;
433 int     nflag;
434 char    word[500];
435 char*   original;
436 Deriv   emptyderiv;
437 Deriv   deriv[DSIZ+3];
438 char    affix[DSIZ*10]; /* 10 is longest affix message */
439 int     prefcount;
440 int     suffcount;
441 char*   acmeid;
442 char    space[300000];  /* must be as large as "words"+"space" in pcode run */
443 Bits    encode[2048];   /* must be as long as "codes" in pcode run */
444 int     nencode;
445 char    voweltab[256];
446 char*   spacep[128*128+1];      /* pointer to words starting with 'xx' */
447 Biobuf  bin;
448 Biobuf  bout;
449
450 char*   codefile = "/sys/lib/amspell";
451 char*   brfile = "/sys/lib/brspell";
452 char*   Usage = "usage";
453
454 void
455 main(int argc, char *argv[])
456 {
457         char *ep, *cp;
458         char *dp;
459         int j, i, c;
460         int low;
461         Bits h;
462
463         Binit(&bin, 0, OREAD);
464         Binit(&bout, 1, OWRITE);
465         for(i=0; c = "aeiouyAEIOUY"[i]; i++)
466                 voweltab[c] = 1;
467         while(argc > 1) {
468                 if(argv[1][0] != '-')
469                         break;
470                 for(i=1; c = argv[1][i]; i++)
471                 switch(c) {
472                 default:
473                         fprint(2, "usage: spell [-bcCvx] [-f file]\n");
474                         exits(Usage);
475
476                 case 'a':
477                         aflag++;
478                         continue;
479
480                 case 'b':
481                         ise();
482                         if(!fflag)
483                                 codefile = brfile;
484                         continue;
485
486                 case 'C':               /* for "correct" */
487                         vflag++;
488                 case 'c':               /* for ocr */
489                         cflag++;
490                         continue;
491
492                 case 'v':
493                         vflag++;
494                         continue;
495
496                 case 'x':
497                         xflag++;
498                         continue;
499
500                 case 'f':
501                         if(argc <= 2) {
502                                 fprint(2, "spell: -f requires another argument\n");
503                                 exits(Usage);
504                         }
505                         argv++;
506                         argc--;
507                         codefile = argv[1];
508                         fflag++;
509                         goto brk;
510                 }
511         brk:
512                 argv++;
513                 argc--;
514         }
515         readdict(codefile);
516         if(argc > 1) {
517                 fprint(2, "usage: spell [-bcCvx] [-f file]\n");
518                 exits(Usage);
519         }
520         if(aflag)
521                 cflag = vflag = 0;
522
523         for(;;) {
524                 affix[0] = 0;
525                 original = Brdline(&bin, '\n');
526                 if(original == 0)
527                         exits(0);
528                 original[Blinelen(&bin)-1] = 0;
529                 low = 0;
530
531                 if(aflag) {
532                         acmeid = original;
533                         while(*original != ':')
534                                 if(*original++ == 0)
535                                         exits(0);
536                         while(*++original != ':')
537                                 if(*original == 0)
538                                         exits(0);
539                         *original++ = 0;
540                 }
541                 for(ep=word,dp=original; j = *dp; ep++,dp++) {
542                         if(ISLOWER(j))
543                                 low++;
544                         if(ep >= word+sizeof(word)-1)
545                                 break;
546                         *ep = j;
547                 }
548                 *ep = 0;
549
550                 if(ISDIGIT(word[0]) && ordinal())
551                         continue;
552
553                 h = 0;
554                 if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
555                         for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
556                                 *dp = Tolower(*cp);
557                 if(!h)
558                 for(;;) {       /* at most twice */
559                         if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
560                                 break;
561                         if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
562                                 break;
563                         if(!ISUPPER(word[0]))
564                                 break;
565                         cp = original;
566                         dp = word;
567                         while(*dp = *cp++) {
568                                         if(!low)
569                                                 *dp = Tolower(*dp);
570                                 dp++;
571                         }
572                         word[0] = Tolower(word[0]);
573                 }
574
575                 if(cflag) {
576                         if(!h || Set(h,STOP))
577                                 print("-");
578                         else if(!vflag)
579                                 print("+");
580                         else 
581                                 print("%c",'0' + (suffcount>0) +
582                                    (prefcount>4? 8: 2*prefcount));
583                 } else if(!h || Set(h,STOP)) {
584                         if(aflag)
585                                 Bprint(&bout, "%s:%s\n", acmeid, original);
586                         else
587                                 Bprint(&bout, "%s\n", original);
588                 } else if(affix[0] != 0 && affix[0] != '.')
589                         print("%s\t%s\n", affix, original);
590         }
591         /* not reached */
592 }
593
594 /*      strip exactly one suffix and do
595  *      indicated routine(s), which may recursively
596  *      strip suffixes
597  */
598 Bits
599 trysuff(char* ep, int lev, int flag)
600 {
601         Suftab *t;
602         char *cp, *sp;
603         Bits h = 0;
604         int initchar = ep[-1];
605
606         flag &= ~MONO;
607         lev += DLEV;
608         if(lev < DSIZ) {
609                 deriv[lev]  = emptyderiv;
610                 deriv[lev-1] = emptyderiv;
611         }
612         if(!ISLOWER(initchar))
613                 return h;
614         for(t=suftab[initchar-'a']; sp=t->suf; t++) {
615                 cp = ep;
616                 while(*sp)
617                         if(*--cp != *sp++)
618                                 goto next;
619                 for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
620                         ;
621                 if(sp < word)
622                         continue;
623                 if(!(t->affixable & flag))
624                         return 0;
625                 h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
626                 if(!h && t->p2!=0) {
627                         if(lev < DSIZ) {
628                                 deriv[lev] = emptyderiv;
629                                 deriv[lev+1] = emptyderiv;
630                         }
631                         h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
632                 }
633                 break;
634         next:;
635         }
636         return h;
637 }
638
639 Bits
640 nop(char* ep, char* d, char* a, int lev, int flag)
641 {
642         USED(ep, d, a, lev, flag);
643         return 0;
644 }
645
646 Bits
647 cstrip(char* ep, char* d, char* a, int lev, int flag)
648 {
649         int temp = ep[0];
650
651         if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
652                 switch(pair(ep[-1],ep[0])) {
653                 case pair('a', 'a'):
654                 case pair('a', 'e'):
655                 case pair('a', 'i'):
656                 case pair('e', 'a'):
657                 case pair('e', 'e'):
658                 case pair('e', 'i'):
659                 case pair('i', 'i'):
660                 case pair('o', 'a'):
661                         return 0;
662                 }
663         } else
664         if(temp==ep[-1]&&temp==ep[-2])
665                 return 0;
666         return strip(ep,d,a,lev,flag);
667 }
668
669 Bits
670 strip(char* ep, char* d, char* a, int lev, int flag)
671 {
672         Bits h = trypref(ep, a, lev, flag);
673
674         USED(d);
675         if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
676                 h = 0;
677         if(h)
678                 return h;
679         if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
680                 h = trypref(ep-1,a,lev,flag|MONO);
681                 if(h)
682                         return h;
683         }
684         return trysuff(ep,lev,flag);
685 }
686
687 Bits
688 s(char* ep, char* d, char* a, int lev, int flag)
689 {
690         if(lev > DLEV+1)
691                 return 0;
692         if(*ep=='s') {
693                 switch(ep[-1]) {
694                 case 'y':
695                         if(ISVOWEL(ep[-2])||ISUPPER(*word))
696                                 break;  /*says Kennedys*/
697                 case 'x':
698                 case 'z':
699                 case 's':
700                         return 0;
701                 case 'h':
702                         switch(ep[-2]) {
703                         case 'c':
704                         case 's':
705                                 return 0;
706                         }
707                 }
708         }
709         return strip(ep,d,a,lev,flag);
710 }
711
712 Bits
713 an(char* ep, char* d, char* a, int lev, int flag)
714 {
715         USED(d);
716         if(!ISUPPER(*word))     /*must be proper name*/
717                 return 0;
718         return trypref(ep,a,lev,flag);
719 }
720
721 Bits
722 ize(char* ep, char* d, char* a, int lev, int flag)
723 {
724         int temp = ep[-1];
725         Bits h;
726
727         USED(a);
728         ep[-1] = 'e';
729         h = strip(ep,"",d,lev,flag);
730         ep[-1] = temp;
731         return h;
732 }
733
734 Bits
735 y_to_e(char* ep, char* d, char* a, int lev, int flag)
736 {
737         Bits h;
738         int  temp;
739
740         USED(a);
741         switch(ep[-1]) {
742         case 'a':
743         case 'e':
744         case 'i':
745                 return 0;
746         }
747         temp = *ep;
748         *ep++ = 'e';
749         h = strip(ep,"",d,lev,flag);
750         ep[-1] = temp;
751         return h;
752 }
753
754 Bits
755 ily(char* ep, char* d, char* a, int lev, int flag)
756 {
757         int temp = ep[0];
758         char *cp = ep;
759
760         if(temp==ep[-1]&&temp==ep[-2])          /* sillly */
761                 return 0;
762         if(*--cp=='y' && !ISVOWEL(*--cp))       /* happyly */
763                 while(cp>word)
764                         if(ISVOWEL(*--cp))      /* shyness */
765                                 return 0;
766         if(ep[-1]=='i')
767                 return i_to_y(ep,d,a,lev,flag);
768         return cstrip(ep,d,a,lev,flag);
769 }
770
771 Bits
772 bility(char* ep, char* d, char* a, int lev, int flag)
773 {
774         *ep++ = 'l';
775         return y_to_e(ep,d,a,lev,flag);
776 }
777
778 Bits
779 i_to_y(char* ep, char* d, char* a, int lev, int flag)
780 {
781         Bits h;
782         int temp;
783
784         if(ISUPPER(*word))
785                 return 0;
786         if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
787                 ep[-1] = 'y';
788                 a = d;
789         }
790         h = cstrip(ep,"",a,lev,flag);
791         ep[-1] = temp;
792         return h;
793 }
794
795 Bits
796 es(char* ep, char* d, char* a, int lev, int flag)
797 {
798         if(lev>DLEV)
799                 return 0;
800         switch(ep[-1]) {
801         default:
802                 return 0;
803         case 'i':
804                 return i_to_y(ep,d,a,lev,flag);
805         case 'h':
806                 switch(ep[-2]) {
807                 default:
808                         return 0;
809                 case 'c':
810                 case 's':
811                         break;
812                 }
813         case 's':
814         case 'z':
815         case 'x':
816                 return strip(ep,d,a,lev,flag);
817         }
818 }
819
820 Bits
821 subst(char* ep, char* d, char* a, int lev, int flag)
822 {
823         char *u,*t;
824         Bits h;
825
826         USED(a);
827         if(skipv(skipv(ep-1)) < word)
828                 return 0;
829         for(t=d; *t!='+'; t++)
830                 continue;
831         for(u=ep; *--t!='-';)
832                 *--u = *t;
833         h = strip(ep,"",d,lev,flag);
834         while(*++t != '+')
835                 continue;
836         while(*++t)
837                 *u++ = *t;
838         return h;
839 }
840
841 Bits
842 tion(char* ep, char* d, char* a, int lev, int flag)
843 {
844         switch(ep[-2]) {
845         default:
846                 return trypref(ep,a,lev,flag);
847         case 'a':
848         case 'e':
849         case 'i':
850         case 'o':
851         case 'u':
852                 return y_to_e(ep,d,a,lev,flag);
853         }
854 }
855
856 /*
857  * possible consonant-consonant-e ending
858  */
859 Bits
860 CCe(char* ep, char* d, char* a, int lev, int flag)
861 {
862         Bits h;
863
864         switch(ep[-1]) {
865         case 'l':
866                 if(ISVOWEL(ep[-2]))
867                         break;
868                 switch(ep[-2]) {
869                 case 'l':
870                 case 'r':
871                 case 'w':
872                         break;
873                 default:
874                         return y_to_e(ep,d,a,lev,flag);
875                 }
876                 break;
877         case 'c':
878         case 'g':
879                 if(*ep == 'a')  /* prevent -able for -eable */
880                         return 0;
881         case 's':
882         case 'v':
883         case 'z':
884                 if(ep[-2]==ep[-1])
885                         break;
886                 if(ISVOWEL(ep[-2]))
887                         break;
888         case 'u':
889                 if(h = y_to_e(ep,d,a,lev,flag))
890                         return h;
891                 if(!(ep[-2]=='n' && ep[-1]=='g'))
892                         return 0;
893         }
894         return VCe(ep,d,a,lev,flag);
895 }
896
897 /*
898  * possible consonant-vowel-consonant-e ending
899  */
900 Bits
901 VCe(char* ep, char* d, char* a, int lev, int flag)
902 {
903         int c;
904         Bits h;
905
906         c = ep[-1];
907         if(c=='e')
908                 return 0;
909         if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
910                 c = *ep;
911                 *ep++ = 'e';
912                 h = trypref(ep,d,lev,flag);
913                 if(!h)
914                         h = trysuff(ep,lev,flag);
915                 if(h)
916                         return h;
917                 ep--;
918                 *ep = c;
919         }
920         return cstrip(ep,d,a,lev,flag);
921 }
922
923 Ptab*
924 lookuppref(uchar** wp, char* ep)
925 {
926         Ptab *sp;
927         uchar *bp,*cp;
928         unsigned int initchar = Tolower(**wp);
929
930         if(!ISALPHA(initchar))
931                 return 0;
932         for(sp=preftab[initchar-'a'];sp->s;sp++) {
933                 bp = *wp;
934                 for(cp= (uchar*)sp->s;*cp; )
935                         if(*bp++!=*cp++)
936                                 goto next;
937                 for(cp=bp;cp<(uchar*)ep;cp++)
938                         if(ISVOWEL(*cp)) {
939                                 *wp = bp;
940                                 return sp;
941                         }
942         next:;
943         }
944         return 0;
945 }
946
947 /*      while word is not in dictionary try stripping
948  *      prefixes. Fail if no more prefixes.
949  */
950 Bits
951 trypref(char* ep, char* a, int lev, int flag)
952 {
953         Ptab *tp;
954         char *bp, *cp;
955         char *pp;
956         Bits h;
957         char space[20];
958
959         if(lev<DSIZ) {
960                 deriv[lev].mesg = a;
961                 deriv[lev].type = *a=='.'? NONE: SUFF;
962         }
963         if(h = tryword(word,ep,lev,flag)) {
964                 if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
965                         return h;
966                 h = 0;
967         }
968         bp = word;
969         pp = space;
970         if(lev<DSIZ) {
971                 deriv[lev+1].mesg = pp;
972                 deriv[lev+1].type = 0;
973         }
974         while(tp=lookuppref((uchar**)&bp,ep)) {
975                 *pp++ = '+';
976                 cp = tp->s;
977                 while(pp<space+sizeof(space) && (*pp = *cp++))
978                         pp++;
979                 deriv[lev+1].type += PREF;
980                 h = tryword(bp,ep,lev+1,flag);
981                 if(Set(h,NOPREF) ||
982                    ((tp->flag&IN) && inun(bp-2,h)==0)) {
983                         h = 0;
984                         break;
985                 }
986                 if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
987                         break;
988                 h = 0;
989         }
990         if(lev < DSIZ) {
991                 deriv[lev+1] = emptyderiv;
992                 deriv[lev+2] = emptyderiv;
993         }
994         return h;
995 }
996
997 Bits
998 tryword(char* bp, char* ep, int lev, int flag)
999 {
1000         int  j;
1001         Bits h = 0;
1002         char duple[3];
1003
1004         if(ep-bp <= 1)
1005                 return h;
1006         if(flag&MONO) {
1007                 if(lev<DSIZ) {
1008                         deriv[++lev].mesg = duple;
1009                         deriv[lev].type = SUFF;
1010                 }
1011                 duple[0] = '+';
1012                 duple[1] = *ep;
1013                 duple[2] = 0;
1014         }
1015         h = dict(bp, ep);
1016         if(vflag==0 || h==0)
1017                 return h;
1018         /*
1019          * when derivations are wanted, collect them
1020          * for printing
1021          */
1022         j = lev;
1023         prefcount = suffcount = 0;
1024         do {
1025                 if(j<DSIZ && deriv[j].type) {
1026                         strcat(affix, deriv[j].mesg);
1027                         if(deriv[j].type == SUFF)
1028                                 suffcount++;
1029                         else if(deriv[j].type != NONE)
1030                                 prefcount = deriv[j].type/PREF;
1031                 }
1032         } while(--j > 0);
1033         return h;
1034 }
1035
1036 int
1037 inun(char* bp, Bits h)
1038 {
1039         if(*bp == 'u')
1040                 return Set(h, IN) == 0;
1041         /* *bp == 'i' */
1042         if(Set(h, IN) == 0)
1043                 return 0;
1044         switch(bp[2]) {
1045         case 'r':
1046                 return bp[1] == 'r';
1047         case 'm':
1048         case 'p':
1049                 return bp[1] == 'm';
1050         }
1051         return bp[1] == 'n';
1052 }
1053
1054 char*
1055 skipv(char *s)
1056 {
1057         if(s >= word && ISVOWEL(*s))
1058                 s--;
1059         while(s >= word && !ISVOWEL(*s))
1060                 s--;
1061         return s;
1062 }
1063
1064 /*
1065  * crummy way to Britishise
1066  */
1067 void
1068 ise(void)
1069 {
1070         Suftab *p;
1071         int i;
1072
1073         for(i=0; i<26; i++)
1074                 for(p = suftab[i]; p->suf; p++) {
1075                         p->suf = ztos(p->suf);
1076                         p->d1 = ztos(p->d1);
1077                         p->a1 = ztos(p->a1);
1078                 }
1079 }
1080
1081 char*
1082 ztos(char *as)
1083 {
1084         char *s, *ds;
1085
1086         for(s=as; *s; s++)
1087                 if(*s == 'z')
1088                         goto copy;
1089         return as;
1090
1091 copy:
1092         ds = strdup(as);
1093         for(s=ds; *s; s++)
1094                 if(*s == 'z')
1095                         *s = 's';
1096         return ds;
1097 }
1098
1099 Bits
1100 dict(char* bp, char* ep)
1101 {
1102         char *cp, *cp1, *w, *wp, *we;
1103         int n, f;
1104
1105         w = bp;
1106         we = ep;
1107         n = ep-bp;
1108         if(n <= 1)
1109                 return NOUN;
1110
1111         f = w[0] & 0x7f;
1112         f *= 128;
1113         f += w[1] & 0x7f;
1114         bp = spacep[f];
1115         ep = spacep[f+1];
1116
1117 loop:
1118         if(bp >= ep) {
1119                 if(xflag) 
1120                         fprint(2, "=%.*s\n", utfnlen(w, n), w);
1121                 return 0;
1122         }
1123         /*
1124          * find the beginning of some word in the middle
1125          */
1126         cp = bp + (ep-bp)/2;
1127
1128         while(cp > bp && !(*cp & 0x80))
1129                 cp--;
1130         while(cp > bp && (cp[-1] & 0x80))
1131                 cp--;
1132
1133         wp = w + 2;     /* skip two letters */
1134         cp1 = cp + 2;   /* skip affix code */
1135         for(;;) {
1136                 if(wp >= we) {
1137                         if(*cp1 & 0x80)
1138                                 goto found;
1139                         else
1140                                 f = 1;
1141                         break;
1142                 }
1143                 if(*cp1 & 0x80) {
1144                         f = -1;
1145                         break;
1146                 }
1147                 f = *cp1++ - *wp++;
1148                 if(f != 0)
1149                         break;
1150         }
1151
1152         if(f < 0) {
1153                 while(!(*cp1 & 0x80))
1154                         cp1++;
1155                 bp = cp1;
1156                 goto loop;
1157         }
1158         ep = cp;
1159         goto loop;
1160
1161 found:
1162         f = ((cp[0] & 0x7) << 8) |
1163                 (cp[1] & 0xff);
1164         if(xflag) {
1165                 fprint(2, "=%.*s ", utfnlen(w, n), w);
1166                 typeprint(encode[f]);
1167         }
1168         return encode[f];
1169 }
1170
1171 void
1172 typeprint(Bits h)
1173 {
1174
1175         pcomma("");
1176         if(h & NOUN)
1177                 pcomma("n");
1178         if(h & PROP_COLLECT)
1179                 pcomma("pc");
1180         if(h & VERB) {
1181                 if((h & VERB) == VERB)
1182                         pcomma("v");
1183                 else
1184                 if((h & VERB) == V_IRREG)
1185                         pcomma("vi");
1186                 else
1187                 if(h & ED)
1188                         pcomma("ed");
1189         }
1190         if(h & ADJ)
1191                 pcomma("a");
1192         if(h & COMP) {
1193                 if((h & COMP) == ACTOR)
1194                         pcomma("er");
1195                 else
1196                         pcomma("comp");
1197         }
1198         if(h & DONT_TOUCH)
1199                 pcomma("d");
1200         if(h & N_AFFIX)
1201                 pcomma("na");
1202         if(h & ADV)
1203                 pcomma("adv");
1204         if(h & ION)
1205                 pcomma("ion");
1206         if(h & V_AFFIX)
1207                 pcomma("va");
1208         if(h & MAN)
1209                 pcomma("man");
1210         if(h & NOPREF)
1211                 pcomma("nopref");
1212         if(h & MONO)
1213                 pcomma("ms");
1214         if(h & IN)
1215                 pcomma("in");
1216         if(h & _Y)
1217                 pcomma("y");
1218         if(h & STOP)
1219                 pcomma("s");
1220         fprint(2, "\n");
1221 }
1222
1223 void
1224 pcomma(char *s)
1225 {
1226         static flag;
1227
1228         if(*s == 0) {
1229                 flag = 0;
1230                 return;
1231         }
1232         if(!flag) {
1233                 fprint(2, "%s", s);
1234                 flag = 1;
1235         } else
1236                 fprint(2, ",%s", s);
1237 }
1238
1239 /*
1240  * is the word on of the following
1241  *      12th    teen
1242  *      21st    end in 1
1243  *      23rd    end in 3
1244  *      77th    default
1245  * called knowing word[0] is a digit
1246  */
1247 int
1248 ordinal(void)
1249 {
1250         char *cp = word;
1251         static char sp[4];
1252
1253         while(ISDIGIT(*cp))
1254                 cp++;
1255         strncpy(sp,cp,3);
1256         if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
1257                 sp[0] = Tolower(cp[0]);
1258                 sp[1] = Tolower(cp[1]);
1259         }
1260         return 0 == strncmp(sp,
1261                 cp[-2]=='1'? "th":      /* out of bounds if 1 digit */
1262                 *--cp=='1'? "st":       /* harmless */
1263                 *cp=='2'? "nd":
1264                 *cp=='3'? "rd":
1265                 "th", 3);
1266 }
1267
1268 /*
1269  * read in the dictionary.
1270  * format is
1271  * {
1272  *      short   nencode;
1273  *      long    encode[nencode];
1274  *      char    space[*];
1275  * };
1276  *
1277  * the encodings are a table all different
1278  * affixes.
1279  * the dictionary proper has 2 bytes
1280  * that demark and then the rest of the
1281  * word. the 2 bytes have the following
1282  *      0x80 0x00       flag
1283  *      0x78 0x00       count of prefix bytes
1284  *                      common with prev word
1285  *      0x07 0xff       affix code
1286  *
1287  * all ints are big endians in the file.
1288  */
1289 void
1290 readdict(char *file)
1291 {
1292         char *s, *is, *lasts, *ls;
1293         int c, i, sp, p;
1294         int f;
1295         long l;
1296
1297         lasts = 0;
1298         f = open(file, 0);
1299         if(f == -1) {
1300                 fprint(2, "cannot open %s\n", file);
1301                 exits("open");
1302         }
1303         if(read(f, space, 2) != 2)
1304                 goto bad;
1305         nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
1306         if(read(f, space, 4*nencode) != 4*nencode)
1307                 goto bad;
1308         s = space;
1309         for(i=0; i<nencode; i++) {
1310                 l = (long)(s[0] & 0xff) << 24;
1311                 l |= (s[1] & 0xff) << 16;
1312                 l |= (s[2] & 0xff) << 8;
1313                 l |= s[3] & 0xff;
1314                 encode[i] = (Bits)l;
1315                 s += 4;
1316         }
1317         l = read(f, space, sizeof(space));
1318         if(l == sizeof(space))
1319                 goto noroom;
1320         is = space + (sizeof(space) - l);
1321         memmove(is, space, l);
1322
1323         s = space;
1324         c = *is++ & 0xff;
1325         sp = -1;
1326         i = 0;
1327
1328 loop:
1329         if(s > is)
1330                 goto noroom;
1331         if(c < 0) {
1332                 close(f);
1333                 while(sp < 128*128)
1334                         spacep[++sp] = s;
1335                 *s = 0x80;              /* fence */
1336                 return;
1337         }
1338         p = (c>>3) & 0xf;
1339         *s++ = c;
1340         *s++ = *is++ & 0xff;
1341         if(p <= 0)
1342                 i = (*is++ & 0xff)*128;
1343         if(p <= 1) {
1344                 if(!(*is & 0x80))
1345                         i = i/128*128 + (*is++ & 0xff);
1346                 if(i <= sp) {
1347                         fprint(2, "the dict isnt sorted or \n");
1348                         fprint(2, "memmove didn't work\n");
1349                         goto bad;
1350                 }
1351                 while(sp < i)
1352                         spacep[++sp] = s-2;
1353         }
1354         ls = lasts;
1355         lasts = s;
1356         for(p-=2; p>0; p--)
1357                 *s++ = *ls++;
1358         for(;;) {
1359                 if(is >= space+sizeof(space)) {
1360                         c = -1;
1361                         break;
1362                 }
1363                 c = *is++ & 0xff;
1364                 if(c & 0x80)
1365                         break;
1366                 *s++ = c;
1367         }
1368         *s = 0;
1369         goto loop;
1370
1371 bad:
1372         fprint(2, "trouble reading %s\n", file);
1373         exits("read");
1374 noroom:
1375         fprint(2, "not enough space for dictionary\n");
1376         exits("space");
1377 }