]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/spell/pcode.c
grep: error if sbrk fails
[plan9front.git] / sys / src / cmd / spell / pcode.c
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include "code.h"
6
7 /* read an annotated spelling list in the form
8         word <tab> affixcode [ , affixcode ] ...
9    print a reencoded version
10         octal <tab> word
11  */
12
13 typedef struct  Dict    Dict;
14 struct  Dict
15 {
16         char*   word;
17         int     encode;
18 };
19
20 Dict    words[200000];
21 char    space[500000];
22 long    encodes[4094];
23 long    nspace;
24 long    nwords;
25 int     ncodes;
26 Biobuf  bout;
27
28 void    readinput(int f);
29 long    typecode(char *str);
30 int     wcmp(void*, void*);
31 void    pdict(void);
32 void    sput(int);
33
34 void
35 main(int argc, char *argv[])
36 {
37         int f;
38
39         Binit(&bout, 1, OWRITE);
40         nwords = 0;
41         nspace = 0;
42         ncodes = 0;
43         if(argc <= 1)
44                 readinput(0);
45         while(argc > 1) {
46                 f = open(argv[1], 0);
47                 if(f < 0) {
48                         fprint(2, "Cannot open %s\n", argv[1]);
49                         exits("open");
50                 }
51                 readinput(f);
52                 argc--;
53                 argv++;
54         }
55         fprint(2, "words = %ld; space = %ld; codes = %d\n",
56                 nwords, nspace, ncodes);
57         qsort(words, nwords, sizeof(words[0]), wcmp);
58         pdict();
59         exits(0);
60 }
61
62 wcmp(void *a, void *b)
63 {
64
65         return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
66 }
67
68 void
69 readinput(int f)
70 {
71         long i;
72         char *code, *line, *bword;
73         Biobuf buf;
74         long lineno = 0;
75
76         Binit(&buf, f, OREAD);
77         while(line = Brdline(&buf, '\n')) {
78                 line[Blinelen(&buf)-1] = 0;
79                 lineno++;
80                 code = line;
81                 while(isspace(*code))
82                         code++;
83                 bword = code;
84                 while(*code && !isspace(*code))
85                         code++;
86
87                 i = code-bword;
88                 memmove(space+nspace, bword, i);
89                 words[nwords].word = space+nspace;
90                 nspace += i;
91                 space[nspace] = 0;
92                 nspace++;
93
94                 if(*code) {
95                         *code++ = 0;
96                         while(isspace(*code))
97                                 code++;
98                 }
99                 words[nwords].encode = typecode(code);
100                 nwords++;
101                 if(nwords >= sizeof(words)/sizeof(words[0])) {
102                         fprint(2, "words array too small\n");
103                         exits("words");
104                 }
105                 if(nspace >= sizeof(space)/sizeof(space[0])) {
106                         fprint(2, "space array too small\n");
107                         exits("space");
108                 }
109         }
110         Bterm(&buf);
111 }
112
113
114 typedef struct  Class   Class;
115 struct  Class
116 {
117         char*   codename;
118         long    bits;
119 };
120 Class   codea[]  =
121 {
122         { "a", ADJ },
123         { "adv", ADV },
124         0
125 };
126 Class   codec[] =
127 {
128         { "comp", COMP },
129         0
130 };
131 Class   coded[] =
132 {
133         { "d", DONT_TOUCH},
134         0
135 };
136
137 Class   codee[] =
138 {
139         { "ed", ED },
140         { "er", ACTOR },
141         0
142 };
143
144 Class   codei[] =
145 {
146         { "in", IN },
147         { "ion", ION },
148         0
149 };
150
151 Class   codem[] =
152 {
153         { "man", MAN },
154         { "ms", MONO },
155         0
156 };
157
158 Class   coden[] =
159 {
160         { "n", NOUN },
161         { "na", N_AFFIX },
162         { "nopref", NOPREF },
163         0
164 };
165
166 Class   codep[] =
167 {
168         { "pc", PROP_COLLECT },
169         0
170 };
171 Class   codes[] =
172 {
173         { "s", STOP },
174         0
175 };
176
177 Class   codev[] =
178 {
179         { "v", VERB },
180         { "va", V_AFFIX },
181         { "vi", V_IRREG },
182         0
183 };
184
185 Class   codey[] =
186 {
187         { "y", _Y },
188         0
189 };
190
191 Class   codez[] =
192 {
193         0
194 };
195 Class*  codetab[] =
196 {
197         codea,
198         codez,
199         codec,
200         coded,
201         codee,
202         codez,
203         codez,
204         codez,
205         codei,
206         codez,
207         codez,
208         codez,
209         codem,
210         coden,
211         codez,
212         codep,
213         codez,
214         codez,
215         codes,
216         codez,
217         codez,
218         codev,
219         codez,
220         codez,
221         codey,
222         codez,
223 };
224
225 long
226 typecode(char *str)
227 {
228         Class *p;
229         long code;
230         int n, i;
231         char *s, *sp, *st;
232
233         code = 0;
234
235 loop:
236         for(s=str; *s != 0 && *s != ','; s++)
237                 ;
238         for(p = codetab[*str-'a']; sp = p->codename; p++) {
239                 st = str;
240                 for(n=s-str;; st++,sp++) {
241                         if(*st != *sp)
242                                 goto cont;
243                         n--;
244                         if(n == 0)
245                                 break;
246                 }
247                 code |= p->bits;
248                 if(*s == 0)
249                         goto out;
250                 str = s+1;
251                 goto loop;
252         cont:;
253         }
254         fprint(2, "Unknown affix code \"%s\"\n", str);
255         return 0;
256 out:
257         for(i=0; i<ncodes; i++)
258                 if(encodes[i] == code)
259                         return i;
260         encodes[i] = code;
261         ncodes++;
262         return i;
263 }
264
265 void
266 sput(int s)
267 {
268
269         Bputc(&bout, s>>8);
270         Bputc(&bout, s);
271 }
272
273 void
274 lput(long l)
275 {
276         Bputc(&bout, l>>24);
277         Bputc(&bout, l>>16);
278         Bputc(&bout, l>>8);
279         Bputc(&bout, l);
280 }
281
282 /*
283  * spit out the encoded dictionary
284  * all numbers are encoded big-endian.
285  *      struct
286  *      {
287  *              short   ncodes;
288  *              long    encodes[ncodes];
289  *              struct
290  *              {
291  *                      short   encode;
292  *                      char    word[*];
293  *              } words[*];
294  *      };
295  * 0x8000 flag for code word
296  * 0x7800 count of number of common bytes with previous word
297  * 0x07ff index into codes array for affixes
298  */
299 void
300 pdict(void)
301 {
302         long i, count;
303         int encode, j, c;
304         char *lastword, *thisword, *word;
305
306         sput(ncodes);
307         for(i=0; i<ncodes; i++)
308                 lput(encodes[i]);
309
310         count = ncodes*4 + 2;
311         lastword = "";
312         for(i=0; i<nwords; i++) {
313                 word = words[i].word;
314                 thisword = word;
315                 for(j=0; *thisword == *lastword; j++) {
316                         if(*thisword == 0) {
317                                 fprint(2, "identical words: %s\n", word);
318                                 break;
319                         }
320                         thisword++;
321                         lastword++;
322                 }
323                 if(j > 15)
324                         j = 15;
325                 encode = words[i].encode;
326                 c = (1<<15) | (j<<11) | encode;
327                 sput(c);
328                 count += 2;
329                 for(thisword=word+j; c = *thisword; thisword++) {
330                         Bputc(&bout, c);
331                         count++;
332                 }
333                 lastword = word;
334         }
335         fprint(2, "output bytes = %ld\n", count);
336 }