2 * RFC822 message tokenizer (really feature generator) for spam filter.
4 * See Paul Graham's musings on spam filtering for theory.
14 void buildre(Dreprog*[3]);
16 char *refile = "/mail/lib/classify.re";
23 fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
28 main(int argc, char **argv)
30 int i, hdr, n, eof, off;
44 maxtoklen = atoi(EARGF(usage()));
47 refile = EARGF(usage());
57 if(open(argv[0], OREAD) < 0)
58 sysfatal("open %s: %r", argv[0]);
62 Binit(&bin, 0, OREAD);
63 Binit(&bout, 1, OWRITE);
70 /* replenish buffer */
71 if(ep - p < 512 && !eof){
74 memmove(msg, p-1, ep-(p-1));
79 n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
81 sysfatal("read error: %r");
95 if(hdr && p[-1]=='\n'){
98 else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
100 else if(cistrncmp(p-1, "\nto:", 4) == 0)
102 else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
104 else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
105 tag = "Return-Path*";
109 m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
110 m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
111 m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
119 fprint(2, "«%s» %.2ux", p, p[0]);
120 sysfatal("no regexps matched at %zd", off + (p-msg));
123 if(m[0] >= m[1] && m[0] >= m[2]){
124 /* "From " marks start of new message */
125 Bprint(&bout, "*From*\n");
131 }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
133 /* should do UTF-aware lowercasing, too much bother */
136 if('A' <= p[i] && p[i] <= 'Z')
141 memmove(buf, tag, i);
142 memmove(buf+i, p, m[1]);
145 memmove(buf, p, m[1]);
148 Bprint(&bout, "%s\n", buf);
149 while(trim(buf) >= 0)
150 Bprint(&bout, "stem*%s\n", buf);
155 fprint(2, "%.*s¦", utfnlen(p, n), p);
163 buildre(Dreprog *re[3])
167 if((b = Bopen(refile, OREAD)) == nil)
168 sysfatal("open %s: %r", refile);
174 if(re[0]==nil || re[1]==nil || re[2]==nil)
175 sysfatal("Breaddfa: %r");
179 /* perhaps this belongs in the tokenizer */
189 /* strip leading punctuation */
193 while(*p && !isalpha(*p))
199 memmove(s, p, strlen(p)+1);
201 /* strip suffix of punctuation */
204 while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
207 /* chop punctuation */
209 /* free!!! -> free! */
230 /* turn FREE into Free */
238 /* turn Free into free */