]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/upas/bayes/msgtok.c
rune(2): add Runeerror reencoding considerations in BUGS section (thanks aiju)
[plan9front.git] / sys / src / cmd / upas / bayes / msgtok.c
1 /*
2  * RFC822 message tokenizer (really feature generator) for spam filter.
3  * 
4  * See Paul Graham's musings on spam filtering for theory.
5  */
6
7 #include <u.h>
8 #include <libc.h>
9 #include <bio.h>
10 #include <regexp.h>
11 #include <ctype.h>
12 #include "dfa.h"
13
14 void buildre(Dreprog*[3]);
15 int debug;
16 char *refile = "/mail/lib/classify.re";
17 int maxtoklen = 20;
18 int trim(char*);
19
20 void
21 usage(void)
22 {
23         fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
24         exits("usage");
25 }
26
27 void
28 main(int argc, char **argv)
29 {
30         int i, hdr, n, eof, off;
31         Dreprog *re[3];
32         int m[3];
33         char *p, *ep, *tag;
34         Biobuf bout, bin;
35         char msg[1024+1];
36         char buf[1024];
37
38         buildre(re);
39         ARGBEGIN{
40         case 'D':
41                 debug = 1;
42                 break;
43         case 'n':
44                 maxtoklen = atoi(EARGF(usage()));
45                 break;
46         case 'r':
47                 refile = EARGF(usage());
48                 break;
49         default:
50                 usage();
51         }ARGEND;
52
53         if(argc > 1)
54                 usage();
55         if(argc == 1){
56                 close(0);
57                 if(open(argv[0], OREAD) < 0)
58                         sysfatal("open %s: %r", argv[0]);
59         }
60
61         tag = nil;
62         Binit(&bin, 0, OREAD);
63         Binit(&bout, 1, OWRITE);
64         ep = msg;
65         p = msg;
66         eof = 0;
67         off = 0;
68         hdr = 1;
69         for(;;){
70                 /* replenish buffer */
71                 if(ep - p < 512 && !eof){
72                         if(p > msg + 1){
73                                 n = ep - p;
74                                 memmove(msg, p-1, ep-(p-1));
75                                 off += (p-1) - msg;
76                                 p = msg+1;
77                                 ep = p + n;
78                         }
79                         n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
80                         if(n < 0)
81                                 sysfatal("read error: %r");
82                         if(n == 0)
83                                 eof = 1;
84                         ep += n;
85                         *ep = 0;
86                 }
87                 if(p >= ep)
88                         break;
89
90                 if(*p == 0){
91                         p++;
92                         continue;
93                 }
94
95                 if(hdr && p[-1]=='\n'){
96                         if(p[0]=='\n')
97                                 hdr = 0;
98                         else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
99                                 tag = "From*";
100                         else if(cistrncmp(p-1, "\nto:", 4) == 0)
101                                 tag = "To*";
102                         else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
103                                 tag = "Subject*";
104                         else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
105                                 tag = "Return-Path*";
106                         else
107                                 tag = nil;
108                 }
109                 m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
110                 m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
111                 m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
112
113                 n = m[0];
114                 if(n < m[1])
115                         n = m[1];
116                 if(n < m[2])
117                         n = m[2];
118                 if(n <= 0){
119 fprint(2, "«%s» %.2ux", p, p[0]);
120                         sysfatal("no regexps matched at %zd", off + (p-msg));
121                 }
122
123                 if(m[0] >= m[1] && m[0] >= m[2]){
124                         /* "From " marks start of new message */
125                         Bprint(&bout, "*From*\n");
126                         n = m[0];
127                         hdr = 1;
128                 }else if(m[2] > 1){
129                         /* ignore */
130                         n = m[2];
131                 }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
132                         /* keyword */
133                         /* should do UTF-aware lowercasing, too much bother */
134 /*
135                         for(i=0; i<n; i++)
136                                 if('A' <= p[i] && p[i] <= 'Z')
137                                         p[i] += 'a' - 'A';
138 */
139                         if(tag){
140                                 i = strlen(tag);        
141                                 memmove(buf, tag, i);
142                                 memmove(buf+i, p, m[1]);
143                                 buf[i+m[1]] = 0;
144                         }else{
145                                 memmove(buf, p, m[1]);
146                                 buf[m[1]] = 0;
147                         }
148                         Bprint(&bout, "%s\n", buf);
149                         while(trim(buf) >= 0)
150                                 Bprint(&bout, "stem*%s\n", buf);
151                         n = m[1];
152                 }else
153                         n = m[2];
154                 if(debug)
155                         fprint(2, "%.*s¦", utfnlen(p, n), p);
156                 p += n;
157         }
158         Bterm(&bout);
159         exits(0);
160 }
161
162 void
163 buildre(Dreprog *re[3])
164 {
165         Biobuf *b;
166
167         if((b = Bopen(refile, OREAD)) == nil)
168                 sysfatal("open %s: %r", refile);
169
170         re[0] = Breaddfa(b);
171         re[1] = Breaddfa(b);
172         re[2] = Breaddfa(b);
173
174         if(re[0]==nil || re[1]==nil || re[2]==nil)
175                 sysfatal("Breaddfa: %r");
176         Bterm(b);
177 }
178
179 /* perhaps this belongs in the tokenizer */
180 int
181 trim(char *s)
182 {
183         char *p, *op;
184         int mix, mix1;
185
186         if(*s == '*')
187                 return -1;
188
189         /* strip leading punctuation */
190         p = strchr(s, '*');
191         if(p == nil)
192                 p = s;
193         while(*p && !isalpha(*p))
194                 p++;
195         if(strlen(p) < 2)
196 {
197                 return -1;
198 }
199         memmove(s, p, strlen(p)+1);
200
201         /* strip suffix of punctuation */
202         p = s+strlen(s);
203         op = p;
204         while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
205                 p--;
206
207         /* chop punctuation */
208         if(p > s){
209                 /* free!!! -> free! */
210                 if(p+1 < op){
211                         p[1] = 0;
212                         return 0;
213                 }
214                 /* free! -> free */
215                 if(p < op){
216                         p[0] = 0;
217                         return 0;
218                 }
219         }
220
221         mix = mix1 = 0;
222         if(isupper(s[0]))
223                 mix = 1;
224         for(p=s+1; *p; p++)
225                 if(isupper(*p)){
226                         mix1 = 1;
227                         break;
228                 }
229
230         /* turn FREE into Free */
231         if(mix1){
232                 for(p=s+1; *p; p++)
233                         if(isupper(*p))
234                                 *p += 'a'-'A';
235                 return 0;
236         }
237
238         /* turn Free into free */
239         if(mix){
240                 *s += 'a'-'A';
241                 return 0;
242         }
243         return -1;
244 }               
245