]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/html2ms.c
uhtml: add html to unicode converter, used by mothra and page/html2ms
[plan9front.git] / sys / src / cmd / html2ms.c
1 #include <u.h>
2 #include <libc.h>
3 #include <ctype.h>
4 #include <bio.h>
5
6 typedef struct Tag Tag;
7 typedef struct Attr Attr;
8 typedef struct Text Text;
9
10 struct Attr {
11         char    attr[64];
12         char    val[256-64];
13 };
14
15 struct Tag {
16         Tag     *up;
17         char    tag[32];
18         Attr    attr[16];
19         int     nattr;
20         int     opening;
21         int     closing;
22
23         void    (*close)(Text *, Tag *);
24         union {
25                 void    *aux;
26                 int     restore;
27         };
28 };
29
30 struct Text {
31         char    font;
32         int     pre;
33         int     pos;
34         int     space;
35         int     output;
36 };
37
38 void eatwhite(void);
39 Tag *parsetext(Text *, Tag *);
40 int parsetag(Tag *);
41 int parseattr(Attr *);
42
43 Biobuf in, out;
44
45 void
46 emit(Text *text, char *fmt, ...)
47 {
48         va_list a;
49
50         if(text->pos > 0){
51                 text->pos = 0;
52                 Bputc(&out, '\n');
53         }
54         va_start(a, fmt);
55         Bvprint(&out, fmt, a);
56         va_end(a);
57 }
58
59 void
60 restoreoutput(Text *text, Tag *tag)
61 {
62         text->output = tag->restore;
63 }
64
65 void
66 ongarbage(Text *text, Tag *tag)
67 {
68         tag->restore = text->output;
69         tag->close = restoreoutput;
70         text->output = 0;
71 }
72
73 void
74 onp(Text *text, Tag *)
75 {
76         emit(text, ".LP\n");
77 }
78
79 void
80 restorepre(Text *text, Tag *tag)
81 {
82         text->pre = tag->restore;
83         emit(text, ".DE\n");
84 }
85
86 void
87 onpre(Text *text, Tag *tag)
88 {
89         tag->restore = text->pre;
90         tag->close = restorepre;
91         text->pre = 1;
92         emit(text, ".DS L\n");
93 }
94
95 void
96 onli(Text *text, Tag *tag)
97 {
98         if(tag->up && cistrcmp(tag->up->tag, "ol") == 0)
99                 emit(text, ".IP\n");
100         else
101                 emit(text, ".IP \\(bu\n");
102         if(tag->up)
103                 tag->up->close = onp;
104 }
105
106 void
107 onh(Text *text, Tag *tag)
108 {
109         emit(text, ".SH %c\n", tag->tag[1]);
110         tag->close = onp;
111 }
112
113 void
114 onbr(Text *text, Tag *tag)
115 {
116         tag->closing = 1;
117         emit(text, ".br\n");
118         if(cistrcmp(tag->tag, "hr") == 0)
119                 emit(text, "\\l'5i'\n.br\n");
120 }
121
122 void
123 restorefont(Text *text, Tag *tag)
124 {
125         text->font = tag->restore;
126         text->pos += Bprint(&out, "\\f%c", text->font);
127 }
128
129 void
130 onfont(Text *text, Tag *tag)
131 {
132         if(text->font == 0)
133                 text->font = 'R';
134         tag->restore = text->font;
135         tag->close = restorefont;
136         if(cistrcmp(tag->tag, "i") == 0)
137                 text->font = 'I';
138         else if(cistrcmp(tag->tag, "b") == 0)
139                 text->font = 'B';
140         text->pos += Bprint(&out, "\\f%c", text->font);
141 }
142
143 struct {
144         char    *tag;
145         void    (*open)(Text *, Tag *);
146 } ontag[] = {
147         "br",           onbr,
148         "hr",           onbr,
149         "b",            onfont,
150         "i",            onfont,
151         "p",            onp,
152         "h1",           onh,
153         "h2",           onh,
154         "h3",           onh,
155         "h4",           onh,
156         "h5",           onh,
157         "li",           onli,
158         "pre",          onpre,
159         "head",         ongarbage,
160         "style",        ongarbage,
161         "script",       ongarbage,
162 };
163
164 void
165 eatwhite(void)
166 {
167         int c;
168
169         while((c = Bgetc(&in)) > 0){
170                 if(strchr("\n\r\t ", c) == nil){
171                         Bungetc(&in);
172                         return;
173                 }
174         }
175 }
176
177 void
178 parsecomment(void)
179 {
180         char buf[64];
181         int n, c;
182
183         n = 0;
184         eatwhite();
185         while((c = Bgetc(&in)) > 0){
186                 if(c == '>')
187                         return;
188                 if(n == 0 && c == '-'){
189                         while((c = Bgetc(&in)) > 0){
190                                 if(c == '-')
191                                         if(Bgetc(&in) == '-')
192                                                 if(Bgetc(&in) == '>')
193                                                         return;
194                         }
195                 }
196                 if(n+1 < sizeof(buf)){
197                         buf[n++] = c;
198                         if(n != 7 || cistrncmp(buf, "[CDATA[", 7))
199                                 continue;
200                         while((c = Bgetc(&in)) > 0){
201                                 if(c == ']')
202                                         if(Bgetc(&in) == ']')
203                                                 if(Bgetc(&in) == '>')
204                                                         return;
205                         }
206                 }
207         }
208 }
209
210 int
211 parseattr(Attr *a)
212 {
213         int q, c, n;
214
215         n = 0;
216         eatwhite();
217         while((c = Bgetc(&in)) > 0){
218                 if(strchr("</>=?!", c)){
219                         Bungetc(&in);
220                         break;
221                 }
222                 if(strchr("\n\r\t ", c))
223                         break;
224                 if(n < sizeof(a->attr)-1)
225                         a->attr[n++] = c;
226         }
227         if(n == 0)
228                 return 0;
229         a->attr[n] = 0;
230         n = 0;
231         eatwhite();
232         if(Bgetc(&in) == '='){
233                 eatwhite();
234                 c = Bgetc(&in);
235                 if(strchr("'\"", c)){
236                         q = c;
237                         while((c = Bgetc(&in)) > 0){
238                                 if(c == q)
239                                         break;
240                                 if(n < sizeof(a->val)-1)
241                                         a->val[n++] = c;
242                         }
243                 } else {
244                         Bungetc(&in);
245                         while((c = Bgetc(&in)) > 0){
246                                 if(strchr("\n\r\t </>?!", c)){
247                                         Bungetc(&in);
248                                         break;
249                                 }
250                                 if(n < sizeof(a->val)-1)
251                                         a->val[n++] = c;
252                         }
253                 }
254         } else
255                 Bungetc(&in);
256         a->val[n] = 0;
257         return 1;
258 }
259
260 int
261 parsetag(Tag *t)
262 {
263         int n, c;
264
265         t->nattr = 0;
266         t->opening = 1;
267         t->closing = 0;
268
269         n = 0;
270         eatwhite();
271         while((c = Bgetc(&in)) > 0){
272                 if(c == '>')
273                         break;
274                 if(strchr("\n\r\t ", c)){
275                         if(parseattr(t->attr + t->nattr))
276                                 if(t->nattr < nelem(t->attr)-1)
277                                         t->nattr++;
278                         continue;
279                 }
280                 if(n == 0 && strchr("?!", c)){
281                         parsecomment();
282                         return 0;
283                 }
284                 if(c == '/'){
285                         if(n == 0){
286                                 t->opening = 0;
287                                 t->closing = 1;
288                         } else
289                                 t->closing = 1;
290                         continue;
291                 }
292                 if(n < sizeof(t->tag)-1)
293                         t->tag[n++] = c;
294         }
295         t->tag[n] = 0;
296         return n > 0;
297 }
298
299 Rune
300 parserune(int c)
301 {
302         char buf[10];
303         int i, n;
304         Rune r;
305
306         n = 0;
307         if(c == '&'){
308                 while((c = Bgetc(&in)) > 0){
309                         if(strchr(";&</>\n\r\t ", c)){
310                                 if(c != ';')
311                                         Bungetc(&in);
312                                 if(n == 0)
313                                         return '&';
314                                 break;
315                         }
316                         if(n == sizeof(buf)-1)
317                                 break;
318                         buf[n++] = c;
319                 }
320                 buf[n] = 0;
321                 if(strcmp(buf, "lt") == 0)
322                         return '<';
323                 if(strcmp(buf, "gt") == 0)
324                         return '>';
325                 if(strcmp(buf, "quot") == 0)
326                         return '"';
327                 if(strcmp(buf, "amp") == 0)
328                         return '&';
329                 /* use tcs -f html to handle the rest. */
330         } else {
331                 do {
332                         buf[n++] = c;
333                         if(fullrune(buf, n)){
334                                 chartorune(&r, buf);
335                                 return r;
336                         }
337                         if(n >= UTFmax)
338                                 break;
339                 } while((c = Bgetc(&in)) > 0);
340         }
341         return 0xFFFD;
342 }
343
344 Rune
345 substrune(Rune r)
346 {
347         switch(r){
348         case 0x2019:
349         case 0x2018:
350                 return '\'';
351         case 0x201c:
352         case 0x201d:
353                 return '"';
354         default:
355                 return r;
356         }
357 }
358
359 void
360 debugtag(Tag *tag, char *dbg)
361 {
362         if(1) return;
363
364         if(tag == nil)
365                 return;
366         debugtag(tag->up, nil);
367         fprint(2, "%s %s%s", tag->tag, dbg ? dbg : " > ", dbg ? "\n" : "");
368 }
369
370
371 Tag*
372 parsetext(Text *text, Tag *tag)
373 {
374         Tag *rtag;
375         Rune r;
376         int c;
377
378         rtag = tag;
379         debugtag(tag, "open");
380         if(tag == nil || tag->closing == 0){
381                 while((c = Bgetc(&in)) > 0){
382                         if(c == '<'){
383                                 Tag t;
384
385                                 memset(&t, 0, sizeof(t));
386                                 if(parsetag(&t)){
387                                         if(t.opening){
388                                                 t.up = tag;
389                                                 for(c = 0; c < nelem(ontag); c++){
390                                                         if(cistrcmp(t.tag, ontag[c].tag) == 0){
391                                                                 ontag[c].open(text, &t);
392                                                                 break;
393                                                         }
394                                                 }
395                                                 rtag = parsetext(text, &t);
396                                                 if(rtag == &t)
397                                                         rtag = tag;
398                                                 else
399                                                         break;
400                                         } else if(t.closing){
401                                                 while(rtag && cistrcmp(rtag->tag, t.tag))
402                                                         rtag = rtag->up;
403                                                 if(rtag == nil)
404                                                         rtag = tag;
405                                                 else
406                                                         break;
407                                         }
408                                 }
409                                 continue;
410                         }
411                         if(!text->output)
412                                 continue;
413                         r = substrune(parserune(c));
414                         switch(r){
415                         case '\n':
416                         case '\r':
417                         case ' ':
418                         case '\t':
419                                 if(text->pre == 0){
420                                         text->space = 1;
421                                         continue;
422                                 }
423                         default:
424                                 if(r == '\n' || r == '\r')
425                                         text->pos = 0;
426                                 if(text->space){
427                                         text->space = 0;
428                                         if(text->pos >= 70){
429                                                 text->pos = 0;
430                                                 Bputc(&out, '\n');
431                                         } else if(text->pos > 0){
432                                                 text->pos++;
433                                                 Bputc(&out, ' ');
434                                         }
435                                 }
436                                 if(text->pos == 0 && r == '.'){
437                                         text->pos++;
438                                         Bputc(&out, ' ');
439                                 }
440                                 text->pos++;
441                                 if(r == 0xA0){
442                                         r = ' ';
443                                         Bputc(&out, '\\');
444                                 }
445                                 Bprint(&out, "%C", r);
446                         }
447                 }
448         }
449         debugtag(tag, "close");
450         if(tag && tag->close)
451                 tag->close(text, tag);
452         return rtag;
453 }
454
455 void
456 main(void)
457 {
458         Text text;
459
460         Binit(&in, 0, OREAD);
461         Binit(&out, 1, OWRITE);
462
463         memset(&text, 0, sizeof(text));
464         text.output = 1;
465         parsetext(&text, nil);
466         emit(&text, "\n");
467 }