]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/html2ms.c
html2ms: underline links
[plan9front.git] / sys / src / cmd / html2ms.c
1 #include <u.h>
2 #include <libc.h>
3 #include <ctype.h>
4 #include <bio.h>
5
6 typedef struct Tag Tag;
7 typedef struct Attr Attr;
8 typedef struct Text Text;
9
10 struct Attr {
11         char    attr[64];
12         char    val[256-64];
13 };
14
15 struct Tag {
16         Tag     *up;
17         char    tag[32];
18         Attr    attr[16];
19         int     nattr;
20         int     opening;
21         int     closing;
22
23         void    (*close)(Text *, Tag *);
24         union {
25                 void    *aux;
26                 int     restore;
27         };
28 };
29
30 struct Text {
31         char*   font;
32         int     pre;
33         int     pos;
34         int     space;
35         int     output;
36         int     underline;
37 };
38
39 void eatwhite(void);
40 Tag *parsetext(Text *, Tag *);
41 int parsetag(Tag *);
42 int parseattr(Attr *);
43
44 Biobuf in, out;
45
46 void
47 emit(Text *text, char *fmt, ...)
48 {
49         va_list a;
50
51         if(text->pos > 0){
52                 text->pos = 0;
53                 Bputc(&out, '\n');
54         }
55         va_start(a, fmt);
56         Bvprint(&out, fmt, a);
57         va_end(a);
58 }
59
60 void
61 restoreoutput(Text *text, Tag *tag)
62 {
63         text->output = tag->restore;
64 }
65
66 void
67 ongarbage(Text *text, Tag *tag)
68 {
69         tag->restore = text->output;
70         tag->close = restoreoutput;
71         text->output = 0;
72 }
73
74 void
75 onp(Text *text, Tag *)
76 {
77         emit(text, ".LP\n");
78 }
79
80 void
81 restorepre(Text *text, Tag *tag)
82 {
83         text->pre = tag->restore;
84         emit(text, ".DE\n");
85 }
86
87 void
88 onpre(Text *text, Tag *tag)
89 {
90         tag->restore = text->pre;
91         tag->close = restorepre;
92         text->pre = 1;
93         emit(text, ".DS L\n");
94 }
95
96 void
97 onli(Text *text, Tag *tag)
98 {
99         if(tag->up && cistrcmp(tag->up->tag, "ol") == 0)
100                 emit(text, ".IP\n");
101         else
102                 emit(text, ".IP \\(bu\n");
103         if(tag->up)
104                 tag->up->close = onp;
105 }
106
107 void
108 onh(Text *text, Tag *tag)
109 {
110         emit(text, ".SH %c\n", tag->tag[1]);
111         tag->close = onp;
112 }
113
114 void
115 onbr(Text *text, Tag *tag)
116 {
117         tag->closing = 1;
118         emit(text, ".br\n");
119         if(cistrcmp(tag->tag, "hr") == 0)
120                 emit(text, "\\l'5i'\n.br\n");
121 }
122
123 void
124 restorefont(Text *text, Tag *tag)
125 {
126         text->font = tag->aux;
127         text->pos += Bprint(&out, "\\f%s", text->font);
128 }
129
130 void
131 onfont(Text *text, Tag *tag)
132 {
133         if(text->font == 0)
134                 text->font = "R";
135         tag->aux = text->font;
136         tag->close = restorefont;
137         if(cistrcmp(tag->tag, "i") == 0)
138                 text->font = "I";
139         else if(cistrcmp(tag->tag, "b") == 0)
140                 text->font = "B";
141         text->pos += Bprint(&out, "\\f%s", text->font);
142 }
143
144 void
145 ona(Text *text, Tag *)
146 {
147         text->underline = 1;
148 }
149
150 struct {
151         char    *tag;
152         void    (*open)(Text *, Tag *);
153 } ontag[] = {
154         "a",            ona,
155         "br",           onbr,
156         "hr",           onbr,
157         "b",            onfont,
158         "i",            onfont,
159         "p",            onp,
160         "h1",           onh,
161         "h2",           onh,
162         "h3",           onh,
163         "h4",           onh,
164         "h5",           onh,
165         "h6",           onh,
166         "li",           onli,
167         "pre",          onpre,
168         "head",         ongarbage,
169         "style",        ongarbage,
170         "script",       ongarbage,
171 };
172
173 void
174 eatwhite(void)
175 {
176         int c;
177
178         while((c = Bgetc(&in)) > 0){
179                 if(strchr("\n\r\t ", c) == nil){
180                         Bungetc(&in);
181                         return;
182                 }
183         }
184 }
185
186 void
187 parsecomment(void)
188 {
189         char buf[64];
190         int n, c;
191
192         n = 0;
193         eatwhite();
194         while((c = Bgetc(&in)) > 0){
195                 if(c == '>')
196                         return;
197                 if(n == 0 && c == '-'){
198                         while((c = Bgetc(&in)) > 0){
199                                 if(c == '-')
200                                         if(Bgetc(&in) == '-')
201                                                 if(Bgetc(&in) == '>')
202                                                         return;
203                         }
204                 }
205                 if(n+1 < sizeof(buf)){
206                         buf[n++] = c;
207                         if(n != 7 || cistrncmp(buf, "[CDATA[", 7))
208                                 continue;
209                         while((c = Bgetc(&in)) > 0){
210                                 if(c == ']')
211                                         if(Bgetc(&in) == ']')
212                                                 if(Bgetc(&in) == '>')
213                                                         return;
214                         }
215                 }
216         }
217 }
218
219 int
220 parseattr(Attr *a)
221 {
222         int q, c, n;
223
224         n = 0;
225         eatwhite();
226         while((c = Bgetc(&in)) > 0){
227                 if(strchr("</>=?!", c)){
228                         Bungetc(&in);
229                         break;
230                 }
231                 if(strchr("\n\r\t ", c))
232                         break;
233                 if(n < sizeof(a->attr)-1)
234                         a->attr[n++] = c;
235         }
236         if(n == 0)
237                 return 0;
238         a->attr[n] = 0;
239         n = 0;
240         eatwhite();
241         if(Bgetc(&in) == '='){
242                 eatwhite();
243                 c = Bgetc(&in);
244                 if(strchr("'\"", c)){
245                         q = c;
246                         while((c = Bgetc(&in)) > 0){
247                                 if(c == q)
248                                         break;
249                                 if(n < sizeof(a->val)-1)
250                                         a->val[n++] = c;
251                         }
252                 } else {
253                         Bungetc(&in);
254                         while((c = Bgetc(&in)) > 0){
255                                 if(strchr("\n\r\t </>?!", c)){
256                                         Bungetc(&in);
257                                         break;
258                                 }
259                                 if(n < sizeof(a->val)-1)
260                                         a->val[n++] = c;
261                         }
262                 }
263         } else
264                 Bungetc(&in);
265         a->val[n] = 0;
266         return 1;
267 }
268
269 int
270 parsetag(Tag *t)
271 {
272         int n, c;
273
274         t->nattr = 0;
275         t->opening = 1;
276         t->closing = 0;
277
278         n = 0;
279         eatwhite();
280         while((c = Bgetc(&in)) > 0){
281                 if(c == '>')
282                         break;
283                 if(strchr("\n\r\t ", c)){
284                         if(parseattr(t->attr + t->nattr))
285                                 if(t->nattr < nelem(t->attr)-1)
286                                         t->nattr++;
287                         continue;
288                 }
289                 if(n == 0 && strchr("?!", c)){
290                         parsecomment();
291                         return 0;
292                 }
293                 if(c == '/'){
294                         if(n == 0){
295                                 t->opening = 0;
296                                 t->closing = 1;
297                         } else
298                                 t->closing = 1;
299                         continue;
300                 }
301                 if(n < sizeof(t->tag)-1)
302                         t->tag[n++] = c;
303         }
304         t->tag[n] = 0;
305         return n > 0;
306 }
307
308 Rune
309 parserune(int c)
310 {
311         char buf[10];
312         int n;
313         Rune r;
314
315         n = 0;
316         if(c == '&'){
317                 while((c = Bgetc(&in)) > 0){
318                         if(strchr(";&</>\n\r\t ", c)){
319                                 if(c != ';')
320                                         Bungetc(&in);
321                                 if(n == 0)
322                                         return '&';
323                                 break;
324                         }
325                         if(n == sizeof(buf)-1)
326                                 break;
327                         buf[n++] = c;
328                 }
329                 buf[n] = 0;
330                 if(strcmp(buf, "lt") == 0)
331                         return '<';
332                 if(strcmp(buf, "gt") == 0)
333                         return '>';
334                 if(strcmp(buf, "quot") == 0)
335                         return '"';
336                 if(strcmp(buf, "amp") == 0)
337                         return '&';
338                 /* use tcs -f html to handle the rest. */
339         } else {
340                 do {
341                         buf[n++] = c;
342                         if(fullrune(buf, n)){
343                                 chartorune(&r, buf);
344                                 return r;
345                         }
346                         if(n >= UTFmax)
347                                 break;
348                 } while((c = Bgetc(&in)) > 0);
349         }
350         return 0xFFFD;
351 }
352
353 Rune
354 substrune(Rune r)
355 {
356         switch(r){
357         case 0x2019:
358         case 0x2018:
359                 return '\'';
360         case 0x201c:
361         case 0x201d:
362                 return '"';
363         default:
364                 return r;
365         }
366 }
367
368 void
369 debugtag(Tag *tag, char *dbg)
370 {
371         if(1) return;
372
373         if(tag == nil)
374                 return;
375         debugtag(tag->up, nil);
376         fprint(2, "%s %s%s", tag->tag, dbg ? dbg : " > ", dbg ? "\n" : "");
377 }
378
379
380 Tag*
381 parsetext(Text *text, Tag *tag)
382 {
383         Tag *rtag;
384         Rune r;
385         int c;
386
387         rtag = tag;
388         debugtag(tag, "open");
389         if(tag == nil || tag->closing == 0){
390                 while((c = Bgetc(&in)) > 0){
391                         if(c == '<'){
392                                 Tag t;
393
394                                 memset(&t, 0, sizeof(t));
395                                 if(parsetag(&t)){
396                                         if(t.opening){
397                                                 t.up = tag;
398                                                 for(c = 0; c < nelem(ontag); c++){
399                                                         if(cistrcmp(t.tag, ontag[c].tag) == 0){
400                                                                 ontag[c].open(text, &t);
401                                                                 break;
402                                                         }
403                                                 }
404                                                 rtag = parsetext(text, &t);
405                                                 if(rtag == &t)
406                                                         rtag = tag;
407                                                 else
408                                                         break;
409                                         } else if(t.closing){
410                                                 while(rtag && cistrcmp(rtag->tag, t.tag))
411                                                         rtag = rtag->up;
412                                                 if(rtag == nil)
413                                                         rtag = tag;
414                                                 else
415                                                         break;
416                                         }
417                                 }
418                                 continue;
419                         }
420                         if(!text->output)
421                                 continue;
422                         r = substrune(parserune(c));
423                         switch(r){
424                         case '\n':
425                         case '\r':
426                         case ' ':
427                         case '\t':
428                                 if(text->pre == 0){
429                                         text->space = 1;
430                                         continue;
431                                 }
432                         default:
433                                 if(r == '\n' || r == '\r')
434                                         text->pos = 0;
435                                 if(text->space){
436                                         text->space = 0;
437                                         if(text->underline){
438                                                 emit(text, "");
439                                                 text->pos = Bprint(&out, ".UL ");
440                                         } else if(text->pos >= 70){
441                                                 text->pos = 0;
442                                                 Bputc(&out, '\n');
443                                         } else if(text->pos > 0){
444                                                 text->pos++;
445                                                 Bputc(&out, ' ');
446                                         }
447                                 }
448                                 if(text->pos == 0 && r == '.'){
449                                         text->pos++;
450                                         Bputc(&out, ' ');
451                                 }
452                                 text->pos++;
453                                 if(r == 0xA0){
454                                         r = ' ';
455                                         Bputc(&out, '\\');
456                                 }
457                                 Bprint(&out, "%C", r);
458                         }
459                 }
460         }
461         debugtag(tag, "close");
462         if(tag && tag->close)
463                 tag->close(text, tag);
464         return rtag;
465 }
466
467 void
468 main(void)
469 {
470         Text text;
471
472         Binit(&in, 0, OREAD);
473         Binit(&out, 1, OWRITE);
474
475         memset(&text, 0, sizeof(text));
476         text.output = 1;
477         parsetext(&text, nil);
478         emit(&text, "\n");
479 }