]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/html2ms.c
mothra: fix alt display resizing, filter control characters in panel entries, use...
[plan9front.git] / sys / src / cmd / html2ms.c
1 #include <u.h>
2 #include <libc.h>
3 #include <ctype.h>
4 #include <bio.h>
5
6 typedef struct Tag Tag;
7 typedef struct Attr Attr;
8 typedef struct Text Text;
9
10 struct Attr {
11         char    attr[64];
12         char    val[256-64];
13 };
14
15 struct Tag {
16         Tag     *up;
17         char    tag[32];
18         Attr    attr[16];
19         int     nattr;
20         int     opening;
21         int     closing;
22
23         void    (*close)(Text *, Tag *);
24         union {
25                 void    *aux;
26         };
27 };
28
29 struct Text {
30         char*   fontstyle;
31         char*   fontsize;
32         int     pre;
33         int     pos;
34         int     space;
35         int     output;
36         int     aftertag;
37
38         char    *bp;
39         char    *wp;
40         int     nb;
41 };
42
43 void eatwhite(void);
44 void parsetext(Text *, Tag *);
45 int parsetag(Tag *);
46 int parseattr(Attr *);
47 void flushtext(Text *);
48 char* getattr(Tag *, char *);
49 int gotattr(Tag *, char *, char *);
50 int gotstyle(Tag *, char *, char *);
51 void reparent(Text *, Tag *, Tag *);
52 void debugtag(Tag *, char *);
53
54 Biobuf in;
55
56 void
57 emitbuf(Text *text, char *buf, int nbuf)
58 {
59         int nw;
60
61         nw = text->wp - text->bp;
62         if((text->nb - nw) < nbuf){
63                 if(nbuf < 4096)
64                         text->nb = nw + 4096;
65                 else
66                         text->nb = nw + nbuf;
67                 text->bp = realloc(text->bp, text->nb);
68                 text->wp = text->bp + nw;
69         }
70         memmove(text->wp, buf, nbuf);
71         text->wp += nbuf;
72 }
73
74 void
75 emitrune(Text *text, Rune r)
76 {
77         char buf[UTFmax+1];
78
79         if(r == '\r' || r =='\n'){
80                 text->pos = 0;
81                 text->space = 0;
82         }else
83                 text->pos++;
84         emitbuf(text, buf, runetochar(buf, &r));
85 }
86
87 void
88 emit(Text *text, char *fmt, ...)
89 {
90         Rune buf[64];
91         va_list a;
92         int i;
93
94         if(fmt[0] == '.' && text->pos)
95                 emitrune(text, '\n');
96         va_start(a, fmt);
97         runevsnprint(buf, nelem(buf), fmt, a);
98         va_end(a);
99         for(i=0; buf[i]; i++)
100                 emitrune(text, buf[i]);
101 }
102
103 void
104 restoreoutput(Text *text, Tag *)
105 {
106         text->output = 1;
107 }
108
109 void
110 ongarbage(Text *text, Tag *tag)
111 {
112         if(text->output == 0)
113                 return;
114         tag->close = restoreoutput;
115         text->output = 0;
116 }
117
118 void
119 onmeta(Text *, Tag *tag)
120 {
121         tag->closing = 1;
122 }
123
124 void
125 onp(Text *text, Tag *)
126 {
127         emit(text, ".LP\n");
128 }
129
130 void
131 restorepre(Text *text, Tag *)
132 {
133         text->pre = 0;
134         emit(text, ".DE\n");
135 }
136
137 void
138 onpre(Text *text, Tag *tag)
139 {
140         if(text->pre)
141                 return;
142         tag->close = restorepre;
143         text->pre = 1;
144         emit(text, ".DS L\n");
145 }
146
147 void
148 onli(Text *text, Tag *tag)
149 {
150         if(tag->up && cistrcmp(tag->up->tag, "ol") == 0)
151                 emit(text, ".IP\n");
152         else
153                 emit(text, ".IP \\(bu\n");
154         if(tag->up)
155                 tag->up->close = onp;
156 }
157
158 void
159 onh(Text *text, Tag *tag)
160 {
161         emit(text, ".SH\n");
162         tag->close = onp;
163 }
164
165 void
166 onbr(Text *text, Tag *tag)
167 {
168         tag->closing = 1;
169         emit(text, ".br\n");
170         if(cistrcmp(tag->tag, "hr") == 0)
171                 emit(text, "\\l'5i'\n.br\n");
172 }
173
174 void
175 fontstyle(Text *text, char *style)
176 {
177         if(strcmp(text->fontstyle, style) == 0)
178                 return;
179         text->fontstyle = style;
180         emit(text, "\\f%s", style);
181 }
182
183 void
184 fontsize(Text *text, char *size)
185 {
186         if(strcmp(text->fontsize, size) == 0)
187                 return;
188         text->fontsize = size;
189         emit(text, ".%s\n", size);
190 }
191
192 void
193 restorefontstyle(Text *text, Tag *tag)
194 {
195         fontstyle(text, tag->aux);
196 }
197
198 void
199 restorefontsize(Text *text, Tag *tag)
200 {
201         fontsize(text, tag->aux);
202 }
203
204 void
205 oni(Text *text, Tag *tag)
206 {
207         tag->aux = text->fontstyle;
208         tag->close = restorefontstyle;
209         fontstyle(text, "I");
210 }
211
212 void
213 onb(Text *text, Tag *tag)
214 {
215         tag->aux = text->fontstyle;
216         tag->close = restorefontstyle;
217         fontstyle(text, "B");
218 }
219
220 void
221 ontt(Text *text, Tag *tag)
222 {
223         tag->aux = text->fontstyle;
224         tag->close = restorefontstyle;
225         fontstyle(text, "C");
226 }
227
228 void
229 onsmall(Text *text, Tag *tag)
230 {
231         tag->aux = text->fontsize;
232         tag->close = restorefontsize;
233         fontsize(text, "SM");
234 }
235
236 void
237 onbig(Text *text, Tag *tag)
238 {
239         tag->aux = text->fontsize;
240         tag->close = restorefontsize;
241         fontsize(text, "LG");
242 }
243
244 void
245 endquote(Text *text, Tag *tag)
246 {
247         if(cistrcmp(tag->tag, "q") == 0)
248                 emitrune(text, '"');
249         emit(text, ".QE\n");
250 }
251
252 void
253 onquote(Text *text, Tag *tag)
254 {
255         tag->close = endquote;
256         if(cistrcmp(tag->tag, "q") == 0)
257                 emit(text, ".QS\n\"");
258         else
259                 emit(text, ".QP\n");
260 }
261
262 typedef struct Table Table;
263 struct Table
264 {
265         char    *fmt;
266
267         char    *bp;
268         int     nb;
269
270         Table   *next;
271         Table   *prev;
272         int     enclose;
273         int     brk;
274
275         Text    save;
276 };
277
278 Tag*
279 tabletag(Tag *tag)
280 {
281         if(tag == nil)
282                 return nil;
283         if(cistrcmp(tag->tag, "table") == 0)
284                 return tag;
285         return tabletag(tag->up);
286 }
287
288 void
289 dumprows(Text *text, Table *s, Table *e)
290 {
291         
292         for(; s != e; s = s->next){
293                 if(s->enclose)
294                         emit(text, "T{\n");
295                 if(s->nb <= 0)
296                         emit(text, "\\ ");
297                 else
298                         emitbuf(text, s->bp, s->nb);
299                 if(s->enclose)
300                         emit(text, "\nT}");
301                 emitrune(text, s->brk ? '\n' : '\t');
302         }
303 }
304
305 void
306 endtable(Text *text, Tag *tag)
307 {
308         int i, cols, rows;
309         Table *t, *h, *s;
310         Tag *tt;
311
312         /* reverse list */
313         h = nil;
314         t = tag->aux;
315         for(; t; t = t->prev){
316                 t->next = h;
317                 h = t;
318         }
319
320         /*
321          * nested table case, add our cells to the next table up.
322          * this is the best we can do, tbl doesnt support nesting
323          */
324         if(tt = tabletag(tag->up)){
325                 while(t = h){
326                         h = h->next;
327                         t->next = nil;
328                         t->prev = tt->aux;
329                         tt->aux = t;
330                 }
331                 return;
332         }
333
334         cols = 0;
335         rows = 0;
336         for(i = 0, t = h; t; t = t->next){
337                 i++;
338                 if(t->brk){
339                         rows++;
340                         if(i > cols)
341                                 cols = i;
342                         i = 0;
343                 }
344         }
345
346         i = 0;
347         for(t = h; t; t = t->next){
348                 i++;
349                 if(t->brk){
350                         while(i < cols){
351                                 s = mallocz(sizeof(Table), 1);
352                                 s->fmt = "L";
353                                 s->brk = t->brk;
354                                 t->brk = 0;
355                                 s->next = t->next;
356                                 t->next = s;
357                                 i++;
358                         }
359                         break;
360                 }
361         }
362
363         s = h;
364         while(s){
365                 emit(text, ".TS\n");
366                 if(gotattr(tag, "align", "center"))
367                         emit(text, "center ;\n");
368                 i = 0;
369                 for(t = s; t; t = t->next){
370                         emit(text, "%s", t->fmt);
371                         if(t->brk){
372                                 emitrune(text, '\n');
373                                 if(++i > 30){
374                                         t = t->next;
375                                         break;
376                                 }
377                         }else
378                                 emitrune(text, ' ');
379                 }
380                 emit(text, ".\n");
381                 dumprows(text, s, t);
382                 emit(text, ".TE\n");
383                 s = t;
384         }
385
386         while(t = h){
387                 h = t->next;
388                 free(t->bp);
389                 free(t);
390         }
391 }
392
393 void
394 ontable(Text *, Tag *tag)
395 {
396         tag->aux = nil;
397         tag->close = endtable;
398 }
399
400 void
401 endcell(Text *text, Tag *tag)
402 {
403         Table *t;
404         Tag *tt;
405         int i;
406
407         if((tt = tabletag(tag)) == nil)
408                 return;
409         if(cistrcmp(tag->tag, "tr") == 0){
410                 if(t = tt->aux)
411                         t->brk = 1;
412         } else {
413                 t = tag->aux;
414                 t->bp = text->bp;
415                 t->nb = text->wp - text->bp;
416
417                 for(i=0; i<t->nb; i++)
418                         if(strchr(" \t\r\n", t->bp[i]) == nil)
419                                 break;
420                 if(i > 0){
421                         memmove(t->bp, t->bp+i, t->nb - i);
422                         t->nb -= i;
423                 }
424                 while(t->nb > 0 && strchr(" \t\r\n", t->bp[t->nb-1]))
425                         t->nb--;
426                 if(t->nb < 32){
427                         for(i=0; i<t->nb; i++)
428                                 if(strchr("\t\r\n", t->bp[i]))
429                                         break;
430                         t->enclose = i < t->nb;
431                 } else {
432                         t->enclose = 1;
433                 }
434                 if(gotstyle(tag, "text-align", "center") || gotstyle(tt, "text-align", "center"))
435                         t->fmt = "c";
436                 else
437                         t->fmt = "L";
438                 t->prev = tt->aux;
439                 tt->aux = t;
440                 *text = t->save;
441         }
442 }
443
444 void
445 oncell(Text *text, Tag *tag)
446 {
447         Tag *tt;
448
449         if((tt = tabletag(tag)) == nil)
450                 return;
451         if(cistrcmp(tag->tag, "tr")){
452                 Table *t;
453
454                 tt = tag->up;
455                 while(tt && cistrcmp(tt->tag, "tr"))
456                         tt = tt->up;
457                 if(tt == nil)
458                         return;
459                 reparent(text, tag, tt);
460
461                 t = mallocz(sizeof(*t), 1);
462                 t->save = *text;
463                 tag->aux = t;
464
465                 text->bp = nil;
466                 text->wp = nil;
467                 text->nb = 0;
468                 text->pos = 0;
469                 text->space = 0;
470         } else
471                 reparent(text, tag, tt);
472         tag->close = endcell;
473 }
474
475 struct {
476         char    *tag;
477         void    (*open)(Text *, Tag *);
478 } ontag[] = {
479         "b",            onb,
480         "big",          onbig,
481         "blockquote",   onquote,
482         "br",           onbr,
483         "cite",         oni,
484         "code",         ontt,
485         "dfn",          oni,
486         "em",           oni,
487         "h1",           onh,
488         "h2",           onh,
489         "h3",           onh,
490         "h4",           onh,
491         "h5",           onh,
492         "h6",           onh,
493         "head",         ongarbage,
494         "hr",           onbr,
495         "i",            oni,
496         "img",          onmeta,
497         "kbd",          ontt,
498         "li",           onli,
499         "link",         onmeta,
500         "meta",         onmeta,
501         "p",            onp,
502         "pre",          onpre,
503         "q",            onquote,
504         "samp",         ontt,
505         "script",       ongarbage,
506         "small",        onsmall,
507         "strong",       onb,
508         "style",        ongarbage,
509         "table",        ontable,
510         "td",           oncell,
511         "th",           oncell,
512         "tr",           oncell,
513         "tt",           ontt,
514         "var",          oni,
515 };
516
517 void
518 eatwhite(void)
519 {
520         int c;
521
522         while((c = Bgetc(&in)) > 0){
523                 if(strchr("\n\r\t ", c) == nil){
524                         Bungetc(&in);
525                         return;
526                 }
527         }
528 }
529
530 void
531 parsecomment(void)
532 {
533         char buf[64];
534         int n, c;
535
536         n = 0;
537         eatwhite();
538         while((c = Bgetc(&in)) > 0){
539                 if(c == '>')
540                         return;
541                 if(n == 0 && c == '-'){
542                         while((c = Bgetc(&in)) > 0){
543                                 if(c == '-')
544                                         if(Bgetc(&in) == '-')
545                                                 if(Bgetc(&in) == '>')
546                                                         return;
547                         }
548                 }
549                 if(n+1 < sizeof(buf)){
550                         buf[n++] = c;
551                         if(n != 7 || cistrncmp(buf, "[CDATA[", 7))
552                                 continue;
553                         while((c = Bgetc(&in)) > 0){
554                                 if(c == ']'){
555                                         if(Bgetc(&in) == ']'){
556                                                 if(Bgetc(&in) != '>')
557                                                         Bungetc(&in);
558                                                 return;
559                                         }
560                                 }
561                         }
562                 }
563         }
564 }
565
566 int
567 parseattr(Attr *a)
568 {
569         int q, c, n;
570
571         n = 0;
572         eatwhite();
573         while((c = Bgetc(&in)) > 0){
574                 if(strchr("</>=?!", c)){
575                         Bungetc(&in);
576                         break;
577                 }
578                 if(strchr("\n\r\t ", c))
579                         break;
580                 if(n < sizeof(a->attr)-1)
581                         a->attr[n++] = c;
582         }
583         if(n == 0)
584                 return 0;
585         a->attr[n] = 0;
586         n = 0;
587         eatwhite();
588         if(Bgetc(&in) == '='){
589                 eatwhite();
590                 c = Bgetc(&in);
591                 if(strchr("'\"", c)){
592                         q = c;
593                         while((c = Bgetc(&in)) > 0){
594                                 if(c == q)
595                                         break;
596                                 if(n < sizeof(a->val)-1)
597                                         a->val[n++] = c;
598                         }
599                 } else {
600                         Bungetc(&in);
601                         while((c = Bgetc(&in)) > 0){
602                                 if(strchr("\n\r\t </>?!", c)){
603                                         Bungetc(&in);
604                                         break;
605                                 }
606                                 if(n < sizeof(a->val)-1)
607                                         a->val[n++] = c;
608                         }
609                 }
610         } else
611                 Bungetc(&in);
612         a->val[n] = 0;
613         return 1;
614 }
615
616 int
617 parsetag(Tag *t)
618 {
619         int n, c;
620
621         t->nattr = 0;
622         t->opening = 1;
623         t->closing = 0;
624
625         n = 0;
626         eatwhite();
627         while((c = Bgetc(&in)) > 0){
628                 if(c == '>')
629                         break;
630                 if(strchr("\n\r\t ", c)){
631                         if(parseattr(t->attr + t->nattr))
632                                 if(t->nattr < nelem(t->attr)-1)
633                                         t->nattr++;
634                         continue;
635                 }
636                 if(n == 0 && strchr("?!", c)){
637                         parsecomment();
638                         return 0;
639                 }
640                 if(c == '/'){
641                         if(n == 0){
642                                 t->opening = 0;
643                                 t->closing = 1;
644                         } else
645                                 t->closing = 1;
646                         continue;
647                 }
648                 if(n < sizeof(t->tag)-1)
649                         t->tag[n++] = c;
650         }
651         t->tag[n] = 0;
652         return n > 0;
653 }
654
655 Rune
656 parserune(int c)
657 {
658         char buf[10];
659         int n;
660         Rune r;
661
662         n = 0;
663         if(c == '&'){
664                 while((c = Bgetc(&in)) > 0){
665                         if(strchr(";&</>\n\r\t ", c)){
666                                 if(c != ';')
667                                         Bungetc(&in);
668                                 if(n == 0)
669                                         return '&';
670                                 break;
671                         }
672                         if(n == sizeof(buf)-1)
673                                 break;
674                         buf[n++] = c;
675                 }
676                 buf[n] = 0;
677                 if(strcmp(buf, "lt") == 0)
678                         return '<';
679                 if(strcmp(buf, "gt") == 0)
680                         return '>';
681                 if(strcmp(buf, "quot") == 0)
682                         return '"';
683                 if(strcmp(buf, "apos") == 0)
684                         return '\'';
685                 if(strcmp(buf, "amp") == 0)
686                         return '&';
687                 /* use tcs -f html to handle the rest. */
688         } else {
689                 do {
690                         buf[n++] = c;
691                         if(fullrune(buf, n)){
692                                 chartorune(&r, buf);
693                                 return r;
694                         }
695                         if(n >= UTFmax)
696                                 break;
697                 } while((c = Bgetc(&in)) > 0);
698         }
699         return 0xFFFD;
700 }
701
702 Rune
703 substrune(Rune r)
704 {
705         switch(r){
706         case 0x2019:
707         case 0x2018:
708                 return '\'';
709         case 0x201c:
710         case 0x201d:
711                 return '"';
712         default:
713                 return r;
714         }
715 }
716
717 void
718 debugtag(Tag *tag, char *dbg)
719 {
720         if(1) return;
721
722         if(tag == nil)
723                 return;
724         debugtag(tag->up, nil);
725         fprint(2, "%s %s%s", tag->tag, dbg ? dbg : " > ", dbg ? "\n" : "");
726 }
727
728 char*
729 getattr(Tag *tag, char *attr)
730 {
731         int i;
732
733         for(i=0; i<tag->nattr; i++)
734                 if(cistrcmp(tag->attr[i].attr, attr) == 0)
735                         return tag->attr[i].val;
736         return nil;
737 }
738
739 int
740 gotattr(Tag *tag, char *attr, char *val)
741 {
742         char *v;
743
744         if((v = getattr(tag, attr)) == nil)
745                 return 0;
746         return cistrstr(v, val) != 0;
747 }
748
749 int
750 gotstyle(Tag *tag, char *style, char *val)
751 {
752         char *v;
753
754         if((v = getattr(tag, "style")) == nil)
755                 return 0;
756         if((v = cistrstr(v, style)) == nil)
757                 return 0;
758         v += strlen(style);
759         while(*v && *v != ':')
760                 v++;
761         if(*v != ':')
762                 return 0;
763         v++;
764         while(*v && strchr("\t ", *v))
765                 v++;
766         if(cistrncmp(v, val, strlen(val)))
767                 return 0;
768         return 1;
769 }
770
771 void
772 reparent(Text *text, Tag *tag, Tag *up)
773 {
774         Tag *old;
775
776         old = tag->up;
777         while(old != up){
778                 debugtag(old, "reparent");
779                 if(old->close){
780                         old->close(text, old);
781                         old->close = nil;
782                 }
783                 old = old->up;
784         }
785         tag->up = up;
786 }
787
788
789 void
790 parsetext(Text *text, Tag *tag)
791 {
792         int hidden, c;
793         Tag t, *up;
794         Rune r;
795
796         if(tag){
797                 up = tag->up;
798                 debugtag(tag, "open");
799                 for(c = 0; c < nelem(ontag); c++){
800                         if(cistrcmp(tag->tag, ontag[c].tag) == 0){
801                                 ontag[c].open(text, tag);
802                                 break;
803                         }
804                 }
805                 hidden = getattr(tag, "hidden") || gotstyle(tag, "display", "none");
806         } else {
807                 up = nil;
808                 hidden = 0;
809         }
810         if(tag == nil || tag->closing == 0){
811                 while((c = Bgetc(&in)) > 0){
812                         if(c == '<'){
813                                 memset(&t, 0, sizeof(t));
814                                 if(parsetag(&t)){
815                                         text->aftertag = 1;
816                                         if(t.opening){
817                                                 t.up = tag;
818                                                 parsetext(text, &t);
819                                                 if(t.up != tag){
820                                                         debugtag(tag, "skip");
821                                                         up = t.up;
822                                                         break;
823                                                 }
824                                                 debugtag(tag, "back");
825                                         } else if(t.closing){
826                                                 up = tag;
827                                                 while(up && cistrcmp(up->tag, t.tag))
828                                                         up = up->up;
829                                                 if(up){
830                                                         up = up->up;
831                                                         break;
832                                                 }
833                                         }
834                                 }
835                                 continue;
836                         }
837                         if(hidden || !text->output)
838                                 continue;
839                         r = substrune(parserune(c));
840                         switch(r){
841                         case '\n':
842                         case '\r':
843                                 if(text->pre == 0 && text->aftertag)
844                                         break;
845                         case ' ':
846                         case '\t':
847                                 if(text->pre == 0){
848                                         text->space = 1;
849                                         break;
850                                 }
851                         default:
852                                 if(text->space){
853                                         if(text->pos >= 70)
854                                                 emitrune(text, '\n');
855                                         else if(text->pos > 0)
856                                                 emitrune(text, ' ');
857                                 }
858                                 if((text->pos == 0 && r == '.') || r == '\\')
859                                         emit(text, "\\&");
860                                 if(r == '\\' || r == 0xA0)
861                                         emitrune(text, '\\');
862                                 if(r == 0xA0)
863                                         r = ' ';
864                                 emitrune(text, r);
865                                 text->aftertag = 0;
866                                 text->space = 0;
867                         }
868                 }
869         }
870         if(tag){
871                 debugtag(tag, "close");
872                 if(tag->close){
873                         tag->close(text, tag);
874                         tag->close = nil;
875                 }
876                 if(up)
877                         tag->up = up;
878         }
879 }
880
881 void
882 inittext(Text *text)
883 {
884         memset(text, 0, sizeof(Text));
885         text->fontstyle = "R";
886         text->fontsize = "NL";
887         text->output = 1;
888 }
889
890 void
891 main(void)
892 {
893         Text text;
894         Binit(&in, 0, OREAD);
895         inittext(&text);
896         parsetext(&text, nil);
897         emit(&text, "\n");
898         write(1, text.bp, text.wp - text.bp);
899 }