]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/html2ms.c
6ecbeef372f681d82f9556a713b168423acc9244
[plan9front.git] / sys / src / cmd / html2ms.c
1 #include <u.h>
2 #include <libc.h>
3 #include <ctype.h>
4 #include <bio.h>
5
6 typedef struct Tag Tag;
7 typedef struct Attr Attr;
8 typedef struct Text Text;
9
10 struct Attr {
11         char    attr[64];
12         char    val[256-64];
13 };
14
15 struct Tag {
16         Tag     *up;
17         char    tag[32];
18         Attr    attr[16];
19         int     nattr;
20         int     opening;
21         int     closing;
22
23         void    (*close)(Text *, Tag *);
24         union {
25                 void    *aux;
26         };
27 };
28
29 struct Text {
30         char*   fontstyle;
31         char*   fontsize;
32         int     pre;
33         int     pos;
34         int     space;
35         int     output;
36         int     aftertag;
37
38         char    *bp;
39         char    *wp;
40         int     nb;
41 };
42
43 void eatwhite(void);
44 void parsetext(Text *, Tag *);
45 int parsetag(Tag *);
46 int parseattr(Attr *);
47 void flushtext(Text *);
48 char* getattr(Tag *, char *);
49 int gotattr(Tag *, char *, char *);
50 int gotstyle(Tag *, char *, char *);
51 void reparent(Text *, Tag *, Tag *);
52 void debugtag(Tag *, char *);
53
54 Biobuf in;
55
56 void
57 emitbuf(Text *text, char *buf, int nbuf)
58 {
59         int nw;
60
61         nw = text->wp - text->bp;
62         if((text->nb - nw) < nbuf){
63                 if(nbuf < 4096)
64                         text->nb = nw + 4096;
65                 else
66                         text->nb = nw + nbuf;
67                 text->bp = realloc(text->bp, text->nb);
68                 text->wp = text->bp + nw;
69         }
70         memmove(text->wp, buf, nbuf);
71         text->wp += nbuf;
72 }
73
74 void
75 emitrune(Text *text, Rune r)
76 {
77         char buf[UTFmax+1];
78
79         if(r == '\r' || r =='\n'){
80                 text->pos = 0;
81                 text->space = 0;
82         }else
83                 text->pos++;
84         emitbuf(text, buf, runetochar(buf, &r));
85 }
86
87 void
88 emit(Text *text, char *fmt, ...)
89 {
90         Rune buf[64];
91         va_list a;
92         int i;
93
94         if(fmt[0] == '.' && text->pos)
95                 emitrune(text, '\n');
96         va_start(a, fmt);
97         runevsnprint(buf, nelem(buf), fmt, a);
98         va_end(a);
99         for(i=0; buf[i]; i++)
100                 emitrune(text, buf[i]);
101 }
102
103 void
104 restoreoutput(Text *text, Tag *)
105 {
106         text->output = 1;
107 }
108
109 void
110 ongarbage(Text *text, Tag *tag)
111 {
112         if(text->output == 0)
113                 return;
114         tag->close = restoreoutput;
115         text->output = 0;
116 }
117
118 void
119 onmeta(Text *, Tag *tag)
120 {
121         tag->closing = 1;
122 }
123
124 void
125 onp(Text *text, Tag *)
126 {
127         emit(text, ".LP\n");
128 }
129
130 void
131 restorepre(Text *text, Tag *)
132 {
133         text->pre = 0;
134         emit(text, ".DE\n");
135 }
136
137 void
138 onpre(Text *text, Tag *tag)
139 {
140         if(text->pre)
141                 return;
142         tag->close = restorepre;
143         text->pre = 1;
144         emit(text, ".DS L\n");
145 }
146
147 void
148 onli(Text *text, Tag *tag)
149 {
150         if(tag->up && cistrcmp(tag->up->tag, "ol") == 0)
151                 emit(text, ".IP\n");
152         else
153                 emit(text, ".IP \\(bu\n");
154         if(tag->up)
155                 tag->up->close = onp;
156 }
157
158 void
159 onh(Text *text, Tag *tag)
160 {
161         emit(text, ".SH\n");
162         tag->close = onp;
163 }
164
165 void
166 onbr(Text *text, Tag *tag)
167 {
168         tag->closing = 1;
169         emit(text, ".br\n");
170         if(cistrcmp(tag->tag, "hr") == 0)
171                 emit(text, "\\l'5i'\n.br\n");
172 }
173
174 void
175 fontstyle(Text *text, char *style)
176 {
177         if(strcmp(text->fontstyle, style) == 0)
178                 return;
179         text->fontstyle = style;
180         emit(text, "\\f%s", style);
181 }
182
183 void
184 fontsize(Text *text, char *size)
185 {
186         if(strcmp(text->fontsize, size) == 0)
187                 return;
188         text->fontsize = size;
189         emit(text, ".%s\n", size);
190 }
191
192 void
193 restorefontstyle(Text *text, Tag *tag)
194 {
195         fontstyle(text, tag->aux);
196 }
197
198 void
199 restorefontsize(Text *text, Tag *tag)
200 {
201         fontsize(text, tag->aux);
202 }
203
204 void
205 oni(Text *text, Tag *tag)
206 {
207         tag->aux = text->fontstyle;
208         tag->close = restorefontstyle;
209         fontstyle(text, "I");
210 }
211
212 void
213 onb(Text *text, Tag *tag)
214 {
215         tag->aux = text->fontstyle;
216         tag->close = restorefontstyle;
217         fontstyle(text, "B");
218 }
219
220 void
221 ontt(Text *text, Tag *tag)
222 {
223         tag->aux = text->fontstyle;
224         tag->close = restorefontstyle;
225         fontstyle(text, "C");
226 }
227
228 void
229 onsmall(Text *text, Tag *tag)
230 {
231         tag->aux = text->fontsize;
232         tag->close = restorefontsize;
233         fontsize(text, "SM");
234 }
235
236 void
237 onbig(Text *text, Tag *tag)
238 {
239         tag->aux = text->fontsize;
240         tag->close = restorefontsize;
241         fontsize(text, "LG");
242 }
243
244 void
245 endquote(Text *text, Tag *tag)
246 {
247         if(cistrcmp(tag->tag, "q") == 0)
248                 emitrune(text, '"');
249         emit(text, ".QE\n");
250 }
251
252 void
253 onquote(Text *text, Tag *tag)
254 {
255         tag->close = endquote;
256         if(cistrcmp(tag->tag, "q") == 0)
257                 emit(text, ".QS\n\"");
258         else
259                 emit(text, ".QP\n");
260 }
261
262 typedef struct Table Table;
263 struct Table
264 {
265         char    *fmt;
266
267         char    *bp;
268         int     nb;
269
270         Table   *next;
271         Table   *prev;
272         int     enclose;
273         int     brk;
274
275         Text    save;
276 };
277
278 Tag*
279 tabletag(Tag *tag)
280 {
281         if(tag == nil)
282                 return nil;
283         if(cistrcmp(tag->tag, "table") == 0)
284                 return tag;
285         return tabletag(tag->up);
286 }
287
288 void
289 dumprows(Text *text, Table *s, Table *e)
290 {
291         
292         for(; s != e; s = s->next){
293                 if(s->enclose)
294                         emit(text, "T{\n");
295                 if(s->nb <= 0)
296                         emit(text, "\\ ");
297                 else
298                         emitbuf(text, s->bp, s->nb);
299                 if(s->enclose)
300                         emit(text, "\nT}");
301                 emitrune(text, s->brk ? '\n' : '\t');
302         }
303 }
304
305 void
306 endtable(Text *text, Tag *tag)
307 {
308         int i, cols, rows;
309         Table *t, *h, *s;
310         Tag *tt;
311
312         /* reverse list */
313         h = nil;
314         t = tag->aux;
315         for(; t; t = t->prev){
316                 t->next = h;
317                 h = t;
318         }
319
320         /*
321          * nested table case, add our cells to the next table up.
322          * this is the best we can do, tbl doesnt support nesting
323          */
324         if(tt = tabletag(tag->up)){
325                 while(t = h){
326                         h = h->next;
327                         t->next = nil;
328                         t->prev = tt->aux;
329                         tt->aux = t;
330                 }
331                 return;
332         }
333
334         cols = 0;
335         rows = 0;
336         for(i = 0, t = h; t; t = t->next){
337                 i++;
338                 if(t->brk){
339                         rows++;
340                         if(i > cols)
341                                 cols = i;
342                         i = 0;
343                 }
344         }
345
346         i = 0;
347         for(t = h; t; t = t->next){
348                 i++;
349                 if(t->brk){
350                         while(i < cols){
351                                 s = mallocz(sizeof(Table), 1);
352                                 s->fmt = "L";
353                                 s->brk = t->brk;
354                                 t->brk = 0;
355                                 s->next = t->next;
356                                 t->next = s;
357                                 i++;
358                         }
359                         break;
360                 }
361         }
362
363         s = h;
364         while(s){
365                 emit(text, ".TS\n");
366                 if(gotattr(tag, "align", "center"))
367                         emit(text, "center ;\n");
368                 i = 0;
369                 for(t = s; t; t = t->next){
370                         emit(text, "%s", t->fmt);
371                         if(t->brk){
372                                 emitrune(text, '\n');
373                                 if(++i > 30){
374                                         t = t->next;
375                                         break;
376                                 }
377                         }else
378                                 emitrune(text, ' ');
379                 }
380                 emit(text, ".\n");
381                 dumprows(text, s, t);
382                 emit(text, ".TE\n");
383                 s = t;
384         }
385
386         while(t = h){
387                 h = t->next;
388                 free(t->bp);
389                 free(t);
390         }
391 }
392
393 void
394 ontable(Text *, Tag *tag)
395 {
396         tag->aux = nil;
397         tag->close = endtable;
398 }
399
400 void
401 endcell(Text *text, Tag *tag)
402 {
403         Table *t;
404         Tag *tt;
405         int i;
406
407         if((tt = tabletag(tag)) == nil)
408                 return;
409         if(cistrcmp(tag->tag, "tr") == 0){
410                 if(t = tt->aux)
411                         t->brk = 1;
412         } else {
413                 t = tag->aux;
414                 t->bp = text->bp;
415                 t->nb = text->wp - text->bp;
416
417                 for(i=0; i<t->nb; i++)
418                         if(strchr(" \t\r\n", t->bp[i]) == nil)
419                                 break;
420                 if(i > 0){
421                         memmove(t->bp, t->bp+i, t->nb - i);
422                         t->nb -= i;
423                 }
424                 while(t->nb > 0 && strchr(" \t\r\n", t->bp[t->nb-1]))
425                         t->nb--;
426                 if(t->nb < 32){
427                         for(i=0; i<t->nb; i++)
428                                 if(strchr("\t\r\n", t->bp[i]))
429                                         break;
430                         t->enclose = i < t->nb;
431                 } else {
432                         t->enclose = 1;
433                 }
434                 if(gotstyle(tag, "text-align", "center") || gotstyle(tt, "text-align", "center"))
435                         t->fmt = "c";
436                 else
437                         t->fmt = "L";
438                 t->prev = tt->aux;
439                 tt->aux = t;
440                 *text = t->save;
441         }
442 }
443
444 void
445 oncell(Text *text, Tag *tag)
446 {
447         Tag *tt;
448
449         if((tt = tabletag(tag)) == nil)
450                 return;
451         if(cistrcmp(tag->tag, "tr")){
452                 Table *t;
453
454                 tt = tag->up;
455                 while(tt && cistrcmp(tt->tag, "tr"))
456                         tt = tt->up;
457                 if(tt == nil)
458                         return;
459                 reparent(text, tag, tt);
460
461                 t = mallocz(sizeof(*t), 1);
462                 t->save = *text;
463                 tag->aux = t;
464
465                 text->bp = nil;
466                 text->wp = nil;
467                 text->nb = 0;
468                 text->pos = 0;
469                 text->space = 0;
470         } else
471                 reparent(text, tag, tt);
472         tag->close = endcell;
473 }
474
475 struct {
476         char    *tag;
477         void    (*open)(Text *, Tag *);
478 } ontag[] = {
479         "b",            onb,
480         "big",          onbig,
481         "blockquote",   onquote,
482         "br",           onbr,
483         "cite",         oni,
484         "code",         ontt,
485         "dfn",          oni,
486         "em",           oni,
487         "h1",           onh,
488         "h2",           onh,
489         "h3",           onh,
490         "h4",           onh,
491         "h5",           onh,
492         "h6",           onh,
493         "head",         ongarbage,
494         "hr",           onbr,
495         "i",            oni,
496         "img",          onmeta,
497         "kbd",          ontt,
498         "li",           onli,
499         "link",         onmeta,
500         "meta",         onmeta,
501         "p",            onp,
502         "pre",          onpre,
503         "q",            onquote,
504         "samp",         ontt,
505         "script",       ongarbage,
506         "small",        onsmall,
507         "strong",       onb,
508         "style",        ongarbage,
509         "table",        ontable,
510         "td",           oncell,
511         "th",           oncell,
512         "tr",           oncell,
513         "tt",           ontt,
514         "var",          oni,
515 };
516
517 void
518 eatwhite(void)
519 {
520         int c;
521
522         while((c = Bgetc(&in)) > 0){
523                 if(strchr("\n\r\t ", c) == nil){
524                         Bungetc(&in);
525                         return;
526                 }
527         }
528 }
529
530 void
531 parsecomment(void)
532 {
533         char buf[64];
534         int n, c;
535
536         n = 0;
537         eatwhite();
538         while((c = Bgetc(&in)) > 0){
539                 if(c == '>')
540                         return;
541                 if(n == 0 && c == '-'){
542                         while((c = Bgetc(&in)) > 0){
543                                 if(c == '-')
544                                         if(Bgetc(&in) == '-')
545                                                 if(Bgetc(&in) == '>')
546                                                         return;
547                         }
548                 }
549                 if(n+1 < sizeof(buf)){
550                         buf[n++] = c;
551                         if(n != 7 || cistrncmp(buf, "[CDATA[", 7))
552                                 continue;
553                         while((c = Bgetc(&in)) > 0){
554                                 if(c == ']'){
555                                         if(Bgetc(&in) == ']'){
556                                                 if(Bgetc(&in) != '>')
557                                                         Bungetc(&in);
558                                                 return;
559                                         }
560                                 }
561                         }
562                 }
563         }
564 }
565
566 int
567 parseattr(Attr *a)
568 {
569         int q, c, n;
570
571         n = 0;
572         eatwhite();
573         while((c = Bgetc(&in)) > 0){
574                 if(strchr("</>=?!", c)){
575                         Bungetc(&in);
576                         break;
577                 }
578                 if(strchr("\n\r\t ", c))
579                         break;
580                 if(n < sizeof(a->attr)-1)
581                         a->attr[n++] = c;
582         }
583         if(n == 0)
584                 return 0;
585         a->attr[n] = 0;
586         n = 0;
587         eatwhite();
588         if(Bgetc(&in) == '='){
589                 eatwhite();
590                 c = Bgetc(&in);
591                 if(strchr("'\"", c)){
592                         q = c;
593                         while((c = Bgetc(&in)) > 0){
594                                 if(c == q)
595                                         break;
596                                 if(n < sizeof(a->val)-1)
597                                         a->val[n++] = c;
598                         }
599                 } else {
600                         Bungetc(&in);
601                         while((c = Bgetc(&in)) > 0){
602                                 if(strchr("\n\r\t </>?!", c)){
603                                         Bungetc(&in);
604                                         break;
605                                 }
606                                 if(n < sizeof(a->val)-1)
607                                         a->val[n++] = c;
608                         }
609                 }
610         } else
611                 Bungetc(&in);
612         a->val[n] = 0;
613         return 1;
614 }
615
616 int
617 parsetag(Tag *t)
618 {
619         int n, c;
620
621         t->nattr = 0;
622         t->opening = 1;
623         t->closing = 0;
624
625         n = 0;
626         eatwhite();
627         while((c = Bgetc(&in)) > 0){
628                 if(c == '>')
629                         break;
630                 if(strchr("\n\r\t ", c)){
631                         if(parseattr(t->attr + t->nattr))
632                                 if(t->nattr < nelem(t->attr)-1)
633                                         t->nattr++;
634                         continue;
635                 }
636                 if(n == 0 && strchr("?!", c)){
637                         parsecomment();
638                         return 0;
639                 }
640                 if(c == '/'){
641                         if(n == 0){
642                                 t->opening = 0;
643                                 t->closing = 1;
644                         } else
645                                 t->closing = 1;
646                         continue;
647                 }
648                 if(n < sizeof(t->tag)-1)
649                         t->tag[n++] = c;
650         }
651         t->tag[n] = 0;
652         return n > 0;
653 }
654
655 Rune
656 parserune(int c)
657 {
658         char buf[10];
659         int n;
660         Rune r;
661
662         n = 0;
663         if(c == '&'){
664                 while((c = Bgetc(&in)) > 0){
665                         if(strchr(";&</>\n\r\t ", c)){
666                                 if(c != ';')
667                                         Bungetc(&in);
668                                 if(n == 0)
669                                         return '&';
670                                 break;
671                         }
672                         if(n == sizeof(buf)-1)
673                                 break;
674                         buf[n++] = c;
675                 }
676                 buf[n] = 0;
677                 if(strcmp(buf, "lt") == 0)
678                         return '<';
679                 if(strcmp(buf, "gt") == 0)
680                         return '>';
681                 if(strcmp(buf, "quot") == 0)
682                         return '"';
683                 if(strcmp(buf, "apos") == 0)
684                         return '\'';
685                 if(strcmp(buf, "amp") == 0)
686                         return '&';
687                 /* use tcs -f html to handle the rest. */
688         } else {
689                 do {
690                         buf[n++] = c;
691                         if(fullrune(buf, n)){
692                                 chartorune(&r, buf);
693                                 return r;
694                         }
695                         if(n >= UTFmax)
696                                 break;
697                 } while((c = Bgetc(&in)) > 0);
698         }
699         return 0xFFFD;
700 }
701
702 Rune
703 substrune(Rune r)
704 {
705         switch(r){
706         case 0x2019:
707         case 0x2018:
708                 return '\'';
709         case 0x201c:
710         case 0x201d:
711                 return '"';
712         default:
713                 return r;
714         }
715 }
716
717 void
718 debugtag(Tag *tag, char *dbg)
719 {
720         if(1){
721                 USED(tag);
722                 USED(dbg);
723                 return;
724         }
725
726         if(tag == nil)
727                 return;
728         debugtag(tag->up, nil);
729         fprint(2, "%s %s%s", tag->tag, dbg ? dbg : " > ", dbg ? "\n" : "");
730 }
731
732 char*
733 getattr(Tag *tag, char *attr)
734 {
735         int i;
736
737         for(i=0; i<tag->nattr; i++)
738                 if(cistrcmp(tag->attr[i].attr, attr) == 0)
739                         return tag->attr[i].val;
740         return nil;
741 }
742
743 int
744 gotattr(Tag *tag, char *attr, char *val)
745 {
746         char *v;
747
748         if((v = getattr(tag, attr)) == nil)
749                 return 0;
750         return cistrstr(v, val) != 0;
751 }
752
753 int
754 gotstyle(Tag *tag, char *style, char *val)
755 {
756         char *v;
757
758         if((v = getattr(tag, "style")) == nil)
759                 return 0;
760         if((v = cistrstr(v, style)) == nil)
761                 return 0;
762         v += strlen(style);
763         while(*v && *v != ':')
764                 v++;
765         if(*v != ':')
766                 return 0;
767         v++;
768         while(*v && strchr("\t ", *v))
769                 v++;
770         if(cistrncmp(v, val, strlen(val)))
771                 return 0;
772         return 1;
773 }
774
775 void
776 reparent(Text *text, Tag *tag, Tag *up)
777 {
778         Tag *old;
779
780         old = tag->up;
781         while(old != up){
782                 debugtag(old, "reparent");
783                 if(old->close){
784                         old->close(text, old);
785                         old->close = nil;
786                 }
787                 old = old->up;
788         }
789         tag->up = up;
790 }
791
792
793 void
794 parsetext(Text *text, Tag *tag)
795 {
796         int hidden, c;
797         Tag t, *up;
798         Rune r;
799
800         if(tag){
801                 up = tag->up;
802                 debugtag(tag, "open");
803                 for(c = 0; c < nelem(ontag); c++){
804                         if(cistrcmp(tag->tag, ontag[c].tag) == 0){
805                                 ontag[c].open(text, tag);
806                                 break;
807                         }
808                 }
809                 hidden = getattr(tag, "hidden") || gotstyle(tag, "display", "none");
810         } else {
811                 up = nil;
812                 hidden = 0;
813         }
814         if(tag == nil || tag->closing == 0){
815                 while((c = Bgetc(&in)) > 0){
816                         if(c == '<'){
817                                 memset(&t, 0, sizeof(t));
818                                 if(parsetag(&t)){
819                                         text->aftertag = 1;
820                                         if(t.opening){
821                                                 t.up = tag;
822                                                 parsetext(text, &t);
823                                                 if(t.up != tag){
824                                                         debugtag(tag, "skip");
825                                                         up = t.up;
826                                                         break;
827                                                 }
828                                                 debugtag(tag, "back");
829                                         } else if(t.closing){
830                                                 up = tag;
831                                                 while(up && cistrcmp(up->tag, t.tag))
832                                                         up = up->up;
833                                                 if(up){
834                                                         up = up->up;
835                                                         break;
836                                                 }
837                                         }
838                                 }
839                                 continue;
840                         }
841                         if(hidden || !text->output)
842                                 continue;
843                         r = substrune(parserune(c));
844                         switch(r){
845                         case '\n':
846                         case '\r':
847                                 if(text->pre == 0 && text->aftertag)
848                                         break;
849                         case ' ':
850                         case '\t':
851                                 if(text->pre == 0){
852                                         text->space = 1;
853                                         break;
854                                 }
855                         default:
856                                 if(text->space){
857                                         if(text->pos >= 70)
858                                                 emitrune(text, '\n');
859                                         else if(text->pos > 0)
860                                                 emitrune(text, ' ');
861                                 }
862                                 if((text->pos == 0 && r == '.') || r == '\\')
863                                         emit(text, "\\&");
864                                 if(r == '\\' || r == 0xA0)
865                                         emitrune(text, '\\');
866                                 if(r == 0xA0)
867                                         r = ' ';
868                                 emitrune(text, r);
869                                 text->aftertag = 0;
870                                 text->space = 0;
871                         }
872                 }
873         }
874         if(tag){
875                 debugtag(tag, "close");
876                 if(tag->close){
877                         tag->close(text, tag);
878                         tag->close = nil;
879                 }
880                 if(up)
881                         tag->up = up;
882         }
883 }
884
885 void
886 inittext(Text *text)
887 {
888         memset(text, 0, sizeof(Text));
889         text->fontstyle = "R";
890         text->fontsize = "NL";
891         text->output = 1;
892 }
893
894 void
895 main(void)
896 {
897         Text text;
898         Binit(&in, 0, OREAD);
899         inittext(&text);
900         parsetext(&text, nil);
901         emit(&text, "\n");
902         write(1, text.bp, text.wp - text.bp);
903 }