]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/troff2html/troff2html.c
devproc: can't wait for ourselfs to stop (thanks Shamar)
[plan9front.git] / sys / src / cmd / troff2html / troff2html.c
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4
5 enum{
6         Nfont = 11,
7         Wid = 20,       /* tmac.anhtml sets page width to 20" so we can recognize .nf text */
8 };
9
10 typedef uintptr Char;
11 typedef struct Troffchar Troffchar;
12 typedef struct Htmlchar Htmlchar;
13 typedef struct Font Font;
14 typedef struct HTMLfont HTMLfont;
15
16 /*
17  * a Char is >= 32 bits. low 16 bits are the rune. higher are attributes.
18  * must be able to hold a pointer.
19  */
20 enum
21 {
22         Italic  =       16,
23         Bold,
24         CW,
25         Indent1,
26         Indent2,
27         Indent3,
28         Heading =       25,
29         Anchor =        26,     /* must be last */
30 };
31
32 enum    /* magic emissions */
33 {
34         Estring = 0,
35         Epp = 1<<16,
36 };
37
38 int attrorder[] = { Indent1, Indent2, Indent3, Heading, Anchor, Italic, Bold, CW };
39
40 int nest[10];
41 int nnest;
42
43 struct Troffchar
44 {
45         char *name;
46         char *value;
47 };
48
49 struct Htmlchar
50 {
51         char *utf;
52         char *name;
53         int value;
54 };
55
56 #include "chars.h"
57
58 struct Font{
59         char            *name;
60         HTMLfont        *htmlfont;
61 };
62
63 struct HTMLfont{
64         char    *name;
65         char    *htmlname;
66         int     bit;
67 };
68
69 /* R must be first; it's the default representation for fonts we don't recognize */
70 HTMLfont htmlfonts[] =
71 {
72         "R",            nil,    0,
73         "LucidaSans",   nil,    0,
74         "I",            "i",    Italic,
75         "LucidaSansI",  "i",    Italic,
76         "CW",           "tt",   CW,
77         "LucidaCW",     "tt",   CW,
78         nil,    nil,
79 };
80
81 #define TABLE "<table border=0 cellpadding=0 cellspacing=0>"
82
83 char*
84 onattr[8*sizeof(int)] =
85 {
86         0, 0, 0, 0, 0, 0, 0, 0,
87         0, 0, 0, 0, 0, 0, 0, 0,
88         "<i>",                  /* italic */
89         "<b>",                  /* bold */
90         "<tt><font size=+1>",   /* cw */
91         "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n",                /* indent1 */
92         "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n",                /* indent2 */
93         "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n",                /* indent3 */
94         0,
95         0,
96         0,
97         "<p><font size=+1><b>", /* heading 25 */
98         "<unused>",             /* anchor 26 */
99 };
100
101 char*
102 offattr[8*sizeof(int)] =
103 {
104         0, 0, 0, 0, 0, 0, 0, 0,
105         0, 0, 0, 0, 0, 0, 0, 0,
106         "</i>",                 /* italic */
107         "</b>",                 /* bold */
108         "</font></tt>",         /* cw */
109         "<-/table>",            /* indent1 */
110         "<-/table>",            /* indent2 */
111         "<-/table>",            /* indent3 */
112         0,
113         0,
114         0,
115         "</b></font>",          /* heading 25 */
116         "</a>",                 /* anchor 26 */
117 };
118
119 Font    *font[Nfont];
120
121 Biobuf  bout;
122 int     debug = 0;
123
124 /* troff state */
125 int     page = 1;
126 int     ft = 1;
127 int     vp = 0;
128 int     hp = 0;
129 int     ps = 1;
130 int     res = 720;
131
132 int     didP = 0;
133 int     atnewline = 1;
134 int     prevlineH = 0;
135 Char    attr = 0;       /* or'ed into each Char */
136
137 Char    *chars;
138 int     nchars;
139 int     nalloc;
140 char**  anchors;        /* allocated in order */
141 int     nanchors;
142
143 char    *filename;
144 int     cno;
145 char    buf[8192];
146 char    *title = "Plan 9 man page";
147
148 void    process(Biobuf*, char*);
149 void    mountfont(int, char*);
150 void    switchfont(int);
151 void    header(char*);
152 void    flush(void);
153 void    trailer(void);
154
155 void*
156 emalloc(ulong n)
157 {
158         void *p;
159
160         p = malloc(n);
161         if(p == nil)
162                 sysfatal("malloc failed: %r");
163         return p;
164 }
165
166 void*
167 erealloc(void *p, ulong n)
168 {
169
170         p = realloc(p, n);
171         if(p == nil)
172                 sysfatal("realloc failed: %r");
173         return p;
174 }
175
176 char*
177 estrdup(char *s)
178 {
179         char *t;
180
181         t = strdup(s);
182         if(t == nil)
183                 sysfatal("strdup failed: %r");
184         return t;
185 }
186
187 void
188 usage(void)
189 {
190         fprint(2, "usage: troff2html [-d] [-t title] [file ...]\n");
191         exits("usage");
192 }
193
194 int
195 hccmp(const void *va, const void *vb)
196 {
197         Htmlchar *a, *b;
198
199         a = (Htmlchar*)va;
200         b = (Htmlchar*)vb;
201         return a->value - b->value;
202 }
203
204 void
205 main(int argc, char *argv[])
206 {
207         int i;
208         Biobuf in, *inp;
209         Rune r;
210
211         for(i=0; i<nelem(htmlchars); i++){
212                 chartorune(&r, htmlchars[i].utf);
213                 htmlchars[i].value = r;
214         }
215         qsort(htmlchars, nelem(htmlchars), sizeof(htmlchars[0]), hccmp);
216
217         ARGBEGIN{
218         case 't':
219                 title = ARGF();
220                 if(title == nil)
221                         usage();
222                 break;
223         case 'd':
224                 debug++;
225                 break;
226         default:
227                 usage();
228         }ARGEND
229
230         Binit(&bout, 1, OWRITE);
231         if(argc == 0){
232                 header(title);
233                 Binit(&in, 0, OREAD);
234                 process(&in, "<stdin>");
235         }else{
236                 header(title);
237                 for(i=0; i<argc; i++){
238                         inp = Bopen(argv[i], OREAD);
239                         if(inp == nil)
240                                 sysfatal("can't open %s: %r", argv[i]);
241                         process(inp, argv[i]);
242                         Bterm(inp);
243                 }
244         }
245         flush();
246         trailer();
247         exits(nil);
248 }
249
250 void
251 emitchar(Char c)
252 {
253         if(nalloc == nchars){
254                 nalloc += 10000;
255                 chars = realloc(chars, nalloc*sizeof(chars[0]));
256                 if(chars == nil)
257                         sysfatal("malloc failed: %r");
258         }
259         chars[nchars++] = c;
260 }
261
262 void
263 emit(Rune r)
264 {
265         emitchar(r | attr);
266         /*
267          * Close man page references early, so that 
268          * .IR proof (1),
269          * doesn't make the comma part of the link.
270          */
271         if(r == ')')
272                 attr &= ~(1<<Anchor);
273 }
274
275 void
276 emitstr(char *s)
277 {
278         emitchar(Estring);
279         emitchar((Char)s);
280 }
281
282 int indentlevel;
283 int linelen;
284
285 void
286 iputrune(Biobuf *b, Rune r)
287 {
288         int i;
289
290         if(linelen++ > 60 && r == ' ')
291                 r = '\n';
292         Bputrune(b, r);
293         if(r == '\n'){
294                 for(i=0; i<indentlevel; i++)
295                         Bprint(b, "    ");
296                 linelen = 0;
297         }
298 }
299
300 void
301 iputs(Biobuf *b, char *s)
302 {
303         if(s[0]=='<' && s[1]=='+'){
304                 iputrune(b, '\n');
305                 Bprint(b, "<%s", s+2);
306                 indentlevel++;
307                 iputrune(b, '\n');
308         }else if(s[0]=='<' && s[1]=='-'){
309                 indentlevel--;
310                 iputrune(b, '\n');
311                 Bprint(b, "<%s", s+2);
312                 iputrune(b, '\n');
313         }else
314                 Bprint(b, "%s", s);
315 }
316
317 void
318 setattr(Char a)
319 {
320         Char on, off;
321         int i, j;
322
323         on = a & ~attr;
324         off = attr & ~a;
325
326         /* walk up the nest stack until we reach something we need to turn off. */
327         for(i=0; i<nnest; i++)
328                 if(off&(1<<nest[i]))
329                         break;
330
331         /* turn off everything above that */
332         for(j=nnest-1; j>=i; j--)
333                 iputs(&bout, offattr[nest[j]]);
334
335         /* turn on everything we just turned off but didn't want to */
336         for(j=i; j<nnest; j++)
337                 if(a&(1<<nest[j]))
338                         iputs(&bout, onattr[nest[j]]);
339                 else
340                         nest[j] = 0;
341
342         /* shift the zeros (turned off things) up */
343         for(i=j=0; i<nnest; i++)
344                 if(nest[i] != 0)
345                         nest[j++] = nest[i];
346         nnest = j;
347
348         /* now turn on the new attributes */
349         for(i=0; i<nelem(attrorder); i++){
350                 j = attrorder[i];
351                 if(on&(1<<j)){
352                         if(j == Anchor)
353                                 onattr[j] = anchors[nanchors++];
354                         iputs(&bout, onattr[j]);
355                         if(nnest >= nelem(nest))
356                                 sysfatal("nesting too deep");
357                         nest[nnest++] = j;
358                 }
359         }
360         attr = a;
361 }
362
363 void
364 flush(void)
365 {
366         int i;
367         Char c, a;
368
369         nanchors = 0;
370         for(i=0; i<nchars; i++){
371                 c = chars[i];
372                 if(c == Estring){
373                         /* next word is string to print */
374                         iputs(&bout, (char*)chars[++i]);
375                         continue;
376                 }
377                 if(c == Epp){
378                         iputrune(&bout, '\n');
379                         iputs(&bout, TABLE "<tr height=5><td></table>");
380                         iputrune(&bout, '\n');
381                         continue;
382                 }
383                 a = c & ~0xFFFF;
384                 c &= 0xFFFF;
385                 /*
386                  * If we're going to something off after a space,
387                  * let's just turn it off before.
388                  */
389                 if(c == ' ' && i<nchars-1 && (chars[i+1]&0xFFFF) >= 32)
390                         a ^= a & ~chars[i+1];
391                 setattr(a);
392                 iputrune(&bout, c & 0xFFFF);
393         }
394 }
395
396 void
397 header(char *s)
398 {
399         Bprint(&bout, "<head>\n");
400         Bprint(&bout, "<title>%s</title>\n", s);
401         Bprint(&bout, "<meta content=\"text/html; charset=utf-8\" http-equiv=Content-Type>\n");
402         Bprint(&bout, "</head>\n");
403         Bprint(&bout, "<body bgcolor=#ffffff>\n");
404 }
405
406 void
407 trailer(void)
408 {
409         Bprint(&bout, "</body></html>\n");
410 }
411
412 int
413 getc(Biobuf *b)
414 {
415         cno++;
416         return Bgetrune(b);
417 }
418
419 void
420 ungetc(Biobuf *b)
421 {
422         cno--;
423         Bungetrune(b);
424 }
425
426 char*
427 getline(Biobuf *b)
428 {
429         int i, c;
430
431         for(i=0; i<sizeof buf; i++){
432                 c = getc(b);
433                 if(c == Beof)
434                         return nil;
435                 buf[i] = c;
436                 if(c == '\n'){
437                         buf[i] = '\0';
438                         break;
439                 }
440         }
441         return buf;
442 }
443
444 int
445 getnum(Biobuf *b)
446 {
447         int i, c;
448
449         i = 0;
450         for(;;){
451                 c = getc(b);
452                 if(c<'0' || '9'<c){
453                         ungetc(b);
454                         break;
455                 }
456                 i = i*10 + (c-'0');
457         }
458         return i;
459 }
460
461 char*
462 getstr(Biobuf *b)
463 {
464         int i, c;
465
466         for(i=0; i<sizeof buf; i++){
467                 /* must get bytes not runes */
468                 cno++;
469                 c = Bgetc(b);
470                 if(c == Beof)
471                         return nil;
472                 buf[i] = c;
473                 if(c == '\n' || c==' ' || c=='\t'){
474                         ungetc(b);
475                         buf[i] = '\0';
476                         break;
477                 }
478         }
479         return buf;
480 }
481
482 int
483 setnum(Biobuf *b, char *name, int min, int max)
484 {
485         int i;
486
487         i = getnum(b);
488         if(debug > 2)
489                 fprint(2, "set %s = %d\n", name, i);
490         if(min<=i && i<max)
491                 return i;
492         sysfatal("value of %s is %d; min %d max %d at %s:#%d", name, i, min, max, filename, cno);
493         return i;
494 }
495
496 void
497 xcmd(Biobuf *b)
498 {
499         char *p, *fld[16], buf[1024];
500
501         int i, nfld;
502
503         p = getline(b);
504         if(p == nil)
505                 sysfatal("xcmd error: %r");
506         if(debug)
507                 fprint(2, "x command '%s'\n", p);
508         nfld = tokenize(p, fld, nelem(fld));
509         if(nfld == 0)
510                 return;
511         switch(fld[0][0]){
512         case 'f':
513                 /* mount font */
514                 if(nfld != 3)
515                         break;
516                 i = atoi(fld[1]);
517                 if(i<0 || Nfont<=i)
518                         sysfatal("font %d out of range at %s:#%d", i, filename, cno);
519                 mountfont(i, fld[2]);
520                 return;
521         case 'i':
522                 /* init */
523                 return;
524         case 'r':
525                 if(nfld<2 || atoi(fld[1])!=res)
526                         sysfatal("typesetter has unexpected resolution %s", fld[1]? fld[1] : "<unspecified>");
527                 return;
528         case 's':
529                 /* stop */
530                 return;
531         case 't':
532                 /* trailer */
533                 return;
534         case 'T':
535                 if(nfld!=2 || strcmp(fld[1], "utf")!=0)
536                         sysfatal("output for unknown typesetter type %s", fld[1]);
537                 return;
538         case 'X':
539                 if(nfld<3 || strcmp(fld[1], "html")!=0)
540                         break;
541                 /* is it a man reference of the form cp(1)? */
542                 /* X manref start/end cp (1) */
543                 if(nfld==6 && strcmp(fld[2], "manref")==0){
544                         /* was the right macro; is it the right form? */
545                         if(strlen(fld[5])>=3 &&
546                            fld[5][0]=='(' && fld[5][2]==')' &&
547                            '0'<=fld[5][1] && fld[5][1]<='9'){
548                                 if(strcmp(fld[3], "start") == 0){
549                                         /* set anchor attribute and remember string */
550                                         attr |= (1<<Anchor);
551                                         snprint(buf, sizeof buf,
552                                                 "<a href=\"/magic/man2html/%c/%s\">",
553                                                 fld[5][1], fld[4]);
554                                         nanchors++;
555                                         anchors = erealloc(anchors, nanchors*sizeof(char*));
556                                         anchors[nanchors-1] = estrdup(buf);
557                                 }else if(strcmp(fld[3], "end") == 0)
558                                         attr &= ~(1<<Anchor);
559                         }
560                 }else if(strcmp(fld[2], "manPP") == 0){
561                         didP = 1;
562                         emitchar(Epp);
563                 }else if(nfld<4 || strcmp(fld[2], "manref")!=0){
564                         if(nfld>2 && strcmp(fld[2], "<P>")==0){ /* avoid triggering extra <br> */
565                                 didP = 1;
566                                 /* clear all font attributes before paragraph */
567                                 emitchar(' ' | (attr & ~(0xFFFF|((1<<Italic)|(1<<Bold)|(1<<CW)))));
568                                 emitstr("<P>");
569                                 /* next emittec char will turn font attributes back on */
570                         }else if(nfld>2 && strcmp(fld[2], "<H4>")==0)
571                                 attr |= (1<<Heading);
572                         else if(nfld>2 && strcmp(fld[2], "</H4>")==0)
573                                 attr &= ~(1<<Heading);
574                         else if(debug)
575                                 fprint(2, "unknown in-line html %s... at %s:%#d\n",
576                                         fld[2], filename, cno);
577                 }
578                 return;
579         }
580         if(debug)
581                 fprint(2, "unknown or badly formatted x command %s\n", fld[0]);
582 }
583
584 int
585 lookup(int c, Htmlchar tab[], int ntab)
586 {
587         int low, high, mid;
588
589         low = 0;
590         high = ntab - 1;
591         while(low <= high){
592                 mid = (low+high)/2;
593                 if(c < tab[mid].value)
594                         high = mid - 1;
595                 else if(c > tab[mid].value)
596                         low = mid + 1;
597                 else
598                         return mid;
599         }
600         return -1;      /* no match */
601 }
602
603 void
604 emithtmlchar(int r)
605 {
606         static char buf[10];
607         int i;
608
609         i = lookup(r, htmlchars, nelem(htmlchars));
610         if(i >= 0)
611                 emitstr(htmlchars[i].name);
612         else
613                 emit(r);
614 }
615
616 char*
617 troffchar(char *s)
618 {
619         int i;
620
621         for(i=0; troffchars[i].name!=nil; i++)
622                 if(strcmp(s, troffchars[i].name) == 0)
623                         return troffchars[i].value;
624         return "??";
625 }
626
627 void
628 indent(void)
629 {
630         int nind;
631
632         didP = 0;
633         if(atnewline){
634                 if(hp != prevlineH){
635                         prevlineH = hp;
636                         /* these most peculiar numbers appear in the troff -man output */
637                         nind = ((prevlineH-1*res)+323)/324;
638                         attr &= ~((1<<Indent1)|(1<<Indent2)|(1<<Indent3));
639                         if(nind >= 1)
640                                 attr |= (1<<Indent1);
641                         if(nind >= 2)
642                                 attr |= (1<<Indent2);
643                         if(nind >= 3)
644                                 attr |= (1<<Indent3);
645                 }
646                 atnewline = 0;
647         }
648 }
649
650 void
651 process(Biobuf *b, char *name)
652 {
653         int c, r, v, i;
654         char *p;
655
656         cno = 0;
657         prevlineH = res;
658         filename = name;
659         for(;;){
660                 c = getc(b);
661                 switch(c){
662                 case Beof:
663                         /* go to ground state */
664                         attr = 0;
665                         emit('\n');
666                         return;
667                 case '\n':
668                         break;
669                 case '0': case '1': case '2': case '3': case '4':
670                 case '5': case '6': case '7': case '8': case '9':
671                         v = c-'0';
672                         c = getc(b);
673                         if(c<'0' || '9'<c)
674                                 sysfatal("illegal character motion at %s:#%d", filename, cno);
675                         v = v*10 + (c-'0');
676                         hp += v;
677                         /* fall through to character case */
678                 case 'c':
679                         indent();
680                         r = getc(b);
681                         emithtmlchar(r);
682                         break;
683                 case 'D':
684                         /* draw line; ignore */
685                         do
686                                 c = getc(b);
687                         while(c!='\n' && c!= Beof);
688                         break;
689                 case 'f':
690                         v = setnum(b, "font", 0, Nfont);
691                         switchfont(v);
692                         break;
693                 case 'h':
694                         v = setnum(b, "hpos", -20000, 20000);
695                         /* generate spaces if motion is large and within a line */
696                         if(!atnewline && v>2*72)
697                                 for(i=0; i<v; i+=72)
698                                         emitstr("&nbsp;");
699                         hp += v;
700                         break;
701                 case 'n':
702                         setnum(b, "n1", -10000, 10000);
703                         //Bprint(&bout, " N1=%d", v);
704                         getc(b);        /* space separates */
705                         setnum(b, "n2", -10000, 10000);
706                         atnewline = 1;
707                         if(!didP && hp < (Wid-1)*res)   /* if line is less than 19" long, probably need a line break */
708                                 emitstr("<br>");
709                         emit('\n');
710                         break;
711                 case 'p':
712                         page = setnum(b, "ps", -10000, 10000);
713                         break;
714                 case 's':
715                         ps = setnum(b, "ps", 1, 1000);
716                         break;
717                 case 'v':
718                         vp += setnum(b, "vpos", -10000, 10000);
719                         /* BUG: ignore motion */
720                         break;
721                 case 'x':
722                         xcmd(b);
723                         break;
724                 case 'w':
725                         emit(' ');
726                         break;
727                 case 'C':
728                         indent();
729                         p = getstr(b);
730                         emitstr(troffchar(p));
731                         break;
732                 case 'H':
733                         hp = setnum(b, "hpos", 0, 20000);
734                         //Bprint(&bout, " H=%d ", hp);
735                         break;
736                 case 'V':
737                         vp = setnum(b, "vpos", 0, 10000);
738                         break;
739                 default:
740                         fprint(2, "dhtml: unknown directive %c(0x%.2ux) at %s:#%d\n", c, c, filename, cno);
741                         return;
742                 }
743         }
744 }
745
746 HTMLfont*
747 htmlfont(char *name)
748 {
749         int i;
750
751         for(i=0; htmlfonts[i].name!=nil; i++)
752                 if(strcmp(name, htmlfonts[i].name) == 0)
753                         return &htmlfonts[i];
754         return &htmlfonts[0];
755 }
756
757 void
758 mountfont(int pos, char *name)
759 {
760         if(debug)
761                 fprint(2, "mount font %s on %d\n", name, pos);
762         if(font[pos] != nil){
763                 free(font[pos]->name);
764                 free(font[pos]);
765         }
766         font[pos] = emalloc(sizeof(Font));
767         font[pos]->name = estrdup(name);
768         font[pos]->htmlfont = htmlfont(name);
769 }
770
771 void
772 switchfont(int pos)
773 {
774         HTMLfont *hf;
775
776         if(debug)
777                 fprint(2, "font change from %d (%s) to %d (%s)\n", ft, font[ft]->name, pos, font[pos]->name);
778         if(pos == ft)
779                 return;
780         hf = font[ft]->htmlfont;
781         if(hf->bit != 0)
782                 attr &= ~(1<<hf->bit);
783         ft = pos;
784         hf = font[ft]->htmlfont;
785         if(hf->bit != 0)
786                 attr |= (1<<hf->bit);
787 }