]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/htmlfmt/html.c
merge
[plan9front.git] / sys / src / cmd / htmlfmt / html.c
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <draw.h>
5 #include <regexp.h>
6 #include <html.h>
7 #include <ctype.h>
8 #include "dat.h"
9
10 char urlexpr[] =
11         "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)"
12         "://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
13 Reprog  *urlprog;
14
15 int inword = 0;
16 int col = 0;
17 int wordi = 0;
18
19 char*
20 loadhtml(int fd)
21 {
22         URLwin *u;
23         Bytes *b;
24         int n;
25         char buf[4096];
26
27         u = emalloc(sizeof(URLwin));
28         u->infd = fd;
29         u->outfd = 1;
30         u->url = estrdup(url);
31         u->type = TextHtml;
32
33         b = emalloc(sizeof(Bytes));
34         while((n = read(fd, buf, sizeof buf)) > 0)
35                 growbytes(b, buf, n);
36         if(b->b == nil)
37                 return nil;     /* empty file */
38         rendertext(u, b);
39         freeurlwin(u);
40         return nil;
41 }
42
43 char*
44 runetobyte(Rune *r, int n)
45 {
46         char *s;
47
48         if(n == 0)
49                 return emalloc(1);
50         s = smprint("%.*S", n, r);
51         if(s == nil)
52                 error("malloc failed");
53         return s;
54 }
55
56 int
57 closingpunct(char c)
58 {
59         return strchr(".,:;'\")]}>!?", c) != nil;
60 }
61
62 void
63 emitword(Bytes *b, Rune *r, int nr)
64 {
65         char *s;
66         int space;
67
68         if(nr == 0)
69                 return;
70         s = smprint("%.*S", nr, r);
71         space = b->n > 0 && !isspace(b->b[b->n-1]) && !closingpunct(*s);
72         if(col > 0 && col+space+nr > width){
73                 growbytes(b, "\n", 1);
74                 space = 0;
75                 col = 0;
76         }
77         if(space && col > 0){
78                 growbytes(b, " ", 1);
79                 col++;
80         }
81         growbytes(b, s, strlen(s));
82         col += nr;
83         free(s);
84         inword = 0;
85 }
86
87 void
88 renderrunes(Bytes *b, Rune *r)
89 {
90         int i, n;
91
92         n = runestrlen(r);
93         for(i=0; i<n; i++){
94                 switch(r[i]){
95                 case '\n':
96                         if(inword)
97                                 emitword(b, r+wordi, i-wordi);
98                         col = 0;
99                         if(b->n == 0)
100                                 break;  /* don't start with blank lines */
101                         if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
102                                 growbytes(b, "\n", 1);
103                         break;
104                 case ' ':
105                         if(inword)
106                                 emitword(b, r+wordi, i-wordi);
107                         break;
108                 default:
109                         if(!inword)
110                                 wordi = i;
111                         inword = 1;
112                         break;
113                 }
114         }
115         if(inword)
116                 emitword(b, r+wordi, i-wordi);
117 }
118
119 void
120 renderbytes(Bytes *b, char *fmt, ...)
121 {
122         Rune *r;
123         va_list arg;
124
125         va_start(arg, fmt);
126         r = runevsmprint(fmt, arg);
127         va_end(arg);
128         renderrunes(b, r);
129         free(r);
130 }
131
132 char*
133 baseurl(char *url)
134 {
135         char *base, *slash;
136         Resub rs[10];
137
138         if(url == nil)
139                 return nil;
140         if(urlprog == nil){
141                 urlprog = regcomp(urlexpr);
142                 if(urlprog == nil)
143                         error("can't compile URL regexp");
144         }
145         memset(rs, 0, sizeof rs);
146         if(regexec(urlprog, url, rs, nelem(rs)) == 0)
147                 return nil;
148         base = estrdup(url);
149         slash = strrchr(base, '/');
150         if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp])
151                 *slash = '\0';
152         else
153                 base[rs[0].ep-rs[0].sp] = '\0';
154         return base;
155 }
156
157 char*
158 fullurl(URLwin *u, Rune *rhref)
159 {
160         char *base, *href, *hrefbase;
161         char *result;
162
163         if(rhref == nil)
164                 return estrdup("NULL URL");
165         href = runetobyte(rhref, runestrlen(rhref));
166         hrefbase = baseurl(href);
167         result = nil;
168         if(hrefbase==nil && (base = baseurl(u->url))!=nil){
169                 result = estrdup(base);
170                 if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
171                         result = eappend(result, "/", "");
172                 free(base);
173         }
174         if(href){
175                 if(result)
176                         result = eappend(result, "", href);
177                 else
178                         result = estrdup(href);
179         }
180         free(hrefbase);
181         if(result == nil)
182                 return estrdup("***unknown***");
183         return result;
184 }
185
186 void
187 render(URLwin *u, Bytes *t, Item *items, int curanchor)
188 {
189         Item *il;
190         Itext *it;
191         Ifloat *ifl;
192         Ispacer *is;
193         Itable *ita;
194         Iimage *im;
195         Anchor *a;
196         Table *tab;
197         Tablecell *cell;
198         char *href;
199
200         inword = 0;
201         col = 0;
202         wordi = 0;
203
204         for(il=items; il!=nil; il=il->next){
205                 if(il->state & IFbrk)
206                         renderbytes(t, "\n");
207                 if(il->state & IFbrksp)
208                         renderbytes(t, "\n");
209
210                 switch(il->tag){
211                 case Itexttag:
212                         it = (Itext*)il;
213                         if(it->state & IFwrap)
214                                 renderrunes(t, it->s);
215                         else
216                                 emitword(t, it->s, runestrlen(it->s));
217                         break;
218                 case Iruletag:
219                         if(t->n>0 && t->b[t->n-1]!='\n')
220                                 renderbytes(t, "\n");
221                         renderbytes(t, "=======\n");
222                         break;
223                 case Iimagetag:
224                         if(!aflag)
225                                 break;
226                         im = (Iimage*)il;
227                         if(im->imsrc){
228                                 href = fullurl(u, im->imsrc);
229                                 renderbytes(t, "[image %s]", href);
230                                 free(href);
231                         }
232                         break;
233                 case Iformfieldtag:
234                         if(aflag)
235                                 renderbytes(t, "[formfield]");
236                         break;
237                 case Itabletag:
238                         ita = (Itable*)il;
239                         tab = ita->table;
240                         for(cell=tab->cells; cell!=nil; cell=cell->next){
241                                 render(u, t, cell->content, curanchor);
242                         }
243                         if(t->n>0 && t->b[t->n-1]!='\n')
244                                 renderbytes(t, "\n");
245                         break;
246                 case Ifloattag:
247                         ifl = (Ifloat*)il;
248                         render(u, t, ifl->item, curanchor);
249                         break;
250                 case Ispacertag:
251                         is = (Ispacer*)il;
252                         if(is->spkind != ISPnull)
253                                 renderbytes(t, " ");
254                         break;
255                 default:
256                         error("unknown item tag %d\n", il->tag);
257                 }
258                 if(il->anchorid != 0 && il->anchorid!=curanchor){
259                         for(a=u->docinfo->anchors; a!=nil; a=a->next)
260                                 if(aflag && a->index == il->anchorid){
261                                         href = fullurl(u, a->href);
262                                         renderbytes(t, "[%s]", href);
263                                         free(href);
264                                         break;
265                                 }
266                         curanchor = il->anchorid;
267                 }
268         }
269         if(t->n>0 && t->b[t->n-1]!='\n')
270                 renderbytes(t, "\n");
271 }
272
273 void
274 rerender(URLwin *u)
275 {
276         Bytes *t;
277
278         t = emalloc(sizeof(Bytes));
279
280         render(u, t, u->items, 0);
281
282         if(t->n)
283                 write(u->outfd, (char*)t->b, t->n);
284         free(t->b);
285         free(t);
286 }
287
288 void
289 rendertext(URLwin *u, Bytes *b)
290 {
291         Rune *rurl;
292
293         rurl = toStr((uchar*)u->url, strlen(u->url), UTF_8);
294         u->items = parsehtml(b->b, b->n, rurl, u->type, UTF_8, &u->docinfo);
295 //      free(rurl);
296
297         rerender(u);
298 }
299
300
301 void
302 freeurlwin(URLwin *u)
303 {
304         freeitems(u->items);
305         u->items = nil;
306         freedocinfo(u->docinfo);
307         u->docinfo = nil;
308         free(u);
309 }