]> git.lizzy.rs Git - plan9front.git/blob - sys/src/libhtml/lex.c
ndb/dns: lookup *all* entries in dblookup(), v4 and v6 queries in parallel, remove...
[plan9front.git] / sys / src / libhtml / lex.c
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
7
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11         int                     i;              // index of next byte to use
12         uchar*          data;           // all the data
13         int                     edata;  // data[0:edata] is valid
14         int                     chset;  // one of US_Ascii, etc.
15         int                     mtype;  // TextHtml or TextPlain
16 };
17
18 enum {
19         EOF = -2,
20         EOB = -1
21 };
22
23 #define ISNAMCHAR(c)    ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
27
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
30 Rune* tagnames[] = {
31         L" ",
32         L"!",
33         L"a", 
34         L"abbr",
35         L"acronym",
36         L"address",
37         L"applet", 
38         L"area",
39         L"b",
40         L"base",
41         L"basefont",
42         L"bdo",
43         L"big",
44         L"blink",
45         L"blockquote",
46         L"body",
47         L"bq",
48         L"br",
49         L"button",
50         L"caption",
51         L"center",
52         L"cite",
53         L"code",
54         L"col",
55         L"colgroup",
56         L"dd",
57         L"del",
58         L"dfn",
59         L"dir",
60         L"div",
61         L"dl",
62         L"dt",
63         L"em",
64         L"fieldset",
65         L"font",
66         L"form",
67         L"frame",
68         L"frameset",
69         L"h1",
70         L"h2",
71         L"h3",
72         L"h4",
73         L"h5",
74         L"h6",
75         L"head",
76         L"hr",
77         L"html",
78         L"i",
79         L"iframe",
80         L"img",
81         L"input",
82         L"ins",
83         L"isindex",
84         L"kbd",
85         L"label",
86         L"legend",
87         L"li",
88         L"link",
89         L"map",
90         L"menu",
91         L"meta",
92         L"nobr",
93         L"noframes",
94         L"noscript",
95         L"object",
96         L"ol",
97         L"optgroup",
98         L"option",
99         L"p",
100         L"param",
101         L"pre",
102         L"q",
103         L"s",
104         L"samp",
105         L"script",
106         L"select",
107         L"small",
108         L"span",
109         L"strike",
110         L"strong",
111         L"style",
112         L"sub",
113         L"sup",
114         L"table",
115         L"tbody",
116         L"td",
117         L"textarea",
118         L"tfoot",
119         L"th",
120         L"thead",
121         L"title",
122         L"tr",
123         L"tt",
124         L"u",
125         L"ul",
126         L"var"
127 };
128
129 // HTML 4.0 attribute names.
130 // Keep sorted, and in correspondence with enum in impl.h.
131 Rune* attrnames[] = {
132         L"abbr",
133         L"accept-charset",
134         L"access-key",
135         L"action",
136         L"align",
137         L"alink",
138         L"alt",
139         L"archive",
140         L"axis",
141         L"background",
142         L"bgcolor",
143         L"border",
144         L"cellpadding",
145         L"cellspacing",
146         L"char",
147         L"charoff",
148         L"charset",
149         L"checked",
150         L"cite",
151         L"class",
152         L"classid",
153         L"clear",
154         L"code",
155         L"codebase",
156         L"codetype",
157         L"color",
158         L"cols",
159         L"colspan",
160         L"compact",
161         L"content",
162         L"coords",
163         L"data",
164         L"datetime",
165         L"declare",
166         L"defer",
167         L"dir",
168         L"disabled",
169         L"enctype",
170         L"face",
171         L"for",
172         L"frame",
173         L"frameborder",
174         L"headers",
175         L"height",
176         L"href",
177         L"hreflang",
178         L"hspace",
179         L"http-equiv",
180         L"id",
181         L"ismap",
182         L"label",
183         L"lang",
184         L"link",
185         L"longdesc",
186         L"marginheight",
187         L"marginwidth",
188         L"maxlength",
189         L"media",
190         L"method",
191         L"multiple",
192         L"name",
193         L"nohref",
194         L"noresize",
195         L"noshade",
196         L"nowrap",
197         L"object",
198         L"onblur",
199         L"onchange",
200         L"onclick",
201         L"ondblclick",
202         L"onfocus",
203         L"onkeypress",
204         L"onkeyup",
205         L"onload",
206         L"onmousedown",
207         L"onmousemove",
208         L"onmouseout",
209         L"onmouseover",
210         L"onmouseup",
211         L"onreset",
212         L"onselect",
213         L"onsubmit",
214         L"onunload",
215         L"profile",
216         L"prompt",
217         L"readonly",
218         L"rel",
219         L"rev",
220         L"rows",
221         L"rowspan",
222         L"rules",
223         L"scheme",
224         L"scope",
225         L"scrolling",
226         L"selected",
227         L"shape",
228         L"size",
229         L"span",
230         L"src",
231         L"standby",
232         L"start",
233         L"style",
234         L"summary",
235         L"tabindex",
236         L"target",
237         L"text",
238         L"title",
239         L"type",
240         L"usemap",
241         L"valign",
242         L"value",
243         L"valuetype",
244         L"version",
245         L"vlink",
246         L"vspace",
247         L"width"
248 };
249
250
251 // Character entity to unicode character number map.
252 // Keep sorted by name.
253 StringInt       chartab[]= {
254         {L"AElig", 198},
255         {L"Aacute", 193},
256         {L"Acirc", 194},
257         {L"Agrave", 192},
258         {L"Alpha", 913},
259         {L"Aring", 197},
260         {L"Atilde", 195},
261         {L"Auml", 196},
262         {L"Beta", 914},
263         {L"Ccedil", 199},
264         {L"Chi", 935},
265         {L"Dagger", 8225},
266         {L"Delta", 916},
267         {L"ETH", 208},
268         {L"Eacute", 201},
269         {L"Ecirc", 202},
270         {L"Egrave", 200},
271         {L"Epsilon", 917},
272         {L"Eta", 919},
273         {L"Euml", 203},
274         {L"Gamma", 915},
275         {L"Iacute", 205},
276         {L"Icirc", 206},
277         {L"Igrave", 204},
278         {L"Iota", 921},
279         {L"Iuml", 207},
280         {L"Kappa", 922},
281         {L"Lambda", 923},
282         {L"Mu", 924},
283         {L"Ntilde", 209},
284         {L"Nu", 925},
285         {L"OElig", 338},
286         {L"Oacute", 211},
287         {L"Ocirc", 212},
288         {L"Ograve", 210},
289         {L"Omega", 937},
290         {L"Omicron", 927},
291         {L"Oslash", 216},
292         {L"Otilde", 213},
293         {L"Ouml", 214},
294         {L"Phi", 934},
295         {L"Pi", 928},
296         {L"Prime", 8243},
297         {L"Psi", 936},
298         {L"Rho", 929},
299         {L"Scaron", 352},
300         {L"Sigma", 931},
301         {L"THORN", 222},
302         {L"Tau", 932},
303         {L"Theta", 920},
304         {L"Uacute", 218},
305         {L"Ucirc", 219},
306         {L"Ugrave", 217},
307         {L"Upsilon", 933},
308         {L"Uuml", 220},
309         {L"Xi", 926},
310         {L"Yacute", 221},
311         {L"Yuml", 376},
312         {L"Zeta", 918},
313         {L"aacute", 225},
314         {L"acirc", 226},
315         {L"acute", 180},
316         {L"aelig", 230},
317         {L"agrave", 224},
318         {L"alefsym", 8501},
319         {L"alpha", 945},
320         {L"amp", 38},
321         {L"and", 8743},
322         {L"ang", 8736},
323         {L"apos", 39},
324         {L"aring", 229},
325         {L"asymp", 8776},
326         {L"atilde", 227},
327         {L"auml", 228},
328         {L"bdquo", 8222},
329         {L"beta", 946},
330         {L"brvbar", 166},
331         {L"bull", 8226},
332         {L"cap", 8745},
333         {L"ccedil", 231},
334         {L"cdots", 8943},
335         {L"cedil", 184},
336         {L"cent", 162},
337         {L"chi", 967},
338         {L"circ", 710},
339         {L"clubs", 9827},
340         {L"cong", 8773},
341         {L"copy", 169},
342         {L"crarr", 8629},
343         {L"cup", 8746},
344         {L"curren", 164},
345         {L"dArr", 8659},
346         {L"dagger", 8224},
347         {L"darr", 8595},
348         {L"ddots", 8945},
349         {L"deg", 176},
350         {L"delta", 948},
351         {L"diams", 9830},
352         {L"divide", 247},
353         {L"eacute", 233},
354         {L"ecirc", 234},
355         {L"egrave", 232},
356         {L"emdash", 8212},      /* non-standard but commonly used */
357         {L"empty", 8709},
358         {L"emsp", 8195},
359         {L"endash", 8211},      /* non-standard but commonly used */
360         {L"ensp", 8194},
361         {L"epsilon", 949},
362         {L"equiv", 8801},
363         {L"eta", 951},
364         {L"eth", 240},
365         {L"euml", 235},
366         {L"euro", 8364},
367         {L"exist", 8707},
368         {L"fnof", 402},
369         {L"forall", 8704},
370         {L"frac12", 189},
371         {L"frac14", 188},
372         {L"frac34", 190},
373         {L"frasl", 8260},
374         {L"gamma", 947},
375         {L"ge", 8805},
376         {L"gt", 62},
377         {L"hArr", 8660},
378         {L"harr", 8596},
379         {L"hearts", 9829},
380         {L"hellip", 8230},
381         {L"iacute", 237},
382         {L"icirc", 238},
383         {L"iexcl", 161},
384         {L"igrave", 236},
385         {L"image", 8465},
386         {L"infin", 8734},
387         {L"int", 8747},
388         {L"iota", 953},
389         {L"iquest", 191},
390         {L"isin", 8712},
391         {L"iuml", 239},
392         {L"kappa", 954},
393         {L"lArr", 8656},
394         {L"lambda", 955},
395         {L"lang", 9001},
396         {L"laquo", 171},
397         {L"larr", 8592},
398         {L"lceil", 8968},
399         {L"ldots", 8230},
400         {L"ldquo", 8220},
401         {L"le", 8804},
402         {L"lfloor", 8970},
403         {L"lowast", 8727},
404         {L"loz", 9674},
405         {L"lrm", 8206},
406         {L"lsaquo", 8249},
407         {L"lsquo", 8216},
408         {L"lt", 60},
409         {L"macr", 175},
410         {L"mdash", 8212},
411         {L"micro", 181},
412         {L"middot", 183},
413         {L"minus", 8722},
414         {L"mu", 956},
415         {L"nabla", 8711},
416         {L"nbsp", 160},
417         {L"ndash", 8211},
418         {L"ne", 8800},
419         {L"ni", 8715},
420         {L"not", 172},
421         {L"notin", 8713},
422         {L"nsub", 8836},
423         {L"ntilde", 241},
424         {L"nu", 957},
425         {L"oacute", 243},
426         {L"ocirc", 244},
427         {L"oelig", 339},
428         {L"ograve", 242},
429         {L"oline", 8254},
430         {L"omega", 969},
431         {L"omicron", 959},
432         {L"oplus", 8853},
433         {L"or", 8744},
434         {L"ordf", 170},
435         {L"ordm", 186},
436         {L"oslash", 248},
437         {L"otilde", 245},
438         {L"otimes", 8855},
439         {L"ouml", 246},
440         {L"para", 182},
441         {L"part", 8706},
442         {L"permil", 8240},
443         {L"perp", 8869},
444         {L"phi", 966},
445         {L"pi", 960},
446         {L"piv", 982},
447         {L"plusmn", 177},
448         {L"pound", 163},
449         {L"prime", 8242},
450         {L"prod", 8719},
451         {L"prop", 8733},
452         {L"psi", 968},
453         {L"quad", 8193},
454         {L"quot", 34},
455         {L"rArr", 8658},
456         {L"radic", 8730},
457         {L"rang", 9002},
458         {L"raquo", 187},
459         {L"rarr", 8594},
460         {L"rceil", 8969},
461         {L"rdquo", 8221},
462         {L"real", 8476},
463         {L"reg", 174},
464         {L"rfloor", 8971},
465         {L"rho", 961},
466         {L"rlm", 8207},
467         {L"rsaquo", 8250},
468         {L"rsquo", 8217},
469         {L"sbquo", 8218},
470         {L"scaron", 353},
471         {L"sdot", 8901},
472         {L"sect", 167},
473         {L"shy", 173},
474         {L"sigma", 963},
475         {L"sigmaf", 962},
476         {L"sim", 8764},
477         {L"sp", 8194},
478         {L"spades", 9824},
479         {L"sub", 8834},
480         {L"sube", 8838},
481         {L"sum", 8721},
482         {L"sup", 8835},
483         {L"sup1", 185},
484         {L"sup2", 178},
485         {L"sup3", 179},
486         {L"supe", 8839},
487         {L"szlig", 223},
488         {L"tau", 964},
489         {L"there4", 8756},
490         {L"theta", 952},
491         {L"thetasym", 977},
492         {L"thinsp", 8201},
493         {L"thorn", 254},
494         {L"tilde", 732},
495         {L"times", 215},
496         {L"trade", 8482},
497         {L"uArr", 8657},
498         {L"uacute", 250},
499         {L"uarr", 8593},
500         {L"ucirc", 251},
501         {L"ugrave", 249},
502         {L"uml", 168},
503         {L"upsih", 978},
504         {L"upsilon", 965},
505         {L"uuml", 252},
506         {L"varepsilon", 8712},
507         {L"varphi", 981},
508         {L"varpi", 982},
509         {L"varrho", 1009},
510         {L"vdots", 8942},
511         {L"vsigma", 962},
512         {L"vtheta", 977},
513         {L"weierp", 8472},
514         {L"xi", 958},
515         {L"yacute", 253},
516         {L"yen", 165},
517         {L"yuml", 255},
518         {L"zeta", 950},
519         {L"zwj", 8205},
520         {L"zwnj", 8204}
521 };
522 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
523
524 // Characters Winstart..Winend are those that Windows
525 // uses interpolated into the Latin1 set.
526 // They aren't supposed to appear in HTML, but they do....
527 enum {
528         Winstart = 127,
529         Winend = 159
530 };
531
532 static int      winchars[]= { 8226,     // 8226 is a bullet
533         8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
534         710, 8240, 352, 8249, 338, 8226, 8226, 8226,
535         8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
536         732, 8482, 353, 8250, 339, 8226, 8226, 376};
537
538 static StringInt*       tagtable;               // initialized from tagnames
539 static StringInt*       attrtable;              // initialized from attrnames
540
541 static void     lexinit(void);
542 static int              getplaindata(TokenSource* ts, Token* a, int* pai);
543 static int              getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
544 static int              getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
545 static int              gettag(TokenSource* ts, int starti, Token* a, int* pai);
546 static Rune*    buftostr(Rune* s, Rune* buf, int j);
547 static int              comment(TokenSource* ts);
548 static int              findstr(TokenSource* ts, Rune* s);
549 static int              ampersand(TokenSource* ts);
550 static int              lowerc(int c);
551 static int              getchar(TokenSource* ts);
552 static void             ungetchar(TokenSource* ts, int c);
553 static void             backup(TokenSource* ts, int savei);
554 static void             freeinsidetoken(Token* t);
555 static void             freeattrs(Attr* ahead);
556 static Attr*    newattr(int attid, Rune* value, Attr* link);
557 static int              Tconv(Fmt* f);
558
559 int     dbglex = 0;
560 static int lexinited = 0;
561
562 static void
563 lexinit(void)
564 {
565         tagtable = _makestrinttab(tagnames, Numtags);
566         attrtable = _makestrinttab(attrnames, Numattrs);
567         fmtinstall('T', Tconv);
568         lexinited = 1;
569 }
570
571 static TokenSource*
572 newtokensource(uchar* data, int edata, int chset, int mtype)
573 {
574         TokenSource*    ans;
575
576         assert(chset == US_Ascii || chset == ISO_8859_1 ||
577                         chset == UTF_8 || chset == Unicode);
578         ans = (TokenSource*)emalloc(sizeof(TokenSource));
579         ans->i = 0;
580         ans->data = data;
581         ans->edata = edata;
582         ans->chset = chset;
583         ans->mtype = mtype;
584         return ans;
585 }
586
587 enum {
588         ToksChunk = 500,
589 };
590
591 // Call this to get the tokens.
592 //  The number of returned tokens is returned in *plen.
593 Token*
594 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
595 {
596         TokenSource*    ts;
597         Token*          a;
598         int     alen;
599         int     ai;
600         int     starti;
601         int     c;
602         int     tag;
603
604         if(!lexinited)
605                 lexinit();
606         ts = newtokensource(data, datalen, chset, mtype);
607         if(dbglex)
608                 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
609         alen = 0;
610         ai = 0;
611         a = 0;
612         if(ts->mtype == TextHtml) {
613                 for(;;) {
614                         assert(ai <= alen);
615                         if(alen - ai < ToksChunk/32) {
616                                 alen += ToksChunk;
617                                 a = erealloc(a, alen*sizeof *a);
618                         }
619                         starti = ts->i;
620                         c = getchar(ts);
621                         if(c < 0)
622                                 break;
623                         if(c == '<') {
624                                 tag = gettag(ts, starti, a, &ai);
625                                 if(tag == Tscript || tag == Tstyle) {
626                                         // special rules for getting Data after....
627                                         starti = ts->i;
628                                         c = getchar(ts);
629                                         tag = getscriptdata(ts, c, starti, a, &ai, tag);
630                                 }
631                         }
632                         else
633                                 tag = getdata(ts, c, starti, a, &ai);
634                         if(tag == -1)
635                                 break;
636                         else if(dbglex > 1 && tag != Comment)
637                                 fprint(2, "lex: got token %T\n", &a[ai-1]);
638                 }
639         }
640         else {
641                 // plain text (non-html) tokens
642                 for(;;) {
643                         assert(ai <= alen);
644                         if(alen - ai < ToksChunk/32) {
645                                 alen += ToksChunk;
646                                 a = erealloc(a, alen*sizeof *a);
647                         }
648                         tag = getplaindata(ts, a, &ai);
649                         if(tag == -1)
650                                 break;
651                         if(dbglex > 1)
652                                 fprint(2, "lex: got token %T\n", &a[ai]);
653                 }
654         }
655         free(ts);
656         if(dbglex)
657                 fprint(2, "lex: returning %d tokens\n", ai);
658         *plen = ai;
659         if(ai == 0){
660                 free(a);
661                 a = 0;
662         }
663         return a;
664 }
665
666 // For case where source isn't HTML.
667 // Just make data tokens, one per line (or partial line,
668 // at end of buffer), ignoring non-whitespace control
669 // characters and dumping \r's.
670 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
671 // Otherwise return -1;
672 static int
673 getplaindata(TokenSource* ts, Token* a, int* pai)
674 {
675         Rune*   s;
676         int     j;
677         int     starti;
678         int     c;
679         Token*  tok;
680         Rune    buf[BIGBUFSIZE];
681
682         s = nil;
683         j = 0;
684         starti = ts->i;
685         for(c = getchar(ts); c >= 0; c = getchar(ts)) {
686                 if(c < ' ') {
687                         if(isspace(c)) {
688                                 if(c == '\r') {
689                                         // ignore it unless no following '\n',
690                                         // in which case treat it like '\n'
691                                         c = getchar(ts);
692                                         if(c != '\n') {
693                                                 if(c >= 0)
694                                                         ungetchar(ts, c);
695                                                 c = '\n';
696                                         }
697                                 }
698                         }
699                         else
700                                 c = 0;
701                 }
702                 if(c != 0) {
703                         buf[j++] = c;
704                         if(j == nelem(buf)-1) {
705                                 s = buftostr(s, buf, j);
706                                 j = 0;
707                         }
708                 }
709                 if(c == '\n')
710                         break;
711         }
712         s = buftostr(s, buf, j);
713         if(s == nil)
714                 return -1;
715         tok = &a[(*pai)++];
716         tok->tag = Data;
717         tok->text = s;
718         tok->attr = nil;
719         tok->starti = starti;
720         return Data;
721 }
722
723 // Return concatenation of s and buf[0:j]
724 static Rune*
725 buftostr(Rune* s, Rune* buf, int j)
726 {
727         int i;
728
729         if(s == nil)
730                 s = _Strndup(buf, j);
731         else {
732                 i = _Strlen(s);
733                 s = realloc(s, ( i+j+1)*sizeof *s);
734                 memcpy(&s[i], buf, j*sizeof *s);
735                 s[i+j] = 0;
736         }
737         setmalloctag(s, getcallerpc(&s));
738         return s;
739 }
740
741 // Gather data up to next start-of-tag or end-of-buffer.
742 // Translate entity references (&amp;).
743 // Ignore non-whitespace control characters and get rid of \r's.
744 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
745 // Otherwise return -1;
746 static int
747 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
748 {
749         Rune*   s;
750         int     j;
751         int     c;
752         Token*  tok;
753         Rune    buf[SMALLBUFSIZE];
754
755         s = nil;
756         j = 0;
757         for(c = firstc; c >= 0; c = getchar(ts)){
758                 if(c == '&') {
759                         c = ampersand(ts);
760                         if(c < 0)
761                                 break;
762                 }
763                 else if(c < ' ') {
764                         if(isspace(c)) {
765                                 if(c == '\r') {
766                                         // ignore it unless no following '\n',
767                                         // in which case treat it like '\n'
768                                         c = getchar(ts);
769                                         if(c != '\n') {
770                                                 if(c >= 0)
771                                                         ungetchar(ts, c);
772                                                 c = '\n';
773                                         }
774                                 }
775                         }
776                         else {
777                                 if(warn)
778                                         fprint(2, "warning: non-whitespace control character %d ignored\n", c);
779                                 c = 0;
780                         }
781                 }
782                 else if(c == '<') {
783                         ungetchar(ts, c);
784                         break;
785                 }
786                 if(c != 0) {
787                         buf[j++] = c;
788                         if(j == nelem(buf)-1) {
789                                 s = buftostr(s, buf, j);
790                                 j = 0;
791                         }
792                 }
793         }
794         s = buftostr(s, buf, j);
795         if(s == nil)
796                 return -1;
797         tok = &a[(*pai)++];
798         tok->tag = Data;
799         tok->text = s;
800         tok->attr = nil;
801         tok->starti = starti;
802         return Data;
803 }
804
805 // The rules for lexing scripts are different (ugh).
806 // Gather up everything until see an "</" tagnames[tok] ">"
807 static int
808 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
809 {
810         Rune*   s;
811         int     j;
812         int     tstarti;
813         int     savei;
814         int     c;
815         int     tag;
816         int     done;
817         Token*  tok;
818         Rune    buf[BIGBUFSIZE];
819
820         s = nil;
821         j = 0;
822         tstarti = starti;
823         c = firstc;
824         done = 0;
825         while(c >= 0) {
826                 if(c == '<') {
827                         // other browsers ignore stuff to end of line after <!
828                         savei = ts->i;
829                         c = getchar(ts);
830                         if(c == '!') {
831                                 if(comment(ts) == -1)
832                                         break;
833                                 if(c == '\r')
834                                         c = getchar(ts);
835                                 if(c == '\n')
836                                         c = getchar(ts);
837                         }
838                         else if(c >= 0) {
839                                 backup(ts, savei);
840                                 tag = gettag(ts, tstarti, a, pai);
841                                 if(tag == -1)
842                                         break;
843                                 if(tag != Comment)
844                                         (*pai)--;
845                                 backup(ts, tstarti);
846                                 if(tag == findtag + RBRA) {
847                                         done = 1;
848                                         break;
849                                 }
850                                 // here tag was not the one we were looking for, so take as regular data
851                                 c = getchar(ts);
852                         }
853                 }
854                 if(c < 0)
855                         break;
856                 if(c != 0) {
857                         buf[j++] = c;
858                         if(j == nelem(buf)-1) {
859                                 s = buftostr(s, buf, j);
860                                 j = 0;
861                         }
862                 }
863                 tstarti = ts->i;
864                 c = getchar(ts);
865         }
866         if(done || ts->i == ts->edata) {
867                 s = buftostr(s, buf, j);
868                 tok = &a[(*pai)++];
869                 tok->tag = Data;
870                 tok->text = s;
871                 tok->attr = nil;
872                 tok->starti = starti;
873                 return Data;
874         }
875         free(s);
876         backup(ts, starti);
877         return -1;
878 }
879
880 // We've just seen a '<'.  Gather up stuff to closing '>' (if buffer
881 // ends before then, return -1).
882 // If it's a tag, look up the name, gather the attributes, and return
883 // the appropriate token.
884 // Else it's either just plain data or some kind of ignorable stuff:
885 // return Data or Comment as appropriate.
886 // If it's not a Comment, put it in a[*pai] and bump *pai.
887 static int
888 gettag(TokenSource* ts, int starti, Token* a, int* pai)
889 {
890         int     rbra;
891         int     ans;
892         Attr*   al;
893         int     nexti;
894         int     c;
895         int     ti;
896         int     afnd;
897         int     attid;
898         int     quote;
899         Rune*   val;
900         int     nv;
901         int     i;
902         int     tag;
903         Token*  tok;
904         Rune    buf[BIGBUFSIZE];
905
906         al = nil;
907         rbra = 0;
908         nexti = ts->i;
909         tok = &a[*pai];
910         tok->tag = Notfound;
911         tok->text = nil;
912         tok->attr = nil;
913         tok->starti = starti;
914         c = getchar(ts);
915         if(c == '/') {
916                 rbra = RBRA;
917                 c = getchar(ts);
918         }
919         if(c < 0)
920                 goto eob_done;
921         if(c >= 256 || !isalpha(c)) {
922                 // not a tag
923                 if(c == '!') {
924                         ans = comment(ts);
925                         if(ans != -1)
926                                 return ans;
927                         goto eob_done;
928                 }
929                 else {
930                         backup(ts, nexti);
931                         tok->tag = Data;
932                         tok->text = _Strdup(L"<");
933                         (*pai)++;
934                         return Data;
935                 }
936         }
937         // c starts a tagname
938         buf[0] = c;
939         i = 1;
940         while(1) {
941                 c = getchar(ts);
942                 if(c < 0)
943                         goto eob_done;
944                 if(!ISNAMCHAR(c))
945                         break;
946                 // if name is bigger than buf it won't be found anyway...
947                 if(i < BIGBUFSIZE)
948                         buf[i++] = c;
949         }
950         if(_lookup(tagtable, Numtags, buf, i, &tag))
951                 tok->tag = tag + rbra;
952         else
953                 tok->text = _Strndup(buf, i);   // for warning print, in build
954         // attribute gathering loop
955         while(1) {
956                 // look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
957                 // skip whitespace
958 attrloop_continue:
959                 while(c < 256 && isspace(c)) {
960                         c = getchar(ts);
961                         if(c < 0)
962                                 goto eob_done;
963                 }
964                 if(c == '>')
965                         goto attrloop_done;
966                 if(c == '<') {
967                         if(warn)
968                                 fprint(2, "warning: unclosed tag\n");
969                         ungetchar(ts, c);
970                         goto attrloop_done;
971                 }
972                 if(c >= 256 || !isalpha(c)) {
973                         if(warn)
974                                 fprint(2, "warning: expected attribute name\n");
975                         // skipt to next attribute name
976                         while(1) {
977                                 c = getchar(ts);
978                                 if(c < 0)
979                                         goto eob_done;
980                                 if(c < 256 && isalpha(c))
981                                         goto attrloop_continue;
982                                 if(c == '<') {
983                                         if(warn)
984                                                 fprint(2, "warning: unclosed tag\n");
985                                         ungetchar(ts, 60);
986                                         goto attrloop_done;
987                                 }
988                                 if(c == '>')
989                                         goto attrloop_done;
990                         }
991                 }
992                 // gather attribute name
993                 buf[0] = c;
994                 i = 1;
995                 while(1) {
996                         c = getchar(ts);
997                         if(c < 0)
998                                 goto eob_done;
999                         if(!ISNAMCHAR(c))
1000                                 break;
1001                         if(i < BIGBUFSIZE-1)
1002                                 buf[i++] = c;
1003                 }
1004                 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
1005                 if(warn && !afnd) {
1006                         buf[i] = 0;
1007                         fprint(2, "warning: unknown attribute name %S\n", buf);
1008                 }
1009                 // skip whitespace
1010                 while(c < 256 && isspace(c)) {
1011                         c = getchar(ts);
1012                         if(c < 0)
1013                                 goto eob_done;
1014                 }
1015                 if(c != '=') {
1016                         if(afnd)
1017                                 al = newattr(attid, nil, al);
1018                         goto attrloop_continue;
1019                 }
1020                 //# c is '=' here;  skip whitespace
1021                 while(1) {
1022                         c = getchar(ts);
1023                         if(c < 0)
1024                                 goto eob_done;
1025                         if(c >= 256 || !isspace(c))
1026                                 break;
1027                 }
1028                 quote = 0;
1029                 if(c == '\'' || c == '"') {
1030                         quote = c;
1031                         c = getchar(ts);
1032                         if(c < 0)
1033                                 goto eob_done;
1034                 }
1035                 val = nil;
1036                 nv = 0;
1037                 while(1) {
1038 valloop_continue:
1039                         if(c < 0)
1040                                 goto eob_done;
1041                         if(c == '>') {
1042                                 if(quote) {
1043                                         // c might be part of string (though not good style)
1044                                         // but if line ends before close quote, assume
1045                                         // there was an unmatched quote
1046                                         ti = ts->i;
1047                                         while(1) {
1048                                                 c = getchar(ts);
1049                                                 if(c < 0)
1050                                                         goto eob_done;
1051                                                 if(c == quote) {
1052                                                         backup(ts, ti);
1053                                                         buf[nv++] = '>';
1054                                                         if(nv == BIGBUFSIZE-1) {
1055                                                                 val = buftostr(val, buf, nv);
1056                                                                 nv = 0;
1057                                                         }
1058                                                         c = getchar(ts);
1059                                                         goto valloop_continue;
1060                                                 }
1061                                                 if(c == '\n') {
1062                                                         if(warn)
1063                                                                 fprint(2, "warning: apparent unmatched quote\n");
1064                                                         backup(ts, ti);
1065                                                         c = '>';
1066                                                         goto valloop_done;
1067                                                 }
1068                                         }
1069                                 }
1070                                 else
1071                                         goto valloop_done;
1072                         }
1073                         if(quote) {
1074                                 if(c == quote) {
1075                                         c = getchar(ts);
1076                                         if(c < 0)
1077                                                 goto eob_done;
1078                                         goto valloop_done;
1079                                 }
1080                                 if(c == '\r') {
1081                                         c = getchar(ts);
1082                                         goto valloop_continue;
1083                                 }
1084                                 if(c == '\t' || c == '\n')
1085                                         c = ' ';
1086                         }
1087                         else {
1088                                 if(c < 256 && isspace(c))
1089                                         goto valloop_done;
1090                         }
1091                         if(c == '&') {
1092                                 c = ampersand(ts);
1093                                 if(c == -1)
1094                                         goto eob_done;
1095                         }
1096                         buf[nv++] = c;
1097                         if(nv == BIGBUFSIZE-1) {
1098                                 val = buftostr(val, buf, nv);
1099                                 nv = 0;
1100                         }
1101                         c = getchar(ts);
1102                 }
1103 valloop_done:
1104                 if(afnd) {
1105                         val = buftostr(val, buf, nv);
1106                         al = newattr(attid, val, al);
1107                 }
1108         }
1109
1110 attrloop_done:
1111         tok->attr = al;
1112         (*pai)++;
1113         return tok->tag;
1114
1115 eob_done:
1116         if(warn)
1117                 fprint(2, "warning: incomplete tag at end of page\n");
1118         backup(ts, nexti);
1119         freeattrs(al);
1120         tok->tag = Data;
1121         tok->text = _Strdup(L"<");
1122         return Data;
1123 }
1124
1125 // We've just read a '<!' at position starti,
1126 // so this may be a comment or other ignored section, or it may
1127 // be just a literal string if there is no close before end of file
1128 // (other browsers do that).
1129 // The accepted practice seems to be (note: contrary to SGML spec!):
1130 // If see <!--, look for --> to close, or if none, > to close.
1131 // If see <!(not --), look for > to close.
1132 // If no close before end of file, leave original characters in as literal data.
1133 //
1134 // If we see ignorable stuff, return Comment.
1135 // Else return nil (caller should back up and try again when more data arrives,
1136 // unless at end of file, in which case caller should just make '<' a data token).
1137 static int
1138 comment(TokenSource* ts)
1139 {
1140         int     nexti;
1141         int     havecomment;
1142         int     c;
1143
1144         nexti = ts->i;
1145         havecomment = 0;
1146         c = getchar(ts);
1147         if(c == '-') {
1148                 c = getchar(ts);
1149                 if(c == '-') {
1150                         if(findstr(ts, L"-->"))
1151                                 havecomment = 1;
1152                         else
1153                                 backup(ts, nexti);
1154                 }
1155         }
1156         if(!havecomment) {
1157                 if(c == '>')
1158                         havecomment = 1;
1159                 else if(c >= 0) {
1160                         if(findstr(ts, L">"))
1161                                 havecomment = 1;
1162                 }
1163         }
1164         if(havecomment)
1165                 return Comment;
1166         return -1;
1167 }
1168
1169 // Look for string s in token source.
1170 // If found, return 1, with buffer at next char after s,
1171 // else return 0 (caller should back up).
1172 static int
1173 findstr(TokenSource* ts, Rune* s)
1174 {
1175         int     c0;
1176         int     n;
1177         int     nexti;
1178         int     i;
1179         int     c;
1180
1181         c0 = s[0];
1182         n = runestrlen(s);
1183         while(1) {
1184                 c = getchar(ts);
1185                 if(c < 0)
1186                         break;
1187                 if(c == c0) {
1188                         if(n == 1)
1189                                 return 1;
1190                         nexti = ts->i;
1191                         for(i = 1; i < n; i++) {
1192                                 c = getchar(ts);
1193                                 if(c < 0)
1194                                         goto mainloop_done;
1195                                 if(c != s[i])
1196                                         break;
1197                         }
1198                         if(i == n)
1199                                 return 1;
1200                         backup(ts, nexti);
1201                 }
1202         }
1203 mainloop_done:
1204         return 0;
1205 }
1206
1207 // We've just read an '&'; look for an entity reference
1208 // name, and if found, return translated char.
1209 // if there is a complete entity name but it isn't known,
1210 // back up to just past the '&' and return '&'.
1211 // If the entity can't be completed in the current buffer, back up
1212 // to the '&' and return -1.
1213 static int
1214 ampersand(TokenSource* ts)
1215 {
1216         int     savei;
1217         int     c;
1218         int     fnd;
1219         int     ans;
1220         int     v;
1221         int     k;
1222         Rune    buf[25];
1223
1224         savei = ts->i;
1225         c = getchar(ts);
1226         fnd = 0;
1227         ans = -1;
1228         if(c == '#') {
1229                 c = getchar(ts);
1230                 v = 0;
1231                 if(c == 'X' || c == 'x')
1232                         for(c = getchar(ts); c < 256; c = getchar(ts))
1233                                 if(c >= '0' && c <= '9')
1234                                         v = v*16+c-'0';
1235                                 else if(c >= 'A' && c<= 'F')
1236                                         v = v*16+c-'A'+10;
1237                                 else if(c >= 'a' && c <= 'f')
1238                                         v = v*16+c-'a'+10;
1239                                 else
1240                                         break;
1241                 else
1242                         while(c >= 0) {
1243                                 if(!(c < 256 && isdigit(c)))
1244                                         break;
1245                                 v = v*10 + c - 48;
1246                                 c = getchar(ts);
1247                         }
1248                 if(c >= 0) {
1249                         if(!(c == ';' || c == '\n' || c == '\r'))
1250                                 ungetchar(ts, c);
1251                         c = v;
1252                         if(c == 160)
1253                                 c = 160;
1254                         if(c >= Winstart && c <= Winend) {
1255                                 c = winchars[c - Winstart];
1256                         }
1257                         ans = c;
1258                         fnd = 1;
1259                 }
1260         }
1261         else if(c < 256 && isalpha(c)) {
1262                 buf[0] = c;
1263                 k = 1;
1264                 while(1) {
1265                         c = getchar(ts);
1266                         if(c < 0)
1267                                 break;
1268                         if(c < 256 && (isalpha(c) || isdigit(c))) {
1269                                 if(k < nelem(buf)-1)
1270                                         buf[k++] = c;
1271                         }
1272                         else {
1273                                 if(!(c == ';' || c == '\n' || c == '\r'))
1274                                         ungetchar(ts, c);
1275                                 break;
1276                         }
1277                 }
1278                 if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
1279                         fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1280         }
1281         if(!fnd) {
1282                 backup(ts, savei);
1283                 ans = '&';
1284         }
1285         return ans;
1286 }
1287
1288 // Get next char, obeying ts.chset.
1289 // Returns -1 if no complete character left before current end of data.
1290 static int
1291 getchar(TokenSource* ts)
1292 {
1293         uchar*  buf;
1294         int     c;
1295         int     n;
1296         int     ok;
1297         Rune    r;
1298
1299         if(ts->i >= ts->edata)
1300                 return -1;
1301         buf = ts->data;
1302         c = buf[ts->i];
1303         switch(ts->chset) {
1304         case ISO_8859_1:
1305                 if(c >= Winstart && c <= Winend)
1306                         c = winchars[c - Winstart];
1307                 ts->i++;
1308                 break;
1309         case US_Ascii:
1310                 if(c > 127) {
1311                         if(warn)
1312                                 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1313                 }
1314                 ts->i++;
1315                 break;
1316         case UTF_8:
1317                 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1318                 if(ok) {
1319                         n = chartorune(&r, (char*)(buf+ts->i));
1320                         if(warn && c == Runeerror)
1321                                 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1322                         ts->i += n;
1323                         c = r;
1324                 }
1325                 else {
1326                         // not enough bytes in buf to complete utf-8 char
1327                         ts->i = ts->edata;      // mark "all used"
1328                         c = -1;
1329                 }
1330                 break;
1331         case Unicode:
1332                 if(ts->i < ts->edata - 1) {
1333                         //standards say most-significant byte first
1334                         c = (c << 8)|(buf[ts->i + 1]);
1335                         ts->i += 2;
1336                 }
1337                 else {
1338                         ts->i = ts->edata;      // mark "all used"
1339                         c = -1;
1340                 }
1341                 break;
1342         default:
1343                 return -1;
1344         }
1345         return c;
1346 }
1347
1348 // Assuming c was the last character returned by getchar, set
1349 // things up so that next getchar will get that same character
1350 // followed by the current 'next character', etc.
1351 static void
1352 ungetchar(TokenSource* ts, int c)
1353 {
1354         int     n;
1355         Rune    r;
1356         char    a[UTFmax];
1357
1358         n = 1;
1359         switch(ts->chset) {
1360         case UTF_8:
1361                 if(c >= 128) {
1362                         r = c;
1363                         n = runetochar(a, &r);
1364                 }
1365                 break;
1366         case Unicode:
1367                 n = 2;
1368                 break;
1369         }
1370         ts->i -= n;
1371 }
1372
1373 // Restore ts so that it is at the state where the index was savei.
1374 static void
1375 backup(TokenSource* ts, int savei)
1376 {
1377         if(dbglex)
1378                 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1379         ts->i = savei;
1380 }
1381
1382
1383 // Look for value associated with attribute attid in token t.
1384 // If there is one, return 1 and put the value in *pans,
1385 // else return 0.
1386 // If xfer is true, transfer ownership of the string to the caller
1387 // (nil it out here); otherwise, caller must duplicate the answer
1388 // if it needs to save it.
1389 // OK to have pans==0, in which case this is just looking
1390 // to see if token is present.
1391 int
1392 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1393 {
1394         Attr*   attr;
1395
1396         attr = t->attr;
1397         while(attr != nil) {
1398                 if(attr->attid == attid) {
1399                         if(pans != nil)
1400                                 *pans = attr->value;
1401                         if(xfer)
1402                                 attr->value = nil;
1403                         return 1;
1404                 }
1405                 attr = attr->next;
1406         }
1407         if(pans != nil)
1408                 *pans = nil;
1409         return 0;
1410 }
1411
1412 static int
1413 Tconv(Fmt *f)
1414 {
1415         Token*  t;
1416         int     i;
1417         int     tag;
1418         char*   srbra;
1419         Rune*   aname;
1420         Rune*   tname;
1421         Attr*   a;
1422         char    buf[BIGBUFSIZE];
1423
1424         t = va_arg(f->args, Token*);
1425         if(t == nil)
1426                 sprint(buf, "<null>");
1427         else {
1428                 i = 0;
1429                 if(dbglex > 1)
1430                         i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1431                 tag = t->tag;
1432                 if(tag == Data) {
1433                         i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1434                 }
1435                 else {
1436                         srbra = "";
1437                         if(tag >= RBRA) {
1438                                 tag -= RBRA;
1439                                 srbra = "/";
1440                         }
1441                         tname = tagnames[tag];
1442                         if(tag == Notfound)
1443                                 tname = L"?";
1444                         i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1445                         for(a = t->attr; a != nil; a = a->next) {
1446                                 aname = attrnames[a->attid];
1447                                 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1448                                 if(a->value != nil)
1449                                         i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1450                         }
1451                         i += snprint(buf+i, sizeof(buf)-i-1, ">");
1452                 }
1453                 buf[i] = 0;
1454         }
1455         return fmtstrcpy(f, buf);
1456 }
1457
1458 // Attrs own their constituent strings, but build may eventually
1459 // transfer some values to its items and nil them out in the Attr.
1460 static Attr*
1461 newattr(int attid, Rune* value, Attr* link)
1462 {
1463         Attr* ans;
1464
1465         ans = (Attr*)emalloc(sizeof(Attr));
1466         ans->attid = attid;
1467         ans->value = value;
1468         ans->next = link;
1469         setmalloctag(ans, getcallerpc(&attid));
1470         return ans;
1471 }
1472
1473 // Free list of Attrs linked through next field
1474 static void
1475 freeattrs(Attr* ahead)
1476 {
1477         Attr* a;
1478         Attr* nexta;
1479
1480         a = ahead;
1481         while(a != nil) {
1482                 nexta = a->next;
1483                 free(a->value);
1484                 free(a);
1485                 a = nexta;
1486         }
1487 }
1488
1489 // Free array of Tokens.
1490 // Allocated space might have room for more than n tokens,
1491 // but only n of them are initialized.
1492 // If caller has transferred ownership of constitutent strings
1493 // or attributes, it must have nil'd out the pointers in the Tokens.
1494 void
1495 _freetokens(Token* tarray, int n)
1496 {
1497         int i;
1498         Token* t;
1499
1500         if(tarray == nil)
1501                 return;
1502         for(i = 0; i < n; i++) {
1503                 t = &tarray[i];
1504                 free(t->text);
1505                 freeattrs(t->attr);
1506         }
1507         free(tarray);
1508 }