]> git.lizzy.rs Git - plan9front.git/blob - sys/src/libhtml/lex.c
libhtml: handle ' character reference
[plan9front.git] / sys / src / libhtml / lex.c
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
7
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11         int                     i;              // index of next byte to use
12         uchar*          data;           // all the data
13         int                     edata;  // data[0:edata] is valid
14         int                     chset;  // one of US_Ascii, etc.
15         int                     mtype;  // TextHtml or TextPlain
16 };
17
18 enum {
19         EOF = -2,
20         EOB = -1
21 };
22
23 #define ISNAMCHAR(c)    ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
27
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
30 Rune* tagnames[] = {
31         L" ",
32         L"!",
33         L"a", 
34         L"abbr",
35         L"acronym",
36         L"address",
37         L"applet", 
38         L"area",
39         L"b",
40         L"base",
41         L"basefont",
42         L"bdo",
43         L"big",
44         L"blink",
45         L"blockquote",
46         L"body",
47         L"bq",
48         L"br",
49         L"button",
50         L"caption",
51         L"center",
52         L"cite",
53         L"code",
54         L"col",
55         L"colgroup",
56         L"dd",
57         L"del",
58         L"dfn",
59         L"dir",
60         L"div",
61         L"dl",
62         L"dt",
63         L"em",
64         L"fieldset",
65         L"font",
66         L"form",
67         L"frame",
68         L"frameset",
69         L"h1",
70         L"h2",
71         L"h3",
72         L"h4",
73         L"h5",
74         L"h6",
75         L"head",
76         L"hr",
77         L"html",
78         L"i",
79         L"iframe",
80         L"img",
81         L"input",
82         L"ins",
83         L"isindex",
84         L"kbd",
85         L"label",
86         L"legend",
87         L"li",
88         L"link",
89         L"map",
90         L"menu",
91         L"meta",
92         L"nobr",
93         L"noframes",
94         L"noscript",
95         L"object",
96         L"ol",
97         L"optgroup",
98         L"option",
99         L"p",
100         L"param",
101         L"pre",
102         L"q",
103         L"s",
104         L"samp",
105         L"script",
106         L"select",
107         L"small",
108         L"span",
109         L"strike",
110         L"strong",
111         L"style",
112         L"sub",
113         L"sup",
114         L"table",
115         L"tbody",
116         L"td",
117         L"textarea",
118         L"tfoot",
119         L"th",
120         L"thead",
121         L"title",
122         L"tr",
123         L"tt",
124         L"u",
125         L"ul",
126         L"var"
127 };
128
129 // HTML 4.0 attribute names.
130 // Keep sorted, and in correspondence with enum in impl.h.
131 Rune* attrnames[] = {
132         L"abbr",
133         L"accept-charset",
134         L"access-key",
135         L"action",
136         L"align",
137         L"alink",
138         L"alt",
139         L"archive",
140         L"axis",
141         L"background",
142         L"bgcolor",
143         L"border",
144         L"cellpadding",
145         L"cellspacing",
146         L"char",
147         L"charoff",
148         L"charset",
149         L"checked",
150         L"cite",
151         L"class",
152         L"classid",
153         L"clear",
154         L"code",
155         L"codebase",
156         L"codetype",
157         L"color",
158         L"cols",
159         L"colspan",
160         L"compact",
161         L"content",
162         L"coords",
163         L"data",
164         L"datetime",
165         L"declare",
166         L"defer",
167         L"dir",
168         L"disabled",
169         L"enctype",
170         L"face",
171         L"for",
172         L"frame",
173         L"frameborder",
174         L"headers",
175         L"height",
176         L"href",
177         L"hreflang",
178         L"hspace",
179         L"http-equiv",
180         L"id",
181         L"ismap",
182         L"label",
183         L"lang",
184         L"link",
185         L"longdesc",
186         L"marginheight",
187         L"marginwidth",
188         L"maxlength",
189         L"media",
190         L"method",
191         L"multiple",
192         L"name",
193         L"nohref",
194         L"noresize",
195         L"noshade",
196         L"nowrap",
197         L"object",
198         L"onblur",
199         L"onchange",
200         L"onclick",
201         L"ondblclick",
202         L"onfocus",
203         L"onkeypress",
204         L"onkeyup",
205         L"onload",
206         L"onmousedown",
207         L"onmousemove",
208         L"onmouseout",
209         L"onmouseover",
210         L"onmouseup",
211         L"onreset",
212         L"onselect",
213         L"onsubmit",
214         L"onunload",
215         L"profile",
216         L"prompt",
217         L"readonly",
218         L"rel",
219         L"rev",
220         L"rows",
221         L"rowspan",
222         L"rules",
223         L"scheme",
224         L"scope",
225         L"scrolling",
226         L"selected",
227         L"shape",
228         L"size",
229         L"span",
230         L"src",
231         L"standby",
232         L"start",
233         L"style",
234         L"summary",
235         L"tabindex",
236         L"target",
237         L"text",
238         L"title",
239         L"type",
240         L"usemap",
241         L"valign",
242         L"value",
243         L"valuetype",
244         L"version",
245         L"vlink",
246         L"vspace",
247         L"width"
248 };
249
250
251 // Character entity to unicode character number map.
252 // Keep sorted by name.
253 StringInt       chartab[]= {
254         {L"AElig", 198},
255         {L"Aacute", 193},
256         {L"Acirc", 194},
257         {L"Agrave", 192},
258         {L"Alpha", 913},
259         {L"Aring", 197},
260         {L"Atilde", 195},
261         {L"Auml", 196},
262         {L"Beta", 914},
263         {L"Ccedil", 199},
264         {L"Chi", 935},
265         {L"Dagger", 8225},
266         {L"Delta", 916},
267         {L"ETH", 208},
268         {L"Eacute", 201},
269         {L"Ecirc", 202},
270         {L"Egrave", 200},
271         {L"Epsilon", 917},
272         {L"Eta", 919},
273         {L"Euml", 203},
274         {L"Gamma", 915},
275         {L"Iacute", 205},
276         {L"Icirc", 206},
277         {L"Igrave", 204},
278         {L"Iota", 921},
279         {L"Iuml", 207},
280         {L"Kappa", 922},
281         {L"Lambda", 923},
282         {L"Mu", 924},
283         {L"Ntilde", 209},
284         {L"Nu", 925},
285         {L"OElig", 338},
286         {L"Oacute", 211},
287         {L"Ocirc", 212},
288         {L"Ograve", 210},
289         {L"Omega", 937},
290         {L"Omicron", 927},
291         {L"Oslash", 216},
292         {L"Otilde", 213},
293         {L"Ouml", 214},
294         {L"Phi", 934},
295         {L"Pi", 928},
296         {L"Prime", 8243},
297         {L"Psi", 936},
298         {L"Rho", 929},
299         {L"Scaron", 352},
300         {L"Sigma", 931},
301         {L"THORN", 222},
302         {L"Tau", 932},
303         {L"Theta", 920},
304         {L"Uacute", 218},
305         {L"Ucirc", 219},
306         {L"Ugrave", 217},
307         {L"Upsilon", 933},
308         {L"Uuml", 220},
309         {L"Xi", 926},
310         {L"Yacute", 221},
311         {L"Yuml", 376},
312         {L"Zeta", 918},
313         {L"aacute", 225},
314         {L"acirc", 226},
315         {L"acute", 180},
316         {L"aelig", 230},
317         {L"agrave", 224},
318         {L"alefsym", 8501},
319         {L"alpha", 945},
320         {L"amp", 38},
321         {L"and", 8743},
322         {L"ang", 8736},
323         {L"apos", 39},
324         {L"aring", 229},
325         {L"asymp", 8776},
326         {L"atilde", 227},
327         {L"auml", 228},
328         {L"bdquo", 8222},
329         {L"beta", 946},
330         {L"brvbar", 166},
331         {L"bull", 8226},
332         {L"cap", 8745},
333         {L"ccedil", 231},
334         {L"cdots", 8943},
335         {L"cedil", 184},
336         {L"cent", 162},
337         {L"chi", 967},
338         {L"circ", 710},
339         {L"clubs", 9827},
340         {L"cong", 8773},
341         {L"copy", 169},
342         {L"crarr", 8629},
343         {L"cup", 8746},
344         {L"curren", 164},
345         {L"dArr", 8659},
346         {L"dagger", 8224},
347         {L"darr", 8595},
348         {L"ddots", 8945},
349         {L"deg", 176},
350         {L"delta", 948},
351         {L"diams", 9830},
352         {L"divide", 247},
353         {L"eacute", 233},
354         {L"ecirc", 234},
355         {L"egrave", 232},
356         {L"emdash", 8212},      /* non-standard but commonly used */
357         {L"empty", 8709},
358         {L"emsp", 8195},
359         {L"endash", 8211},      /* non-standard but commonly used */
360         {L"ensp", 8194},
361         {L"epsilon", 949},
362         {L"equiv", 8801},
363         {L"eta", 951},
364         {L"eth", 240},
365         {L"euml", 235},
366         {L"euro", 8364},
367         {L"exist", 8707},
368         {L"fnof", 402},
369         {L"forall", 8704},
370         {L"frac12", 189},
371         {L"frac14", 188},
372         {L"frac34", 190},
373         {L"frasl", 8260},
374         {L"gamma", 947},
375         {L"ge", 8805},
376         {L"gt", 62},
377         {L"hArr", 8660},
378         {L"harr", 8596},
379         {L"hearts", 9829},
380         {L"hellip", 8230},
381         {L"iacute", 237},
382         {L"icirc", 238},
383         {L"iexcl", 161},
384         {L"igrave", 236},
385         {L"image", 8465},
386         {L"infin", 8734},
387         {L"int", 8747},
388         {L"iota", 953},
389         {L"iquest", 191},
390         {L"isin", 8712},
391         {L"iuml", 239},
392         {L"kappa", 954},
393         {L"lArr", 8656},
394         {L"lambda", 955},
395         {L"lang", 9001},
396         {L"laquo", 171},
397         {L"larr", 8592},
398         {L"lceil", 8968},
399         {L"ldots", 8230},
400         {L"ldquo", 8220},
401         {L"le", 8804},
402         {L"lfloor", 8970},
403         {L"lowast", 8727},
404         {L"loz", 9674},
405         {L"lrm", 8206},
406         {L"lsaquo", 8249},
407         {L"lsquo", 8216},
408         {L"lt", 60},
409         {L"macr", 175},
410         {L"mdash", 8212},
411         {L"micro", 181},
412         {L"middot", 183},
413         {L"minus", 8722},
414         {L"mu", 956},
415         {L"nabla", 8711},
416         {L"nbsp", 160},
417         {L"ndash", 8211},
418         {L"ne", 8800},
419         {L"ni", 8715},
420         {L"not", 172},
421         {L"notin", 8713},
422         {L"nsub", 8836},
423         {L"ntilde", 241},
424         {L"nu", 957},
425         {L"oacute", 243},
426         {L"ocirc", 244},
427         {L"oelig", 339},
428         {L"ograve", 242},
429         {L"oline", 8254},
430         {L"omega", 969},
431         {L"omicron", 959},
432         {L"oplus", 8853},
433         {L"or", 8744},
434         {L"ordf", 170},
435         {L"ordm", 186},
436         {L"oslash", 248},
437         {L"otilde", 245},
438         {L"otimes", 8855},
439         {L"ouml", 246},
440         {L"para", 182},
441         {L"part", 8706},
442         {L"permil", 8240},
443         {L"perp", 8869},
444         {L"phi", 966},
445         {L"pi", 960},
446         {L"piv", 982},
447         {L"plusmn", 177},
448         {L"pound", 163},
449         {L"prime", 8242},
450         {L"prod", 8719},
451         {L"prop", 8733},
452         {L"psi", 968},
453         {L"quad", 8193},
454         {L"quot", 34},
455         {L"rArr", 8658},
456         {L"radic", 8730},
457         {L"rang", 9002},
458         {L"raquo", 187},
459         {L"rarr", 8594},
460         {L"rceil", 8969},
461         {L"rdquo", 8221},
462         {L"real", 8476},
463         {L"reg", 174},
464         {L"rfloor", 8971},
465         {L"rho", 961},
466         {L"rlm", 8207},
467         {L"rsaquo", 8250},
468         {L"rsquo", 8217},
469         {L"sbquo", 8218},
470         {L"scaron", 353},
471         {L"sdot", 8901},
472         {L"sect", 167},
473         {L"shy", 173},
474         {L"sigma", 963},
475         {L"sigmaf", 962},
476         {L"sim", 8764},
477         {L"sp", 8194},
478         {L"spades", 9824},
479         {L"sub", 8834},
480         {L"sube", 8838},
481         {L"sum", 8721},
482         {L"sup", 8835},
483         {L"sup1", 185},
484         {L"sup2", 178},
485         {L"sup3", 179},
486         {L"supe", 8839},
487         {L"szlig", 223},
488         {L"tau", 964},
489         {L"there4", 8756},
490         {L"theta", 952},
491         {L"thetasym", 977},
492         {L"thinsp", 8201},
493         {L"thorn", 254},
494         {L"tilde", 732},
495         {L"times", 215},
496         {L"trade", 8482},
497         {L"uArr", 8657},
498         {L"uacute", 250},
499         {L"uarr", 8593},
500         {L"ucirc", 251},
501         {L"ugrave", 249},
502         {L"uml", 168},
503         {L"upsih", 978},
504         {L"upsilon", 965},
505         {L"uuml", 252},
506         {L"varepsilon", 8712},
507         {L"varphi", 981},
508         {L"varpi", 982},
509         {L"varrho", 1009},
510         {L"vdots", 8942},
511         {L"vsigma", 962},
512         {L"vtheta", 977},
513         {L"weierp", 8472},
514         {L"xi", 958},
515         {L"yacute", 253},
516         {L"yen", 165},
517         {L"yuml", 255},
518         {L"zeta", 950},
519         {L"zwj", 8205},
520         {L"zwnj", 8204}
521 };
522 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
523
524 // Characters Winstart..Winend are those that Windows
525 // uses interpolated into the Latin1 set.
526 // They aren't supposed to appear in HTML, but they do....
527 enum {
528         Winstart = 127,
529         Winend = 159
530 };
531
532 static int      winchars[]= { 8226,     // 8226 is a bullet
533         8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
534         710, 8240, 352, 8249, 338, 8226, 8226, 8226,
535         8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
536         732, 8482, 353, 8250, 339, 8226, 8226, 376};
537
538 static StringInt*       tagtable;               // initialized from tagnames
539 static StringInt*       attrtable;              // initialized from attrnames
540
541 static void     lexinit(void);
542 static int              getplaindata(TokenSource* ts, Token* a, int* pai);
543 static int              getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
544 static int              getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
545 static int              gettag(TokenSource* ts, int starti, Token* a, int* pai);
546 static Rune*    buftostr(Rune* s, Rune* buf, int j);
547 static int              comment(TokenSource* ts);
548 static int              findstr(TokenSource* ts, Rune* s);
549 static int              ampersand(TokenSource* ts);
550 static int              lowerc(int c);
551 static int              getchar(TokenSource* ts);
552 static void             ungetchar(TokenSource* ts, int c);
553 static void             backup(TokenSource* ts, int savei);
554 static void             freeinsidetoken(Token* t);
555 static void             freeattrs(Attr* ahead);
556 static Attr*    newattr(int attid, Rune* value, Attr* link);
557 static int              Tconv(Fmt* f);
558
559 int     dbglex = 0;
560 static int lexinited = 0;
561
562 static void
563 lexinit(void)
564 {
565         tagtable = _makestrinttab(tagnames, Numtags);
566         attrtable = _makestrinttab(attrnames, Numattrs);
567         fmtinstall('T', Tconv);
568         lexinited = 1;
569 }
570
571 static TokenSource*
572 newtokensource(uchar* data, int edata, int chset, int mtype)
573 {
574         TokenSource*    ans;
575
576         assert(chset == US_Ascii || chset == ISO_8859_1 ||
577                         chset == UTF_8 || chset == Unicode);
578         ans = (TokenSource*)emalloc(sizeof(TokenSource));
579         ans->i = 0;
580         ans->data = data;
581         ans->edata = edata;
582         ans->chset = chset;
583         ans->mtype = mtype;
584         return ans;
585 }
586
587 enum {
588         ToksChunk = 500,
589 };
590
591 // Call this to get the tokens.
592 //  The number of returned tokens is returned in *plen.
593 Token*
594 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
595 {
596         TokenSource*    ts;
597         Token*          a;
598         int     alen;
599         int     ai;
600         int     starti;
601         int     c;
602         int     tag;
603
604         if(!lexinited)
605                 lexinit();
606         ts = newtokensource(data, datalen, chset, mtype);
607         if(dbglex)
608                 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
609         alen = 0;
610         ai = 0;
611         a = 0;
612         if(ts->mtype == TextHtml) {
613                 for(;;) {
614                         if(alen - ai < ToksChunk/32) {
615                                 alen += ToksChunk;
616                                 a = erealloc(a, alen*sizeof *a);
617                         }
618                         starti = ts->i;
619                         c = getchar(ts);
620                         if(c < 0)
621                                 break;
622                         if(c == '<') {
623                                 tag = gettag(ts, starti, a, &ai);
624                                 if(tag == Tscript || tag == Tstyle) {
625                                         // special rules for getting Data after....
626                                         starti = ts->i;
627                                         c = getchar(ts);
628                                         tag = getscriptdata(ts, c, starti, a, &ai, tag);
629                                 }
630                         }
631                         else
632                                 tag = getdata(ts, c, starti, a, &ai);
633                         if(tag == -1)
634                                 break;
635                         else if(dbglex > 1 && tag != Comment)
636                                 fprint(2, "lex: got token %T\n", &a[ai-1]);
637                 }
638         }
639         else {
640                 // plain text (non-html) tokens
641                 for(;;) {
642                         if(alen - ai < ToksChunk/32) {
643                                 alen += ToksChunk;
644                                 a = erealloc(a, alen*sizeof *a);
645                         }
646                         tag = getplaindata(ts, a, &ai);
647                         if(tag == -1)
648                                 break;
649                         if(dbglex > 1)
650                                 fprint(2, "lex: got token %T\n", &a[ai]);
651                 }
652         }
653         free(ts);
654         if(dbglex)
655                 fprint(2, "lex: returning %d tokens\n", ai);
656         *plen = ai;
657         if(ai == 0){
658                 free(a);
659                 a = 0;
660         }
661         return a;
662 }
663
664 // For case where source isn't HTML.
665 // Just make data tokens, one per line (or partial line,
666 // at end of buffer), ignoring non-whitespace control
667 // characters and dumping \r's.
668 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
669 // Otherwise return -1;
670 static int
671 getplaindata(TokenSource* ts, Token* a, int* pai)
672 {
673         Rune*   s;
674         int     j;
675         int     starti;
676         int     c;
677         Token*  tok;
678         Rune    buf[BIGBUFSIZE];
679
680         s = nil;
681         j = 0;
682         starti = ts->i;
683         for(c = getchar(ts); c >= 0; c = getchar(ts)) {
684                 if(c < ' ') {
685                         if(isspace(c)) {
686                                 if(c == '\r') {
687                                         // ignore it unless no following '\n',
688                                         // in which case treat it like '\n'
689                                         c = getchar(ts);
690                                         if(c != '\n') {
691                                                 if(c >= 0)
692                                                         ungetchar(ts, c);
693                                                 c = '\n';
694                                         }
695                                 }
696                         }
697                         else
698                                 c = 0;
699                 }
700                 if(c != 0) {
701                         buf[j++] = c;
702                         if(j == nelem(buf)-1) {
703                                 s = buftostr(s, buf, j);
704                                 j = 0;
705                         }
706                 }
707                 if(c == '\n')
708                         break;
709         }
710         s = buftostr(s, buf, j);
711         if(s == nil)
712                 return -1;
713         tok = &a[(*pai)++];
714         tok->tag = Data;
715         tok->text = s;
716         tok->attr = nil;
717         tok->starti = starti;
718         return Data;
719 }
720
721 // Return concatenation of s and buf[0:j]
722 static Rune*
723 buftostr(Rune* s, Rune* buf, int j)
724 {
725         int i;
726
727         if(s == nil)
728                 s = _Strndup(buf, j);
729         else {
730                 i = _Strlen(s);
731                 s = realloc(s, ( i+j+1)*sizeof *s);
732                 memcpy(&s[i], buf, j*sizeof *s);
733                 s[i+j] = 0;
734         }
735         return s;
736 }
737
738 // Gather data up to next start-of-tag or end-of-buffer.
739 // Translate entity references (&amp;).
740 // Ignore non-whitespace control characters and get rid of \r's.
741 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
742 // Otherwise return -1;
743 static int
744 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
745 {
746         Rune*   s;
747         int     j;
748         int     c;
749         Token*  tok;
750         Rune    buf[SMALLBUFSIZE];
751
752         s = nil;
753         j = 0;
754         for(c = firstc; c >= 0; c = getchar(ts)){
755                 if(c == '&') {
756                         c = ampersand(ts);
757                         if(c < 0)
758                                 break;
759                 }
760                 else if(c < ' ') {
761                         if(isspace(c)) {
762                                 if(c == '\r') {
763                                         // ignore it unless no following '\n',
764                                         // in which case treat it like '\n'
765                                         c = getchar(ts);
766                                         if(c != '\n') {
767                                                 if(c >= 0)
768                                                         ungetchar(ts, c);
769                                                 c = '\n';
770                                         }
771                                 }
772                         }
773                         else {
774                                 if(warn)
775                                         fprint(2, "warning: non-whitespace control character %d ignored\n", c);
776                                 c = 0;
777                         }
778                 }
779                 else if(c == '<') {
780                         ungetchar(ts, c);
781                         break;
782                 }
783                 if(c != 0) {
784                         buf[j++] = c;
785                         if(j == nelem(buf)-1) {
786                                 s = buftostr(s, buf, j);
787                                 j = 0;
788                         }
789                 }
790         }
791         s = buftostr(s, buf, j);
792         if(s == nil)
793                 return -1;
794         tok = &a[(*pai)++];
795         tok->tag = Data;
796         tok->text = s;
797         tok->attr = nil;
798         tok->starti = starti;
799         return Data;
800 }
801
802 // The rules for lexing scripts are different (ugh).
803 // Gather up everything until see an "</" tagnames[tok] ">"
804 static int
805 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
806 {
807         Rune*   s;
808         int     j;
809         int     tstarti;
810         int     savei;
811         int     c;
812         int     tag;
813         int     done;
814         Token*  tok;
815         Rune    buf[BIGBUFSIZE];
816
817         s = nil;
818         j = 0;
819         tstarti = starti;
820         c = firstc;
821         done = 0;
822         while(c >= 0) {
823                 if(c == '<') {
824                         // other browsers ignore stuff to end of line after <!
825                         savei = ts->i;
826                         c = getchar(ts);
827                         if(c == '!') {
828                                 if(comment(ts) == -1)
829                                         break;
830                                 if(c == '\r')
831                                         c = getchar(ts);
832                                 if(c == '\n')
833                                         c = getchar(ts);
834                         }
835                         else if(c >= 0) {
836                                 backup(ts, savei);
837                                 tag = gettag(ts, tstarti, a, pai);
838                                 if(tag == -1)
839                                         break;
840                                 if(tag != Comment)
841                                         (*pai)--;
842                                 backup(ts, tstarti);
843                                 if(tag == findtag + RBRA) {
844                                         done = 1;
845                                         break;
846                                 }
847                                 // here tag was not the one we were looking for, so take as regular data
848                                 c = getchar(ts);
849                         }
850                 }
851                 if(c < 0)
852                         break;
853                 if(c != 0) {
854                         buf[j++] = c;
855                         if(j == nelem(buf)-1) {
856                                 s = buftostr(s, buf, j);
857                                 j = 0;
858                         }
859                 }
860                 tstarti = ts->i;
861                 c = getchar(ts);
862         }
863         if(done || ts->i == ts->edata) {
864                 s = buftostr(s, buf, j);
865                 tok = &a[(*pai)++];
866                 tok->tag = Data;
867                 tok->text = s;
868                 tok->attr = nil;
869                 tok->starti = starti;
870                 return Data;
871         }
872         free(s);
873         backup(ts, starti);
874         return -1;
875 }
876
877 // We've just seen a '<'.  Gather up stuff to closing '>' (if buffer
878 // ends before then, return -1).
879 // If it's a tag, look up the name, gather the attributes, and return
880 // the appropriate token.
881 // Else it's either just plain data or some kind of ignorable stuff:
882 // return Data or Comment as appropriate.
883 // If it's not a Comment, put it in a[*pai] and bump *pai.
884 static int
885 gettag(TokenSource* ts, int starti, Token* a, int* pai)
886 {
887         int     rbra;
888         int     ans;
889         Attr*   al;
890         int     nexti;
891         int     c;
892         int     ti;
893         int     afnd;
894         int     attid;
895         int     quote;
896         Rune*   val;
897         int     nv;
898         int     i;
899         int     tag;
900         Token*  tok;
901         Rune    buf[BIGBUFSIZE];
902
903         rbra = 0;
904         nexti = ts->i;
905         tok = &a[*pai];
906         tok->tag = Notfound;
907         tok->text = nil;
908         tok->attr = nil;
909         tok->starti = starti;
910         c = getchar(ts);
911         if(c == '/') {
912                 rbra = RBRA;
913                 c = getchar(ts);
914         }
915         if(c < 0)
916                 goto eob_done;
917         if(c >= 256 || !isalpha(c)) {
918                 // not a tag
919                 if(c == '!') {
920                         ans = comment(ts);
921                         if(ans != -1)
922                                 return ans;
923                         goto eob_done;
924                 }
925                 else {
926                         backup(ts, nexti);
927                         tok->tag = Data;
928                         tok->text = _Strdup(L"<");
929                         (*pai)++;
930                         return Data;
931                 }
932         }
933         // c starts a tagname
934         buf[0] = c;
935         i = 1;
936         while(1) {
937                 c = getchar(ts);
938                 if(c < 0)
939                         goto eob_done;
940                 if(!ISNAMCHAR(c))
941                         break;
942                 // if name is bigger than buf it won't be found anyway...
943                 if(i < BIGBUFSIZE)
944                         buf[i++] = c;
945         }
946         if(_lookup(tagtable, Numtags, buf, i, &tag))
947                 tok->tag = tag + rbra;
948         else
949                 tok->text = _Strndup(buf, i);   // for warning print, in build
950         // attribute gathering loop
951         al = nil;
952         while(1) {
953                 // look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
954                 // skip whitespace
955 attrloop_continue:
956                 while(c < 256 && isspace(c)) {
957                         c = getchar(ts);
958                         if(c < 0)
959                                 goto eob_done;
960                 }
961                 if(c == '>')
962                         goto attrloop_done;
963                 if(c == '<') {
964                         if(warn)
965                                 fprint(2, "warning: unclosed tag\n");
966                         ungetchar(ts, c);
967                         goto attrloop_done;
968                 }
969                 if(c >= 256 || !isalpha(c)) {
970                         if(warn)
971                                 fprint(2, "warning: expected attribute name\n");
972                         // skipt to next attribute name
973                         while(1) {
974                                 c = getchar(ts);
975                                 if(c < 0)
976                                         goto eob_done;
977                                 if(c < 256 && isalpha(c))
978                                         goto attrloop_continue;
979                                 if(c == '<') {
980                                         if(warn)
981                                                 fprint(2, "warning: unclosed tag\n");
982                                         ungetchar(ts, 60);
983                                         goto attrloop_done;
984                                 }
985                                 if(c == '>')
986                                         goto attrloop_done;
987                         }
988                 }
989                 // gather attribute name
990                 buf[0] = c;
991                 i = 1;
992                 while(1) {
993                         c = getchar(ts);
994                         if(c < 0)
995                                 goto eob_done;
996                         if(!ISNAMCHAR(c))
997                                 break;
998                         if(i < BIGBUFSIZE-1)
999                                 buf[i++] = c;
1000                 }
1001                 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
1002                 if(warn && !afnd) {
1003                         buf[i] = 0;
1004                         fprint(2, "warning: unknown attribute name %S\n", buf);
1005                 }
1006                 // skip whitespace
1007                 while(c < 256 && isspace(c)) {
1008                         c = getchar(ts);
1009                         if(c < 0)
1010                                 goto eob_done;
1011                 }
1012                 if(c != '=') {
1013                         if(afnd)
1014                                 al = newattr(attid, nil, al);
1015                         goto attrloop_continue;
1016                 }
1017                 //# c is '=' here;  skip whitespace
1018                 while(1) {
1019                         c = getchar(ts);
1020                         if(c < 0)
1021                                 goto eob_done;
1022                         if(c >= 256 || !isspace(c))
1023                                 break;
1024                 }
1025                 quote = 0;
1026                 if(c == '\'' || c == '"') {
1027                         quote = c;
1028                         c = getchar(ts);
1029                         if(c < 0)
1030                                 goto eob_done;
1031                 }
1032                 val = nil;
1033                 nv = 0;
1034                 while(1) {
1035 valloop_continue:
1036                         if(c < 0)
1037                                 goto eob_done;
1038                         if(c == '>') {
1039                                 if(quote) {
1040                                         // c might be part of string (though not good style)
1041                                         // but if line ends before close quote, assume
1042                                         // there was an unmatched quote
1043                                         ti = ts->i;
1044                                         while(1) {
1045                                                 c = getchar(ts);
1046                                                 if(c < 0)
1047                                                         goto eob_done;
1048                                                 if(c == quote) {
1049                                                         backup(ts, ti);
1050                                                         buf[nv++] = '>';
1051                                                         if(nv == BIGBUFSIZE-1) {
1052                                                                 val = buftostr(val, buf, nv);
1053                                                                 nv = 0;
1054                                                         }
1055                                                         c = getchar(ts);
1056                                                         goto valloop_continue;
1057                                                 }
1058                                                 if(c == '\n') {
1059                                                         if(warn)
1060                                                                 fprint(2, "warning: apparent unmatched quote\n");
1061                                                         backup(ts, ti);
1062                                                         c = '>';
1063                                                         goto valloop_done;
1064                                                 }
1065                                         }
1066                                 }
1067                                 else
1068                                         goto valloop_done;
1069                         }
1070                         if(quote) {
1071                                 if(c == quote) {
1072                                         c = getchar(ts);
1073                                         if(c < 0)
1074                                                 goto eob_done;
1075                                         goto valloop_done;
1076                                 }
1077                                 if(c == '\r') {
1078                                         c = getchar(ts);
1079                                         goto valloop_continue;
1080                                 }
1081                                 if(c == '\t' || c == '\n')
1082                                         c = ' ';
1083                         }
1084                         else {
1085                                 if(c < 256 && isspace(c))
1086                                         goto valloop_done;
1087                         }
1088                         if(c == '&') {
1089                                 c = ampersand(ts);
1090                                 if(c == -1)
1091                                         goto eob_done;
1092                         }
1093                         buf[nv++] = c;
1094                         if(nv == BIGBUFSIZE-1) {
1095                                 val = buftostr(val, buf, nv);
1096                                 nv = 0;
1097                         }
1098                         c = getchar(ts);
1099                 }
1100 valloop_done:
1101                 if(afnd) {
1102                         val = buftostr(val, buf, nv);
1103                         al = newattr(attid, val, al);
1104                 }
1105         }
1106
1107 attrloop_done:
1108         tok->attr = al;
1109         (*pai)++;
1110         return tok->tag;
1111
1112 eob_done:
1113         if(warn)
1114                 fprint(2, "warning: incomplete tag at end of page\n");
1115         backup(ts, nexti);
1116         tok->tag = Data;
1117         tok->text = _Strdup(L"<");
1118         return Data;
1119 }
1120
1121 // We've just read a '<!' at position starti,
1122 // so this may be a comment or other ignored section, or it may
1123 // be just a literal string if there is no close before end of file
1124 // (other browsers do that).
1125 // The accepted practice seems to be (note: contrary to SGML spec!):
1126 // If see <!--, look for --> to close, or if none, > to close.
1127 // If see <!(not --), look for > to close.
1128 // If no close before end of file, leave original characters in as literal data.
1129 //
1130 // If we see ignorable stuff, return Comment.
1131 // Else return nil (caller should back up and try again when more data arrives,
1132 // unless at end of file, in which case caller should just make '<' a data token).
1133 static int
1134 comment(TokenSource* ts)
1135 {
1136         int     nexti;
1137         int     havecomment;
1138         int     c;
1139
1140         nexti = ts->i;
1141         havecomment = 0;
1142         c = getchar(ts);
1143         if(c == '-') {
1144                 c = getchar(ts);
1145                 if(c == '-') {
1146                         if(findstr(ts, L"-->"))
1147                                 havecomment = 1;
1148                         else
1149                                 backup(ts, nexti);
1150                 }
1151         }
1152         if(!havecomment) {
1153                 if(c == '>')
1154                         havecomment = 1;
1155                 else if(c >= 0) {
1156                         if(findstr(ts, L">"))
1157                                 havecomment = 1;
1158                 }
1159         }
1160         if(havecomment)
1161                 return Comment;
1162         return -1;
1163 }
1164
1165 // Look for string s in token source.
1166 // If found, return 1, with buffer at next char after s,
1167 // else return 0 (caller should back up).
1168 static int
1169 findstr(TokenSource* ts, Rune* s)
1170 {
1171         int     c0;
1172         int     n;
1173         int     nexti;
1174         int     i;
1175         int     c;
1176
1177         c0 = s[0];
1178         n = runestrlen(s);
1179         while(1) {
1180                 c = getchar(ts);
1181                 if(c < 0)
1182                         break;
1183                 if(c == c0) {
1184                         if(n == 1)
1185                                 return 1;
1186                         nexti = ts->i;
1187                         for(i = 1; i < n; i++) {
1188                                 c = getchar(ts);
1189                                 if(c < 0)
1190                                         goto mainloop_done;
1191                                 if(c != s[i])
1192                                         break;
1193                         }
1194                         if(i == n)
1195                                 return 1;
1196                         backup(ts, nexti);
1197                 }
1198         }
1199 mainloop_done:
1200         return 0;
1201 }
1202
1203 // We've just read an '&'; look for an entity reference
1204 // name, and if found, return translated char.
1205 // if there is a complete entity name but it isn't known,
1206 // back up to just past the '&' and return '&'.
1207 // If the entity can't be completed in the current buffer, back up
1208 // to the '&' and return -1.
1209 static int
1210 ampersand(TokenSource* ts)
1211 {
1212         int     savei;
1213         int     c;
1214         int     fnd;
1215         int     ans;
1216         int     v;
1217         int     k;
1218         Rune    buf[25];
1219
1220         savei = ts->i;
1221         c = getchar(ts);
1222         fnd = 0;
1223         ans = -1;
1224         if(c == '#') {
1225                 c = getchar(ts);
1226                 v = 0;
1227                 if(c == 'X' || c == 'x')
1228                         for(c = getchar(ts); c < 256; c = getchar(ts))
1229                                 if(c >= '0' && c <= '9')
1230                                         v = v*16+c-'0';
1231                                 else if(c >= 'A' && c<= 'F')
1232                                         v = v*16+c-'A'+10;
1233                                 else if(c >= 'a' && c <= 'f')
1234                                         v = v*16+c-'a'+10;
1235                                 else
1236                                         break;
1237                 else
1238                         while(c >= 0) {
1239                                 if(!(c < 256 && isdigit(c)))
1240                                         break;
1241                                 v = v*10 + c - 48;
1242                                 c = getchar(ts);
1243                         }
1244                 if(c >= 0) {
1245                         if(!(c == ';' || c == '\n' || c == '\r'))
1246                                 ungetchar(ts, c);
1247                         c = v;
1248                         if(c == 160)
1249                                 c = 160;
1250                         if(c >= Winstart && c <= Winend) {
1251                                 c = winchars[c - Winstart];
1252                         }
1253                         ans = c;
1254                         fnd = 1;
1255                 }
1256         }
1257         else if(c < 256 && isalpha(c)) {
1258                 buf[0] = c;
1259                 k = 1;
1260                 while(1) {
1261                         c = getchar(ts);
1262                         if(c < 0)
1263                                 break;
1264                         if(c < 256 && (isalpha(c) || isdigit(c))) {
1265                                 if(k < nelem(buf)-1)
1266                                         buf[k++] = c;
1267                         }
1268                         else {
1269                                 if(!(c == ';' || c == '\n' || c == '\r'))
1270                                         ungetchar(ts, c);
1271                                 break;
1272                         }
1273                 }
1274                 if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
1275                         fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1276         }
1277         if(!fnd) {
1278                 backup(ts, savei);
1279                 ans = '&';
1280         }
1281         return ans;
1282 }
1283
1284 // Get next char, obeying ts.chset.
1285 // Returns -1 if no complete character left before current end of data.
1286 static int
1287 getchar(TokenSource* ts)
1288 {
1289         uchar*  buf;
1290         int     c;
1291         int     n;
1292         int     ok;
1293         Rune    r;
1294
1295         if(ts->i >= ts->edata)
1296                 return -1;
1297         buf = ts->data;
1298         c = buf[ts->i];
1299         switch(ts->chset) {
1300         case ISO_8859_1:
1301                 if(c >= Winstart && c <= Winend)
1302                         c = winchars[c - Winstart];
1303                 ts->i++;
1304                 break;
1305         case US_Ascii:
1306                 if(c > 127) {
1307                         if(warn)
1308                                 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1309                 }
1310                 ts->i++;
1311                 break;
1312         case UTF_8:
1313                 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1314                 if(ok) {
1315                         n = chartorune(&r, (char*)(buf+ts->i));
1316                         if(warn && c == Runeerror)
1317                                 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1318                         ts->i += n;
1319                         c = r;
1320                 }
1321                 else {
1322                         // not enough bytes in buf to complete utf-8 char
1323                         ts->i = ts->edata;      // mark "all used"
1324                         c = -1;
1325                 }
1326                 break;
1327         case Unicode:
1328                 if(ts->i < ts->edata - 1) {
1329                         //standards say most-significant byte first
1330                         c = (c << 8)|(buf[ts->i + 1]);
1331                         ts->i += 2;
1332                 }
1333                 else {
1334                         ts->i = ts->edata;      // mark "all used"
1335                         c = -1;
1336                 }
1337                 break;
1338         default:
1339                 return -1;
1340         }
1341         return c;
1342 }
1343
1344 // Assuming c was the last character returned by getchar, set
1345 // things up so that next getchar will get that same character
1346 // followed by the current 'next character', etc.
1347 static void
1348 ungetchar(TokenSource* ts, int c)
1349 {
1350         int     n;
1351         Rune    r;
1352         char    a[UTFmax];
1353
1354         n = 1;
1355         switch(ts->chset) {
1356         case UTF_8:
1357                 if(c >= 128) {
1358                         r = c;
1359                         n = runetochar(a, &r);
1360                 }
1361                 break;
1362         case Unicode:
1363                 n = 2;
1364                 break;
1365         }
1366         ts->i -= n;
1367 }
1368
1369 // Restore ts so that it is at the state where the index was savei.
1370 static void
1371 backup(TokenSource* ts, int savei)
1372 {
1373         if(dbglex)
1374                 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1375         ts->i = savei;
1376 }
1377
1378
1379 // Look for value associated with attribute attid in token t.
1380 // If there is one, return 1 and put the value in *pans,
1381 // else return 0.
1382 // If xfer is true, transfer ownership of the string to the caller
1383 // (nil it out here); otherwise, caller must duplicate the answer
1384 // if it needs to save it.
1385 // OK to have pans==0, in which case this is just looking
1386 // to see if token is present.
1387 int
1388 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1389 {
1390         Attr*   attr;
1391
1392         attr = t->attr;
1393         while(attr != nil) {
1394                 if(attr->attid == attid) {
1395                         if(pans != nil)
1396                                 *pans = attr->value;
1397                         if(xfer)
1398                                 attr->value = nil;
1399                         return 1;
1400                 }
1401                 attr = attr->next;
1402         }
1403         if(pans != nil)
1404                 *pans = nil;
1405         return 0;
1406 }
1407
1408 static int
1409 Tconv(Fmt *f)
1410 {
1411         Token*  t;
1412         int     i;
1413         int     tag;
1414         char*   srbra;
1415         Rune*   aname;
1416         Rune*   tname;
1417         Attr*   a;
1418         char    buf[BIGBUFSIZE];
1419
1420         t = va_arg(f->args, Token*);
1421         if(t == nil)
1422                 sprint(buf, "<null>");
1423         else {
1424                 i = 0;
1425                 if(dbglex > 1)
1426                         i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1427                 tag = t->tag;
1428                 if(tag == Data) {
1429                         i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1430                 }
1431                 else {
1432                         srbra = "";
1433                         if(tag >= RBRA) {
1434                                 tag -= RBRA;
1435                                 srbra = "/";
1436                         }
1437                         tname = tagnames[tag];
1438                         if(tag == Notfound)
1439                                 tname = L"?";
1440                         i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1441                         for(a = t->attr; a != nil; a = a->next) {
1442                                 aname = attrnames[a->attid];
1443                                 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1444                                 if(a->value != nil)
1445                                         i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1446                         }
1447                         i += snprint(buf+i, sizeof(buf)-i-1, ">");
1448                 }
1449                 buf[i] = 0;
1450         }
1451         return fmtstrcpy(f, buf);
1452 }
1453
1454 // Attrs own their constituent strings, but build may eventually
1455 // transfer some values to its items and nil them out in the Attr.
1456 static Attr*
1457 newattr(int attid, Rune* value, Attr* link)
1458 {
1459         Attr* ans;
1460
1461         ans = (Attr*)emalloc(sizeof(Attr));
1462         ans->attid = attid;
1463         ans->value = value;
1464         ans->next = link;
1465         return ans;
1466 }
1467
1468 // Free list of Attrs linked through next field
1469 static void
1470 freeattrs(Attr* ahead)
1471 {
1472         Attr* a;
1473         Attr* nexta;
1474
1475         a = ahead;
1476         while(a != nil) {
1477                 nexta = a->next;
1478                 free(a->value);
1479                 free(a);
1480                 a = nexta;
1481         }
1482 }
1483
1484 // Free array of Tokens.
1485 // Allocated space might have room for more than n tokens,
1486 // but only n of them are initialized.
1487 // If caller has transferred ownership of constitutent strings
1488 // or attributes, it must have nil'd out the pointers in the Tokens.
1489 void
1490 _freetokens(Token* tarray, int n)
1491 {
1492         int i;
1493         Token* t;
1494
1495         if(tarray == nil)
1496                 return;
1497         for(i = 0; i < n; i++) {
1498                 t = &tarray[i];
1499                 free(t->text);
1500                 freeattrs(t->attr);
1501         }
1502         free(tarray);
1503 }