8 typedef struct TokenSource TokenSource;
11 int i; // index of next byte to use
12 uchar* data; // all the data
13 int edata; // data[0:edata] is valid
14 int chset; // one of US_Ascii, etc.
15 int mtype; // TextHtml or TextPlain
23 #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
129 // HTML 4.0 attribute names.
130 // Keep sorted, and in correspondence with enum in impl.h.
131 Rune* attrnames[] = {
251 // Character entity to unicode character number map.
252 // Keep sorted by name.
253 StringInt chartab[]= {
356 {L"emdash", 8212}, /* non-standard but commonly used */
359 {L"endash", 8211}, /* non-standard but commonly used */
506 {L"varepsilon", 8712},
522 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
524 // Characters Winstart..Winend are those that Windows
525 // uses interpolated into the Latin1 set.
526 // They aren't supposed to appear in HTML, but they do....
532 static int winchars[]= { 8226, // 8226 is a bullet
533 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
534 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
535 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
536 732, 8482, 353, 8250, 339, 8226, 8226, 376};
538 static StringInt* tagtable; // initialized from tagnames
539 static StringInt* attrtable; // initialized from attrnames
541 static void lexinit(void);
542 static int getplaindata(TokenSource* ts, Token* a, int* pai);
543 static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
544 static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
545 static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
546 static Rune* buftostr(Rune* s, Rune* buf, int j);
547 static int comment(TokenSource* ts);
548 static int findstr(TokenSource* ts, Rune* s);
549 static int ampersand(TokenSource* ts);
550 static int lowerc(int c);
551 static int getchar(TokenSource* ts);
552 static void ungetchar(TokenSource* ts, int c);
553 static void backup(TokenSource* ts, int savei);
554 static void freeinsidetoken(Token* t);
555 static void freeattrs(Attr* ahead);
556 static Attr* newattr(int attid, Rune* value, Attr* link);
557 static int Tconv(Fmt* f);
560 static int lexinited = 0;
565 tagtable = _makestrinttab(tagnames, Numtags);
566 attrtable = _makestrinttab(attrnames, Numattrs);
567 fmtinstall('T', Tconv);
572 newtokensource(uchar* data, int edata, int chset, int mtype)
576 assert(chset == US_Ascii || chset == ISO_8859_1 ||
577 chset == UTF_8 || chset == Unicode);
578 ans = (TokenSource*)emalloc(sizeof(TokenSource));
591 // Call this to get the tokens.
592 // The number of returned tokens is returned in *plen.
594 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
606 ts = newtokensource(data, datalen, chset, mtype);
608 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
612 if(ts->mtype == TextHtml) {
615 if(alen - ai < ToksChunk/32) {
617 a = erealloc(a, alen*sizeof *a);
624 tag = gettag(ts, starti, a, &ai);
625 if(tag == Tscript || tag == Tstyle) {
626 // special rules for getting Data after....
629 tag = getscriptdata(ts, c, starti, a, &ai, tag);
633 tag = getdata(ts, c, starti, a, &ai);
636 else if(dbglex > 1 && tag != Comment)
637 fprint(2, "lex: got token %T\n", &a[ai-1]);
641 // plain text (non-html) tokens
644 if(alen - ai < ToksChunk/32) {
646 a = erealloc(a, alen*sizeof *a);
648 tag = getplaindata(ts, a, &ai);
652 fprint(2, "lex: got token %T\n", &a[ai]);
657 fprint(2, "lex: returning %d tokens\n", ai);
666 // For case where source isn't HTML.
667 // Just make data tokens, one per line (or partial line,
668 // at end of buffer), ignoring non-whitespace control
669 // characters and dumping \r's.
670 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
671 // Otherwise return -1;
673 getplaindata(TokenSource* ts, Token* a, int* pai)
680 Rune buf[BIGBUFSIZE];
685 for(c = getchar(ts); c >= 0; c = getchar(ts)) {
689 // ignore it unless no following '\n',
690 // in which case treat it like '\n'
704 if(j == nelem(buf)-1) {
705 s = buftostr(s, buf, j);
712 s = buftostr(s, buf, j);
719 tok->starti = starti;
723 // Return concatenation of s and buf[0:j]
725 buftostr(Rune* s, Rune* buf, int j)
730 s = _Strndup(buf, j);
733 s = realloc(s, ( i+j+1)*sizeof *s);
734 memcpy(&s[i], buf, j*sizeof *s);
737 setmalloctag(s, getcallerpc(&s));
741 // Gather data up to next start-of-tag or end-of-buffer.
742 // Translate entity references (&).
743 // Ignore non-whitespace control characters and get rid of \r's.
744 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
745 // Otherwise return -1;
747 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
753 Rune buf[SMALLBUFSIZE];
757 for(c = firstc; c >= 0; c = getchar(ts)){
766 // ignore it unless no following '\n',
767 // in which case treat it like '\n'
778 fprint(2, "warning: non-whitespace control character %d ignored\n", c);
788 if(j == nelem(buf)-1) {
789 s = buftostr(s, buf, j);
794 s = buftostr(s, buf, j);
801 tok->starti = starti;
805 // The rules for lexing scripts are different (ugh).
806 // Gather up everything until see an "</" tagnames[tok] ">"
808 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
818 Rune buf[BIGBUFSIZE];
827 // other browsers ignore stuff to end of line after <!
831 if(comment(ts) == -1)
840 tag = gettag(ts, tstarti, a, pai);
846 if(tag == findtag + RBRA) {
850 // here tag was not the one we were looking for, so take as regular data
858 if(j == nelem(buf)-1) {
859 s = buftostr(s, buf, j);
866 if(done || ts->i == ts->edata) {
867 s = buftostr(s, buf, j);
872 tok->starti = starti;
880 // We've just seen a '<'. Gather up stuff to closing '>' (if buffer
881 // ends before then, return -1).
882 // If it's a tag, look up the name, gather the attributes, and return
883 // the appropriate token.
884 // Else it's either just plain data or some kind of ignorable stuff:
885 // return Data or Comment as appropriate.
886 // If it's not a Comment, put it in a[*pai] and bump *pai.
888 gettag(TokenSource* ts, int starti, Token* a, int* pai)
904 Rune buf[BIGBUFSIZE];
913 tok->starti = starti;
921 if(c >= 256 || !isalpha(c)) {
932 tok->text = _Strdup(L"<");
937 // c starts a tagname
946 // if name is bigger than buf it won't be found anyway...
950 if(_lookup(tagtable, Numtags, buf, i, &tag))
951 tok->tag = tag + rbra;
953 tok->text = _Strndup(buf, i); // for warning print, in build
954 // attribute gathering loop
956 // look for "ws name" or "ws name ws = ws val" (ws=whitespace)
959 while(c < 256 && isspace(c)) {
968 fprint(2, "warning: unclosed tag\n");
972 if(c >= 256 || !isalpha(c)) {
974 fprint(2, "warning: expected attribute name\n");
975 // skipt to next attribute name
980 if(c < 256 && isalpha(c))
981 goto attrloop_continue;
984 fprint(2, "warning: unclosed tag\n");
992 // gather attribute name
1001 if(i < BIGBUFSIZE-1)
1004 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
1007 fprint(2, "warning: unknown attribute name %S\n", buf);
1010 while(c < 256 && isspace(c)) {
1017 al = newattr(attid, nil, al);
1018 goto attrloop_continue;
1020 //# c is '=' here; skip whitespace
1025 if(c >= 256 || !isspace(c))
1029 if(c == '\'' || c == '"') {
1043 // c might be part of string (though not good style)
1044 // but if line ends before close quote, assume
1045 // there was an unmatched quote
1054 if(nv == BIGBUFSIZE-1) {
1055 val = buftostr(val, buf, nv);
1059 goto valloop_continue;
1063 fprint(2, "warning: apparent unmatched quote\n");
1082 goto valloop_continue;
1084 if(c == '\t' || c == '\n')
1088 if(c < 256 && isspace(c))
1097 if(nv == BIGBUFSIZE-1) {
1098 val = buftostr(val, buf, nv);
1105 val = buftostr(val, buf, nv);
1106 al = newattr(attid, val, al);
1117 fprint(2, "warning: incomplete tag at end of page\n");
1121 tok->text = _Strdup(L"<");
1125 // We've just read a '<!' at position starti,
1126 // so this may be a comment or other ignored section, or it may
1127 // be just a literal string if there is no close before end of file
1128 // (other browsers do that).
1129 // The accepted practice seems to be (note: contrary to SGML spec!):
1130 // If see <!--, look for --> to close, or if none, > to close.
1131 // If see <!(not --), look for > to close.
1132 // If no close before end of file, leave original characters in as literal data.
1134 // If we see ignorable stuff, return Comment.
1135 // Else return nil (caller should back up and try again when more data arrives,
1136 // unless at end of file, in which case caller should just make '<' a data token).
1138 comment(TokenSource* ts)
1150 if(findstr(ts, L"-->"))
1160 if(findstr(ts, L">"))
1169 // Look for string s in token source.
1170 // If found, return 1, with buffer at next char after s,
1171 // else return 0 (caller should back up).
1173 findstr(TokenSource* ts, Rune* s)
1191 for(i = 1; i < n; i++) {
1207 // We've just read an '&'; look for an entity reference
1208 // name, and if found, return translated char.
1209 // if there is a complete entity name but it isn't known,
1210 // back up to just past the '&' and return '&'.
1211 // If the entity can't be completed in the current buffer, back up
1212 // to the '&' and return -1.
1214 ampersand(TokenSource* ts)
1231 if(c == 'X' || c == 'x')
1232 for(c = getchar(ts); c < 256; c = getchar(ts))
1233 if(c >= '0' && c <= '9')
1235 else if(c >= 'A' && c<= 'F')
1237 else if(c >= 'a' && c <= 'f')
1243 if(!(c < 256 && isdigit(c)))
1249 if(!(c == ';' || c == '\n' || c == '\r'))
1254 if(c >= Winstart && c <= Winend) {
1255 c = winchars[c - Winstart];
1261 else if(c < 256 && isalpha(c)) {
1268 if(c < 256 && (isalpha(c) || isdigit(c))) {
1269 if(k < nelem(buf)-1)
1273 if(!(c == ';' || c == '\n' || c == '\r'))
1278 if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
1279 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1288 // Get next char, obeying ts.chset.
1289 // Returns -1 if no complete character left before current end of data.
1291 getchar(TokenSource* ts)
1299 if(ts->i >= ts->edata)
1305 if(c >= Winstart && c <= Winend)
1306 c = winchars[c - Winstart];
1312 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1317 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1319 n = chartorune(&r, (char*)(buf+ts->i));
1320 if(warn && c == Runeerror)
1321 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1326 // not enough bytes in buf to complete utf-8 char
1327 ts->i = ts->edata; // mark "all used"
1332 if(ts->i < ts->edata - 1) {
1333 //standards say most-significant byte first
1334 c = (c << 8)|(buf[ts->i + 1]);
1338 ts->i = ts->edata; // mark "all used"
1348 // Assuming c was the last character returned by getchar, set
1349 // things up so that next getchar will get that same character
1350 // followed by the current 'next character', etc.
1352 ungetchar(TokenSource* ts, int c)
1363 n = runetochar(a, &r);
1373 // Restore ts so that it is at the state where the index was savei.
1375 backup(TokenSource* ts, int savei)
1378 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1383 // Look for value associated with attribute attid in token t.
1384 // If there is one, return 1 and put the value in *pans,
1386 // If xfer is true, transfer ownership of the string to the caller
1387 // (nil it out here); otherwise, caller must duplicate the answer
1388 // if it needs to save it.
1389 // OK to have pans==0, in which case this is just looking
1390 // to see if token is present.
1392 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1397 while(attr != nil) {
1398 if(attr->attid == attid) {
1400 *pans = attr->value;
1422 char buf[BIGBUFSIZE];
1424 t = va_arg(f->args, Token*);
1426 sprint(buf, "<null>");
1430 i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1433 i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1441 tname = tagnames[tag];
1444 i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1445 for(a = t->attr; a != nil; a = a->next) {
1446 aname = attrnames[a->attid];
1447 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1449 i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1451 i += snprint(buf+i, sizeof(buf)-i-1, ">");
1455 return fmtstrcpy(f, buf);
1458 // Attrs own their constituent strings, but build may eventually
1459 // transfer some values to its items and nil them out in the Attr.
1461 newattr(int attid, Rune* value, Attr* link)
1465 ans = (Attr*)emalloc(sizeof(Attr));
1469 setmalloctag(ans, getcallerpc(&attid));
1473 // Free list of Attrs linked through next field
1475 freeattrs(Attr* ahead)
1489 // Free array of Tokens.
1490 // Allocated space might have room for more than n tokens,
1491 // but only n of them are initialized.
1492 // If caller has transferred ownership of constitutent strings
1493 // or attributes, it must have nil'd out the pointers in the Tokens.
1495 _freetokens(Token* tarray, int n)
1502 for(i = 0; i < n; i++) {