8 typedef struct TokenSource TokenSource;
11 int i; // index of next byte to use
12 uchar* data; // all the data
13 int edata; // data[0:edata] is valid
14 int chset; // one of US_Ascii, etc.
15 int mtype; // TextHtml or TextPlain
23 #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
129 // HTML 4.0 attribute names.
130 // Keep sorted, and in correspondence with enum in impl.h.
131 Rune* attrnames[] = {
251 // Character entity to unicode character number map.
252 // Keep sorted by name.
253 StringInt chartab[]= {
356 {L"emdash", 8212}, /* non-standard but commonly used */
359 {L"endash", 8211}, /* non-standard but commonly used */
506 {L"varepsilon", 8712},
522 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
524 // Characters Winstart..Winend are those that Windows
525 // uses interpolated into the Latin1 set.
526 // They aren't supposed to appear in HTML, but they do....
532 static int winchars[]= { 8226, // 8226 is a bullet
533 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
534 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
535 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
536 732, 8482, 353, 8250, 339, 8226, 8226, 376};
538 static StringInt* tagtable; // initialized from tagnames
539 static StringInt* attrtable; // initialized from attrnames
541 static void lexinit(void);
542 static int getplaindata(TokenSource* ts, Token* a, int* pai);
543 static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
544 static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
545 static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
546 static Rune* buftostr(Rune* s, Rune* buf, int j);
547 static int comment(TokenSource* ts);
548 static int findstr(TokenSource* ts, Rune* s);
549 static int ampersand(TokenSource* ts);
550 static int lowerc(int c);
551 static int getchar(TokenSource* ts);
552 static void ungetchar(TokenSource* ts, int c);
553 static void backup(TokenSource* ts, int savei);
554 static void freeinsidetoken(Token* t);
555 static void freeattrs(Attr* ahead);
556 static Attr* newattr(int attid, Rune* value, Attr* link);
557 static int Tconv(Fmt* f);
560 static int lexinited = 0;
565 tagtable = _makestrinttab(tagnames, Numtags);
566 attrtable = _makestrinttab(attrnames, Numattrs);
567 fmtinstall('T', Tconv);
572 newtokensource(uchar* data, int edata, int chset, int mtype)
576 assert(chset == US_Ascii || chset == ISO_8859_1 ||
577 chset == UTF_8 || chset == Unicode);
578 ans = (TokenSource*)emalloc(sizeof(TokenSource));
591 // Call this to get the tokens.
592 // The number of returned tokens is returned in *plen.
594 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
606 ts = newtokensource(data, datalen, chset, mtype);
608 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
612 if(ts->mtype == TextHtml) {
614 if(alen - ai < ToksChunk/32) {
616 a = erealloc(a, alen*sizeof *a);
623 tag = gettag(ts, starti, a, &ai);
624 if(tag == Tscript || tag == Tstyle) {
625 // special rules for getting Data after....
628 tag = getscriptdata(ts, c, starti, a, &ai, tag);
632 tag = getdata(ts, c, starti, a, &ai);
635 else if(dbglex > 1 && tag != Comment)
636 fprint(2, "lex: got token %T\n", &a[ai-1]);
640 // plain text (non-html) tokens
642 if(alen - ai < ToksChunk/32) {
644 a = erealloc(a, alen*sizeof *a);
646 tag = getplaindata(ts, a, &ai);
650 fprint(2, "lex: got token %T\n", &a[ai]);
655 fprint(2, "lex: returning %d tokens\n", ai);
664 // For case where source isn't HTML.
665 // Just make data tokens, one per line (or partial line,
666 // at end of buffer), ignoring non-whitespace control
667 // characters and dumping \r's.
668 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
669 // Otherwise return -1;
671 getplaindata(TokenSource* ts, Token* a, int* pai)
678 Rune buf[BIGBUFSIZE];
683 for(c = getchar(ts); c >= 0; c = getchar(ts)) {
687 // ignore it unless no following '\n',
688 // in which case treat it like '\n'
702 if(j == nelem(buf)-1) {
703 s = buftostr(s, buf, j);
710 s = buftostr(s, buf, j);
717 tok->starti = starti;
721 // Return concatenation of s and buf[0:j]
723 buftostr(Rune* s, Rune* buf, int j)
728 s = _Strndup(buf, j);
731 s = realloc(s, ( i+j+1)*sizeof *s);
732 memcpy(&s[i], buf, j*sizeof *s);
738 // Gather data up to next start-of-tag or end-of-buffer.
739 // Translate entity references (&).
740 // Ignore non-whitespace control characters and get rid of \r's.
741 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
742 // Otherwise return -1;
744 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
750 Rune buf[SMALLBUFSIZE];
754 for(c = firstc; c >= 0; c = getchar(ts)){
763 // ignore it unless no following '\n',
764 // in which case treat it like '\n'
775 fprint(2, "warning: non-whitespace control character %d ignored\n", c);
785 if(j == nelem(buf)-1) {
786 s = buftostr(s, buf, j);
791 s = buftostr(s, buf, j);
798 tok->starti = starti;
802 // The rules for lexing scripts are different (ugh).
803 // Gather up everything until see an "</" tagnames[tok] ">"
805 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
815 Rune buf[BIGBUFSIZE];
824 // other browsers ignore stuff to end of line after <!
828 if(comment(ts) == -1)
837 tag = gettag(ts, tstarti, a, pai);
843 if(tag == findtag + RBRA) {
847 // here tag was not the one we were looking for, so take as regular data
855 if(j == nelem(buf)-1) {
856 s = buftostr(s, buf, j);
863 if(done || ts->i == ts->edata) {
864 s = buftostr(s, buf, j);
869 tok->starti = starti;
877 // We've just seen a '<'. Gather up stuff to closing '>' (if buffer
878 // ends before then, return -1).
879 // If it's a tag, look up the name, gather the attributes, and return
880 // the appropriate token.
881 // Else it's either just plain data or some kind of ignorable stuff:
882 // return Data or Comment as appropriate.
883 // If it's not a Comment, put it in a[*pai] and bump *pai.
885 gettag(TokenSource* ts, int starti, Token* a, int* pai)
901 Rune buf[BIGBUFSIZE];
909 tok->starti = starti;
917 if(c >= 256 || !isalpha(c)) {
928 tok->text = _Strdup(L"<");
933 // c starts a tagname
942 // if name is bigger than buf it won't be found anyway...
946 if(_lookup(tagtable, Numtags, buf, i, &tag))
947 tok->tag = tag + rbra;
949 tok->text = _Strndup(buf, i); // for warning print, in build
950 // attribute gathering loop
953 // look for "ws name" or "ws name ws = ws val" (ws=whitespace)
956 while(c < 256 && isspace(c)) {
965 fprint(2, "warning: unclosed tag\n");
969 if(c >= 256 || !isalpha(c)) {
971 fprint(2, "warning: expected attribute name\n");
972 // skipt to next attribute name
977 if(c < 256 && isalpha(c))
978 goto attrloop_continue;
981 fprint(2, "warning: unclosed tag\n");
989 // gather attribute name
1001 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
1004 fprint(2, "warning: unknown attribute name %S\n", buf);
1007 while(c < 256 && isspace(c)) {
1014 al = newattr(attid, nil, al);
1015 goto attrloop_continue;
1017 //# c is '=' here; skip whitespace
1022 if(c >= 256 || !isspace(c))
1026 if(c == '\'' || c == '"') {
1040 // c might be part of string (though not good style)
1041 // but if line ends before close quote, assume
1042 // there was an unmatched quote
1051 if(nv == BIGBUFSIZE-1) {
1052 val = buftostr(val, buf, nv);
1056 goto valloop_continue;
1060 fprint(2, "warning: apparent unmatched quote\n");
1079 goto valloop_continue;
1081 if(c == '\t' || c == '\n')
1085 if(c < 256 && isspace(c))
1094 if(nv == BIGBUFSIZE-1) {
1095 val = buftostr(val, buf, nv);
1102 val = buftostr(val, buf, nv);
1103 al = newattr(attid, val, al);
1114 fprint(2, "warning: incomplete tag at end of page\n");
1117 tok->text = _Strdup(L"<");
1121 // We've just read a '<!' at position starti,
1122 // so this may be a comment or other ignored section, or it may
1123 // be just a literal string if there is no close before end of file
1124 // (other browsers do that).
1125 // The accepted practice seems to be (note: contrary to SGML spec!):
1126 // If see <!--, look for --> to close, or if none, > to close.
1127 // If see <!(not --), look for > to close.
1128 // If no close before end of file, leave original characters in as literal data.
1130 // If we see ignorable stuff, return Comment.
1131 // Else return nil (caller should back up and try again when more data arrives,
1132 // unless at end of file, in which case caller should just make '<' a data token).
1134 comment(TokenSource* ts)
1146 if(findstr(ts, L"-->"))
1156 if(findstr(ts, L">"))
1165 // Look for string s in token source.
1166 // If found, return 1, with buffer at next char after s,
1167 // else return 0 (caller should back up).
1169 findstr(TokenSource* ts, Rune* s)
1187 for(i = 1; i < n; i++) {
1203 // We've just read an '&'; look for an entity reference
1204 // name, and if found, return translated char.
1205 // if there is a complete entity name but it isn't known,
1206 // back up to just past the '&' and return '&'.
1207 // If the entity can't be completed in the current buffer, back up
1208 // to the '&' and return -1.
1210 ampersand(TokenSource* ts)
1227 if(c == 'X' || c == 'x')
1228 for(c = getchar(ts); c < 256; c = getchar(ts))
1229 if(c >= '0' && c <= '9')
1231 else if(c >= 'A' && c<= 'F')
1233 else if(c >= 'a' && c <= 'f')
1239 if(!(c < 256 && isdigit(c)))
1245 if(!(c == ';' || c == '\n' || c == '\r'))
1250 if(c >= Winstart && c <= Winend) {
1251 c = winchars[c - Winstart];
1257 else if(c < 256 && isalpha(c)) {
1264 if(c < 256 && (isalpha(c) || isdigit(c))) {
1265 if(k < nelem(buf)-1)
1269 if(!(c == ';' || c == '\n' || c == '\r'))
1274 if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
1275 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1284 // Get next char, obeying ts.chset.
1285 // Returns -1 if no complete character left before current end of data.
1287 getchar(TokenSource* ts)
1295 if(ts->i >= ts->edata)
1301 if(c >= Winstart && c <= Winend)
1302 c = winchars[c - Winstart];
1308 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1313 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1315 n = chartorune(&r, (char*)(buf+ts->i));
1316 if(warn && c == Runeerror)
1317 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1322 // not enough bytes in buf to complete utf-8 char
1323 ts->i = ts->edata; // mark "all used"
1328 if(ts->i < ts->edata - 1) {
1329 //standards say most-significant byte first
1330 c = (c << 8)|(buf[ts->i + 1]);
1334 ts->i = ts->edata; // mark "all used"
1344 // Assuming c was the last character returned by getchar, set
1345 // things up so that next getchar will get that same character
1346 // followed by the current 'next character', etc.
1348 ungetchar(TokenSource* ts, int c)
1359 n = runetochar(a, &r);
1369 // Restore ts so that it is at the state where the index was savei.
1371 backup(TokenSource* ts, int savei)
1374 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1379 // Look for value associated with attribute attid in token t.
1380 // If there is one, return 1 and put the value in *pans,
1382 // If xfer is true, transfer ownership of the string to the caller
1383 // (nil it out here); otherwise, caller must duplicate the answer
1384 // if it needs to save it.
1385 // OK to have pans==0, in which case this is just looking
1386 // to see if token is present.
1388 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1393 while(attr != nil) {
1394 if(attr->attid == attid) {
1396 *pans = attr->value;
1418 char buf[BIGBUFSIZE];
1420 t = va_arg(f->args, Token*);
1422 sprint(buf, "<null>");
1426 i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1429 i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1437 tname = tagnames[tag];
1440 i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1441 for(a = t->attr; a != nil; a = a->next) {
1442 aname = attrnames[a->attid];
1443 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1445 i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1447 i += snprint(buf+i, sizeof(buf)-i-1, ">");
1451 return fmtstrcpy(f, buf);
1454 // Attrs own their constituent strings, but build may eventually
1455 // transfer some values to its items and nil them out in the Attr.
1457 newattr(int attid, Rune* value, Attr* link)
1461 ans = (Attr*)emalloc(sizeof(Attr));
1468 // Free list of Attrs linked through next field
1470 freeattrs(Attr* ahead)
1484 // Free array of Tokens.
1485 // Allocated space might have room for more than n tokens,
1486 // but only n of them are initialized.
1487 // If caller has transferred ownership of constitutent strings
1488 // or attributes, it must have nil'd out the pointers in the Tokens.
1490 _freetokens(Token* tarray, int n)
1497 for(i = 0; i < n; i++) {