]> git.lizzy.rs Git - plan9front.git/blobdiff - sys/src/cmd/uhtml.c
devproc: can't wait for ourselfs to stop (thanks Shamar)
[plan9front.git] / sys / src / cmd / uhtml.c
index f2160450388c2619aa9a79e5288eaeec411d831d..bef236f43e2e94c40f945fce461f9b77946026a1 100644 (file)
@@ -3,22 +3,30 @@
 #include <ctype.h>
 
 int nbuf;
-char buf[4096+1];
-char *cset = "utf";
+char buf[64*1024+1];
+char *cset = nil;
+char *whitespace = " \t\r\n";
 
 void
 usage(void)
 {
-       fprint(2, "%s [ -h ] [ -c charset ] [ file ]\n", argv0);
+       fprint(2, "%s [ -p ] [ -c charset ] [ file ]\n", argv0);
        exits("usage");
 }
 
 char*
-strval(char *s)
+attr(char *s, char *a)
 {
        char *e, q;
 
-       while(strchr("\t ", *s))
+       if((s = cistrstr(s, a)) == nil)
+               return nil;
+       s += strlen(a);
+       while(*s && strchr(whitespace, *s))
+               s++;
+       if(*s++ != '=')
+               return nil;
+       while(*s && strchr(whitespace, *s))
                s++;
        q = 0;
        if(*s == '"' || *s == '\'')
@@ -32,67 +40,110 @@ strval(char *s)
                        continue;
                break;
        }
-       if(e - s > 1)
-               return smprint("%.*s", (int)(e-s), s);
+       if((e - s) > 1)
+               return smprint("%.*s", (int)(e - s), s);
        return nil;
 }
 
 void
 main(int argc, char *argv[])
 {
-       int pfd[2], pflag = 0;
-       char *arg[4], *s, *p;
+       int n, q, pfd[2], pflag = 0;
+       char *arg[4], *s, *g, *e, *p, *a, t;
+       Rune r;
 
        ARGBEGIN {
-       case 'h':
-               usage();
        case 'c':
                cset = EARGF(usage());
                break;
        case 'p':
                pflag = 1;
                break;
+       default:
+               usage();
        } ARGEND;
 
        if(*argv){
                close(0);
-               if(open(*argv, OREAD) != 1)
+               if(open(*argv, OREAD) != 0)
                        sysfatal("open: %r");
        }
-       if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0)
-               sysfatal("read: %r");
-       buf[nbuf] = 0;
+       nbuf = 0;
+       while(nbuf < sizeof(buf)-1){
+               if((n = read(0, buf + nbuf, sizeof(buf)-1-nbuf)) <= 0)
+                       break;
+               nbuf += n;
+               buf[nbuf] = 0;
+       }
+
        p = buf;
-       while(nbuf > 0){
-               if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
-                       p += 3;
-                       cset = "utf";
+       if(nbuf >= 3 && memcmp(p, "\xEF\xBB\xBF", 3)==0){
+               p += 3;
+               nbuf -= 3;
+               cset = "utf";
+               goto Found;
+       }
+       if(nbuf >= 2 && memcmp(p, "\xFE\xFF", 2) == 0){
+               p += 2;
+               nbuf -= 2;
+               cset = "unicode-be";
+               goto Found;
+       }
+       if(nbuf >= 2 && memcmp(p, "\xFF\xFE", 2) == 0){
+               p += 2;
+               nbuf -= 2;
+               cset = "unicode-le";
+               goto Found;
+       }
+
+       s = p;
+       do {
+               if((s = strchr(s, '<')) == nil)
                        break;
+               q = 0;
+               g = ++s;
+               e = buf+nbuf;
+               while(s < e){
+                       if(*s == '=' && q == 0)
+                               q = '=';
+                       else if(*s == '\'' || *s == '"'){
+                               if(q == '=')
+                                       q = *s;
+                               else if(q == *s)
+                                       q = 0;
+                       }
+                       else if(*s == '>' && q != '\'' && q != '"'){
+                               e = s;
+                               break;
+                       }
+                       else if(q == '=' && strchr(whitespace, *s) == nil)
+                               q = 0;
+                       s++;
                }
-               if(memcmp(p, "\xFE\xFF", 2) == 0){
-                       p += 2;
-                       cset = "unicode-be";
+               t = *e;
+               *e = 0;
+               if((a = attr(g, "encoding")) != nil || (a = attr(g, "charset")) != nil)
+               if(cistrcmp(a, "utf") != 0 && cistrcmp(a, "utf-8") != 0){
+                       cset = a;
+                       *e = t;
                        break;
                }
-               if(memcmp(p, "\xFF\xFE", 2) == 0){
-                       p += 2;
-                       cset = "unicode-le";
-                       break;
+               *e = t;
+               s = ++e;
+       } while(t);
+
+       s = p;
+       while(s+UTFmax < p+nbuf){
+               s += chartorune(&r, s);
+               if(r == Runeerror){
+                       if(cset == nil)
+                               cset = "latin1";
+                       goto Found;
                }
-               if(s = cistrstr(p, "encoding="))
-                       if(s = strval(s+9)){
-                               cset = s;
-                               break;
-                       }
-               if(s = cistrstr(p, "charset="))
-                       if(s = strval(s+8)){
-                               cset = s;
-                               break;
-                       }
-               break;
        }
-       nbuf -= p - buf;
+       cset = "utf";
 
+Found:
        if(pflag){
                print("%s\n", cset);
                exits(0);
@@ -116,7 +167,7 @@ main(int argc, char *argv[])
 
                arg[0] = "rc";
                arg[1] = "-c";
-               arg[2] = smprint("{tcs -f %s | tcs -f html} || cat", cset);
+               arg[2] = smprint("{tcs -f %s || cat} | tcs -f html", cset);
                arg[3] = nil;
                exec("/bin/rc", arg);
        }