#include <ctype.h>
int nbuf;
-char buf[4096+1];
-char *cset = "utf";
+char buf[64*1024+1];
+char *cset = nil;
+char *whitespace = " \t\r\n";
void
usage(void)
{
- fprint(2, "%s [ -h ] [ -c charset ] [ file ]\n", argv0);
+ fprint(2, "%s [ -p ] [ -c charset ] [ file ]\n", argv0);
exits("usage");
}
char*
-strval(char *s)
+attr(char *s, char *a)
{
char *e, q;
- while(strchr("\t ", *s))
+ if((s = cistrstr(s, a)) == nil)
+ return nil;
+ s += strlen(a);
+ while(*s && strchr(whitespace, *s))
+ s++;
+ if(*s++ != '=')
+ return nil;
+ while(*s && strchr(whitespace, *s))
s++;
q = 0;
if(*s == '"' || *s == '\'')
continue;
break;
}
- if(e - s > 1)
- return smprint("%.*s", (int)(e-s), s);
+ if((e - s) > 1)
+ return smprint("%.*s", (int)(e - s), s);
return nil;
}
void
main(int argc, char *argv[])
{
- int pfd[2], pflag = 0;
- char *arg[4], *s;
+ int n, q, pfd[2], pflag = 0;
+ char *arg[4], *s, *g, *e, *p, *a, t;
+ Rune r;
ARGBEGIN {
- case 'h':
- usage();
case 'c':
cset = EARGF(usage());
break;
case 'p':
pflag = 1;
break;
+ default:
+ usage();
} ARGEND;
if(*argv){
close(0);
- if(open(*argv, OREAD) != 1)
+ if(open(*argv, OREAD) != 0)
sysfatal("open: %r");
}
- if((nbuf = read(0, buf, sizeof(buf)-1)) < 0)
- sysfatal("read: %r");
- buf[nbuf] = 0;
-
- /* useless BOM marker */
- if(memcmp(buf, "\xEF\xBB\xBF", 3)==0)
- memmove(buf, buf+3, nbuf-3);
-
- for(;;){
- if(s = cistrstr(buf, "encoding="))
- if(s = strval(s+9)){
- cset = s;
- break;
+ nbuf = 0;
+ while(nbuf < sizeof(buf)-1){
+ if((n = read(0, buf + nbuf, sizeof(buf)-1-nbuf)) <= 0)
+ break;
+ nbuf += n;
+ buf[nbuf] = 0;
+ }
+
+ p = buf;
+ if(nbuf >= 3 && memcmp(p, "\xEF\xBB\xBF", 3)==0){
+ p += 3;
+ nbuf -= 3;
+ cset = "utf";
+ goto Found;
+ }
+ if(nbuf >= 2 && memcmp(p, "\xFE\xFF", 2) == 0){
+ p += 2;
+ nbuf -= 2;
+ cset = "unicode-be";
+ goto Found;
+ }
+ if(nbuf >= 2 && memcmp(p, "\xFF\xFE", 2) == 0){
+ p += 2;
+ nbuf -= 2;
+ cset = "unicode-le";
+ goto Found;
+ }
+
+ s = p;
+ do {
+ if((s = strchr(s, '<')) == nil)
+ break;
+ q = 0;
+ g = ++s;
+ e = buf+nbuf;
+ while(s < e){
+ if(*s == '=' && q == 0)
+ q = '=';
+ else if(*s == '\'' || *s == '"'){
+ if(q == '=')
+ q = *s;
+ else if(q == *s)
+ q = 0;
}
- if(s = cistrstr(buf, "charset="))
- if(s = strval(s+8)){
- cset = s;
+ else if(*s == '>' && q != '\'' && q != '"'){
+ e = s;
break;
}
- break;
+ else if(q == '=' && strchr(whitespace, *s) == nil)
+ q = 0;
+ s++;
+ }
+ t = *e;
+ *e = 0;
+ if((a = attr(g, "encoding")) != nil || (a = attr(g, "charset")) != nil)
+ if(cistrcmp(a, "utf") != 0 && cistrcmp(a, "utf-8") != 0){
+ cset = a;
+ *e = t;
+ break;
+ }
+ *e = t;
+ s = ++e;
+ } while(t);
+
+ s = p;
+ while(s+UTFmax < p+nbuf){
+ s += chartorune(&r, s);
+ if(r == Runeerror){
+ if(cset == nil)
+ cset = "latin1";
+ goto Found;
+ }
}
+ cset = "utf";
+Found:
if(pflag){
print("%s\n", cset);
exits(0);
}
- if(pipe(pfd) < 0)
- sysfatal("pipe: %r");
-
if(nbuf == 0){
- write(1, buf, 0);
+ write(1, p, 0);
exits(0);
}
- switch(rfork(RFFDG|RFREND|RFPROC|RFNOWAIT)){
+ if(pipe(pfd) < 0)
+ sysfatal("pipe: %r");
+
+ switch(rfork(RFFDG|RFREND|RFPROC)){
case -1:
sysfatal("fork: %r");
case 0:
arg[0] = "rc";
arg[1] = "-c";
- arg[2] = smprint("{tcs -f %s | tcs -f html} || cat", cset);
+ arg[2] = smprint("{tcs -f %s || cat} | tcs -f html", cset);
arg[3] = nil;
exec("/bin/rc", arg);
}
close(pfd[1]);
while(nbuf > 0){
- if(write(1, buf, nbuf) != nbuf)
+ if(write(1, p, nbuf) != nbuf)
sysfatal("write: %r");
- if((nbuf = read(0, buf, sizeof(buf))) < 0)
+ p = buf;
+ if((nbuf = read(0, p, sizeof(buf))) < 0)
sysfatal("read: %r");
}
+ close(1);
+ waitpid();
exits(0);
}