]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/file.c
merge
[plan9front.git] / sys / src / cmd / file.c
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6
7 /*
8  * file - determine type of file
9  */
10 #define LENDIAN(p)      ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11
12 uchar   buf[6001];
13 short   cfreq[140];
14 short   wfreq[50];
15 int     nbuf;
16 Dir*    mbuf;
17 int     fd;
18 char    *fname;
19 char    *slash;
20
21 enum
22 {
23         Cword,
24         Fword,
25         Aword,
26         Alword,
27         Lword,
28         I1,
29         I2,
30         I3,
31         Clatin  = 128,
32         Cbinary,
33         Cnull,
34         Ceascii,
35         Cutf,
36 };
37 struct
38 {
39         char*   word;
40         int     class;
41 } dict[] =
42 {
43         "PATH",         Lword,
44         "TEXT",         Aword,
45         "adt",          Alword,
46         "aggr",         Alword,
47         "alef",         Alword,
48         "array",        Lword,
49         "block",        Fword,
50         "char",         Cword,
51         "common",       Fword,
52         "con",          Lword,
53         "data",         Fword,
54         "dimension",    Fword,
55         "double",       Cword,
56         "extern",       Cword,
57         "bio",          I2,
58         "float",        Cword,
59         "fn",           Lword,
60         "function",     Fword,
61         "h",            I3,
62         "implement",    Lword,
63         "import",       Lword,
64         "include",      I1,
65         "int",          Cword,
66         "integer",      Fword,
67         "iota",         Lword,
68         "libc",         I2,
69         "long",         Cword,
70         "module",       Lword,
71         "real",         Fword,
72         "ref",          Lword,
73         "register",     Cword,
74         "self",         Lword,
75         "short",        Cword,
76         "static",       Cword,
77         "stdio",        I2,
78         "struct",       Cword,
79         "subroutine",   Fword,
80         "u",            I2,
81         "void",         Cword,
82 };
83
84 /* codes for 'mode' field in language structure */
85 enum    {
86                 Normal  = 0,
87                 First,          /* first entry for language spanning several ranges */
88                 Multi,          /* later entries "   "       "  ... */
89                 Shared,         /* codes used in several languages */
90         };
91
92 struct
93 {
94         int     mode;           /* see enum above */
95         int     count;
96         int     low;
97         int     high;
98         char    *name;
99
100 } language[] =
101 {
102         Normal, 0,      0x0100, 0x01FF, "Extended Latin",
103         Normal, 0,      0x0370, 0x03FF, "Greek",
104         Normal, 0,      0x0400, 0x04FF, "Cyrillic",
105         Normal, 0,      0x0530, 0x058F, "Armenian",
106         Normal, 0,      0x0590, 0x05FF, "Hebrew",
107         Normal, 0,      0x0600, 0x06FF, "Arabic",
108         Normal, 0,      0x0900, 0x097F, "Devanagari",
109         Normal, 0,      0x0980, 0x09FF, "Bengali",
110         Normal, 0,      0x0A00, 0x0A7F, "Gurmukhi",
111         Normal, 0,      0x0A80, 0x0AFF, "Gujarati",
112         Normal, 0,      0x0B00, 0x0B7F, "Oriya",
113         Normal, 0,      0x0B80, 0x0BFF, "Tamil",
114         Normal, 0,      0x0C00, 0x0C7F, "Telugu",
115         Normal, 0,      0x0C80, 0x0CFF, "Kannada",
116         Normal, 0,      0x0D00, 0x0D7F, "Malayalam",
117         Normal, 0,      0x0E00, 0x0E7F, "Thai",
118         Normal, 0,      0x0E80, 0x0EFF, "Lao",
119         Normal, 0,      0x1000, 0x105F, "Tibetan",
120         Normal, 0,      0x10A0, 0x10FF, "Georgian",
121         Normal, 0,      0x3040, 0x30FF, "Japanese",
122         Normal, 0,      0x3100, 0x312F, "Chinese",
123         First,  0,      0x3130, 0x318F, "Korean",
124         Multi,  0,      0x3400, 0x3D2F, "Korean",
125         Shared, 0,      0x4e00, 0x9fff, "CJK",
126         Normal, 0,      0,      0,      0,              /* terminal entry */
127 };
128
129
130 enum
131 {
132         Fascii,         /* printable ascii */
133         Flatin,         /* latin 1*/
134         Futf,           /* UTF character set */
135         Fbinary,        /* binary */
136         Feascii,        /* ASCII with control chars */
137         Fnull,          /* NULL in file */
138 } guess;
139
140 void    bump_utf_count(Rune);
141 int     cistrncmp(char*, char*, int);
142 void    filetype(int);
143 int     getfontnum(uchar*, uchar**);
144 int     isas(void);
145 int     isc(void);
146 int     iscint(void);
147 int     isenglish(void);
148 int     ishp(void);
149 int     ishtml(void);
150 int     isrfc822(void);
151 int     ismbox(void);
152 int     islimbo(void);
153 int     ismung(void);
154 int     isp9bit(void);
155 int     isp9font(void);
156 int     isrtf(void);
157 int     ismsdos(void);
158 int     iself(void);
159 int     istring(void);
160 int     isoffstr(void);
161 int     iff(void);
162 int     long0(void);
163 int     longoff(void);
164 int     istar(void);
165 int     isface(void);
166 int     isexec(void);
167 int     p9bitnum(uchar*);
168 int     p9subfont(uchar*);
169 void    print_utf(void);
170 void    type(char*, int);
171 int     utf_count(void);
172 void    wordfreq(void);
173
174 int     (*call[])(void) =
175 {
176         long0,          /* recognizable by first 4 bytes */
177         istring,        /* recognizable by first string */
178         iself,          /* ELF (foreign) executable */
179         isexec,         /* native executables */
180         iff,            /* interchange file format (strings) */
181         longoff,        /* recognizable by 4 bytes at some offset */
182         isoffstr,       /* recognizable by string at some offset */
183         isrfc822,       /* email file */
184         ismbox,         /* mail box */
185         istar,          /* recognizable by tar checksum */
186         ishtml,         /* html keywords */
187         iscint,         /* compiler/assembler intermediate */
188         islimbo,        /* limbo source */
189         isc,            /* c & alef compiler key words */
190         isas,           /* assembler key words */
191         isp9font,       /* plan 9 font */
192         isp9bit,        /* plan 9 image (as from /dev/window) */
193         isrtf,          /* rich text format */
194         ismsdos,        /* msdos exe (virus file attachement) */
195         isface,         /* ascii face file */
196
197         /* last resorts */
198         ismung,         /* entropy compressed/encrypted */
199         isenglish,      /* char frequency English */
200         0
201 };
202
203 int mime;
204
205 char OCTET[] =  "application/octet-stream\n";
206 char PLAIN[] =  "text/plain\n";
207
208 void
209 main(int argc, char *argv[])
210 {
211         int i, j, maxlen;
212         char *cp;
213         Rune r;
214
215         ARGBEGIN{
216         case 'm':
217                 mime = 1;
218                 break;
219         default:
220                 fprint(2, "usage: file [-m] [file...]\n");
221                 exits("usage");
222         }ARGEND;
223
224         maxlen = 0;
225         if(mime == 0 || argc > 1){
226                 for(i = 0; i < argc; i++) {
227                         for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
228                                         ;
229                         if(j > maxlen)
230                                 maxlen = j;
231                 }
232         }
233         if (argc <= 0) {
234                 if(!mime)
235                         print ("stdin: ");
236                 filetype(0);
237         }
238         else {
239                 for(i = 0; i < argc; i++)
240                         type(argv[i], maxlen);
241         }
242         exits(0);
243 }
244
245 void
246 type(char *file, int nlen)
247 {
248         Rune r;
249         int i;
250         char *p;
251
252         if(nlen > 0){
253                 slash = 0;
254                 for (i = 0, p = file; *p; i++) {
255                         if (*p == '/')                  /* find rightmost slash */
256                                 slash = p;
257                         p += chartorune(&r, p);         /* count runes */
258                 }
259                 print("%s:%*s",file, nlen-i+1, "");
260         }
261         fname = file;
262         if ((fd = open(file, OREAD)) < 0) {
263                 print("cannot open: %r\n");
264                 return;
265         }
266         filetype(fd);
267         close(fd);
268 }
269
270 /*
271  * Unicode 4.0 4-byte runes.
272  */
273 typedef int Rune1;
274
275 enum {
276         UTFmax1 = 4,
277 };
278
279 int
280 fullrune1(char *p, int n)
281 {
282         int c;
283
284         if(n >= 1) {
285                 c = *(uchar*)p;
286                 if(c < 0x80)
287                         return 1;
288                 if(n >= 2 && c < 0xE0)
289                         return 1;
290                 if(n >= 3 && c < 0xF0)
291                         return 1;
292                 if(n >= 4)
293                         return 1;
294         }
295         return 0;
296 }
297
298 int
299 chartorune1(Rune1 *rune, char *str)
300 {
301         int c, c1, c2, c3, n;
302         Rune r;
303
304         c = *(uchar*)str;
305         if(c < 0xF0){
306                 r = 0;
307                 n = chartorune(&r, str);
308                 *rune = r;
309                 return n;
310         }
311         c &= ~0xF0;
312         c1 = *(uchar*)(str+1) & ~0x80;
313         c2 = *(uchar*)(str+2) & ~0x80;
314         c3 = *(uchar*)(str+3) & ~0x80;
315         n = (c<<18) | (c1<<12) | (c2<<6) | c3;
316         if(n < 0x10000 || n > 0x10FFFF){
317                 *rune = Runeerror;
318                 return 1;
319         }
320         *rune = n;
321         return 4;
322 }
323
324 void
325 filetype(int fd)
326 {
327         Rune1 r;
328         int i, f, n;
329         char *p, *eob;
330
331         free(mbuf);
332         mbuf = dirfstat(fd);
333         if(mbuf == nil){
334                 print("cannot stat: %r\n");
335                 return;
336         }
337         if(mbuf->mode & DMDIR) {
338                 print(mime ? OCTET : "directory\n");
339                 return;
340         }
341         if(mbuf->type != 'M' && mbuf->type != '|') {
342                 print(mime ? OCTET : "special file #%C/%s\n",
343                         mbuf->type, mbuf->name);
344                 return;
345         }
346         /* may be reading a pipe on standard input */
347         nbuf = readn(fd, buf, sizeof(buf)-1);
348         if(nbuf < 0) {
349                 print("cannot read: %r\n");
350                 return;
351         }
352         if(nbuf == 0) {
353                 print(mime ? PLAIN : "empty file\n");
354                 return;
355         }
356         buf[nbuf] = 0;
357
358         /*
359          * build histogram table
360          */
361         memset(cfreq, 0, sizeof(cfreq));
362         for (i = 0; language[i].name; i++)
363                 language[i].count = 0;
364         eob = (char *)buf+nbuf;
365         for(n = 0, p = (char *)buf; p < eob; n++) {
366                 if (!fullrune1(p, eob-p) && eob-p < UTFmax1)
367                         break;
368                 p += chartorune1(&r, p);
369                 if (r == 0)
370                         f = Cnull;
371                 else if (r <= 0x7f) {
372                         if (!isprint(r) && !isspace(r))
373                                 f = Ceascii;    /* ASCII control char */
374                         else f = r;
375                 } else if (r == 0x80) {
376                         bump_utf_count(r);
377                         f = Cutf;
378                 } else if (r < 0xA0)
379                         f = Cbinary;    /* Invalid Runes */
380                 else if (r <= 0xff)
381                         f = Clatin;     /* Latin 1 */
382                 else {
383                         bump_utf_count(r);
384                         f = Cutf;               /* UTF extension */
385                 }
386                 cfreq[f]++;                     /* ASCII chars peg directly */
387         }
388         /*
389          * gross classify
390          */
391         if (cfreq[Cbinary])
392                 guess = Fbinary;
393         else if (cfreq[Cutf])
394                 guess = Futf;
395         else if (cfreq[Clatin])
396                 guess = Flatin;
397         else if (cfreq[Ceascii])
398                 guess = Feascii;
399         else if (cfreq[Cnull])
400                 guess = Fbinary;
401         else
402                 guess = Fascii;
403         /*
404          * lookup dictionary words
405          */
406         memset(wfreq, 0, sizeof(wfreq));
407         if(guess == Fascii || guess == Flatin || guess == Futf)
408                 wordfreq();
409         /*
410          * call individual classify routines
411          */
412         for(i=0; call[i]; i++)
413                 if((*call[i])())
414                         return;
415
416         /*
417          * if all else fails,
418          * print out gross classification
419          */
420         if (nbuf < 100 && !mime)
421                 print(mime ? PLAIN : "short ");
422         if (guess == Fascii)
423                 print(mime ? PLAIN : "Ascii\n");
424         else if (guess == Feascii)
425                 print(mime ? PLAIN : "extended ascii\n");
426         else if (guess == Flatin)
427                 print(mime ? PLAIN : "latin ascii\n");
428         else if (guess == Futf && utf_count() < 4)
429                 print_utf();
430         else print(mime ? OCTET : "binary\n");
431 }
432
433 void
434 bump_utf_count(Rune r)
435 {
436         int low, high, mid;
437
438         high = sizeof(language)/sizeof(language[0])-1;
439         for (low = 0; low < high;) {
440                 mid = (low+high)/2;
441                 if (r >= language[mid].low) {
442                         if (r <= language[mid].high) {
443                                 language[mid].count++;
444                                 break;
445                         } else low = mid+1;
446                 } else high = mid;
447         }
448 }
449
450 int
451 utf_count(void)
452 {
453         int i, count;
454
455         count = 0;
456         for (i = 0; language[i].name; i++)
457                 if (language[i].count > 0)
458                         switch (language[i].mode) {
459                         case Normal:
460                         case First:
461                                 count++;
462                                 break;
463                         default:
464                                 break;
465                         }
466         return count;
467 }
468
469 int
470 chkascii(void)
471 {
472         int i;
473
474         for (i = 'a'; i < 'z'; i++)
475                 if (cfreq[i])
476                         return 1;
477         for (i = 'A'; i < 'Z'; i++)
478                 if (cfreq[i])
479                         return 1;
480         return 0;
481 }
482
483 int
484 find_first(char *name)
485 {
486         int i;
487
488         for (i = 0; language[i].name != 0; i++)
489                 if (language[i].mode == First
490                         && strcmp(language[i].name, name) == 0)
491                         return i;
492         return -1;
493 }
494
495 void
496 print_utf(void)
497 {
498         int i, printed, j;
499
500         if(mime){
501                 print(PLAIN);
502                 return;
503         }
504         if (chkascii()) {
505                 printed = 1;
506                 print("Ascii");
507         } else
508                 printed = 0;
509         for (i = 0; language[i].name; i++)
510                 if (language[i].count) {
511                         switch(language[i].mode) {
512                         case Multi:
513                                 j = find_first(language[i].name);
514                                 if (j < 0)
515                                         break;
516                                 if (language[j].count > 0)
517                                         break;
518                                 /* Fall through */
519                         case Normal:
520                         case First:
521                                 if (printed)
522                                         print(" & ");
523                                 else printed = 1;
524                                 print("%s", language[i].name);
525                                 break;
526                         case Shared:
527                         default:
528                                 break;
529                         }
530                 }
531         if(!printed)
532                 print("UTF");
533         print(" text\n");
534 }
535
536 void
537 wordfreq(void)
538 {
539         int low, high, mid, r;
540         uchar *p, *p2, c;
541
542         p = buf;
543         for(;;) {
544                 while (p < buf+nbuf && !isalpha(*p))
545                         p++;
546                 if (p >= buf+nbuf)
547                         return;
548                 p2 = p;
549                 while(p < buf+nbuf && isalpha(*p))
550                         p++;
551                 c = *p;
552                 *p = 0;
553                 high = sizeof(dict)/sizeof(dict[0]);
554                 for(low = 0;low < high;) {
555                         mid = (low+high)/2;
556                         r = strcmp(dict[mid].word, (char*)p2);
557                         if(r == 0) {
558                                 wfreq[dict[mid].class]++;
559                                 break;
560                         }
561                         if(r < 0)
562                                 low = mid+1;
563                         else
564                                 high = mid;
565                 }
566                 *p++ = c;
567         }
568 }
569
570 typedef struct Filemagic Filemagic;
571 struct Filemagic {
572         ulong x;
573         ulong mask;
574         char *desc;
575         char *mime;
576 };
577
578 /*
579  * integers in this table must be as seen on a little-endian machine
580  * when read from a file.
581  */
582 Filemagic long0tab[] = {
583         0xF16DF16D,     0xFFFFFFFF,     "pac1 audio file\n",    OCTET,
584         /* "pac1" */
585         0x31636170,     0xFFFFFFFF,     "pac3 audio file\n",    OCTET,
586         /* "pXc2 */
587         0x32630070,     0xFFFF00FF,     "pac4 audio file\n",    OCTET,
588         0xBA010000,     0xFFFFFFFF,     "mpeg system stream\n", OCTET,
589         0x43614c66,     0xFFFFFFFF,     "FLAC audio file\n",    OCTET,
590         0x30800CC0,     0xFFFFFFFF,     "inferno .dis executable\n", OCTET,
591         0x04034B50,     0xFFFFFFFF,     "zip archive\n", "application/zip",
592         070707,         0xFFFF,         "cpio archive\n", OCTET,
593         0x2F7,          0xFFFF,         "tex dvi\n", "application/dvi",
594         0xfaff,         0xfeff,         "mp3 audio\n",  "audio/mpeg",
595         0xfeff0000,     0xffffffff,     "utf-32be\n",   "text/plain charset=utf-32be",
596         0xfffe,         0xffffffff,     "utf-32le\n",   "text/plain charset=utf-32le",
597         0xfeff,         0xffff,         "utf-16be\n",   "text/plain charset=utf-16be",
598         0xfffe,         0xffff,         "utf-16le\n",   "text/plain charset=utf-16le",
599         /* 0xfeedface: this could alternately be a Next Plan 9 boot image */
600         0xcefaedfe,     0xFFFFFFFF,     "32-bit power Mach-O executable\n", OCTET,
601         /* 0xfeedfacf */
602         0xcffaedfe,     0xFFFFFFFF,     "64-bit power Mach-O executable\n", OCTET,
603         /* 0xcefaedfe */
604         0xfeedface,     0xFFFFFFFF,     "386 Mach-O executable\n", OCTET,
605         /* 0xcffaedfe */
606         0xfeedfacf,     0xFFFFFFFF,     "amd64 Mach-O executable\n", OCTET,
607         /* 0xcafebabe */
608         0xbebafeca,     0xFFFFFFFF,     "Mach-O universal executable\n", OCTET,
609         /*
610          * venti & fossil magic numbers are stored big-endian on disk,
611          * thus the numbers appear reversed in this table.
612          */
613         0xad4e5cd1,     0xFFFFFFFF,     "venti arena\n", OCTET,
614 };
615
616 int
617 filemagic(Filemagic *tab, int ntab, ulong x)
618 {
619         int i;
620
621         for(i=0; i<ntab; i++)
622                 if((x&tab[i].mask) == tab[i].x){
623                         print(mime ? tab[i].mime : tab[i].desc);
624                         return 1;
625                 }
626         return 0;
627 }
628
629 int
630 long0(void)
631 {
632         return filemagic(long0tab, nelem(long0tab), LENDIAN(buf));
633 }
634
635 typedef struct Fileoffmag Fileoffmag;
636 struct Fileoffmag {
637         ulong   off;
638         Filemagic;
639 };
640
641 /*
642  * integers in this table must be as seen on a little-endian machine
643  * when read from a file.
644  */
645 Fileoffmag longofftab[] = {
646         /*
647          * venti & fossil magic numbers are stored big-endian on disk,
648          * thus the numbers appear reversed in this table.
649          */
650         256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET,
651         256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET,
652         128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET,
653         4,        0x31647542, 0xFFFFFFFF, "OS X finder properties\n", OCTET,
654 };
655
656 int
657 fileoffmagic(Fileoffmag *tab, int ntab)
658 {
659         int i;
660         ulong x;
661         Fileoffmag *tp;
662         uchar buf[sizeof(long)];
663
664         for(i=0; i<ntab; i++) {
665                 tp = tab + i;
666                 seek(fd, tp->off, 0);
667                 if (readn(fd, buf, sizeof buf) != sizeof buf)
668                         continue;
669                 x = LENDIAN(buf);
670                 if((x&tp->mask) == tp->x){
671                         print(mime? tp->mime: tp->desc);
672                         return 1;
673                 }
674         }
675         return 0;
676 }
677
678 int
679 longoff(void)
680 {
681         return fileoffmagic(longofftab, nelem(longofftab));
682 }
683
684 int
685 isexec(void)
686 {
687         Fhdr f;
688
689         seek(fd, 0, 0);         /* reposition to start of file */
690         if(crackhdr(fd, &f)) {
691                 print(mime ? OCTET : "%s\n", f.name);
692                 return 1;
693         }
694         return 0;
695 }
696
697
698 /* from tar.c */
699 enum { NAMSIZ = 100, TBLOCK = 512 };
700
701 union   hblock
702 {
703         char    dummy[TBLOCK];
704         struct  header
705         {
706                 char    name[NAMSIZ];
707                 char    mode[8];
708                 char    uid[8];
709                 char    gid[8];
710                 char    size[12];
711                 char    mtime[12];
712                 char    chksum[8];
713                 char    linkflag;
714                 char    linkname[NAMSIZ];
715                 /* rest are defined by POSIX's ustar format; see p1003.2b */
716                 char    magic[6];       /* "ustar" */
717                 char    version[2];
718                 char    uname[32];
719                 char    gname[32];
720                 char    devmajor[8];
721                 char    devminor[8];
722                 char    prefix[155];  /* if non-null, path = prefix "/" name */
723         } dbuf;
724 };
725
726 int
727 checksum(union hblock *hp)
728 {
729         int i;
730         char *cp;
731         struct header *hdr = &hp->dbuf;
732
733         for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
734                 *cp = ' ';
735         i = 0;
736         for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
737                 i += *cp & 0xff;
738         return i;
739 }
740
741 int
742 istar(void)
743 {
744         int chksum;
745         char tblock[TBLOCK];
746         union hblock *hp = (union hblock *)tblock;
747         struct header *hdr = &hp->dbuf;
748
749         seek(fd, 0, 0);         /* reposition to start of file */
750         if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
751                 return 0;
752         chksum = strtol(hdr->chksum, 0, 8);
753         if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
754                 if (strcmp(hdr->magic, "ustar") == 0)
755                         print(mime? "application/x-ustar\n":
756                                 "posix tar archive\n");
757                 else
758                         print(mime? "application/x-tar\n": "tar archive\n");
759                 return 1;
760         }
761         return 0;
762 }
763
764 /*
765  * initial words to classify file
766  */
767 struct  FILE_STRING
768 {
769         char    *key;
770         char    *filetype;
771         int     length;
772         char    *mime;
773 } file_string[] =
774 {
775         "!<arch>\n__.SYMDEF",   "archive random library",       16,     "application/octet-stream",
776         "!<arch>\n",            "archive",                      8,      "application/octet-stream",
777         "070707",               "cpio archive - ascii header",  6,      "application/octet-stream",
778         "#!/bin/rc",            "rc executable file",           9,      "text/plain",
779         "#!/bin/sh",            "sh executable file",           9,      "text/plain",
780         "%!",                   "postscript",                   2,      "application/postscript",
781         "\004%!",               "postscript",                   3,      "application/postscript",
782         "x T post",             "troff output for post",        8,      "application/troff",
783         "x T Latin1",           "troff output for Latin1",      10,     "application/troff",
784         "x T utf",              "troff output for UTF",         7,      "application/troff",
785         "x T 202",              "troff output for 202",         7,      "application/troff",
786         "x T aps",              "troff output for aps",         7,      "application/troff",
787         "GIF",                  "GIF image",                    3,      "image/gif",
788         "\0PC Research, Inc\0", "ghostscript fax file",         18,     "application/ghostscript",
789         "%PDF",                 "PDF",                          4,      "application/pdf",
790         "<html>\n",             "HTML file",                    7,      "text/html",
791         "<HTML>\n",             "HTML file",                    7,      "text/html",
792         "\111\111\052\000",     "tiff",                         4,      "image/tiff",
793         "\115\115\000\052",     "tiff",                         4,      "image/tiff",
794         "\377\330\377\340",     "jpeg",                         4,      "image/jpeg",
795         "\377\330\377\341",     "jpeg",                         4,      "image/jpeg",
796         "\377\330\377\333",     "jpeg",                         4,      "image/jpeg",
797         "BM",                   "bmp",                          2,      "image/bmp",
798         "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",     "microsoft office document",    8,      "application/octet-stream",
799         "<MakerFile ",          "FrameMaker file",              11,     "application/framemaker",
800         "\033E\033",    "HP PCL printer data",          3,      OCTET,
801         "\033%-12345X", "HPJCL file",           9,      "application/hpjcl",
802         "ID3",                  "mp3 audio with id3",   3,      "audio/mpeg",
803         "\211PNG",              "PNG image",            4,      "image/png",
804         "P3\n",                 "ppm",                          3,      "image/ppm",
805         "P6\n",                 "ppm",                          3,      "image/ppm",
806         "/* XPM */\n",  "xbm",                          10,     "image/xbm",
807         ".HTML ",               "troff -ms input",      6,      "text/troff",
808         ".LP",                  "troff -ms input",      3,      "text/troff",
809         ".ND",                  "troff -ms input",      3,      "text/troff",
810         ".PP",                  "troff -ms input",      3,      "text/troff",
811         ".TL",                  "troff -ms input",      3,      "text/troff",
812         ".TR",                  "troff -ms input",      3,      "text/troff",
813         ".TH",                  "manual page",          3,      "text/troff",
814         ".\\\"",                "troff input",          3,      "text/troff",
815         ".de",                  "troff input",          3,      "text/troff",
816         ".if",                  "troff input",          3,      "text/troff",
817         ".nr",                  "troff input",          3,      "text/troff",
818         ".tr",                  "troff input",          3,      "text/troff",
819         "vac:",                 "venti score",          4,      "text/plain",
820         "-----BEGIN CERTIFICATE-----\n",
821                                 "pem certificate",      -1,     "text/plain",
822         "-----BEGIN TRUSTED CERTIFICATE-----\n",
823                                 "pem trusted certificate", -1,  "text/plain",
824         "-----BEGIN X509 CERTIFICATE-----\n",
825                                 "pem x.509 certificate", -1,    "text/plain",
826         "subject=/C=",          "pem certificate with header", -1, "text/plain",
827         "process snapshot ",    "process snapshot",     -1,     "application/snapfs",
828         0,0,0,0
829 };
830
831 int
832 istring(void)
833 {
834         int i, l;
835         struct FILE_STRING *p;
836
837         for(p = file_string; p->key; p++) {
838                 l = p->length;
839                 if(l == -1)
840                         l = strlen(p->key);
841                 if(nbuf >= l && memcmp(buf, p->key, l) == 0) {
842                         if(mime)
843                                 print("%s\n", p->mime);
844                         else
845                                 print("%s\n", p->filetype);
846                         return 1;
847                 }
848         }
849         if(strncmp((char*)buf, "TYPE=", 5) == 0) {      /* td */
850                 for(i = 5; i < nbuf; i++)
851                         if(buf[i] == '\n')
852                                 break;
853                 if(mime)
854                         print(OCTET);
855                 else
856                         print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
857                 return 1;
858         }
859         return 0;
860 }
861
862 struct offstr
863 {
864         ulong   off;
865         struct FILE_STRING;
866 } offstrs[] = {
867         32*1024, "\001CD001\001",       "ISO9660 CD image",     7,      OCTET,
868         0, 0, 0, 0, 0
869 };
870
871 int
872 isoffstr(void)
873 {
874         int n;
875         char buf[256];
876         struct offstr *p;
877
878         for(p = offstrs; p->key; p++) {
879                 seek(fd, p->off, 0);
880                 n = p->length;
881                 if (n > sizeof buf)
882                         n = sizeof buf;
883                 if (readn(fd, buf, n) != n)
884                         continue;
885                 if(memcmp(buf, p->key, n) == 0) {
886                         if(mime)
887                                 print("%s\n", p->mime);
888                         else
889                                 print("%s\n", p->filetype);
890                         return 1;
891                 }
892         }
893         return 0;
894 }
895
896 int
897 iff(void)
898 {
899         if (strncmp((char*)buf, "FORM", 4) == 0 &&
900             strncmp((char*)buf+8, "AIFF", 4) == 0) {
901                 print("%s\n", mime? "audio/x-aiff": "aiff audio");
902                 return 1;
903         }
904         if (strncmp((char*)buf, "RIFF", 4) == 0) {
905                 if (strncmp((char*)buf+8, "WAVE", 4) == 0)
906                         print("%s\n", mime? "audio/wave": "wave audio");
907                 else if (strncmp((char*)buf+8, "AVI ", 4) == 0)
908                         print("%s\n", mime? "video/avi": "avi video");
909                 else
910                         print("%s\n", mime? "application/octet-stream":
911                                 "riff file");
912                 return 1;
913         }
914         return 0;
915 }
916
917 char*   html_string[] =
918 {
919         "title",
920         "body",
921         "head",
922         "strong",
923         "h1",
924         "h2",
925         "h3",
926         "h4",
927         "h5",
928         "h6",
929         "ul",
930         "li",
931         "dl",
932         "br",
933         "em",
934         0,
935 };
936
937 int
938 ishtml(void)
939 {
940         uchar *p, *q;
941         int i, count;
942
943                 /* compare strings between '<' and '>' to html table */
944         count = 0;
945         p = buf;
946         for(;;) {
947                 while (p < buf+nbuf && *p != '<')
948                         p++;
949                 p++;
950                 if (p >= buf+nbuf)
951                         break;
952                 if(*p == '/')
953                         p++;
954                 q = p;
955                 while(p < buf+nbuf && *p != '>')
956                         p++;
957                 if (p >= buf+nbuf)
958                         break;
959                 for(i = 0; html_string[i]; i++) {
960                         if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
961                                 if(count++ > 4) {
962                                         print(mime ? "text/html\n" : "HTML file\n");
963                                         return 1;
964                                 }
965                                 break;
966                         }
967                 }
968                 p++;
969         }
970         return 0;
971 }
972
973 char*   rfc822_string[] =
974 {
975         "from:",
976         "date:",
977         "to:",
978         "subject:",
979         "received:",
980         "reply to:",
981         "sender:",
982         0,
983 };
984
985 int
986 isrfc822(void)
987 {
988
989         char *p, *q, *r;
990         int i, count;
991
992         count = 0;
993         p = (char*)buf;
994         for(;;) {
995                 q = strchr(p, '\n');
996                 if(q == nil)
997                         break;
998                 *q = 0;
999                 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
1000                         count++;
1001                         *q = '\n';
1002                         p = q+1;
1003                         continue;
1004                 }
1005                 *q = '\n';
1006                 if(*p != '\t' && *p != ' '){
1007                         r = strchr(p, ':');
1008                         if(r == 0 || r > q)
1009                                 break;
1010                         for(i = 0; rfc822_string[i]; i++) {
1011                                 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
1012                                         count++;
1013                                         break;
1014                                 }
1015                         }
1016                 }
1017                 p = q+1;
1018         }
1019         if(count >= 3){
1020                 print(mime ? "message/rfc822\n" : "email file\n");
1021                 return 1;
1022         }
1023         return 0;
1024 }
1025
1026 int
1027 ismbox(void)
1028 {
1029         char *p, *q;
1030
1031         p = (char*)buf;
1032         q = strchr(p, '\n');
1033         if(q == nil)
1034                 return 0;
1035         *q = 0;
1036         if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
1037                 print(mime ? "text/plain\n" : "mail box\n");
1038                 return 1;
1039         }
1040         *q = '\n';
1041         return 0;
1042 }
1043
1044 int
1045 iscint(void)
1046 {
1047         int type;
1048         char *name;
1049         Biobuf b;
1050
1051         if(Binit(&b, fd, OREAD) == Beof)
1052                 return 0;
1053         seek(fd, 0, 0);
1054         type = objtype(&b, &name);
1055         if(type < 0)
1056                 return 0;
1057         if(mime)
1058                 print(OCTET);
1059         else
1060                 print("%s intermediate\n", name);
1061         return 1;
1062 }
1063
1064 int
1065 isc(void)
1066 {
1067         int n;
1068
1069         n = wfreq[I1];
1070         /*
1071          * includes
1072          */
1073         if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1074                 goto yes;
1075         if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1076                 goto yes;
1077         /*
1078          * declarations
1079          */
1080         if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
1081                 goto yes;
1082         /*
1083          * assignments
1084          */
1085         if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
1086                 goto yes;
1087         return 0;
1088
1089 yes:
1090         if(mime){
1091                 print(PLAIN);
1092                 return 1;
1093         }
1094         if(wfreq[Alword] > 0)
1095                 print("alef program\n");
1096         else
1097                 print("c program\n");
1098         return 1;
1099 }
1100
1101 int
1102 islimbo(void)
1103 {
1104
1105         /*
1106          * includes
1107          */
1108         if(wfreq[Lword] < 4)
1109                 return 0;
1110         print(mime ? PLAIN : "limbo program\n");
1111         return 1;
1112 }
1113
1114 int
1115 isas(void)
1116 {
1117
1118         /*
1119          * includes
1120          */
1121         if(wfreq[Aword] < 2)
1122                 return 0;
1123         print(mime ? PLAIN : "as program\n");
1124         return 1;
1125 }
1126
1127 /*
1128  * low entropy means encrypted
1129  */
1130 int
1131 ismung(void)
1132 {
1133         int i, bucket[8];
1134         float cs;
1135
1136         if(nbuf < 64)
1137                 return 0;
1138         memset(bucket, 0, sizeof(bucket));
1139         for(i=nbuf-64; i<nbuf; i++)
1140                 bucket[(buf[i]>>5)&07] += 1;
1141
1142         cs = 0.;
1143         for(i=0; i<8; i++)
1144                 cs += (bucket[i]-8)*(bucket[i]-8);
1145         cs /= 8.;
1146         if(cs <= 24.322) {
1147                 if(buf[0]==0x1f && buf[1]==0x9d)
1148                         print(mime ? OCTET : "compressed\n");
1149                 else
1150                 if(buf[0]==0x1f && buf[1]==0x8b)
1151                         print(mime ? OCTET : "gzip compressed\n");
1152                 else
1153                 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h')
1154                         print(mime ? OCTET : "bzip2 compressed\n");
1155                 else
1156                         print(mime ? OCTET : "encrypted\n");
1157                 return 1;
1158         }
1159         return 0;
1160 }
1161
1162 /*
1163  * english by punctuation and frequencies
1164  */
1165 int
1166 isenglish(void)
1167 {
1168         int vow, comm, rare, badpun, punct;
1169         char *p;
1170
1171         if(guess != Fascii && guess != Feascii)
1172                 return 0;
1173         badpun = 0;
1174         punct = 0;
1175         for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
1176                 switch(*p) {
1177                 case '.':
1178                 case ',':
1179                 case ')':
1180                 case '%':
1181                 case ';':
1182                 case ':':
1183                 case '?':
1184                         punct++;
1185                         if(p[1] != ' ' && p[1] != '\n')
1186                                 badpun++;
1187                 }
1188         if(badpun*5 > punct)
1189                 return 0;
1190         if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])       /* shell file test */
1191                 return 0;
1192         if(2*cfreq[';'] > cfreq['e'])
1193                 return 0;
1194
1195         vow = 0;
1196         for(p="AEIOU"; *p; p++) {
1197                 vow += cfreq[*p];
1198                 vow += cfreq[tolower(*p)];
1199         }
1200         comm = 0;
1201         for(p="ETAION"; *p; p++) {
1202                 comm += cfreq[*p];
1203                 comm += cfreq[tolower(*p)];
1204         }
1205         rare = 0;
1206         for(p="VJKQXZ"; *p; p++) {
1207                 rare += cfreq[*p];
1208                 rare += cfreq[tolower(*p)];
1209         }
1210         if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
1211                 print(mime ? PLAIN : "English text\n");
1212                 return 1;
1213         }
1214         return 0;
1215 }
1216
1217 /*
1218  * pick up a number with
1219  * syntax _*[0-9]+_
1220  */
1221 #define P9BITLEN        12
1222 int
1223 p9bitnum(uchar *bp)
1224 {
1225         int n, c, len;
1226
1227         len = P9BITLEN;
1228         while(*bp == ' ') {
1229                 bp++;
1230                 len--;
1231                 if(len <= 0)
1232                         return -1;
1233         }
1234         n = 0;
1235         while(len > 1) {
1236                 c = *bp++;
1237                 if(!isdigit(c))
1238                         return -1;
1239                 n = n*10 + c-'0';
1240                 len--;
1241         }
1242         if(*bp != ' ')
1243                 return -1;
1244         return n;
1245 }
1246
1247 int
1248 depthof(char *s, int *newp)
1249 {
1250         char *es;
1251         int d;
1252
1253         *newp = 0;
1254         es = s+12;
1255         while(s<es && *s==' ')
1256                 s++;
1257         if(s == es)
1258                 return -1;
1259         if('0'<=*s && *s<='9')
1260                 return 1<<strtol(s, 0, 0);
1261
1262         *newp = 1;
1263         d = 0;
1264         while(s<es && *s!=' '){
1265                 s++;                    /* skip letter */
1266                 d += strtoul(s, &s, 10);
1267         }
1268
1269         if(d % 8 == 0 || 8 % d == 0)
1270                 return d;
1271         else
1272                 return -1;
1273 }
1274
1275 int
1276 isp9bit(void)
1277 {
1278         int dep, lox, loy, hix, hiy, px, new, cmpr;
1279         ulong t;
1280         long len;
1281         char *newlabel;
1282         uchar *cp;
1283
1284         cp = buf;
1285         cmpr = 0;
1286         newlabel = "old ";
1287
1288         if(memcmp(cp, "compressed\n", 11) == 0) {
1289                 cmpr = 1;
1290                 cp = buf + 11;
1291         }
1292
1293         dep = depthof((char*)cp + 0*P9BITLEN, &new);
1294         if(new)
1295                 newlabel = "";
1296         lox = p9bitnum(cp + 1*P9BITLEN);
1297         loy = p9bitnum(cp + 2*P9BITLEN);
1298         hix = p9bitnum(cp + 3*P9BITLEN);
1299         hiy = p9bitnum(cp + 4*P9BITLEN);
1300         if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1301                 return 0;
1302
1303         if(dep < 8){
1304                 px = 8/dep;             /* pixels per byte */
1305                 /* set l to number of bytes of data per scan line */
1306                 if(lox >= 0)
1307                         len = (hix+px-1)/px - lox/px;
1308                 else{                   /* make positive before divide */
1309                         t = (-lox)+px-1;
1310                         t = (t/px)*px;
1311                         len = (t+hix+px-1)/px;
1312                 }
1313         }else
1314                 len = (hix-lox)*dep/8;
1315         len *= hiy - loy;               /* col length */
1316         len += 5 * P9BITLEN;            /* size of initial ascii */
1317
1318         /*
1319          * for compressed images, don't look any further. otherwise:
1320          * for image file, length is non-zero and must match calculation above.
1321          * for /dev/window and /dev/screen the length is always zero.
1322          * for subfont, the subfont header should follow immediately.
1323          */
1324         if (cmpr) {
1325                 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n",
1326                         newlabel, dep);
1327                 return 1;
1328         }
1329         /*
1330          * mbuf->length == 0 probably indicates reading a pipe.
1331          * Ghostscript sometimes produces a little extra on the end.
1332          */
1333         if (len != 0 && (mbuf->length == 0 || mbuf->length == len ||
1334             mbuf->length > len && mbuf->length < len+P9BITLEN)) {
1335                 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep);
1336                 return 1;
1337         }
1338         if (p9subfont(buf+len)) {
1339                 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep);
1340                 return 1;
1341         }
1342         return 0;
1343 }
1344
1345 int
1346 p9subfont(uchar *p)
1347 {
1348         int n, h, a;
1349
1350         /* if image too big, assume it's a subfont */
1351         if (p+3*P9BITLEN > buf+sizeof(buf))
1352                 return 1;
1353
1354         n = p9bitnum(p + 0*P9BITLEN);   /* char count */
1355         if (n < 0)
1356                 return 0;
1357         h = p9bitnum(p + 1*P9BITLEN);   /* height */
1358         if (h < 0)
1359                 return 0;
1360         a = p9bitnum(p + 2*P9BITLEN);   /* ascent */
1361         if (a < 0)
1362                 return 0;
1363         return 1;
1364 }
1365
1366 #define WHITESPACE(c)           ((c) == ' ' || (c) == '\t' || (c) == '\n')
1367
1368 int
1369 isp9font(void)
1370 {
1371         uchar *cp, *p;
1372         int i, n;
1373         char pathname[1024];
1374
1375         cp = buf;
1376         if (!getfontnum(cp, &cp))       /* height */
1377                 return 0;
1378         if (!getfontnum(cp, &cp))       /* ascent */
1379                 return 0;
1380         for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) {
1381                 if (!getfontnum(cp, &cp))       /* min */
1382                         break;
1383                 if (!getfontnum(cp, &cp))       /* max */
1384                         return 0;
1385                 getfontnum(cp, &cp);    /* optional offset */
1386                 while (WHITESPACE(*cp))
1387                         cp++;
1388                 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1389                                 ;
1390                         /* construct a path name, if needed */
1391                 n = 0;
1392                 if (*p != '/' && slash) {
1393                         n = slash-fname+1;
1394                         if (n < sizeof(pathname))
1395                                 memcpy(pathname, fname, n);
1396                         else n = 0;
1397                 }
1398                 if (n+cp-p+4 < sizeof(pathname)) {
1399                         memcpy(pathname+n, p, cp-p);
1400                         n += cp-p;
1401                         pathname[n] = 0;
1402                         if (access(pathname, AEXIST) < 0) {
1403                                 strcpy(pathname+n, ".0");
1404                                 if (access(pathname, AEXIST) < 0)
1405                                         return 0;
1406                         }
1407                 }
1408         }
1409         if (i) {
1410                 print(mime ? "text/plain\n" : "font file\n");
1411                 return 1;
1412         }
1413         return 0;
1414 }
1415
1416 int
1417 getfontnum(uchar *cp, uchar **rp)
1418 {
1419         while (WHITESPACE(*cp))         /* extract ulong delimited by whitespace */
1420                 cp++;
1421         if (*cp < '0' || *cp > '9')
1422                 return 0;
1423         strtoul((char *)cp, (char **)rp, 0);
1424         if (!WHITESPACE(**rp)) {
1425                 *rp = cp;
1426                 return 0;
1427         }
1428         return 1;
1429 }
1430
1431 int
1432 isrtf(void)
1433 {
1434         if(strstr((char *)buf, "\\rtf1")){
1435                 print(mime ? "application/rtf\n" : "rich text format\n");
1436                 return 1;
1437         }
1438         return 0;
1439 }
1440
1441 int
1442 ismsdos(void)
1443 {
1444         if (buf[0] == 0x4d && buf[1] == 0x5a){
1445                 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1446                 return 1;
1447         }
1448         return 0;
1449 }
1450
1451 int
1452 iself(void)
1453 {
1454         static char *cpu[] = {          /* NB: incomplete and arbitary list */
1455         [1]     "WE32100",
1456         [2]     "SPARC",
1457         [3]     "i386",
1458         [4]     "M68000",
1459         [5]     "M88000",
1460         [6]     "i486",
1461         [7]     "i860",
1462         [8]     "R3000",
1463         [9]     "S370",
1464         [10]    "R4000",
1465         [15]    "HP-PA",
1466         [18]    "sparc v8+",
1467         [19]    "i960",
1468         [20]    "PPC-32",
1469         [21]    "PPC-64",
1470         [40]    "ARM",
1471         [41]    "Alpha",
1472         [43]    "sparc v9",
1473         [50]    "IA-64",
1474         [62]    "AMD64",
1475         [75]    "VAX",
1476         };
1477         static char *type[] = {
1478         [1]     "relocatable object",
1479         [2]     "executable",
1480         [3]     "shared library",
1481         [4]     "core dump",
1482         };
1483
1484         if (memcmp(buf, "\x7fELF", 4) == 0){
1485                 if (!mime){
1486                         int isdifend = 0;
1487                         int n = (buf[19] << 8) | buf[18];
1488                         char *p = "unknown";
1489                         char *t = "unknown";
1490
1491                         if (n > 0 && n < nelem(cpu) && cpu[n])
1492                                 p = cpu[n];
1493                         else {
1494                                 /* try the other byte order */
1495                                 isdifend = 1;
1496                                 n = (buf[18] << 8) | buf[19];
1497                                 if (n > 0 && n < nelem(cpu) && cpu[n])
1498                                         p = cpu[n];
1499                         }
1500                         if(isdifend)
1501                                 n = (buf[16]<< 8) | buf[17];
1502                         else
1503                                 n = (buf[17]<< 8) | buf[16];
1504
1505                         if(n>0 && n < nelem(type) && type[n])
1506                                 t = type[n];
1507                         print("%s ELF %s\n", p, t);
1508                 }
1509                 else
1510                         print("application/x-elf-executable");
1511                 return 1;
1512         }
1513
1514         return 0;
1515 }
1516
1517 int
1518 isface(void)
1519 {
1520         int i, j, ldepth, l;
1521         char *p;
1522
1523         ldepth = -1;
1524         for(j = 0; j < 3; j++){
1525                 for(p = (char*)buf, i=0; i<3; i++){
1526                         if(p[0] != '0' || p[1] != 'x')
1527                                 return 0;
1528                         if(buf[2+8] == ',')
1529                                 l = 2;
1530                         else if(buf[2+4] == ',')
1531                                 l = 1;
1532                         else
1533                                 return 0;
1534                         if(ldepth == -1)
1535                                 ldepth = l;
1536                         if(l != ldepth)
1537                                 return 0;
1538                         strtoul(p, &p, 16);
1539                         if(*p++ != ',')
1540                                 return 0;
1541                         while(*p == ' ' || *p == '\t')
1542                                 p++;
1543                 }
1544                 if (*p++ != '\n')
1545                         return 0;
1546         }
1547
1548         if(mime)
1549                 print("application/x-face\n");
1550         else
1551                 print("face image depth %d\n", ldepth);
1552         return 1;
1553 }
1554