1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6
7 /*
8 * file - determine type of file
9 */
10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11
12 uchar buf[6001];
13 short cfreq[140];
14 short wfreq[50];
15 int nbuf;
16 Dir* mbuf;
17 int fd;
18 char *fname;
19 char *slash;
20
21 enum
22 {
23 Cword,
24 Fword,
25 Aword,
26 Alword,
27 Lword,
28 I1,
29 I2,
30 I3,
31 Clatin = 128,
32 Cbinary,
33 Cnull,
34 Ceascii,
35 Cutf,
36 };
37 struct
38 {
39 char* word;
40 int class;
41 } dict[] =
42 {
43 "PATH", Lword,
44 "TEXT", Aword,
45 "adt", Alword,
46 "aggr", Alword,
47 "alef", Alword,
48 "array", Lword,
49 "block", Fword,
50 "char", Cword,
51 "common", Fword,
52 "con", Lword,
53 "data", Fword,
54 "dimension", Fword,
55 "double", Cword,
56 "extern", Cword,
57 "bio", I2,
58 "float", Cword,
59 "fn", Lword,
60 "function", Fword,
61 "h", I3,
62 "implement", Lword,
63 "import", Lword,
64 "include", I1,
65 "int", Cword,
66 "integer", Fword,
67 "iota", Lword,
68 "libc", I2,
69 "long", Cword,
70 "module", Lword,
71 "real", Fword,
72 "ref", Lword,
73 "register", Cword,
74 "self", Lword,
75 "short", Cword,
76 "static", Cword,
77 "stdio", I2,
78 "struct", Cword,
79 "subroutine", Fword,
80 "u", I2,
81 "void", Cword,
82 };
83
84 /* codes for 'mode' field in language structure */
85 enum {
86 Normal = 0,
87 First, /* first entry for language spanning several ranges */
88 Multi, /* later entries " " " ... */
89 Shared, /* codes used in several languages */
90 };
91
92 struct
93 {
94 int mode; /* see enum above */
95 int count;
96 int low;
97 int high;
98 char *name;
99
100 } language[] =
101 {
102 Normal, 0, 0x0100, 0x01FF, "Extended Latin",
103 Normal, 0, 0x0370, 0x03FF, "Greek",
104 Normal, 0, 0x0400, 0x04FF, "Cyrillic",
105 Normal, 0, 0x0530, 0x058F, "Armenian",
106 Normal, 0, 0x0590, 0x05FF, "Hebrew",
107 Normal, 0, 0x0600, 0x06FF, "Arabic",
108 Normal, 0, 0x0900, 0x097F, "Devanagari",
109 Normal, 0, 0x0980, 0x09FF, "Bengali",
110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
112 Normal, 0, 0x0B00, 0x0B7F, "Oriya",
113 Normal, 0, 0x0B80, 0x0BFF, "Tamil",
114 Normal, 0, 0x0C00, 0x0C7F, "Telugu",
115 Normal, 0, 0x0C80, 0x0CFF, "Kannada",
116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
117 Normal, 0, 0x0E00, 0x0E7F, "Thai",
118 Normal, 0, 0x0E80, 0x0EFF, "Lao",
119 Normal, 0, 0x1000, 0x105F, "Tibetan",
120 Normal, 0, 0x10A0, 0x10FF, "Georgian",
121 Normal, 0, 0x3040, 0x30FF, "Japanese",
122 Normal, 0, 0x3100, 0x312F, "Chinese",
123 First, 0, 0x3130, 0x318F, "Korean",
124 Multi, 0, 0x3400, 0x3D2F, "Korean",
125 Shared, 0, 0x4e00, 0x9fff, "CJK",
126 Normal, 0, 0, 0, 0, /* terminal entry */
127 };
128
129
130 enum
131 {
132 Fascii, /* printable ascii */
133 Flatin, /* latin 1*/
134 Futf, /* UTF character set */
135 Fbinary, /* binary */
136 Feascii, /* ASCII with control chars */
137 Fnull, /* NULL in file */
138 } guess;
139
140 void bump_utf_count(Rune);
141 int cistrncmp(char*, char*, int);
142 void filetype(int);
143 int getfontnum(uchar*, uchar**);
144 int isas(void);
145 int isc(void);
146 int iscint(void);
147 int isenglish(void);
148 int ishp(void);
149 int ishtml(void);
150 int isrfc822(void);
151 int ismbox(void);
152 int islimbo(void);
153 int ismung(void);
154 int isp9bit(void);
155 int isp9font(void);
156 int isrtf(void);
157 int ismsdos(void);
158 int iself(void);
159 int istring(void);
160 int isoffstr(void);
161 int iff(void);
162 int long0(void);
163 int longoff(void);
164 int istar(void);
165 int isface(void);
166 int isexec(void);
167 int p9bitnum(uchar*);
168 int p9subfont(uchar*);
169 void print_utf(void);
170 void type(char*, int);
171 int utf_count(void);
172 void wordfreq(void);
173
174 int (*call[])(void) =
175 {
176 long0, /* recognizable by first 4 bytes */
177 istring, /* recognizable by first string */
178 iself, /* ELF (foreign) executable */
179 isexec, /* native executables */
180 iff, /* interchange file format (strings) */
181 longoff, /* recognizable by 4 bytes at some offset */
182 isoffstr, /* recognizable by string at some offset */
183 isrfc822, /* email file */
184 ismbox, /* mail box */
185 istar, /* recognizable by tar checksum */
186 ishtml, /* html keywords */
187 iscint, /* compiler/assembler intermediate */
188 islimbo, /* limbo source */
189 isc, /* c & alef compiler key words */
190 isas, /* assembler key words */
191 isp9font, /* plan 9 font */
192 isp9bit, /* plan 9 image (as from /dev/window) */
193 isrtf, /* rich text format */
194 ismsdos, /* msdos exe (virus file attachement) */
195 isface, /* ascii face file */
196
197 /* last resorts */
198 ismung, /* entropy compressed/encrypted */
199 isenglish, /* char frequency English */
200 0
201 };
202
203 int mime;
204
205 char OCTET[] = "application/octet-stream\n";
206 char PLAIN[] = "text/plain\n";
207
208 void
main(int argc,char * argv[])209 main(int argc, char *argv[])
210 {
211 int i, j, maxlen;
212 char *cp;
213 Rune r;
214
215 ARGBEGIN{
216 case 'm':
217 mime = 1;
218 break;
219 default:
220 fprint(2, "usage: file [-m] [file...]\n");
221 exits("usage");
222 }ARGEND;
223
224 maxlen = 0;
225 if(mime == 0 || argc > 1){
226 for(i = 0; i < argc; i++) {
227 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
228 ;
229 if(j > maxlen)
230 maxlen = j;
231 }
232 }
233 if (argc <= 0) {
234 if(!mime)
235 print ("stdin: ");
236 filetype(0);
237 }
238 else {
239 for(i = 0; i < argc; i++)
240 type(argv[i], maxlen);
241 }
242 exits(0);
243 }
244
245 void
type(char * file,int nlen)246 type(char *file, int nlen)
247 {
248 Rune r;
249 int i;
250 char *p;
251
252 if(nlen > 0){
253 slash = 0;
254 for (i = 0, p = file; *p; i++) {
255 if (*p == '/') /* find rightmost slash */
256 slash = p;
257 p += chartorune(&r, p); /* count runes */
258 }
259 print("%s:%*s",file, nlen-i+1, "");
260 }
261 fname = file;
262 if ((fd = open(file, OREAD)) < 0) {
263 print("cannot open: %r\n");
264 return;
265 }
266 filetype(fd);
267 close(fd);
268 }
269
270 void
filetype(int fd)271 filetype(int fd)
272 {
273 Rune r;
274 int i, f, n;
275 char *p, *eob;
276
277 free(mbuf);
278 mbuf = dirfstat(fd);
279 if(mbuf == nil){
280 print("cannot stat: %r\n");
281 return;
282 }
283 if(mbuf->mode & DMDIR) {
284 print(mime ? OCTET : "directory\n");
285 return;
286 }
287 if(mbuf->type != 'M' && mbuf->type != '|') {
288 print(mime ? OCTET : "special file #%C/%s\n",
289 mbuf->type, mbuf->name);
290 return;
291 }
292 /* may be reading a pipe on standard input */
293 nbuf = readn(fd, buf, sizeof(buf)-1);
294 if(nbuf < 0) {
295 print("cannot read: %r\n");
296 return;
297 }
298 if(nbuf == 0) {
299 print(mime ? PLAIN : "empty file\n");
300 return;
301 }
302 buf[nbuf] = 0;
303
304 /*
305 * build histogram table
306 */
307 memset(cfreq, 0, sizeof(cfreq));
308 for (i = 0; language[i].name; i++)
309 language[i].count = 0;
310 eob = (char *)buf+nbuf;
311 for(n = 0, p = (char *)buf; p < eob; n++) {
312 if (!fullrune(p, eob-p) && eob-p < UTFmax)
313 break;
314 p += chartorune(&r, p);
315 if (r == 0)
316 f = Cnull;
317 else if (r <= 0x7f) {
318 if (!isprint(r) && !isspace(r))
319 f = Ceascii; /* ASCII control char */
320 else f = r;
321 } else if (r == 0x80) {
322 bump_utf_count(r);
323 f = Cutf;
324 } else if (r < 0xA0)
325 f = Cbinary; /* Invalid Runes */
326 else if (r <= 0xff)
327 f = Clatin; /* Latin 1 */
328 else {
329 bump_utf_count(r);
330 f = Cutf; /* UTF extension */
331 }
332 cfreq[f]++; /* ASCII chars peg directly */
333 }
334 /*
335 * gross classify
336 */
337 if (cfreq[Cbinary])
338 guess = Fbinary;
339 else if (cfreq[Cutf])
340 guess = Futf;
341 else if (cfreq[Clatin])
342 guess = Flatin;
343 else if (cfreq[Ceascii])
344 guess = Feascii;
345 else if (cfreq[Cnull])
346 guess = Fbinary;
347 else
348 guess = Fascii;
349 /*
350 * lookup dictionary words
351 */
352 memset(wfreq, 0, sizeof(wfreq));
353 if(guess == Fascii || guess == Flatin || guess == Futf)
354 wordfreq();
355 /*
356 * call individual classify routines
357 */
358 for(i=0; call[i]; i++)
359 if((*call[i])())
360 return;
361
362 /*
363 * if all else fails,
364 * print out gross classification
365 */
366 if (nbuf < 100 && !mime)
367 print(mime ? PLAIN : "short ");
368 if (guess == Fascii)
369 print(mime ? PLAIN : "Ascii\n");
370 else if (guess == Feascii)
371 print(mime ? PLAIN : "extended ascii\n");
372 else if (guess == Flatin)
373 print(mime ? PLAIN : "latin ascii\n");
374 else if (guess == Futf && utf_count() < 4)
375 print_utf();
376 else print(mime ? OCTET : "binary\n");
377 }
378
379 void
bump_utf_count(Rune r)380 bump_utf_count(Rune r)
381 {
382 int low, high, mid;
383
384 high = sizeof(language)/sizeof(language[0])-1;
385 for (low = 0; low < high;) {
386 mid = (low+high)/2;
387 if (r >= language[mid].low) {
388 if (r <= language[mid].high) {
389 language[mid].count++;
390 break;
391 } else low = mid+1;
392 } else high = mid;
393 }
394 }
395
396 int
utf_count(void)397 utf_count(void)
398 {
399 int i, count;
400
401 count = 0;
402 for (i = 0; language[i].name; i++)
403 if (language[i].count > 0)
404 switch (language[i].mode) {
405 case Normal:
406 case First:
407 count++;
408 break;
409 default:
410 break;
411 }
412 return count;
413 }
414
415 int
chkascii(void)416 chkascii(void)
417 {
418 int i;
419
420 for (i = 'a'; i < 'z'; i++)
421 if (cfreq[i])
422 return 1;
423 for (i = 'A'; i < 'Z'; i++)
424 if (cfreq[i])
425 return 1;
426 return 0;
427 }
428
429 int
find_first(char * name)430 find_first(char *name)
431 {
432 int i;
433
434 for (i = 0; language[i].name != 0; i++)
435 if (language[i].mode == First
436 && strcmp(language[i].name, name) == 0)
437 return i;
438 return -1;
439 }
440
441 void
print_utf(void)442 print_utf(void)
443 {
444 int i, printed, j;
445
446 if(mime){
447 print(PLAIN);
448 return;
449 }
450 if (chkascii()) {
451 printed = 1;
452 print("Ascii");
453 } else
454 printed = 0;
455 for (i = 0; language[i].name; i++)
456 if (language[i].count) {
457 switch(language[i].mode) {
458 case Multi:
459 j = find_first(language[i].name);
460 if (j < 0)
461 break;
462 if (language[j].count > 0)
463 break;
464 /* Fall through */
465 case Normal:
466 case First:
467 if (printed)
468 print(" & ");
469 else printed = 1;
470 print("%s", language[i].name);
471 break;
472 case Shared:
473 default:
474 break;
475 }
476 }
477 if(!printed)
478 print("UTF");
479 print(" text\n");
480 }
481
482 void
wordfreq(void)483 wordfreq(void)
484 {
485 int low, high, mid, r;
486 uchar *p, *p2, c;
487
488 p = buf;
489 for(;;) {
490 while (p < buf+nbuf && !isalpha(*p))
491 p++;
492 if (p >= buf+nbuf)
493 return;
494 p2 = p;
495 while(p < buf+nbuf && isalpha(*p))
496 p++;
497 c = *p;
498 *p = 0;
499 high = sizeof(dict)/sizeof(dict[0]);
500 for(low = 0;low < high;) {
501 mid = (low+high)/2;
502 r = strcmp(dict[mid].word, (char*)p2);
503 if(r == 0) {
504 wfreq[dict[mid].class]++;
505 break;
506 }
507 if(r < 0)
508 low = mid+1;
509 else
510 high = mid;
511 }
512 *p++ = c;
513 }
514 }
515
516 typedef struct Filemagic Filemagic;
517 struct Filemagic {
518 ulong x;
519 ulong mask;
520 char *desc;
521 char *mime;
522 };
523
524 /*
525 * integers in this table must be as seen on a little-endian machine
526 * when read from a file.
527 */
528 Filemagic long0tab[] = {
529 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
530 /* "pac1" */
531 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
532 /* "pXc2 */
533 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET,
534 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
535 0x43614c66, 0xFFFFFFFF, "FLAC audio file\n", OCTET,
536 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
537 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip",
538 070707, 0xFFFF, "cpio archive\n", OCTET,
539 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi",
540 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg",
541 0xf0ff, 0xf6ff, "aac audio\n", "audio/mpeg",
542 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be",
543 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le",
544 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be",
545 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le",
546 /* 0xfeedface: this could alternately be a Next Plan 9 boot image */
547 0xcefaedfe, 0xFFFFFFFF, "32-bit power Mach-O executable\n", OCTET,
548 /* 0xfeedfacf */
549 0xcffaedfe, 0xFFFFFFFF, "64-bit power Mach-O executable\n", OCTET,
550 /* 0xcefaedfe */
551 0xfeedface, 0xFFFFFFFF, "386 Mach-O executable\n", OCTET,
552 /* 0xcffaedfe */
553 0xfeedfacf, 0xFFFFFFFF, "amd64 Mach-O executable\n", OCTET,
554 /* 0xcafebabe */
555 0xbebafeca, 0xFFFFFFFF, "Mach-O universal executable\n", OCTET,
556 /*
557 * these magic numbers are stored big-endian on disk,
558 * thus the numbers appear reversed in this table.
559 */
560 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET,
561 0x2bb19a52, 0xFFFFFFFF, "paq archive\n", OCTET,
562 };
563
564 int
filemagic(Filemagic * tab,int ntab,ulong x)565 filemagic(Filemagic *tab, int ntab, ulong x)
566 {
567 int i;
568
569 for(i=0; i<ntab; i++)
570 if((x&tab[i].mask) == tab[i].x){
571 print(mime ? tab[i].mime : tab[i].desc);
572 return 1;
573 }
574 return 0;
575 }
576
577 int
long0(void)578 long0(void)
579 {
580 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf));
581 }
582
583 typedef struct Fileoffmag Fileoffmag;
584 struct Fileoffmag {
585 ulong off;
586 Filemagic;
587 };
588
589 /*
590 * integers in this table must be as seen on a little-endian machine
591 * when read from a file.
592 */
593 Fileoffmag longofftab[] = {
594 /*
595 * these magic numbers are stored big-endian on disk,
596 * thus the numbers appear reversed in this table.
597 */
598 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET,
599 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET,
600 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET,
601 4, 0x31647542, 0xFFFFFFFF, "OS X finder properties\n", OCTET,
602 };
603
604 int
fileoffmagic(Fileoffmag * tab,int ntab)605 fileoffmagic(Fileoffmag *tab, int ntab)
606 {
607 int i;
608 ulong x;
609 Fileoffmag *tp;
610 uchar buf[sizeof(long)];
611
612 for(i=0; i<ntab; i++) {
613 tp = tab + i;
614 seek(fd, tp->off, 0);
615 if (readn(fd, buf, sizeof buf) != sizeof buf)
616 continue;
617 x = LENDIAN(buf);
618 if((x&tp->mask) == tp->x){
619 print(mime? tp->mime: tp->desc);
620 return 1;
621 }
622 }
623 return 0;
624 }
625
626 int
longoff(void)627 longoff(void)
628 {
629 return fileoffmagic(longofftab, nelem(longofftab));
630 }
631
632 int
isexec(void)633 isexec(void)
634 {
635 Fhdr f;
636
637 seek(fd, 0, 0); /* reposition to start of file */
638 if(crackhdr(fd, &f)) {
639 print(mime ? OCTET : "%s\n", f.name);
640 return 1;
641 }
642 return 0;
643 }
644
645
646 /* from tar.c */
647 enum { NAMSIZ = 100, TBLOCK = 512 };
648
649 union hblock
650 {
651 char dummy[TBLOCK];
652 struct header
653 {
654 char name[NAMSIZ];
655 char mode[8];
656 char uid[8];
657 char gid[8];
658 char size[12];
659 char mtime[12];
660 char chksum[8];
661 char linkflag;
662 char linkname[NAMSIZ];
663 /* rest are defined by POSIX's ustar format; see p1003.2b */
664 char magic[6]; /* "ustar" */
665 char version[2];
666 char uname[32];
667 char gname[32];
668 char devmajor[8];
669 char devminor[8];
670 char prefix[155]; /* if non-null, path = prefix "/" name */
671 } dbuf;
672 };
673
674 int
checksum(union hblock * hp)675 checksum(union hblock *hp)
676 {
677 int i;
678 char *cp;
679 struct header *hdr = &hp->dbuf;
680
681 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
682 *cp = ' ';
683 i = 0;
684 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
685 i += *cp & 0xff;
686 return i;
687 }
688
689 int
istar(void)690 istar(void)
691 {
692 int chksum;
693 char tblock[TBLOCK];
694 union hblock *hp = (union hblock *)tblock;
695 struct header *hdr = &hp->dbuf;
696
697 seek(fd, 0, 0); /* reposition to start of file */
698 if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
699 return 0;
700 chksum = strtol(hdr->chksum, 0, 8);
701 if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
702 if (strcmp(hdr->magic, "ustar") == 0)
703 print(mime? "application/x-ustar\n":
704 "posix tar archive\n");
705 else
706 print(mime? "application/x-tar\n": "tar archive\n");
707 return 1;
708 }
709 return 0;
710 }
711
712 /*
713 * initial words to classify file
714 */
715 struct FILE_STRING
716 {
717 char *key;
718 char *filetype;
719 int length;
720 char *mime;
721 } file_string[] =
722 {
723 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
724 "!<arch>\n", "archive", 8, "application/octet-stream",
725 "070707", "cpio archive - ascii header", 6, "application/octet-stream",
726 "#!/bin/rc", "rc executable file", 9, "text/plain",
727 "#!/bin/sh", "sh executable file", 9, "text/plain",
728 "%!", "postscript", 2, "application/postscript",
729 "\004%!", "postscript", 3, "application/postscript",
730 "x T post", "troff output for post", 8, "application/troff",
731 "x T Latin1", "troff output for Latin1", 10, "application/troff",
732 "x T utf", "troff output for UTF", 7, "application/troff",
733 "x T 202", "troff output for 202", 7, "application/troff",
734 "x T aps", "troff output for aps", 7, "application/troff",
735 "x T ", "troff output", 4, "application/troff",
736 "GIF", "GIF image", 3, "image/gif",
737 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
738 "%PDF", "PDF", 4, "application/pdf",
739 "<html>\n", "HTML file", 7, "text/html",
740 "<HTML>\n", "HTML file", 7, "text/html",
741 "\111\111\052\000", "tiff", 4, "image/tiff",
742 "\115\115\000\052", "tiff", 4, "image/tiff",
743 "\377\330\377\340", "jpeg", 4, "image/jpeg",
744 "\377\330\377\341", "jpeg", 4, "image/jpeg",
745 "\377\330\377\333", "jpeg", 4, "image/jpeg",
746 "BM", "bmp", 2, "image/bmp",
747 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
748 "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
749 "\033E\033", "HP PCL printer data", 3, OCTET,
750 "\033&", "HP PCL printer data", 2, OCTET,
751 "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
752 "\033Lua", "Lua bytecode", 4, OCTET,
753 "ID3", "mp3 audio with id3", 3, "audio/mpeg",
754 "\211PNG", "PNG image", 4, "image/png",
755 "P3\n", "ppm", 3, "image/ppm",
756 "P6\n", "ppm", 3, "image/ppm",
757 "/* XPM */\n", "xbm", 10, "image/xbm",
758 ".HTML ", "troff -ms input", 6, "text/troff",
759 ".LP", "troff -ms input", 3, "text/troff",
760 ".ND", "troff -ms input", 3, "text/troff",
761 ".PP", "troff -ms input", 3, "text/troff",
762 ".TL", "troff -ms input", 3, "text/troff",
763 ".TR", "troff -ms input", 3, "text/troff",
764 ".TH", "manual page", 3, "text/troff",
765 ".\\\"", "troff input", 3, "text/troff",
766 ".de", "troff input", 3, "text/troff",
767 ".if", "troff input", 3, "text/troff",
768 ".nr", "troff input", 3, "text/troff",
769 ".tr", "troff input", 3, "text/troff",
770 "vac:", "venti score", 4, "text/plain",
771 "-----BEGIN CERTIFICATE-----\n",
772 "pem certificate", -1, "text/plain",
773 "-----BEGIN TRUSTED CERTIFICATE-----\n",
774 "pem trusted certificate", -1, "text/plain",
775 "-----BEGIN X509 CERTIFICATE-----\n",
776 "pem x.509 certificate", -1, "text/plain",
777 "subject=/C=", "pem certificate with header", -1, "text/plain",
778 "process snapshot ", "process snapshot", -1, "application/snapfs",
779 "BEGIN:VCARD\r\n", "vCard", 13, "text/directory;profile=vcard",
780 "BEGIN:VCARD\n", "vCard", 12, "text/directory;profile=vcard",
781 0,0,0,0
782 };
783
784 int
istring(void)785 istring(void)
786 {
787 int i, l;
788 struct FILE_STRING *p;
789
790 for(p = file_string; p->key; p++) {
791 l = p->length;
792 if(l == -1)
793 l = strlen(p->key);
794 if(nbuf >= l && memcmp(buf, p->key, l) == 0) {
795 if(mime)
796 print("%s\n", p->mime);
797 else
798 print("%s\n", p->filetype);
799 return 1;
800 }
801 }
802 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
803 for(i = 5; i < nbuf; i++)
804 if(buf[i] == '\n')
805 break;
806 if(mime)
807 print(OCTET);
808 else
809 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
810 return 1;
811 }
812 return 0;
813 }
814
815 struct offstr
816 {
817 ulong off;
818 struct FILE_STRING;
819 } offstrs[] = {
820 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET,
821 0, 0, 0, 0, 0
822 };
823
824 int
isoffstr(void)825 isoffstr(void)
826 {
827 int n;
828 char buf[256];
829 struct offstr *p;
830
831 for(p = offstrs; p->key; p++) {
832 seek(fd, p->off, 0);
833 n = p->length;
834 if (n > sizeof buf)
835 n = sizeof buf;
836 if (readn(fd, buf, n) != n)
837 continue;
838 if(memcmp(buf, p->key, n) == 0) {
839 if(mime)
840 print("%s\n", p->mime);
841 else
842 print("%s\n", p->filetype);
843 return 1;
844 }
845 }
846 return 0;
847 }
848
849 int
iff(void)850 iff(void)
851 {
852 if (strncmp((char*)buf, "FORM", 4) == 0 &&
853 strncmp((char*)buf+8, "AIFF", 4) == 0) {
854 print("%s\n", mime? "audio/x-aiff": "aiff audio");
855 return 1;
856 }
857 if (strncmp((char*)buf, "RIFF", 4) == 0) {
858 if (strncmp((char*)buf+8, "WAVE", 4) == 0)
859 print("%s\n", mime? "audio/wave": "wave audio");
860 else if (strncmp((char*)buf+8, "AVI ", 4) == 0)
861 print("%s\n", mime? "video/avi": "avi video");
862 else
863 print("%s\n", mime? "application/octet-stream":
864 "riff file");
865 return 1;
866 }
867 return 0;
868 }
869
870 char* html_string[] =
871 {
872 "title",
873 "body",
874 "head",
875 "strong",
876 "h1",
877 "h2",
878 "h3",
879 "h4",
880 "h5",
881 "h6",
882 "ul",
883 "li",
884 "dl",
885 "br",
886 "em",
887 0,
888 };
889
890 int
ishtml(void)891 ishtml(void)
892 {
893 uchar *p, *q;
894 int i, count;
895
896 /* compare strings between '<' and '>' to html table */
897 count = 0;
898 p = buf;
899 for(;;) {
900 while (p < buf+nbuf && *p != '<')
901 p++;
902 p++;
903 if (p >= buf+nbuf)
904 break;
905 if(*p == '/')
906 p++;
907 q = p;
908 while(p < buf+nbuf && *p != '>')
909 p++;
910 if (p >= buf+nbuf)
911 break;
912 for(i = 0; html_string[i]; i++) {
913 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
914 if(count++ > 4) {
915 print(mime ? "text/html\n" : "HTML file\n");
916 return 1;
917 }
918 break;
919 }
920 }
921 p++;
922 }
923 return 0;
924 }
925
926 char* rfc822_string[] =
927 {
928 "from:",
929 "date:",
930 "to:",
931 "subject:",
932 "received:",
933 "reply to:",
934 "sender:",
935 0,
936 };
937
938 int
isrfc822(void)939 isrfc822(void)
940 {
941
942 char *p, *q, *r;
943 int i, count;
944
945 count = 0;
946 p = (char*)buf;
947 for(;;) {
948 q = strchr(p, '\n');
949 if(q == nil)
950 break;
951 *q = 0;
952 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
953 count++;
954 *q = '\n';
955 p = q+1;
956 continue;
957 }
958 *q = '\n';
959 if(*p != '\t' && *p != ' '){
960 r = strchr(p, ':');
961 if(r == 0 || r > q)
962 break;
963 for(i = 0; rfc822_string[i]; i++) {
964 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
965 count++;
966 break;
967 }
968 }
969 }
970 p = q+1;
971 }
972 if(count >= 3){
973 print(mime ? "message/rfc822\n" : "email file\n");
974 return 1;
975 }
976 return 0;
977 }
978
979 int
ismbox(void)980 ismbox(void)
981 {
982 char *p, *q;
983
984 p = (char*)buf;
985 q = strchr(p, '\n');
986 if(q == nil)
987 return 0;
988 *q = 0;
989 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
990 print(mime ? "text/plain\n" : "mail box\n");
991 return 1;
992 }
993 *q = '\n';
994 return 0;
995 }
996
997 int
iscint(void)998 iscint(void)
999 {
1000 int type;
1001 char *name;
1002 Biobuf b;
1003
1004 if(Binit(&b, fd, OREAD) == Beof)
1005 return 0;
1006 seek(fd, 0, 0);
1007 type = objtype(&b, &name);
1008 if(type < 0)
1009 return 0;
1010 if(mime)
1011 print(OCTET);
1012 else
1013 print("%s intermediate\n", name);
1014 return 1;
1015 }
1016
1017 int
isc(void)1018 isc(void)
1019 {
1020 int n;
1021
1022 n = wfreq[I1];
1023 /*
1024 * includes
1025 */
1026 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1027 goto yes;
1028 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1029 goto yes;
1030 /*
1031 * declarations
1032 */
1033 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
1034 goto yes;
1035 /*
1036 * assignments
1037 */
1038 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
1039 goto yes;
1040 return 0;
1041
1042 yes:
1043 if(mime){
1044 print(PLAIN);
1045 return 1;
1046 }
1047 if(wfreq[Alword] > 0)
1048 print("alef program\n");
1049 else
1050 print("c program\n");
1051 return 1;
1052 }
1053
1054 int
islimbo(void)1055 islimbo(void)
1056 {
1057
1058 /*
1059 * includes
1060 */
1061 if(wfreq[Lword] < 4)
1062 return 0;
1063 print(mime ? PLAIN : "limbo program\n");
1064 return 1;
1065 }
1066
1067 int
isas(void)1068 isas(void)
1069 {
1070
1071 /*
1072 * includes
1073 */
1074 if(wfreq[Aword] < 2)
1075 return 0;
1076 print(mime ? PLAIN : "as program\n");
1077 return 1;
1078 }
1079
1080 /*
1081 * low entropy means encrypted
1082 */
1083 int
ismung(void)1084 ismung(void)
1085 {
1086 int i, bucket[8];
1087 float cs;
1088
1089 if(nbuf < 64)
1090 return 0;
1091 memset(bucket, 0, sizeof(bucket));
1092 for(i=nbuf-64; i<nbuf; i++)
1093 bucket[(buf[i]>>5)&07] += 1;
1094
1095 cs = 0.;
1096 for(i=0; i<8; i++)
1097 cs += (bucket[i]-8)*(bucket[i]-8);
1098 cs /= 8.;
1099 if(cs <= 24.322) {
1100 if(buf[0]==0x1f && buf[1]==0x9d)
1101 print(mime ? OCTET : "compressed\n");
1102 else
1103 if(buf[0]==0x1f && buf[1]==0x8b)
1104 print(mime ? OCTET : "gzip compressed\n");
1105 else
1106 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h')
1107 print(mime ? OCTET : "bzip2 compressed\n");
1108 else
1109 print(mime ? OCTET : "encrypted\n");
1110 return 1;
1111 }
1112 return 0;
1113 }
1114
1115 /*
1116 * english by punctuation and frequencies
1117 */
1118 int
isenglish(void)1119 isenglish(void)
1120 {
1121 int vow, comm, rare, badpun, punct;
1122 char *p;
1123
1124 if(guess != Fascii && guess != Feascii)
1125 return 0;
1126 badpun = 0;
1127 punct = 0;
1128 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
1129 switch(*p) {
1130 case '.':
1131 case ',':
1132 case ')':
1133 case '%':
1134 case ';':
1135 case ':':
1136 case '?':
1137 punct++;
1138 if(p[1] != ' ' && p[1] != '\n')
1139 badpun++;
1140 }
1141 if(badpun*5 > punct)
1142 return 0;
1143 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
1144 return 0;
1145 if(2*cfreq[';'] > cfreq['e'])
1146 return 0;
1147
1148 vow = 0;
1149 for(p="AEIOU"; *p; p++) {
1150 vow += cfreq[*p];
1151 vow += cfreq[tolower(*p)];
1152 }
1153 comm = 0;
1154 for(p="ETAION"; *p; p++) {
1155 comm += cfreq[*p];
1156 comm += cfreq[tolower(*p)];
1157 }
1158 rare = 0;
1159 for(p="VJKQXZ"; *p; p++) {
1160 rare += cfreq[*p];
1161 rare += cfreq[tolower(*p)];
1162 }
1163 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
1164 print(mime ? PLAIN : "English text\n");
1165 return 1;
1166 }
1167 return 0;
1168 }
1169
1170 /*
1171 * pick up a number with
1172 * syntax _*[0-9]+_
1173 */
1174 #define P9BITLEN 12
1175 int
p9bitnum(uchar * bp)1176 p9bitnum(uchar *bp)
1177 {
1178 int n, c, len;
1179
1180 len = P9BITLEN;
1181 while(*bp == ' ') {
1182 bp++;
1183 len--;
1184 if(len <= 0)
1185 return -1;
1186 }
1187 n = 0;
1188 while(len > 1) {
1189 c = *bp++;
1190 if(!isdigit(c))
1191 return -1;
1192 n = n*10 + c-'0';
1193 len--;
1194 }
1195 if(*bp != ' ')
1196 return -1;
1197 return n;
1198 }
1199
1200 int
depthof(char * s,int * newp)1201 depthof(char *s, int *newp)
1202 {
1203 char *es;
1204 int d;
1205
1206 *newp = 0;
1207 es = s+12;
1208 while(s<es && *s==' ')
1209 s++;
1210 if(s == es)
1211 return -1;
1212 if('0'<=*s && *s<='9')
1213 return 1<<strtol(s, 0, 0);
1214
1215 *newp = 1;
1216 d = 0;
1217 while(s<es && *s!=' '){
1218 s++; /* skip letter */
1219 d += strtoul(s, &s, 10);
1220 }
1221
1222 if(d % 8 == 0 || 8 % d == 0)
1223 return d;
1224 else
1225 return -1;
1226 }
1227
1228 int
isp9bit(void)1229 isp9bit(void)
1230 {
1231 int dep, lox, loy, hix, hiy, px, new, cmpr;
1232 ulong t;
1233 long len;
1234 char *newlabel;
1235 uchar *cp;
1236
1237 cp = buf;
1238 cmpr = 0;
1239 newlabel = "old ";
1240
1241 if(memcmp(cp, "compressed\n", 11) == 0) {
1242 cmpr = 1;
1243 cp = buf + 11;
1244 }
1245
1246 dep = depthof((char*)cp + 0*P9BITLEN, &new);
1247 if(new)
1248 newlabel = "";
1249 lox = p9bitnum(cp + 1*P9BITLEN);
1250 loy = p9bitnum(cp + 2*P9BITLEN);
1251 hix = p9bitnum(cp + 3*P9BITLEN);
1252 hiy = p9bitnum(cp + 4*P9BITLEN);
1253 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1254 return 0;
1255
1256 if(dep < 8){
1257 px = 8/dep; /* pixels per byte */
1258 /* set l to number of bytes of data per scan line */
1259 if(lox >= 0)
1260 len = (hix+px-1)/px - lox/px;
1261 else{ /* make positive before divide */
1262 t = (-lox)+px-1;
1263 t = (t/px)*px;
1264 len = (t+hix+px-1)/px;
1265 }
1266 }else
1267 len = (hix-lox)*dep/8;
1268 len *= hiy - loy; /* col length */
1269 len += 5 * P9BITLEN; /* size of initial ascii */
1270
1271 /*
1272 * for compressed images, don't look any further. otherwise:
1273 * for image file, length is non-zero and must match calculation above.
1274 * for /dev/window and /dev/screen the length is always zero.
1275 * for subfont, the subfont header should follow immediately.
1276 */
1277 if (cmpr) {
1278 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n",
1279 newlabel, dep);
1280 return 1;
1281 }
1282 /*
1283 * mbuf->length == 0 probably indicates reading a pipe.
1284 * Ghostscript sometimes produces a little extra on the end.
1285 */
1286 if (len != 0 && (mbuf->length == 0 || mbuf->length == len ||
1287 mbuf->length > len && mbuf->length < len+P9BITLEN)) {
1288 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep);
1289 return 1;
1290 }
1291 if (p9subfont(buf+len)) {
1292 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep);
1293 return 1;
1294 }
1295 return 0;
1296 }
1297
1298 int
p9subfont(uchar * p)1299 p9subfont(uchar *p)
1300 {
1301 int n, h, a;
1302
1303 /* if image too big, assume it's a subfont */
1304 if (p+3*P9BITLEN > buf+sizeof(buf))
1305 return 1;
1306
1307 n = p9bitnum(p + 0*P9BITLEN); /* char count */
1308 if (n < 0)
1309 return 0;
1310 h = p9bitnum(p + 1*P9BITLEN); /* height */
1311 if (h < 0)
1312 return 0;
1313 a = p9bitnum(p + 2*P9BITLEN); /* ascent */
1314 if (a < 0)
1315 return 0;
1316 return 1;
1317 }
1318
1319 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
1320
1321 int
isp9font(void)1322 isp9font(void)
1323 {
1324 uchar *cp, *p;
1325 int i, n;
1326 char pathname[1024];
1327
1328 cp = buf;
1329 if (!getfontnum(cp, &cp)) /* height */
1330 return 0;
1331 if (!getfontnum(cp, &cp)) /* ascent */
1332 return 0;
1333 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) {
1334 if (!getfontnum(cp, &cp)) /* min */
1335 break;
1336 if (!getfontnum(cp, &cp)) /* max */
1337 return 0;
1338 getfontnum(cp, &cp); /* optional offset */
1339 while (WHITESPACE(*cp))
1340 cp++;
1341 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1342 ;
1343 /* construct a path name, if needed */
1344 n = 0;
1345 if (*p != '/' && slash) {
1346 n = slash-fname+1;
1347 if (n < sizeof(pathname))
1348 memcpy(pathname, fname, n);
1349 else n = 0;
1350 }
1351 if (n+cp-p+4 < sizeof(pathname)) {
1352 memcpy(pathname+n, p, cp-p);
1353 n += cp-p;
1354 pathname[n] = 0;
1355 if (access(pathname, AEXIST) < 0) {
1356 strcpy(pathname+n, ".0");
1357 if (access(pathname, AEXIST) < 0)
1358 return 0;
1359 }
1360 }
1361 }
1362 if (i) {
1363 print(mime ? "text/plain\n" : "font file\n");
1364 return 1;
1365 }
1366 return 0;
1367 }
1368
1369 int
getfontnum(uchar * cp,uchar ** rp)1370 getfontnum(uchar *cp, uchar **rp)
1371 {
1372 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
1373 cp++;
1374 if (*cp < '0' || *cp > '9')
1375 return 0;
1376 strtoul((char *)cp, (char **)rp, 0);
1377 if (!WHITESPACE(**rp)) {
1378 *rp = cp;
1379 return 0;
1380 }
1381 return 1;
1382 }
1383
1384 int
isrtf(void)1385 isrtf(void)
1386 {
1387 if(strstr((char *)buf, "\\rtf1")){
1388 print(mime ? "application/rtf\n" : "rich text format\n");
1389 return 1;
1390 }
1391 return 0;
1392 }
1393
1394 int
ismsdos(void)1395 ismsdos(void)
1396 {
1397 if (buf[0] == 0x4d && buf[1] == 0x5a){
1398 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1399 return 1;
1400 }
1401 return 0;
1402 }
1403
1404 int
iself(void)1405 iself(void)
1406 {
1407 static char *cpu[] = { /* NB: incomplete and arbitary list */
1408 [1] "WE32100",
1409 [2] "SPARC",
1410 [3] "i386",
1411 [4] "M68000",
1412 [5] "M88000",
1413 [6] "i486",
1414 [7] "i860",
1415 [8] "R3000",
1416 [9] "S370",
1417 [10] "R4000",
1418 [15] "HP-PA",
1419 [18] "sparc v8+",
1420 [19] "i960",
1421 [20] "PPC-32",
1422 [21] "PPC-64",
1423 [40] "ARM",
1424 [41] "Alpha",
1425 [43] "sparc v9",
1426 [50] "IA-64",
1427 [62] "AMD64",
1428 [75] "VAX",
1429 };
1430 static char *type[] = {
1431 [1] "relocatable object",
1432 [2] "executable",
1433 [3] "shared library",
1434 [4] "core dump",
1435 };
1436
1437 if (memcmp(buf, "\x7fELF", 4) == 0){
1438 if (!mime){
1439 int isdifend = 0;
1440 int n = (buf[19] << 8) | buf[18];
1441 char *p = "unknown";
1442 char *t = "unknown";
1443
1444 if (n > 0 && n < nelem(cpu) && cpu[n])
1445 p = cpu[n];
1446 else {
1447 /* try the other byte order */
1448 isdifend = 1;
1449 n = (buf[18] << 8) | buf[19];
1450 if (n > 0 && n < nelem(cpu) && cpu[n])
1451 p = cpu[n];
1452 }
1453 if(isdifend)
1454 n = (buf[16]<< 8) | buf[17];
1455 else
1456 n = (buf[17]<< 8) | buf[16];
1457
1458 if(n>0 && n < nelem(type) && type[n])
1459 t = type[n];
1460 print("%s ELF%s %s\n", p, (buf[4] == 2? "64": "32"), t);
1461 }
1462 else
1463 print("application/x-elf-executable");
1464 return 1;
1465 }
1466
1467 return 0;
1468 }
1469
1470 int
isface(void)1471 isface(void)
1472 {
1473 int i, j, ldepth, l;
1474 char *p;
1475
1476 ldepth = -1;
1477 for(j = 0; j < 3; j++){
1478 for(p = (char*)buf, i=0; i<3; i++){
1479 if(p[0] != '0' || p[1] != 'x')
1480 return 0;
1481 if(buf[2+8] == ',')
1482 l = 2;
1483 else if(buf[2+4] == ',')
1484 l = 1;
1485 else
1486 return 0;
1487 if(ldepth == -1)
1488 ldepth = l;
1489 if(l != ldepth)
1490 return 0;
1491 strtoul(p, &p, 16);
1492 if(*p++ != ',')
1493 return 0;
1494 while(*p == ' ' || *p == '\t')
1495 p++;
1496 }
1497 if (*p++ != '\n')
1498 return 0;
1499 }
1500
1501 if(mime)
1502 print("application/x-face\n");
1503 else
1504 print("face image depth %d\n", ldepth);
1505 return 1;
1506 }
1507
1508