xref: /plan9-contrib/sys/src/cmd/file.c (revision 34c2901791623ea03308d4cc8cd056b841394d48)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"char",		Cword,
51 	"common",	Fword,
52 	"con",		Lword,
53 	"data",		Fword,
54 	"dimension",	Fword,
55 	"double",	Cword,
56 	"extern",	Cword,
57 	"bio",		I2,
58 	"float",	Cword,
59 	"fn",		Lword,
60 	"function",	Fword,
61 	"h",		I3,
62 	"implement",	Lword,
63 	"import",	Lword,
64 	"include",	I1,
65 	"int",		Cword,
66 	"integer",	Fword,
67 	"iota",		Lword,
68 	"libc",		I2,
69 	"long",		Cword,
70 	"module",	Lword,
71 	"real",		Fword,
72 	"ref",		Lword,
73 	"register",	Cword,
74 	"self",		Lword,
75 	"short",	Cword,
76 	"static",	Cword,
77 	"stdio",	I2,
78 	"struct",	Cword,
79 	"subroutine",	Fword,
80 	"u",		I2,
81 	"void",		Cword,
82 };
83 
84 /* codes for 'mode' field in language structure */
85 enum	{
86 		Normal	= 0,
87 		First,		/* first entry for language spanning several ranges */
88 		Multi,		/* later entries "   "       "  ... */
89 		Shared,		/* codes used in several languages */
90 	};
91 
92 struct
93 {
94 	int	mode;		/* see enum above */
95 	int 	count;
96 	int	low;
97 	int	high;
98 	char	*name;
99 
100 } language[] =
101 {
102 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
103 	Normal,	0,	0x0370,	0x03FF,	"Greek",
104 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
105 	Normal,	0,	0x0530,	0x058F,	"Armenian",
106 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
107 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
108 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
109 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
110 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
111 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
112 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
113 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
114 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
115 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
116 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
117 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
118 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
119 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
120 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
121 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
122 	Normal,	0,	0x3100,	0x312F,	"Chinese",
123 	First,	0,	0x3130,	0x318F,	"Korean",
124 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
125 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
126 	Normal,	0,	0,	0,	0,		/* terminal entry */
127 };
128 
129 
130 enum
131 {
132 	Fascii,		/* printable ascii */
133 	Flatin,		/* latin 1*/
134 	Futf,		/* UTF character set */
135 	Fbinary,	/* binary */
136 	Feascii,	/* ASCII with control chars */
137 	Fnull,		/* NULL in file */
138 } guess;
139 
140 void	bump_utf_count(Rune);
141 int	cistrncmp(char*, char*, int);
142 void	filetype(int);
143 int	getfontnum(uchar*, uchar**);
144 int	isas(void);
145 int	isc(void);
146 int	iscint(void);
147 int	isenglish(void);
148 int	ishp(void);
149 int	ishtml(void);
150 int	isrfc822(void);
151 int	ismbox(void);
152 int	islimbo(void);
153 int	ismung(void);
154 int	isp9bit(void);
155 int	isp9font(void);
156 int	isrtf(void);
157 int	ismsdos(void);
158 int	iself(void);
159 int	istring(void);
160 int	isoffstr(void);
161 int	iff(void);
162 int	long0(void);
163 int	longoff(void);
164 int	istar(void);
165 int	isface(void);
166 int	isexec(void);
167 int	p9bitnum(uchar*);
168 int	p9subfont(uchar*);
169 void	print_utf(void);
170 void	type(char*, int);
171 int	utf_count(void);
172 void	wordfreq(void);
173 
174 int	(*call[])(void) =
175 {
176 	long0,		/* recognizable by first 4 bytes */
177 	istring,	/* recognizable by first string */
178 	iself,		/* ELF (foreign) executable */
179 	isexec,		/* native executables */
180 	iff,		/* interchange file format (strings) */
181 	longoff,	/* recognizable by 4 bytes at some offset */
182 	isoffstr,	/* recognizable by string at some offset */
183 	isrfc822,	/* email file */
184 	ismbox,		/* mail box */
185 	istar,		/* recognizable by tar checksum */
186 	ishtml,		/* html keywords */
187 	iscint,		/* compiler/assembler intermediate */
188 	islimbo,	/* limbo source */
189 	isc,		/* c & alef compiler key words */
190 	isas,		/* assembler key words */
191 	isp9font,	/* plan 9 font */
192 	isp9bit,	/* plan 9 image (as from /dev/window) */
193 	ismung,		/* entropy compressed/encrypted */
194 	isenglish,	/* char frequency English */
195 	isrtf,		/* rich text format */
196 	ismsdos,	/* msdos exe (virus file attachement) */
197 	isface,		/* ascii face file */
198 	0
199 };
200 
201 int mime;
202 
203 #define OCTET	"application/octet-stream\n"
204 #define PLAIN	"text/plain\n"
205 
206 void
207 main(int argc, char *argv[])
208 {
209 	int i, j, maxlen;
210 	char *cp;
211 	Rune r;
212 
213 	ARGBEGIN{
214 	case 'm':
215 		mime = 1;
216 		break;
217 	default:
218 		fprint(2, "usage: file [-m] [file...]\n");
219 		exits("usage");
220 	}ARGEND;
221 
222 	maxlen = 0;
223 	if(mime == 0 || argc > 1){
224 		for(i = 0; i < argc; i++) {
225 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
226 					;
227 			if(j > maxlen)
228 				maxlen = j;
229 		}
230 	}
231 	if (argc <= 0) {
232 		if(!mime)
233 			print ("stdin: ");
234 		filetype(0);
235 	}
236 	else {
237 		for(i = 0; i < argc; i++)
238 			type(argv[i], maxlen);
239 	}
240 	exits(0);
241 }
242 
243 void
244 type(char *file, int nlen)
245 {
246 	Rune r;
247 	int i;
248 	char *p;
249 
250 	if(nlen > 0){
251 		slash = 0;
252 		for (i = 0, p = file; *p; i++) {
253 			if (*p == '/')			/* find rightmost slash */
254 				slash = p;
255 			p += chartorune(&r, p);		/* count runes */
256 		}
257 		print("%s:%*s",file, nlen-i+1, "");
258 	}
259 	fname = file;
260 	if ((fd = open(file, OREAD)) < 0) {
261 		print("cannot open\n");
262 		return;
263 	}
264 	filetype(fd);
265 	close(fd);
266 }
267 
268 /*
269  * Unicode 4.0 4-byte runes.
270  */
271 typedef int Rune1;
272 
273 enum {
274 	UTFmax1 = 4,
275 };
276 
277 int
278 fullrune1(char *p, int n)
279 {
280 	int c;
281 
282 	if(n >= 1) {
283 		c = *(uchar*)p;
284 		if(c < 0x80)
285 			return 1;
286 		if(n >= 2 && c < 0xE0)
287 			return 1;
288 		if(n >= 3 && c < 0xF0)
289 			return 1;
290 		if(n >= 4)
291 			return 1;
292 	}
293 	return 0;
294 }
295 
296 int
297 chartorune1(Rune1 *rune, char *str)
298 {
299 	int c, c1, c2, c3, n;
300 	Rune r;
301 
302 	c = *(uchar*)str;
303 	if(c < 0xF0){
304 		r = 0;
305 		n = chartorune(&r, str);
306 		*rune = r;
307 		return n;
308 	}
309 	c &= ~0xF0;
310 	c1 = *(uchar*)(str+1) & ~0x80;
311 	c2 = *(uchar*)(str+2) & ~0x80;
312 	c3 = *(uchar*)(str+3) & ~0x80;
313 	n = (c<<18) | (c1<<12) | (c2<<6) | c3;
314 	if(n < 0x10000 || n > 0x10FFFF){
315 		*rune = Runeerror;
316 		return 1;
317 	}
318 	*rune = n;
319 	return 4;
320 }
321 
322 void
323 filetype(int fd)
324 {
325 	Rune1 r;
326 	int i, f, n;
327 	char *p, *eob;
328 
329 	free(mbuf);
330 	mbuf = dirfstat(fd);
331 	if(mbuf == nil){
332 		print("cannot stat: %r\n");
333 		return;
334 	}
335 	if(mbuf->mode & DMDIR) {
336 		print(mime ? "text/directory\n" : "directory\n");
337 		return;
338 	}
339 	if(mbuf->type != 'M' && mbuf->type != '|') {
340 		print(mime ? OCTET : "special file #%c/%s\n",
341 			mbuf->type, mbuf->name);
342 		return;
343 	}
344 	/* may be reading a pipe on standard input */
345 	nbuf = readn(fd, buf, sizeof(buf)-1);
346 	if(nbuf < 0) {
347 		print("cannot read\n");
348 		return;
349 	}
350 	if(nbuf == 0) {
351 		print(mime ? PLAIN : "empty file\n");
352 		return;
353 	}
354 	buf[nbuf] = 0;
355 
356 	/*
357 	 * build histogram table
358 	 */
359 	memset(cfreq, 0, sizeof(cfreq));
360 	for (i = 0; language[i].name; i++)
361 		language[i].count = 0;
362 	eob = (char *)buf+nbuf;
363 	for(n = 0, p = (char *)buf; p < eob; n++) {
364 		if (!fullrune1(p, eob-p) && eob-p < UTFmax1)
365 			break;
366 		p += chartorune1(&r, p);
367 		if (r == 0)
368 			f = Cnull;
369 		else if (r <= 0x7f) {
370 			if (!isprint(r) && !isspace(r))
371 				f = Ceascii;	/* ASCII control char */
372 			else f = r;
373 		} else if (r == 0x80) {
374 			bump_utf_count(r);
375 			f = Cutf;
376 		} else if (r < 0xA0)
377 			f = Cbinary;	/* Invalid Runes */
378 		else if (r <= 0xff)
379 			f = Clatin;	/* Latin 1 */
380 		else {
381 			bump_utf_count(r);
382 			f = Cutf;		/* UTF extension */
383 		}
384 		cfreq[f]++;			/* ASCII chars peg directly */
385 	}
386 	/*
387 	 * gross classify
388 	 */
389 	if (cfreq[Cbinary])
390 		guess = Fbinary;
391 	else if (cfreq[Cutf])
392 		guess = Futf;
393 	else if (cfreq[Clatin])
394 		guess = Flatin;
395 	else if (cfreq[Ceascii])
396 		guess = Feascii;
397 	else if (cfreq[Cnull])
398 		guess = Fbinary;
399 	else
400 		guess = Fascii;
401 	/*
402 	 * lookup dictionary words
403 	 */
404 	memset(wfreq, 0, sizeof(wfreq));
405 	if(guess == Fascii || guess == Flatin || guess == Futf)
406 		wordfreq();
407 	/*
408 	 * call individual classify routines
409 	 */
410 	for(i=0; call[i]; i++)
411 		if((*call[i])())
412 			return;
413 
414 	/*
415 	 * if all else fails,
416 	 * print out gross classification
417 	 */
418 	if (nbuf < 100 && !mime)
419 		print(mime ? PLAIN : "short ");
420 	if (guess == Fascii)
421 		print(mime ? PLAIN : "Ascii\n");
422 	else if (guess == Feascii)
423 		print(mime ? PLAIN : "extended ascii\n");
424 	else if (guess == Flatin)
425 		print(mime ? PLAIN : "latin ascii\n");
426 	else if (guess == Futf && utf_count() < 4)
427 		print_utf();
428 	else print(mime ? OCTET : "binary\n");
429 }
430 
431 void
432 bump_utf_count(Rune r)
433 {
434 	int low, high, mid;
435 
436 	high = sizeof(language)/sizeof(language[0])-1;
437 	for (low = 0; low < high;) {
438 		mid = (low+high)/2;
439 		if (r >= language[mid].low) {
440 			if (r <= language[mid].high) {
441 				language[mid].count++;
442 				break;
443 			} else low = mid+1;
444 		} else high = mid;
445 	}
446 }
447 
448 int
449 utf_count(void)
450 {
451 	int i, count;
452 
453 	count = 0;
454 	for (i = 0; language[i].name; i++)
455 		if (language[i].count > 0)
456 			switch (language[i].mode) {
457 			case Normal:
458 			case First:
459 				count++;
460 				break;
461 			default:
462 				break;
463 			}
464 	return count;
465 }
466 
467 int
468 chkascii(void)
469 {
470 	int i;
471 
472 	for (i = 'a'; i < 'z'; i++)
473 		if (cfreq[i])
474 			return 1;
475 	for (i = 'A'; i < 'Z'; i++)
476 		if (cfreq[i])
477 			return 1;
478 	return 0;
479 }
480 
481 int
482 find_first(char *name)
483 {
484 	int i;
485 
486 	for (i = 0; language[i].name != 0; i++)
487 		if (language[i].mode == First
488 			&& strcmp(language[i].name, name) == 0)
489 			return i;
490 	return -1;
491 }
492 
493 void
494 print_utf(void)
495 {
496 	int i, printed, j;
497 
498 	if(mime){
499 		print(PLAIN);
500 		return;
501 	}
502 	if (chkascii()) {
503 		printed = 1;
504 		print("Ascii");
505 	} else
506 		printed = 0;
507 	for (i = 0; language[i].name; i++)
508 		if (language[i].count) {
509 			switch(language[i].mode) {
510 			case Multi:
511 				j = find_first(language[i].name);
512 				if (j < 0)
513 					break;
514 				if (language[j].count > 0)
515 					break;
516 				/* Fall through */
517 			case Normal:
518 			case First:
519 				if (printed)
520 					print(" & ");
521 				else printed = 1;
522 				print("%s", language[i].name);
523 				break;
524 			case Shared:
525 			default:
526 				break;
527 			}
528 		}
529 	if(!printed)
530 		print("UTF");
531 	print(" text\n");
532 }
533 
534 void
535 wordfreq(void)
536 {
537 	int low, high, mid, r;
538 	uchar *p, *p2, c;
539 
540 	p = buf;
541 	for(;;) {
542 		while (p < buf+nbuf && !isalpha(*p))
543 			p++;
544 		if (p >= buf+nbuf)
545 			return;
546 		p2 = p;
547 		while(p < buf+nbuf && isalpha(*p))
548 			p++;
549 		c = *p;
550 		*p = 0;
551 		high = sizeof(dict)/sizeof(dict[0]);
552 		for(low = 0;low < high;) {
553 			mid = (low+high)/2;
554 			r = strcmp(dict[mid].word, (char*)p2);
555 			if(r == 0) {
556 				wfreq[dict[mid].class]++;
557 				break;
558 			}
559 			if(r < 0)
560 				low = mid+1;
561 			else
562 				high = mid;
563 		}
564 		*p++ = c;
565 	}
566 }
567 
568 typedef struct Filemagic Filemagic;
569 struct Filemagic {
570 	ulong x;
571 	ulong mask;
572 	char *desc;
573 	char *mime;
574 };
575 
576 /*
577  * integers in this table must be as seen on a little-endian machine
578  * when read from a file.
579  */
580 Filemagic long0tab[] = {
581 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
582 	/* "pac1" */
583 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
584 	/* "pXc2 */
585 	0x32630070,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
586 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
587 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
588 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip",
589 	070707,		0xFFFF,		"cpio archive\n", OCTET,
590 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
591 	0xfaff,		0xfeff,		"mp3 audio\n",	"audio/mpeg",
592 	0xfeff0000,	0xffffffff,	"utf-32be\n",	"text/plain charset=utf-32be",
593 	0xfffe,		0xffffffff,	"utf-32le\n",	"text/plain charset=utf-32le",
594 	0xfeff,		0xffff,		"utf-16be\n",	"text/plain charset=utf-16be",
595 	0xfffe,		0xffff,		"utf-16le\n",	"text/plain charset=utf-16le",
596 	/*
597 	 * venti & fossil magic numbers are stored big-endian on disk,
598 	 * thus the numbers appear reversed in this table.
599 	 */
600 	0xad4e5cd1,	0xFFFFFFFF,	"venti arena\n", OCTET,
601 };
602 
603 int
604 filemagic(Filemagic *tab, int ntab, ulong x)
605 {
606 	int i;
607 
608 	for(i=0; i<ntab; i++)
609 		if((x&tab[i].mask) == tab[i].x){
610 			print(mime ? tab[i].mime : tab[i].desc);
611 			return 1;
612 		}
613 	return 0;
614 }
615 
616 int
617 long0(void)
618 {
619 	return filemagic(long0tab, nelem(long0tab), LENDIAN(buf));
620 }
621 
622 typedef struct Fileoffmag Fileoffmag;
623 struct Fileoffmag {
624 	ulong	off;
625 	Filemagic;
626 };
627 
628 /*
629  * integers in this table must be as seen on a little-endian machine
630  * when read from a file.
631  */
632 Fileoffmag longofftab[] = {
633 	/*
634 	 * venti & fossil magic numbers are stored big-endian on disk,
635 	 * thus the numbers appear reversed in this table.
636 	 */
637 	256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET,
638 	256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET,
639 	128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET,
640 };
641 
642 int
643 fileoffmagic(Fileoffmag *tab, int ntab)
644 {
645 	int i;
646 	ulong x;
647 	Fileoffmag *tp;
648 	uchar buf[sizeof(long)];
649 
650 	for(i=0; i<ntab; i++) {
651 		tp = tab + i;
652 		seek(fd, tp->off, 0);
653 		if (readn(fd, buf, sizeof buf) != sizeof buf)
654 			continue;
655 		x = LENDIAN(buf);
656 		if((x&tp->mask) == tp->x){
657 			print(mime? tp->mime: tp->desc);
658 			return 1;
659 		}
660 	}
661 	return 0;
662 }
663 
664 int
665 longoff(void)
666 {
667 	return fileoffmagic(longofftab, nelem(longofftab));
668 }
669 
670 int
671 isexec(void)
672 {
673 	Fhdr f;
674 
675 	seek(fd, 0, 0);		/* reposition to start of file */
676 	if(crackhdr(fd, &f)) {
677 		print(mime ? OCTET : "%s\n", f.name);
678 		return 1;
679 	}
680 	return 0;
681 }
682 
683 
684 /* from tar.c */
685 enum { NAMSIZ = 100, TBLOCK = 512 };
686 
687 union	hblock
688 {
689 	char	dummy[TBLOCK];
690 	struct	header
691 	{
692 		char	name[NAMSIZ];
693 		char	mode[8];
694 		char	uid[8];
695 		char	gid[8];
696 		char	size[12];
697 		char	mtime[12];
698 		char	chksum[8];
699 		char	linkflag;
700 		char	linkname[NAMSIZ];
701 		/* rest are defined by POSIX's ustar format; see p1003.2b */
702 		char	magic[6];	/* "ustar" */
703 		char	version[2];
704 		char	uname[32];
705 		char	gname[32];
706 		char	devmajor[8];
707 		char	devminor[8];
708 		char	prefix[155];  /* if non-null, path = prefix "/" name */
709 	} dbuf;
710 };
711 
712 int
713 checksum(union hblock *hp)
714 {
715 	int i;
716 	char *cp;
717 	struct header *hdr = &hp->dbuf;
718 
719 	for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
720 		*cp = ' ';
721 	i = 0;
722 	for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
723 		i += *cp & 0xff;
724 	return i;
725 }
726 
727 int
728 istar(void)
729 {
730 	int chksum;
731 	char tblock[TBLOCK];
732 	union hblock *hp = (union hblock *)tblock;
733 	struct header *hdr = &hp->dbuf;
734 
735 	seek(fd, 0, 0);		/* reposition to start of file */
736 	if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
737 		return 0;
738 	chksum = strtol(hdr->chksum, 0, 8);
739 	if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
740 		if (strcmp(hdr->magic, "ustar") == 0)
741 			print(mime? "application/x-ustar\n":
742 				"posix tar archive\n");
743 		else
744 			print(mime? "application/x-tar\n": "tar archive\n");
745 		return 1;
746 	}
747 	return 0;
748 }
749 
750 /*
751  * initial words to classify file
752  */
753 struct	FILE_STRING
754 {
755 	char 	*key;
756 	char	*filetype;
757 	int	length;
758 	char	*mime;
759 } file_string[] =
760 {
761 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
762 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
763 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
764 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
765 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
766 	"%!",			"postscript",			2,	"application/postscript",
767 	"\004%!",		"postscript",			3,	"application/postscript",
768 	"x T post",		"troff output for post",	8,	"application/troff",
769 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
770 	"x T utf",		"troff output for UTF",		7,	"application/troff",
771 	"x T 202",		"troff output for 202",		7,	"application/troff",
772 	"x T aps",		"troff output for aps",		7,	"application/troff",
773 	"GIF",			"GIF image", 			3,	"image/gif",
774 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
775 	"%PDF",			"PDF",				4,	"application/pdf",
776 	"<html>\n",		"HTML file",			7,	"text/html",
777 	"<HTML>\n",		"HTML file",			7,	"text/html",
778 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
779 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
780 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
781 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
782 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
783 	"BM",			"bmp",				2,	"image/bmp",
784 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
785 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
786 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
787 	"ID3",			"mp3 audio with id3",	3,	"audio/mpeg",
788 	"\211PNG",		"PNG image",		4,	"image/png",
789 	"P3\n",			"ppm",				3,	"image/ppm",
790 	"P6\n",			"ppm",				3,	"image/ppm",
791 	"/* XPM */\n",	"xbm",				10,	"image/xbm",
792 	".HTML ",		"troff -ms input",	6,	"text/troff",
793 	".LP",			"troff -ms input",	3,	"text/troff",
794 	".ND",			"troff -ms input",	3,	"text/troff",
795 	".PP",			"troff -ms input",	3,	"text/troff",
796 	".TL",			"troff -ms input",	3,	"text/troff",
797 	".TR",			"troff -ms input",	3,	"text/troff",
798 	".TH",			"manual page",		3,	"text/troff",
799 	".\\\"",		"troff input",		3,	"text/troff",
800 	".de",			"troff input",		3,	"text/troff",
801 	".if",			"troff input",		3,	"text/troff",
802 	".nr",			"troff input",		3,	"text/troff",
803 	".tr",			"troff input",		3,	"text/troff",
804 	"vac:",			"venti score",		4,	"text/plain",
805 	"-----BEGIN CERTIFICATE-----\n",
806 				"pem certificate",	-1,	"text/plain",
807 	"-----BEGIN TRUSTED CERTIFICATE-----\n",
808 				"pem trusted certificate", -1,	"text/plain",
809 	"-----BEGIN X509 CERTIFICATE-----\n",
810 				"pem x.509 certificate", -1,	"text/plain",
811 	"subject=/C=",		"pem certificate with header", -1, "text/plain",
812 	"process snapshot ",	"process snapshot",	-1,	"application/snapfs",
813 	0,0,0,0
814 };
815 
816 int
817 istring(void)
818 {
819 	int i, l;
820 	struct FILE_STRING *p;
821 
822 	for(p = file_string; p->key; p++) {
823 		l = p->length;
824 		if(l == -1)
825 			l = strlen(p->key);
826 		if(nbuf >= l && memcmp(buf, p->key, l) == 0) {
827 			if(mime)
828 				print("%s\n", p->mime);
829 			else
830 				print("%s\n", p->filetype);
831 			return 1;
832 		}
833 	}
834 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
835 		for(i = 5; i < nbuf; i++)
836 			if(buf[i] == '\n')
837 				break;
838 		if(mime)
839 			print(OCTET);
840 		else
841 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
842 		return 1;
843 	}
844 	return 0;
845 }
846 
847 struct offstr
848 {
849 	ulong	off;
850 	struct FILE_STRING;
851 } offstrs[] = {
852 	32*1024, "\001CD001\001",	"ISO9660 CD image",	7,	OCTET,
853 	0, 0, 0, 0, 0
854 };
855 
856 int
857 isoffstr(void)
858 {
859 	int n;
860 	char buf[256];
861 	struct offstr *p;
862 
863 	for(p = offstrs; p->key; p++) {
864 		seek(fd, p->off, 0);
865 		n = p->length;
866 		if (n > sizeof buf)
867 			n = sizeof buf;
868 		if (readn(fd, buf, n) != n)
869 			continue;
870 		if(memcmp(buf, p->key, n) == 0) {
871 			if(mime)
872 				print("%s\n", p->mime);
873 			else
874 				print("%s\n", p->filetype);
875 			return 1;
876 		}
877 	}
878 	return 0;
879 }
880 
881 int
882 iff(void)
883 {
884 	if (strncmp((char*)buf, "FORM", 4) == 0 &&
885 	    strncmp((char*)buf+8, "AIFF", 4) == 0) {
886 		print("%s\n", mime? "audio/x-aiff": "aiff audio");
887 		return 1;
888 	}
889 	return 0;
890 }
891 
892 char*	html_string[] =
893 {
894 	"title",
895 	"body",
896 	"head",
897 	"strong",
898 	"h1",
899 	"h2",
900 	"h3",
901 	"h4",
902 	"h5",
903 	"h6",
904 	"ul",
905 	"li",
906 	"dl",
907 	"br",
908 	"em",
909 	0,
910 };
911 
912 int
913 ishtml(void)
914 {
915 	uchar *p, *q;
916 	int i, count;
917 
918 		/* compare strings between '<' and '>' to html table */
919 	count = 0;
920 	p = buf;
921 	for(;;) {
922 		while (p < buf+nbuf && *p != '<')
923 			p++;
924 		p++;
925 		if (p >= buf+nbuf)
926 			break;
927 		if(*p == '/')
928 			p++;
929 		q = p;
930 		while(p < buf+nbuf && *p != '>')
931 			p++;
932 		if (p >= buf+nbuf)
933 			break;
934 		for(i = 0; html_string[i]; i++) {
935 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
936 				if(count++ > 4) {
937 					print(mime ? "text/html\n" : "HTML file\n");
938 					return 1;
939 				}
940 				break;
941 			}
942 		}
943 		p++;
944 	}
945 	return 0;
946 }
947 
948 char*	rfc822_string[] =
949 {
950 	"from:",
951 	"date:",
952 	"to:",
953 	"subject:",
954 	"received:",
955 	"reply to:",
956 	"sender:",
957 	0,
958 };
959 
960 int
961 isrfc822(void)
962 {
963 
964 	char *p, *q, *r;
965 	int i, count;
966 
967 	count = 0;
968 	p = (char*)buf;
969 	for(;;) {
970 		q = strchr(p, '\n');
971 		if(q == nil)
972 			break;
973 		*q = 0;
974 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
975 			count++;
976 			*q = '\n';
977 			p = q+1;
978 			continue;
979 		}
980 		*q = '\n';
981 		if(*p != '\t' && *p != ' '){
982 			r = strchr(p, ':');
983 			if(r == 0 || r > q)
984 				break;
985 			for(i = 0; rfc822_string[i]; i++) {
986 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
987 					count++;
988 					break;
989 				}
990 			}
991 		}
992 		p = q+1;
993 	}
994 	if(count >= 3){
995 		print(mime ? "message/rfc822\n" : "email file\n");
996 		return 1;
997 	}
998 	return 0;
999 }
1000 
1001 int
1002 ismbox(void)
1003 {
1004 	char *p, *q;
1005 
1006 	p = (char*)buf;
1007 	q = strchr(p, '\n');
1008 	if(q == nil)
1009 		return 0;
1010 	*q = 0;
1011 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
1012 		print(mime ? "text/plain\n" : "mail box\n");
1013 		return 1;
1014 	}
1015 	*q = '\n';
1016 	return 0;
1017 }
1018 
1019 int
1020 iscint(void)
1021 {
1022 	int type;
1023 	char *name;
1024 	Biobuf b;
1025 
1026 	if(Binit(&b, fd, OREAD) == Beof)
1027 		return 0;
1028 	seek(fd, 0, 0);
1029 	type = objtype(&b, &name);
1030 	if(type < 0)
1031 		return 0;
1032 	if(mime)
1033 		print(OCTET);
1034 	else
1035 		print("%s intermediate\n", name);
1036 	return 1;
1037 }
1038 
1039 int
1040 isc(void)
1041 {
1042 	int n;
1043 
1044 	n = wfreq[I1];
1045 	/*
1046 	 * includes
1047 	 */
1048 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1049 		goto yes;
1050 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1051 		goto yes;
1052 	/*
1053 	 * declarations
1054 	 */
1055 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
1056 		goto yes;
1057 	/*
1058 	 * assignments
1059 	 */
1060 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
1061 		goto yes;
1062 	return 0;
1063 
1064 yes:
1065 	if(mime){
1066 		print(PLAIN);
1067 		return 1;
1068 	}
1069 	if(wfreq[Alword] > 0)
1070 		print("alef program\n");
1071 	else
1072 		print("c program\n");
1073 	return 1;
1074 }
1075 
1076 int
1077 islimbo(void)
1078 {
1079 
1080 	/*
1081 	 * includes
1082 	 */
1083 	if(wfreq[Lword] < 4)
1084 		return 0;
1085 	print(mime ? PLAIN : "limbo program\n");
1086 	return 1;
1087 }
1088 
1089 int
1090 isas(void)
1091 {
1092 
1093 	/*
1094 	 * includes
1095 	 */
1096 	if(wfreq[Aword] < 2)
1097 		return 0;
1098 	print(mime ? PLAIN : "as program\n");
1099 	return 1;
1100 }
1101 
1102 /*
1103  * low entropy means encrypted
1104  */
1105 int
1106 ismung(void)
1107 {
1108 	int i, bucket[8];
1109 	float cs;
1110 
1111 	if(nbuf < 64)
1112 		return 0;
1113 	memset(bucket, 0, sizeof(bucket));
1114 	for(i=nbuf-64; i<nbuf; i++)
1115 		bucket[(buf[i]>>5)&07] += 1;
1116 
1117 	cs = 0.;
1118 	for(i=0; i<8; i++)
1119 		cs += (bucket[i]-8)*(bucket[i]-8);
1120 	cs /= 8.;
1121 	if(cs <= 24.322) {
1122 		if(buf[0]==0x1f && buf[1]==0x9d)
1123 			print(mime ? OCTET : "compressed\n");
1124 		else
1125 		if(buf[0]==0x1f && buf[1]==0x8b)
1126 			print(mime ? OCTET : "gzip compressed\n");
1127 		else
1128 		if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h')
1129 			print(mime ? OCTET : "bzip2 compressed\n");
1130 		else
1131 			print(mime ? OCTET : "encrypted\n");
1132 		return 1;
1133 	}
1134 	return 0;
1135 }
1136 
1137 /*
1138  * english by punctuation and frequencies
1139  */
1140 int
1141 isenglish(void)
1142 {
1143 	int vow, comm, rare, badpun, punct;
1144 	char *p;
1145 
1146 	if(guess != Fascii && guess != Feascii)
1147 		return 0;
1148 	badpun = 0;
1149 	punct = 0;
1150 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
1151 		switch(*p) {
1152 		case '.':
1153 		case ',':
1154 		case ')':
1155 		case '%':
1156 		case ';':
1157 		case ':':
1158 		case '?':
1159 			punct++;
1160 			if(p[1] != ' ' && p[1] != '\n')
1161 				badpun++;
1162 		}
1163 	if(badpun*5 > punct)
1164 		return 0;
1165 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
1166 		return 0;
1167 	if(2*cfreq[';'] > cfreq['e'])
1168 		return 0;
1169 
1170 	vow = 0;
1171 	for(p="AEIOU"; *p; p++) {
1172 		vow += cfreq[*p];
1173 		vow += cfreq[tolower(*p)];
1174 	}
1175 	comm = 0;
1176 	for(p="ETAION"; *p; p++) {
1177 		comm += cfreq[*p];
1178 		comm += cfreq[tolower(*p)];
1179 	}
1180 	rare = 0;
1181 	for(p="VJKQXZ"; *p; p++) {
1182 		rare += cfreq[*p];
1183 		rare += cfreq[tolower(*p)];
1184 	}
1185 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
1186 		print(mime ? PLAIN : "English text\n");
1187 		return 1;
1188 	}
1189 	return 0;
1190 }
1191 
1192 /*
1193  * pick up a number with
1194  * syntax _*[0-9]+_
1195  */
1196 #define	P9BITLEN	12
1197 int
1198 p9bitnum(uchar *bp)
1199 {
1200 	int n, c, len;
1201 
1202 	len = P9BITLEN;
1203 	while(*bp == ' ') {
1204 		bp++;
1205 		len--;
1206 		if(len <= 0)
1207 			return -1;
1208 	}
1209 	n = 0;
1210 	while(len > 1) {
1211 		c = *bp++;
1212 		if(!isdigit(c))
1213 			return -1;
1214 		n = n*10 + c-'0';
1215 		len--;
1216 	}
1217 	if(*bp != ' ')
1218 		return -1;
1219 	return n;
1220 }
1221 
1222 int
1223 depthof(char *s, int *newp)
1224 {
1225 	char *es;
1226 	int d;
1227 
1228 	*newp = 0;
1229 	es = s+12;
1230 	while(s<es && *s==' ')
1231 		s++;
1232 	if(s == es)
1233 		return -1;
1234 	if('0'<=*s && *s<='9')
1235 		return 1<<strtol(s, 0, 0);
1236 
1237 	*newp = 1;
1238 	d = 0;
1239 	while(s<es && *s!=' '){
1240 		s++;			/* skip letter */
1241 		d += strtoul(s, &s, 10);
1242 	}
1243 
1244 	if(d % 8 == 0 || 8 % d == 0)
1245 		return d;
1246 	else
1247 		return -1;
1248 }
1249 
1250 int
1251 isp9bit(void)
1252 {
1253 	int dep, lox, loy, hix, hiy, px, new, cmpr;
1254 	ulong t;
1255 	long len;
1256 	char *newlabel;
1257 	uchar *cp;
1258 
1259 	cp = buf;
1260 	cmpr = 0;
1261 	newlabel = "old ";
1262 
1263 	if(memcmp(cp, "compressed\n", 11) == 0) {
1264 		cmpr = 1;
1265 		cp = buf + 11;
1266 	}
1267 
1268 	dep = depthof((char*)cp + 0*P9BITLEN, &new);
1269 	if(new)
1270 		newlabel = "";
1271 	lox = p9bitnum(cp + 1*P9BITLEN);
1272 	loy = p9bitnum(cp + 2*P9BITLEN);
1273 	hix = p9bitnum(cp + 3*P9BITLEN);
1274 	hiy = p9bitnum(cp + 4*P9BITLEN);
1275 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1276 		return 0;
1277 
1278 	if(dep < 8){
1279 		px = 8/dep;		/* pixels per byte */
1280 		/* set l to number of bytes of data per scan line */
1281 		if(lox >= 0)
1282 			len = (hix+px-1)/px - lox/px;
1283 		else{			/* make positive before divide */
1284 			t = (-lox)+px-1;
1285 			t = (t/px)*px;
1286 			len = (t+hix+px-1)/px;
1287 		}
1288 	}else
1289 		len = (hix-lox)*dep/8;
1290 	len *= hiy - loy;		/* col length */
1291 	len += 5 * P9BITLEN;		/* size of initial ascii */
1292 
1293 	/*
1294 	 * for compressed images, don't look any further. otherwise:
1295 	 * for image file, length is non-zero and must match calculation above.
1296 	 * for /dev/window and /dev/screen the length is always zero.
1297 	 * for subfont, the subfont header should follow immediately.
1298 	 */
1299 	if (cmpr) {
1300 		print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n",
1301 			newlabel, dep);
1302 		return 1;
1303 	}
1304 	/*
1305 	 * mbuf->length == 0 probably indicates reading a pipe.
1306 	 * Ghostscript sometimes produces a little extra on the end.
1307 	 */
1308 	if (len != 0 && (mbuf->length == 0 || mbuf->length == len ||
1309 	    mbuf->length > len && mbuf->length < len+P9BITLEN)) {
1310 		print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep);
1311 		return 1;
1312 	}
1313 	if (p9subfont(buf+len)) {
1314 		print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep);
1315 		return 1;
1316 	}
1317 	return 0;
1318 }
1319 
1320 int
1321 p9subfont(uchar *p)
1322 {
1323 	int n, h, a;
1324 
1325 	/* if image too big, assume it's a subfont */
1326 	if (p+3*P9BITLEN > buf+sizeof(buf))
1327 		return 1;
1328 
1329 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1330 	if (n < 0)
1331 		return 0;
1332 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1333 	if (h < 0)
1334 		return 0;
1335 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1336 	if (a < 0)
1337 		return 0;
1338 	return 1;
1339 }
1340 
1341 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1342 
1343 int
1344 isp9font(void)
1345 {
1346 	uchar *cp, *p;
1347 	int i, n;
1348 	char pathname[1024];
1349 
1350 	cp = buf;
1351 	if (!getfontnum(cp, &cp))	/* height */
1352 		return 0;
1353 	if (!getfontnum(cp, &cp))	/* ascent */
1354 		return 0;
1355 	for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) {
1356 		if (!getfontnum(cp, &cp))	/* min */
1357 			break;
1358 		if (!getfontnum(cp, &cp))	/* max */
1359 			return 0;
1360 		getfontnum(cp, &cp);	/* optional offset */
1361 		while (WHITESPACE(*cp))
1362 			cp++;
1363 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1364 				;
1365 			/* construct a path name, if needed */
1366 		n = 0;
1367 		if (*p != '/' && slash) {
1368 			n = slash-fname+1;
1369 			if (n < sizeof(pathname))
1370 				memcpy(pathname, fname, n);
1371 			else n = 0;
1372 		}
1373 		if (n+cp-p+4 < sizeof(pathname)) {
1374 			memcpy(pathname+n, p, cp-p);
1375 			n += cp-p;
1376 			pathname[n] = 0;
1377 			if (access(pathname, AEXIST) < 0) {
1378 				strcpy(pathname+n, ".0");
1379 				if (access(pathname, AEXIST) < 0)
1380 					return 0;
1381 			}
1382 		}
1383 	}
1384 	if (i) {
1385 		print(mime ? "text/plain\n" : "font file\n");
1386 		return 1;
1387 	}
1388 	return 0;
1389 }
1390 
1391 int
1392 getfontnum(uchar *cp, uchar **rp)
1393 {
1394 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1395 		cp++;
1396 	if (*cp < '0' || *cp > '9')
1397 		return 0;
1398 	strtoul((char *)cp, (char **)rp, 0);
1399 	if (!WHITESPACE(**rp)) {
1400 		*rp = cp;
1401 		return 0;
1402 	}
1403 	return 1;
1404 }
1405 
1406 int
1407 isrtf(void)
1408 {
1409 	if(strstr((char *)buf, "\\rtf1")){
1410 		print(mime ? "application/rtf\n" : "rich text format\n");
1411 		return 1;
1412 	}
1413 	return 0;
1414 }
1415 
1416 int
1417 ismsdos(void)
1418 {
1419 	if (buf[0] == 0x4d && buf[1] == 0x5a){
1420 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1421 		return 1;
1422 	}
1423 	return 0;
1424 }
1425 
1426 int
1427 iself(void)
1428 {
1429 	static char *cpu[] = {		/* NB: incomplete and arbitary list */
1430 	[1]	"WE32100",
1431 	[2]	"SPARC",
1432 	[3]	"i386",
1433 	[4]	"M68000",
1434 	[5]	"M88000",
1435 	[6]	"i486",
1436 	[7]	"i860",
1437 	[8]	"R3000",
1438 	[9]	"S370",
1439 	[10]	"R4000",
1440 	[15]	"HP-PA",
1441 	[18]	"sparc v8+",
1442 	[19]	"i960",
1443 	[20]	"PPC-32",
1444 	[21]	"PPC-64",
1445 	[40]	"ARM",
1446 	[41]	"Alpha",
1447 	[43]	"sparc v9",
1448 	[50]	"IA-64",
1449 	[62]	"AMD64",
1450 	[75]	"VAX",
1451 	};
1452 	static char *type[] = {
1453 	[1]	"relocatable object",
1454 	[2]	"executable",
1455 	[3]	"shared library",
1456 	[4]	"core dump",
1457 	};
1458 
1459 	if (memcmp(buf, "\x7fELF", 4) == 0){
1460 		if (!mime){
1461 			int isdifend = 0;
1462 			int n = (buf[19] << 8) | buf[18];
1463 			char *p = "unknown";
1464 			char *t = "unknown";
1465 
1466 			if (n > 0 && n < nelem(cpu) && cpu[n])
1467 				p = cpu[n];
1468 			else {
1469 				/* try the other byte order */
1470 				isdifend = 1;
1471 				n = (buf[18] << 8) | buf[19];
1472 				if (n > 0 && n < nelem(cpu) && cpu[n])
1473 					p = cpu[n];
1474 			}
1475 			if(isdifend)
1476 				n = (buf[16]<< 8) | buf[17];
1477 			else
1478 				n = (buf[17]<< 8) | buf[16];
1479 
1480 			if(n>0 && n < nelem(type) && type[n])
1481 				t = type[n];
1482 			print("%s ELF %s\n", p, t);
1483 		}
1484 		else
1485 			print("application/x-elf-executable");
1486 		return 1;
1487 	}
1488 
1489 	return 0;
1490 }
1491 
1492 int
1493 isface(void)
1494 {
1495 	int i, j, ldepth, l;
1496 	char *p;
1497 
1498 	ldepth = -1;
1499 	for(j = 0; j < 3; j++){
1500 		for(p = (char*)buf, i=0; i<3; i++){
1501 			if(p[0] != '0' || p[1] != 'x')
1502 				return 0;
1503 			if(buf[2+8] == ',')
1504 				l = 2;
1505 			else if(buf[2+4] == ',')
1506 				l = 1;
1507 			else
1508 				return 0;
1509 			if(ldepth == -1)
1510 				ldepth = l;
1511 			if(l != ldepth)
1512 				return 0;
1513 			strtoul(p, &p, 16);
1514 			if(*p++ != ',')
1515 				return 0;
1516 			while(*p == ' ' || *p == '\t')
1517 				p++;
1518 		}
1519 		if (*p++ != '\n')
1520 			return 0;
1521 	}
1522 
1523 	if(mime)
1524 		print("application/x-face\n");
1525 	else
1526 		print("face image depth %d\n", ldepth);
1527 	return 1;
1528 }
1529 
1530