xref: /plan9/sys/src/cmd/file.c (revision ec59a3ddbfceee0efe34584c2c9981a5e5ff1ec4)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"chan",		Alword,
51 	"char",		Cword,
52 	"common",	Fword,
53 	"con",		Lword,
54 	"data",		Fword,
55 	"dimension",	Fword,
56 	"double",	Cword,
57 	"extern",	Cword,
58 	"bio",		I2,
59 	"float",	Cword,
60 	"fn",		Lword,
61 	"function",	Fword,
62 	"h",		I3,
63 	"implement",	Lword,
64 	"import",	Lword,
65 	"include",	I1,
66 	"int",		Cword,
67 	"integer",	Fword,
68 	"iota",		Lword,
69 	"libc",		I2,
70 	"long",		Cword,
71 	"module",	Lword,
72 	"real",		Fword,
73 	"ref",		Lword,
74 	"register",	Cword,
75 	"self",		Lword,
76 	"short",	Cword,
77 	"static",	Cword,
78 	"stdio",	I2,
79 	"struct",	Cword,
80 	"subroutine",	Fword,
81 	"u",		I2,
82 	"void",		Cword,
83 };
84 
85 /* codes for 'mode' field in language structure */
86 enum	{
87 		Normal	= 0,
88 		First,		/* first entry for language spanning several ranges */
89 		Multi,		/* later entries "   "       "  ... */
90 		Shared,		/* codes used in several languages */
91 	};
92 
93 struct
94 {
95 	int	mode;		/* see enum above */
96 	int 	count;
97 	int	low;
98 	int	high;
99 	char	*name;
100 
101 } language[] =
102 {
103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
125 	First,	0,	0x3130,	0x318F,	"Korean",
126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
128 	Normal,	0,	0,	0,	0,		/* terminal entry */
129 };
130 
131 
132 enum
133 {
134 	Fascii,		/* printable ascii */
135 	Flatin,		/* latin 1*/
136 	Futf,		/* UTf character set */
137 	Fbinary,	/* binary */
138 	Feascii,	/* ASCII with control chars */
139 	Fnull,		/* NULL in file */
140 } guess;
141 
142 void	bump_utf_count(Rune);
143 int	cistrncmp(char*, char*, int);
144 void	filetype(int);
145 int	getfontnum(uchar*, uchar**);
146 int	isas(void);
147 int	isc(void);
148 int	iscint(void);
149 int	isenglish(void);
150 int	ishp(void);
151 int	ishtml(void);
152 int	isrfc822(void);
153 int	ismbox(void);
154 int	islimbo(void);
155 int	ismung(void);
156 int	isp9bit(void);
157 int	isp9font(void);
158 int	isrtf(void);
159 int	ismsdos(void);
160 int	iself(void);
161 int	istring(void);
162 int	iff(void);
163 int	long0(void);
164 int	istar(void);
165 int	isface(void);
166 int	isexec(void);
167 int	p9bitnum(uchar*);
168 int	p9subfont(uchar*);
169 void	print_utf(void);
170 void	type(char*, int);
171 int	utf_count(void);
172 void	wordfreq(void);
173 
174 int	(*call[])(void) =
175 {
176 	long0,		/* recognizable by first 4 bytes */
177 	istring,	/* recognizable by first string */
178 	iself,		/* ELF (foreign) executable */
179 	isexec,		/* native executables */
180 	iff,		/* interchange file format (strings) */
181 	isrfc822,	/* email file */
182 	ismbox,		/* mail box */
183 	istar,		/* recognizable by tar checksum */
184 	ishtml,		/* html keywords */
185 	iscint,		/* compiler/assembler intermediate */
186 	islimbo,	/* limbo source */
187 	isc,		/* c & alef compiler key words */
188 	isas,		/* assembler key words */
189 	ismung,		/* entropy compressed/encrypted */
190 	isp9font,	/* plan 9 font */
191 	isp9bit,	/* plan 9 image (as from /dev/window) */
192 	isenglish,	/* char frequency English */
193 	isrtf,		/* rich text format */
194 	ismsdos,	/* msdos exe (virus file attachement) */
195 	isface,		/* ascii face file */
196 	0
197 };
198 
199 int mime;
200 
201 #define OCTET	"application/octet-stream\n"
202 #define PLAIN	"text/plain\n"
203 
204 void
205 main(int argc, char *argv[])
206 {
207 	int i, j, maxlen;
208 	char *cp;
209 	Rune r;
210 
211 	ARGBEGIN{
212 	case 'm':
213 		mime = 1;
214 		break;
215 	default:
216 		fprint(2, "usage: file [-m] [file...]\n");
217 		exits("usage");
218 	}ARGEND;
219 
220 	maxlen = 0;
221 	if(mime == 0 || argc > 1){
222 		for(i = 0; i < argc; i++) {
223 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
224 					;
225 			if(j > maxlen)
226 				maxlen = j;
227 		}
228 	}
229 	if (argc <= 0) {
230 		if(!mime)
231 			print ("stdin: ");
232 		filetype(0);
233 	}
234 	else {
235 		for(i = 0; i < argc; i++)
236 			type(argv[i], maxlen);
237 	}
238 	exits(0);
239 }
240 
241 void
242 type(char *file, int nlen)
243 {
244 	Rune r;
245 	int i;
246 	char *p;
247 
248 	if(nlen > 0){
249 		slash = 0;
250 		for (i = 0, p = file; *p; i++) {
251 			if (*p == '/')			/* find rightmost slash */
252 				slash = p;
253 			p += chartorune(&r, p);		/* count runes */
254 		}
255 		print("%s:%*s",file, nlen-i+1, "");
256 	}
257 	fname = file;
258 	if ((fd = open(file, OREAD)) < 0) {
259 		print("cannot open\n");
260 		return;
261 	}
262 	filetype(fd);
263 	close(fd);
264 }
265 
266 void
267 filetype(int fd)
268 {
269 	Rune r;
270 	int i, f, n;
271 	char *p, *eob;
272 
273 	free(mbuf);
274 	mbuf = dirfstat(fd);
275 	if(mbuf == nil){
276 		print("cannot stat: %r\n");
277 		return;
278 	}
279 	if(mbuf->mode & DMDIR) {
280 		print(mime ? "text/directory\n" : "directory\n");
281 		return;
282 	}
283 	if(mbuf->type != 'M' && mbuf->type != '|') {
284 		print(mime ? OCTET : "special file #%c/%s\n",
285 			mbuf->type, mbuf->name);
286 		return;
287 	}
288 	nbuf = read(fd, buf, sizeof(buf)-1);
289 
290 	if(nbuf < 0) {
291 		print("cannot read\n");
292 		return;
293 	}
294 	if(nbuf == 0) {
295 		print(mime ? PLAIN : "empty file\n");
296 		return;
297 	}
298 	buf[nbuf] = 0;
299 
300 	/*
301 	 * build histogram table
302 	 */
303 	memset(cfreq, 0, sizeof(cfreq));
304 	for (i = 0; language[i].name; i++)
305 		language[i].count = 0;
306 	eob = (char *)buf+nbuf;
307 	for(n = 0, p = (char *)buf; p < eob; n++) {
308 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
309 			break;
310 		p += chartorune(&r, p);
311 		if (r == 0)
312 			f = Cnull;
313 		else if (r <= 0x7f) {
314 			if (!isprint(r) && !isspace(r))
315 				f = Ceascii;	/* ASCII control char */
316 			else f = r;
317 		} else if (r == 0x080) {
318 			bump_utf_count(r);
319 			f = Cutf;
320 		} else if (r < 0xA0)
321 				f = Cbinary;	/* Invalid Runes */
322 		else if (r <= 0xff)
323 				f = Clatin;	/* Latin 1 */
324 		else {
325 			bump_utf_count(r);
326 			f = Cutf;		/* UTF extension */
327 		}
328 		cfreq[f]++;			/* ASCII chars peg directly */
329 	}
330 	/*
331 	 * gross classify
332 	 */
333 	if (cfreq[Cbinary])
334 		guess = Fbinary;
335 	else if (cfreq[Cutf])
336 		guess = Futf;
337 	else if (cfreq[Clatin])
338 		guess = Flatin;
339 	else if (cfreq[Ceascii])
340 		guess = Feascii;
341 	else if (cfreq[Cnull] == n) {
342 		print(mime ? OCTET : "first block all null bytes\n");
343 		return;
344 	}
345 	else guess = Fascii;
346 	/*
347 	 * lookup dictionary words
348 	 */
349 	memset(wfreq, 0, sizeof(wfreq));
350 	if(guess == Fascii || guess == Flatin || guess == Futf)
351 		wordfreq();
352 	/*
353 	 * call individual classify routines
354 	 */
355 	for(i=0; call[i]; i++)
356 		if((*call[i])())
357 			return;
358 
359 	/*
360 	 * if all else fails,
361 	 * print out gross classification
362 	 */
363 	if (nbuf < 100 && !mime)
364 		print(mime ? PLAIN : "short ");
365 	if (guess == Fascii)
366 		print(mime ? PLAIN : "Ascii\n");
367 	else if (guess == Feascii)
368 		print(mime ? PLAIN : "extended ascii\n");
369 	else if (guess == Flatin)
370 		print(mime ? PLAIN : "latin ascii\n");
371 	else if (guess == Futf && utf_count() < 4)
372 		print_utf();
373 	else print(mime ? OCTET : "binary\n");
374 }
375 
376 void
377 bump_utf_count(Rune r)
378 {
379 	int low, high, mid;
380 
381 	high = sizeof(language)/sizeof(language[0])-1;
382 	for (low = 0; low < high;) {
383 		mid = (low+high)/2;
384 		if (r >=language[mid].low) {
385 			if (r <= language[mid].high) {
386 				language[mid].count++;
387 				break;
388 			} else low = mid+1;
389 		} else high = mid;
390 	}
391 }
392 
393 int
394 utf_count(void)
395 {
396 	int i, count;
397 
398 	count = 0;
399 	for (i = 0; language[i].name; i++)
400 		if (language[i].count > 0)
401 			switch (language[i].mode) {
402 			case Normal:
403 			case First:
404 				count++;
405 				break;
406 			default:
407 				break;
408 			}
409 	return count;
410 }
411 
412 int
413 chkascii(void)
414 {
415 	int i;
416 
417 	for (i = 'a'; i < 'z'; i++)
418 		if (cfreq[i])
419 			return 1;
420 	for (i = 'A'; i < 'Z'; i++)
421 		if (cfreq[i])
422 			return 1;
423 	return 0;
424 }
425 
426 int
427 find_first(char *name)
428 {
429 	int i;
430 
431 	for (i = 0; language[i].name != 0; i++)
432 		if (language[i].mode == First
433 			&& strcmp(language[i].name, name) == 0)
434 			return i;
435 	return -1;
436 }
437 
438 void
439 print_utf(void)
440 {
441 	int i, printed, j;
442 
443 	if(mime){
444 		print(PLAIN);
445 		return;
446 	}
447 	if (chkascii()) {
448 		printed = 1;
449 		print("Ascii");
450 	} else
451 		printed = 0;
452 	for (i = 0; language[i].name; i++)
453 		if (language[i].count) {
454 			switch(language[i].mode) {
455 			case Multi:
456 				j = find_first(language[i].name);
457 				if (j < 0)
458 					break;
459 				if (language[j].count > 0)
460 					break;
461 				/* Fall through */
462 			case Normal:
463 			case First:
464 				if (printed)
465 					print(" & ");
466 				else printed = 1;
467 				print("%s", language[i].name);
468 				break;
469 			case Shared:
470 			default:
471 				break;
472 			}
473 		}
474 	if(!printed)
475 		print("UTF");
476 	print(" text\n");
477 }
478 
479 void
480 wordfreq(void)
481 {
482 	int low, high, mid, r;
483 	uchar *p, *p2, c;
484 
485 	p = buf;
486 	for(;;) {
487 		while (p < buf+nbuf && !isalpha(*p))
488 			p++;
489 		if (p >= buf+nbuf)
490 			return;
491 		p2 = p;
492 		while(p < buf+nbuf && isalpha(*p))
493 			p++;
494 		c = *p;
495 		*p = 0;
496 		high = sizeof(dict)/sizeof(dict[0]);
497 		for(low = 0;low < high;) {
498 			mid = (low+high)/2;
499 			r = strcmp(dict[mid].word, (char*)p2);
500 			if(r == 0) {
501 				wfreq[dict[mid].class]++;
502 				break;
503 			}
504 			if(r < 0)
505 				low = mid+1;
506 			else
507 				high = mid;
508 		}
509 		*p++ = c;
510 	}
511 }
512 
513 typedef struct Filemagic Filemagic;
514 struct Filemagic {
515 	ulong x;
516 	ulong mask;
517 	char *desc;
518 	char *mime;
519 };
520 
521 Filemagic long0tab[] = {
522 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
523 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
524 	0x32636170,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
525 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
526 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
527 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip",
528 	070707,		0xFFFF,		"cpio archive\n", OCTET,
529 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
530 	0xfaff,		0xfeff,		"mp3 audio\n",	"audio/mpeg",
531 };
532 
533 int
534 filemagic(Filemagic *tab, int ntab, ulong x)
535 {
536 	int i;
537 
538 	for(i=0; i<ntab; i++)
539 		if((x&tab[i].mask) == tab[i].x){
540 			print(mime ? tab[i].mime : tab[i].desc);
541 			return 1;
542 		}
543 	return 0;
544 }
545 
546 int
547 long0(void)
548 {
549 	long x;
550 
551 	x = LENDIAN(buf);
552 	if(filemagic(long0tab, nelem(long0tab), x))
553 		return 1;
554 	return 0;
555 }
556 
557 int
558 isexec(void)
559 {
560 	Fhdr f;
561 
562 	seek(fd, 0, 0);		/* reposition to start of file */
563 	if(crackhdr(fd, &f)) {
564 		print(mime ? OCTET : "%s\n", f.name);
565 		return 1;
566 	}
567 	return 0;
568 }
569 
570 
571 /* from tar.c */
572 enum { NAMSIZ = 100, TBLOCK = 512 };
573 
574 union	hblock
575 {
576 	char	dummy[TBLOCK];
577 	struct	header
578 	{
579 		char	name[NAMSIZ];
580 		char	mode[8];
581 		char	uid[8];
582 		char	gid[8];
583 		char	size[12];
584 		char	mtime[12];
585 		char	chksum[8];
586 		char	linkflag;
587 		char	linkname[NAMSIZ];
588 		/* rest are defined by POSIX's ustar format; see p1003.2b */
589 		char	magic[6];	/* "ustar" */
590 		char	version[2];
591 		char	uname[32];
592 		char	gname[32];
593 		char	devmajor[8];
594 		char	devminor[8];
595 		char	prefix[155];  /* if non-null, path = prefix "/" name */
596 	} dbuf;
597 };
598 
599 int
600 checksum(union hblock *hp)
601 {
602 	int i;
603 	char *cp;
604 	struct header *hdr = &hp->dbuf;
605 
606 	for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
607 		*cp = ' ';
608 	i = 0;
609 	for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
610 		i += *cp & 0xff;
611 	return i;
612 }
613 
614 int
615 istar(void)
616 {
617 	int chksum;
618 	char tblock[TBLOCK];
619 	union hblock *hp = (union hblock *)tblock;
620 	struct header *hdr = &hp->dbuf;
621 
622 	seek(fd, 0, 0);		/* reposition to start of file */
623 	if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
624 		return 0;
625 	chksum = strtol(hdr->chksum, 0, 8);
626 	if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
627 		if (strcmp(hdr->magic, "ustar") == 0)
628 			print(mime? "application/x-ustar\n":
629 				"posix tar archive\n");
630 		else
631 			print(mime? "application/x-tar\n": "tar archive\n");
632 		return 1;
633 	}
634 	return 0;
635 }
636 
637 /*
638  * initial words to classify file
639  */
640 struct	FILE_STRING
641 {
642 	char 	*key;
643 	char	*filetype;
644 	int	length;
645 	char	*mime;
646 } file_string[] =
647 {
648 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
649 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
650 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
651 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
652 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
653 	"%!",			"postscript",			2,	"application/postscript",
654 	"\004%!",		"postscript",			3,	"application/postscript",
655 	"x T post",		"troff output for post",	8,	"application/troff",
656 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
657 	"x T utf",		"troff output for UTF",		7,	"application/troff",
658 	"x T 202",		"troff output for 202",		7,	"application/troff",
659 	"x T aps",		"troff output for aps",		7,	"application/troff",
660 	"GIF",			"GIF image", 			3,	"image/gif",
661 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
662 	"%PDF",			"PDF",				4,	"application/pdf",
663 	"<html>\n",		"HTML file",			7,	"text/html",
664 	"<HTML>\n",		"HTML file",			7,	"text/html",
665 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
666 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
667 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
668 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
669 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
670 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
671 	"BM",			"bmp",				2,	"image/bmp",
672 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
673 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
674 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
675 	"ID3",			"mp3 audio with id3",	3,	"audio/mpeg",
676 	"\211PNG",		"PNG image",		4,	"image/png",
677 	"P3\n",			"ppm",				3,	"image/ppm",
678 	"P6\n",			"ppm",				3,	"image/ppm",
679 	"/* XPM */\n",	"xbm",				10,	"image/xbm",
680 	0,0,0,0
681 };
682 
683 int
684 istring(void)
685 {
686 	int i;
687 	struct FILE_STRING *p;
688 
689 	for(p = file_string; p->key; p++) {
690 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
691 			if(mime)
692 				print("%s\n", p->mime);
693 			else
694 				print("%s\n", p->filetype);
695 			return 1;
696 		}
697 	}
698 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
699 		for(i = 5; i < nbuf; i++)
700 			if(buf[i] == '\n')
701 				break;
702 		if(mime)
703 			print(OCTET);
704 		else
705 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
706 		return 1;
707 	}
708 	return 0;
709 }
710 
711 int
712 iff(void)
713 {
714 	if (strncmp((char*)buf, "FORM", 4) == 0 &&
715 	    strncmp((char*)buf+8, "AIFF", 4) == 0) {
716 		print("%s\n", mime? "audio/x-aiff": "aiff audio");
717 		return 1;
718 	}
719 	return 0;
720 }
721 
722 char*	html_string[] =
723 {
724 	"title",
725 	"body",
726 	"head",
727 	"strong",
728 	"h1",
729 	"h2",
730 	"h3",
731 	"h4",
732 	"h5",
733 	"h6",
734 	"ul",
735 	"li",
736 	"dl",
737 	"br",
738 	"em",
739 	0,
740 };
741 
742 int
743 ishtml(void)
744 {
745 	uchar *p, *q;
746 	int i, count;
747 
748 		/* compare strings between '<' and '>' to html table */
749 	count = 0;
750 	p = buf;
751 	for(;;) {
752 		while (p < buf+nbuf && *p != '<')
753 			p++;
754 		p++;
755 		if (p >= buf+nbuf)
756 			break;
757 		if(*p == '/')
758 			p++;
759 		q = p;
760 		while(p < buf+nbuf && *p != '>')
761 			p++;
762 		if (p >= buf+nbuf)
763 			break;
764 		for(i = 0; html_string[i]; i++) {
765 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
766 				if(count++ > 4) {
767 					print(mime ? "text/html\n" : "HTML file\n");
768 					return 1;
769 				}
770 				break;
771 			}
772 		}
773 		p++;
774 	}
775 	return 0;
776 }
777 
778 char*	rfc822_string[] =
779 {
780 	"from:",
781 	"date:",
782 	"to:",
783 	"subject:",
784 	"received:",
785 	"reply to:",
786 	"sender:",
787 	0,
788 };
789 
790 int
791 isrfc822(void)
792 {
793 
794 	char *p, *q, *r;
795 	int i, count;
796 
797 	count = 0;
798 	p = (char*)buf;
799 	for(;;) {
800 		q = strchr(p, '\n');
801 		if(q == nil)
802 			break;
803 		*q = 0;
804 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
805 			count++;
806 			*q = '\n';
807 			p = q+1;
808 			continue;
809 		}
810 		*q = '\n';
811 		if(*p != '\t' && *p != ' '){
812 			r = strchr(p, ':');
813 			if(r == 0 || r > q)
814 				break;
815 			for(i = 0; rfc822_string[i]; i++) {
816 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
817 					count++;
818 					break;
819 				}
820 			}
821 		}
822 		p = q+1;
823 	}
824 	if(count >= 3){
825 		print(mime ? "message/rfc822\n" : "email file\n");
826 		return 1;
827 	}
828 	return 0;
829 }
830 
831 int
832 ismbox(void)
833 {
834 	char *p, *q;
835 
836 	p = (char*)buf;
837 	q = strchr(p, '\n');
838 	if(q == nil)
839 		return 0;
840 	*q = 0;
841 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
842 		print(mime ? "text/plain\n" : "mail box\n");
843 		return 1;
844 	}
845 	*q = '\n';
846 	return 0;
847 }
848 
849 int
850 iscint(void)
851 {
852 	int type;
853 	char *name;
854 	Biobuf b;
855 
856 	if(Binit(&b, fd, OREAD) == Beof)
857 		return 0;
858 	seek(fd, 0, 0);
859 	type = objtype(&b, &name);
860 	if(type < 0)
861 		return 0;
862 	if(mime)
863 		print(OCTET);
864 	else
865 		print("%s intermediate\n", name);
866 	return 1;
867 }
868 
869 int
870 isc(void)
871 {
872 	int n;
873 
874 	n = wfreq[I1];
875 	/*
876 	 * includes
877 	 */
878 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
879 		goto yes;
880 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
881 		goto yes;
882 	/*
883 	 * declarations
884 	 */
885 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
886 		goto yes;
887 	/*
888 	 * assignments
889 	 */
890 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
891 		goto yes;
892 	return 0;
893 
894 yes:
895 	if(mime){
896 		print(PLAIN);
897 		return 1;
898 	}
899 	if(wfreq[Alword] > 0)
900 		print("alef program\n");
901 	else
902 		print("c program\n");
903 	return 1;
904 }
905 
906 int
907 islimbo(void)
908 {
909 
910 	/*
911 	 * includes
912 	 */
913 	if(wfreq[Lword] < 4)
914 		return 0;
915 	print(mime ? PLAIN : "limbo program\n");
916 	return 1;
917 }
918 
919 int
920 isas(void)
921 {
922 
923 	/*
924 	 * includes
925 	 */
926 	if(wfreq[Aword] < 2)
927 		return 0;
928 	print(mime ? PLAIN : "as program\n");
929 	return 1;
930 }
931 
932 /*
933  * low entropy means encrypted
934  */
935 int
936 ismung(void)
937 {
938 	int i, bucket[8];
939 	float cs;
940 
941 	if(nbuf < 64)
942 		return 0;
943 	memset(bucket, 0, sizeof(bucket));
944 	for(i=nbuf-64; i<nbuf; i++)
945 		bucket[(buf[i]>>5)&07] += 1;
946 
947 	cs = 0.;
948 	for(i=0; i<8; i++)
949 		cs += (bucket[i]-8)*(bucket[i]-8);
950 	cs /= 8.;
951 	if(cs <= 24.322) {
952 		if(buf[0]==0x1f && buf[1]==0x9d)
953 			print(mime ? OCTET : "compressed\n");
954 		else
955 		if(buf[0]==0x1f && buf[1]==0x8b)
956 			print(mime ? OCTET : "gzip compressed\n");
957 		else
958 		if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h')
959 			print(mime ? OCTET : "bzip2 compressed\n");
960 		else
961 			print(mime ? OCTET : "encrypted\n");
962 		return 1;
963 	}
964 	return 0;
965 }
966 
967 /*
968  * english by punctuation and frequencies
969  */
970 int
971 isenglish(void)
972 {
973 	int vow, comm, rare, badpun, punct;
974 	char *p;
975 
976 	if(guess != Fascii && guess != Feascii)
977 		return 0;
978 	badpun = 0;
979 	punct = 0;
980 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
981 		switch(*p) {
982 		case '.':
983 		case ',':
984 		case ')':
985 		case '%':
986 		case ';':
987 		case ':':
988 		case '?':
989 			punct++;
990 			if(p[1] != ' ' && p[1] != '\n')
991 				badpun++;
992 		}
993 	if(badpun*5 > punct)
994 		return 0;
995 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
996 		return 0;
997 	if(2*cfreq[';'] > cfreq['e'])
998 		return 0;
999 
1000 	vow = 0;
1001 	for(p="AEIOU"; *p; p++) {
1002 		vow += cfreq[*p];
1003 		vow += cfreq[tolower(*p)];
1004 	}
1005 	comm = 0;
1006 	for(p="ETAION"; *p; p++) {
1007 		comm += cfreq[*p];
1008 		comm += cfreq[tolower(*p)];
1009 	}
1010 	rare = 0;
1011 	for(p="VJKQXZ"; *p; p++) {
1012 		rare += cfreq[*p];
1013 		rare += cfreq[tolower(*p)];
1014 	}
1015 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
1016 		print(mime ? PLAIN : "English text\n");
1017 		return 1;
1018 	}
1019 	return 0;
1020 }
1021 
1022 /*
1023  * pick up a number with
1024  * syntax _*[0-9]+_
1025  */
1026 #define	P9BITLEN	12
1027 int
1028 p9bitnum(uchar *bp)
1029 {
1030 	int n, c, len;
1031 
1032 	len = P9BITLEN;
1033 	while(*bp == ' ') {
1034 		bp++;
1035 		len--;
1036 		if(len <= 0)
1037 			return -1;
1038 	}
1039 	n = 0;
1040 	while(len > 1) {
1041 		c = *bp++;
1042 		if(!isdigit(c))
1043 			return -1;
1044 		n = n*10 + c-'0';
1045 		len--;
1046 	}
1047 	if(*bp != ' ')
1048 		return -1;
1049 	return n;
1050 }
1051 
1052 int
1053 depthof(char *s, int *newp)
1054 {
1055 	char *es;
1056 	int d;
1057 
1058 	*newp = 0;
1059 	es = s+12;
1060 	while(s<es && *s==' ')
1061 		s++;
1062 	if(s == es)
1063 		return -1;
1064 	if('0'<=*s && *s<='9')
1065 		return 1<<strtol(s, 0, 0);
1066 
1067 	*newp = 1;
1068 	d = 0;
1069 	while(s<es && *s!=' '){
1070 		s++;	/* skip letter */
1071 		d += strtoul(s, &s, 10);
1072 	}
1073 
1074 	switch(d){
1075 	case 32:
1076 	case 24:
1077 	case 16:
1078 	case 8:
1079 		return d;
1080 	}
1081 	return -1;
1082 }
1083 
1084 int
1085 isp9bit(void)
1086 {
1087 	int dep, lox, loy, hix, hiy, px, new;
1088 	ulong t;
1089 	long len;
1090 	char *newlabel;
1091 
1092 	newlabel = "old ";
1093 
1094 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
1095 	if(new)
1096 		newlabel = "";
1097 	lox = p9bitnum(buf + 1*P9BITLEN);
1098 	loy = p9bitnum(buf + 2*P9BITLEN);
1099 	hix = p9bitnum(buf + 3*P9BITLEN);
1100 	hiy = p9bitnum(buf + 4*P9BITLEN);
1101 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1102 		return 0;
1103 
1104 	if(dep < 8){
1105 		px = 8/dep;	/* pixels per byte */
1106 		/* set l to number of bytes of data per scan line */
1107 		if(lox >= 0)
1108 			len = (hix+px-1)/px - lox/px;
1109 		else{	/* make positive before divide */
1110 			t = (-lox)+px-1;
1111 			t = (t/px)*px;
1112 			len = (t+hix+px-1)/px;
1113 		}
1114 	}else
1115 		len = (hix-lox)*dep/8;
1116 	len *= (hiy-loy);		/* col length */
1117 	len += 5*P9BITLEN;		/* size of initial ascii */
1118 
1119 	/*
1120 	 * for image file, length is non-zero and must match calculation above
1121 	 * for /dev/window and /dev/screen the length is always zero
1122 	 * for subfont, the subfont header should follow immediately.
1123 	 */
1124 	if (len != 0 && mbuf->length == 0) {
1125 		print("%splan 9 image\n", newlabel);
1126 		return 1;
1127 	}
1128 	if (mbuf->length == len) {
1129 		print("%splan 9 image\n", newlabel);
1130 		return 1;
1131 	}
1132 	/* Ghostscript sometimes produces a little extra on the end */
1133 	if (mbuf->length < len+P9BITLEN) {
1134 		print("%splan 9 image\n", newlabel);
1135 		return 1;
1136 	}
1137 	if (p9subfont(buf+len)) {
1138 		print("%ssubfont file\n", newlabel);
1139 		return 1;
1140 	}
1141 	return 0;
1142 }
1143 
1144 int
1145 p9subfont(uchar *p)
1146 {
1147 	int n, h, a;
1148 
1149 		/* if image too big, assume it's a subfont */
1150 	if (p+3*P9BITLEN > buf+sizeof(buf))
1151 		return 1;
1152 
1153 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1154 	if (n < 0)
1155 		return 0;
1156 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1157 	if (h < 0)
1158 		return 0;
1159 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1160 	if (a < 0)
1161 		return 0;
1162 	return 1;
1163 }
1164 
1165 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1166 
1167 int
1168 isp9font(void)
1169 {
1170 	uchar *cp, *p;
1171 	int i, n;
1172 	char pathname[1024];
1173 
1174 	cp = buf;
1175 	if (!getfontnum(cp, &cp))	/* height */
1176 		return 0;
1177 	if (!getfontnum(cp, &cp))	/* ascent */
1178 		return 0;
1179 	for (i = 0;; i++) {
1180 		if (!getfontnum(cp, &cp))	/* min */
1181 			break;
1182 		if (!getfontnum(cp, &cp))	/* max */
1183 			return 0;
1184 		while (WHITESPACE(*cp))
1185 			cp++;
1186 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1187 				;
1188 			/* construct a path name, if needed */
1189 		n = 0;
1190 		if (*p != '/' && slash) {
1191 			n = slash-fname+1;
1192 			if (n < sizeof(pathname))
1193 				memcpy(pathname, fname, n);
1194 			else n = 0;
1195 		}
1196 		if (n+cp-p < sizeof(pathname)) {
1197 			memcpy(pathname+n, p, cp-p);
1198 			n += cp-p;
1199 			pathname[n] = 0;
1200 			if (access(pathname, AEXIST) < 0)
1201 				return 0;
1202 		}
1203 	}
1204 	if (i) {
1205 		print(mime ? "text/plain\n" : "font file\n");
1206 		return 1;
1207 	}
1208 	return 0;
1209 }
1210 
1211 int
1212 getfontnum(uchar *cp, uchar **rp)
1213 {
1214 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1215 		cp++;
1216 	if (*cp < '0' || *cp > '9')
1217 		return 0;
1218 	strtoul((char *)cp, (char **)rp, 0);
1219 	if (!WHITESPACE(**rp))
1220 		return 0;
1221 	return 1;
1222 }
1223 
1224 int
1225 isrtf(void)
1226 {
1227 	if(strstr((char *)buf, "\\rtf1")){
1228 		print(mime ? "application/rtf\n" : "rich text format\n");
1229 		return 1;
1230 	}
1231 	return 0;
1232 }
1233 
1234 int
1235 ismsdos(void)
1236 {
1237 	if (buf[0] == 0x4d && buf[1] == 0x5a){
1238 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1239 		return 1;
1240 	}
1241 	return 0;
1242 }
1243 
1244 int
1245 iself(void)
1246 {
1247 	char *cpu[] = {		/* NB: incomplete and arbitary list */
1248 	[1]	"WE32100",
1249 	[2]	"SPARC",
1250 	[3]	"i386",
1251 	[4]	"M68000",
1252 	[5]	"M88000",
1253 	[6]	"i486",
1254 	[7]	"i860",
1255 	[8]	"R3000",
1256 	[9]	"S370",
1257 	[10]	"R4000",
1258 	[15]	"HP-PA",
1259 	[18]	"sparc v8+",
1260 	[19]	"i960",
1261 	[20]	"PPC-32",
1262 	[21]	"PPC-64",
1263 	[40]	"ARM",
1264 	[41]	"Alpha",
1265 	[43]	"sparc v9",
1266 	[50]	"IA-46",
1267 	[62]	"AMD64",
1268 	[75]	"VAX",
1269 	};
1270 
1271 
1272 	if (memcmp(buf, "\x7fELF", 4) == 0){
1273 		if (!mime){
1274 			int n = (buf[19] << 8) | buf[18];
1275 			char *p = "unknown";
1276 
1277 			if (n > 0 && n < nelem(cpu) && cpu[n])
1278 				p = cpu[n];
1279 			else {
1280 				/* try the other byte order */
1281 				n = (buf[18] << 8) | buf[19];
1282 				if (n > 0 && n < nelem(cpu) && cpu[n])
1283 					p = cpu[n];
1284 			}
1285 			print("%s ELF executable\n", p);
1286 		}
1287 		else
1288 			print("application/x-elf-executable");
1289 		return 1;
1290 	}
1291 
1292 	return 0;
1293 }
1294 
1295 int
1296 isface(void)
1297 {
1298 	int i, j, ldepth, l;
1299 	char *p;
1300 
1301 	ldepth = -1;
1302 	for(j = 0; j < 3; j++){
1303 		for(p = (char*)buf, i=0; i<3; i++){
1304 			if(p[0] != '0' || p[1] != 'x')
1305 				return 0;
1306 			if(buf[2+8] == ',')
1307 				l = 2;
1308 			else if(buf[2+4] == ',')
1309 				l = 1;
1310 			else
1311 				return 0;
1312 			if(ldepth == -1)
1313 				ldepth = l;
1314 			if(l != ldepth)
1315 				return 0;
1316 			strtoul(p, &p, 16);
1317 			if(*p++ != ',')
1318 				return 0;
1319 			while(*p == ' ' || *p == '\t')
1320 				p++;
1321 		}
1322 		if (*p++ != '\n')
1323 			return 0;
1324 	}
1325 
1326 	if(mime)
1327 		print("application/x-face\n");
1328 	else
1329 		print("face image depth %d\n", ldepth);
1330 	return 1;
1331 }
1332 
1333