xref: /plan9/sys/src/cmd/file.c (revision 2009dc88df672c2a416b2d1e1e9b7b5bedb3ce4e)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"chan",		Alword,
51 	"char",		Cword,
52 	"common",	Fword,
53 	"con",		Lword,
54 	"data",		Fword,
55 	"dimension",	Fword,
56 	"double",	Cword,
57 	"extern",	Cword,
58 	"bio",		I2,
59 	"float",	Cword,
60 	"fn",		Lword,
61 	"function",	Fword,
62 	"h",		I3,
63 	"implement",	Lword,
64 	"import",	Lword,
65 	"include",	I1,
66 	"int",		Cword,
67 	"integer",	Fword,
68 	"iota",		Lword,
69 	"libc",		I2,
70 	"long",		Cword,
71 	"module",	Lword,
72 	"real",		Fword,
73 	"ref",		Lword,
74 	"register",	Cword,
75 	"self",		Lword,
76 	"short",	Cword,
77 	"static",	Cword,
78 	"stdio",	I2,
79 	"struct",	Cword,
80 	"subroutine",	Fword,
81 	"u",		I2,
82 	"void",		Cword,
83 };
84 
85 /* codes for 'mode' field in language structure */
86 enum	{
87 		Normal	= 0,
88 		First,		/* first entry for language spanning several ranges */
89 		Multi,		/* later entries "   "       "  ... */
90 		Shared,		/* codes used in several languages */
91 	};
92 
93 struct
94 {
95 	int	mode;		/* see enum above */
96 	int 	count;
97 	int	low;
98 	int	high;
99 	char	*name;
100 
101 } language[] =
102 {
103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
125 	First,	0,	0x3130,	0x318F,	"Korean",
126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
128 	Normal,	0,	0,	0,	0,		/* terminal entry */
129 };
130 
131 
132 enum
133 {
134 	Fascii,		/* printable ascii */
135 	Flatin,		/* latin 1*/
136 	Futf,		/* UTf character set */
137 	Fbinary,	/* binary */
138 	Feascii,	/* ASCII with control chars */
139 	Fnull,		/* NULL in file */
140 } guess;
141 
142 void	bump_utf_count(Rune);
143 int	cistrncmp(char*, char*, int);
144 void	filetype(int);
145 int	getfontnum(uchar*, uchar**);
146 int	isas(void);
147 int	isc(void);
148 int	iscint(void);
149 int	isenglish(void);
150 int	ishp(void);
151 int	ishtml(void);
152 int	isrfc822(void);
153 int	ismbox(void);
154 int	islimbo(void);
155 int	ismung(void);
156 int	isp9bit(void);
157 int	isp9font(void);
158 int	isrtf(void);
159 int	ismsdos(void);
160 int	iself(void);
161 int	istring(void);
162 int	long0(void);
163 int	istar(void);
164 int	p9bitnum(uchar*);
165 int	p9subfont(uchar*);
166 void	print_utf(void);
167 void	type(char*, int);
168 int	utf_count(void);
169 void	wordfreq(void);
170 
171 int	(*call[])(void) =
172 {
173 	long0,		/* recognizable by first 4 bytes */
174 	istring,	/* recognizable by first string */
175 	isrfc822,	/* email file */
176 	ismbox,		/* mail box */
177 	istar,		/* recognizable by tar checksum */
178 	ishtml,		/* html keywords */
179 	iscint,		/* compiler/assembler intermediate */
180 	islimbo,	/* limbo source */
181 	isc,		/* c & alef compiler key words */
182 	isas,		/* assembler key words */
183 	ismung,		/* entropy compressed/encrypted */
184 	isp9font,	/* plan 9 font */
185 	isp9bit,	/* plan 9 image (as from /dev/window) */
186 	isenglish,	/* char frequency English */
187 	isrtf,		/* rich text format */
188 	ismsdos,	/* msdos exe (virus file attachement) */
189 	iself,		/* ELF (foreign) executable */
190 	0
191 };
192 
193 int mime;
194 
195 #define OCTET	"application/octet-stream\n"
196 #define PLAIN	"text/plain\n"
197 
198 void
199 main(int argc, char *argv[])
200 {
201 	int i, j, maxlen;
202 	char *cp;
203 	Rune r;
204 
205 	ARGBEGIN{
206 	case 'm':
207 		mime = 1;
208 		break;
209 	default:
210 		fprint(2, "usage: file [-m] [file...]\n");
211 		exits("usage");
212 	}ARGEND;
213 
214 	maxlen = 0;
215 	if(mime == 0 || argc > 1){
216 		for(i = 0; i < argc; i++) {
217 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
218 					;
219 			if(j > maxlen)
220 				maxlen = j;
221 		}
222 	}
223 	if (argc <= 0) {
224 		if(!mime)
225 			print ("stdin: ");
226 		filetype(0);
227 	}
228 	else {
229 		for(i = 0; i < argc; i++)
230 			type(argv[i], maxlen);
231 	}
232 	exits(0);
233 }
234 
235 void
236 type(char *file, int nlen)
237 {
238 	Rune r;
239 	int i;
240 	char *p;
241 
242 	if(nlen > 0){
243 		slash = 0;
244 		for (i = 0, p = file; *p; i++) {
245 			if (*p == '/')			/* find rightmost slash */
246 				slash = p;
247 			p += chartorune(&r, p);		/* count runes */
248 		}
249 		print("%s:%*s",file, nlen-i+1, "");
250 	}
251 	fname = file;
252 	if ((fd = open(file, OREAD)) < 0) {
253 		print("cannot open\n");
254 		return;
255 	}
256 	filetype(fd);
257 	close(fd);
258 }
259 
260 void
261 filetype(int fd)
262 {
263 	Rune r;
264 	int i, f, n;
265 	char *p, *eob;
266 
267 	free(mbuf);
268 	mbuf = dirfstat(fd);
269 	if(mbuf == nil){
270 		print("cannot stat: %r\n");
271 		return;
272 	}
273 	if(mbuf->mode & DMDIR) {
274 		print(mime ? "text/directory\n" : "directory\n");
275 		return;
276 	}
277 	if(mbuf->type != 'M' && mbuf->type != '|') {
278 		print(mime ? OCTET : "special file #%c/%s\n",
279 			mbuf->type, mbuf->name);
280 		return;
281 	}
282 	nbuf = read(fd, buf, sizeof(buf)-1);
283 
284 	if(nbuf < 0) {
285 		print("cannot read\n");
286 		return;
287 	}
288 	if(nbuf == 0) {
289 		print(mime ? PLAIN : "empty file\n");
290 		return;
291 	}
292 	buf[nbuf] = 0;
293 
294 	/*
295 	 * build histogram table
296 	 */
297 	memset(cfreq, 0, sizeof(cfreq));
298 	for (i = 0; language[i].name; i++)
299 		language[i].count = 0;
300 	eob = (char *)buf+nbuf;
301 	for(n = 0, p = (char *)buf; p < eob; n++) {
302 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
303 			break;
304 		p += chartorune(&r, p);
305 		if (r == 0)
306 			f = Cnull;
307 		else if (r <= 0x7f) {
308 			if (!isprint(r) && !isspace(r))
309 				f = Ceascii;	/* ASCII control char */
310 			else f = r;
311 		} else if (r == 0x080) {
312 			bump_utf_count(r);
313 			f = Cutf;
314 		} else if (r < 0xA0)
315 				f = Cbinary;	/* Invalid Runes */
316 		else if (r <= 0xff)
317 				f = Clatin;	/* Latin 1 */
318 		else {
319 			bump_utf_count(r);
320 			f = Cutf;		/* UTF extension */
321 		}
322 		cfreq[f]++;			/* ASCII chars peg directly */
323 	}
324 	/*
325 	 * gross classify
326 	 */
327 	if (cfreq[Cbinary])
328 		guess = Fbinary;
329 	else if (cfreq[Cutf])
330 		guess = Futf;
331 	else if (cfreq[Clatin])
332 		guess = Flatin;
333 	else if (cfreq[Ceascii])
334 		guess = Feascii;
335 	else if (cfreq[Cnull] == n) {
336 		print(mime ? OCTET : "first block all null bytes\n");
337 		return;
338 	}
339 	else guess = Fascii;
340 	/*
341 	 * lookup dictionary words
342 	 */
343 	memset(wfreq, 0, sizeof(wfreq));
344 	if(guess == Fascii || guess == Flatin || guess == Futf)
345 		wordfreq();
346 	/*
347 	 * call individual classify routines
348 	 */
349 	for(i=0; call[i]; i++)
350 		if((*call[i])())
351 			return;
352 
353 	/*
354 	 * if all else fails,
355 	 * print out gross classification
356 	 */
357 	if (nbuf < 100 && !mime)
358 		print(mime ? PLAIN : "short ");
359 	if (guess == Fascii)
360 		print(mime ? PLAIN : "Ascii\n");
361 	else if (guess == Feascii)
362 		print(mime ? PLAIN : "extended ascii\n");
363 	else if (guess == Flatin)
364 		print(mime ? PLAIN : "latin ascii\n");
365 	else if (guess == Futf && utf_count() < 4)
366 		print_utf();
367 	else print(mime ? OCTET : "binary\n");
368 }
369 
370 void
371 bump_utf_count(Rune r)
372 {
373 	int low, high, mid;
374 
375 	high = sizeof(language)/sizeof(language[0])-1;
376 	for (low = 0; low < high;) {
377 		mid = (low+high)/2;
378 		if (r >=language[mid].low) {
379 			if (r <= language[mid].high) {
380 				language[mid].count++;
381 				break;
382 			} else low = mid+1;
383 		} else high = mid;
384 	}
385 }
386 
387 int
388 utf_count(void)
389 {
390 	int i, count;
391 
392 	count = 0;
393 	for (i = 0; language[i].name; i++)
394 		if (language[i].count > 0)
395 			switch (language[i].mode) {
396 			case Normal:
397 			case First:
398 				count++;
399 				break;
400 			default:
401 				break;
402 			}
403 	return count;
404 }
405 
406 int
407 chkascii(void)
408 {
409 	int i;
410 
411 	for (i = 'a'; i < 'z'; i++)
412 		if (cfreq[i])
413 			return 1;
414 	for (i = 'A'; i < 'Z'; i++)
415 		if (cfreq[i])
416 			return 1;
417 	return 0;
418 }
419 
420 int
421 find_first(char *name)
422 {
423 	int i;
424 
425 	for (i = 0; language[i].name != 0; i++)
426 		if (language[i].mode == First
427 			&& strcmp(language[i].name, name) == 0)
428 			return i;
429 	return -1;
430 }
431 
432 void
433 print_utf(void)
434 {
435 	int i, printed, j;
436 
437 	if(mime){
438 		print(PLAIN);
439 		return;
440 	}
441 	if (chkascii()) {
442 		printed = 1;
443 		print("Ascii");
444 	} else
445 		printed = 0;
446 	for (i = 0; language[i].name; i++)
447 		if (language[i].count) {
448 			switch(language[i].mode) {
449 			case Multi:
450 				j = find_first(language[i].name);
451 				if (j < 0)
452 					break;
453 				if (language[j].count > 0)
454 					break;
455 				/* Fall through */
456 			case Normal:
457 			case First:
458 				if (printed)
459 					print(" & ");
460 				else printed = 1;
461 				print("%s", language[i].name);
462 				break;
463 			case Shared:
464 			default:
465 				break;
466 			}
467 		}
468 	if(!printed)
469 		print("UTF");
470 	print(" text\n");
471 }
472 
473 void
474 wordfreq(void)
475 {
476 	int low, high, mid, r;
477 	uchar *p, *p2, c;
478 
479 	p = buf;
480 	for(;;) {
481 		while (p < buf+nbuf && !isalpha(*p))
482 			p++;
483 		if (p >= buf+nbuf)
484 			return;
485 		p2 = p;
486 		while(p < buf+nbuf && isalpha(*p))
487 			p++;
488 		c = *p;
489 		*p = 0;
490 		high = sizeof(dict)/sizeof(dict[0]);
491 		for(low = 0;low < high;) {
492 			mid = (low+high)/2;
493 			r = strcmp(dict[mid].word, (char*)p2);
494 			if(r == 0) {
495 				wfreq[dict[mid].class]++;
496 				break;
497 			}
498 			if(r < 0)
499 				low = mid+1;
500 			else
501 				high = mid;
502 		}
503 		*p++ = c;
504 	}
505 }
506 
507 typedef struct Filemagic Filemagic;
508 struct Filemagic {
509 	ulong x;
510 	ulong mask;
511 	char *desc;
512 	char *mime;
513 };
514 
515 Filemagic long0tab[] = {
516 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
517 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
518 	0x32636170,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
519 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
520 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
521 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip",
522 	070707,		0xFFFF,		"cpio archive\n", OCTET,
523 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
524 };
525 
526 int
527 filemagic(Filemagic *tab, int ntab, ulong x)
528 {
529 	int i;
530 
531 	for(i=0; i<ntab; i++)
532 		if((x&tab[i].mask) == tab[i].x){
533 			print(mime ? tab[i].mime : tab[i].desc);
534 			return 1;
535 		}
536 	return 0;
537 }
538 
539 int
540 long0(void)
541 {
542 	Fhdr f;
543 	long x;
544 
545 	seek(fd, 0, 0);		/* reposition to start of file */
546 	if(crackhdr(fd, &f)) {
547 		print(mime ? OCTET : "%s\n", f.name);
548 		return 1;
549 	}
550 	x = LENDIAN(buf);
551 	if(filemagic(long0tab, nelem(long0tab), x))
552 		return 1;
553 	return 0;
554 }
555 
556 /* from tar.c */
557 enum { NAMSIZ = 100, TBLOCK = 512 };
558 
559 union	hblock
560 {
561 	char	dummy[TBLOCK];
562 	struct	header
563 	{
564 		char	name[NAMSIZ];
565 		char	mode[8];
566 		char	uid[8];
567 		char	gid[8];
568 		char	size[12];
569 		char	mtime[12];
570 		char	chksum[8];
571 		char	linkflag;
572 		char	linkname[NAMSIZ];
573 		/* rest are defined by POSIX's ustar format; see p1003.2b */
574 		char	magic[6];	/* "ustar" */
575 		char	version[2];
576 		char	uname[32];
577 		char	gname[32];
578 		char	devmajor[8];
579 		char	devminor[8];
580 		char	prefix[155];  /* if non-null, path = prefix "/" name */
581 	} dbuf;
582 };
583 
584 int
585 checksum(union hblock *hp)
586 {
587 	int i;
588 	char *cp;
589 	struct header *hdr = &hp->dbuf;
590 
591 	for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
592 		*cp = ' ';
593 	i = 0;
594 	for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
595 		i += *cp & 0xff;
596 	return i;
597 }
598 
599 int
600 istar(void)
601 {
602 	int chksum;
603 	char tblock[TBLOCK];
604 	union hblock *hp = (union hblock *)tblock;
605 	struct header *hdr = &hp->dbuf;
606 
607 	seek(fd, 0, 0);		/* reposition to start of file */
608 	if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
609 		return 0;
610 	chksum = strtol(hdr->chksum, 0, 8);
611 	if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
612 		if (strcmp(hdr->magic, "ustar") == 0)
613 			print(mime? "application/x-ustar\n":
614 				"posix tar archive\n");
615 		else
616 			print(mime? "application/x-tar\n": "tar archive\n");
617 		return 1;
618 	}
619 	return 0;
620 }
621 
622 /*
623  * initial words to classify file
624  */
625 struct	FILE_STRING
626 {
627 	char 	*key;
628 	char	*filetype;
629 	int	length;
630 	char	*mime;
631 } file_string[] =
632 {
633 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
634 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
635 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
636 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
637 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
638 	"%!",			"postscript",			2,	"application/postscript",
639 	"\004%!",		"postscript",			3,	"application/postscript",
640 	"x T post",		"troff output for post",	8,	"application/troff",
641 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
642 	"x T utf",		"troff output for UTF",		7,	"application/troff",
643 	"x T 202",		"troff output for 202",		7,	"application/troff",
644 	"x T aps",		"troff output for aps",		7,	"application/troff",
645 	"GIF",			"GIF image", 			3,	"image/gif",
646 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
647 	"%PDF",			"PDF",				4,	"application/pdf",
648 	"<html>\n",		"HTML file",			7,	"text/html",
649 	"<HTML>\n",		"HTML file",			7,	"text/html",
650 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
651 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
652 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
653 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
654 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
655 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
656 	"BM",			"bmp",				2,	"image/bmp",
657 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
658 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
659 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
660 	0,0,0,0
661 };
662 
663 int
664 istring(void)
665 {
666 	int i;
667 	struct FILE_STRING *p;
668 
669 	for(p = file_string; p->key; p++) {
670 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
671 			if(mime)
672 				print("%s\n", p->mime);
673 			else
674 				print("%s\n", p->filetype);
675 			return 1;
676 		}
677 	}
678 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
679 		for(i = 5; i < nbuf; i++)
680 			if(buf[i] == '\n')
681 				break;
682 		if(mime)
683 			print(OCTET);
684 		else
685 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
686 		return 1;
687 	}
688 	return 0;
689 }
690 
691 char*	html_string[] =
692 {
693 	"title",
694 	"body",
695 	"head",
696 	"strong",
697 	"h1",
698 	"h2",
699 	"h3",
700 	"h4",
701 	"h5",
702 	"h6",
703 	"ul",
704 	"li",
705 	"dl",
706 	"br",
707 	"em",
708 	0,
709 };
710 
711 int
712 ishtml(void)
713 {
714 	uchar *p, *q;
715 	int i, count;
716 
717 		/* compare strings between '<' and '>' to html table */
718 	count = 0;
719 	p = buf;
720 	for(;;) {
721 		while (p < buf+nbuf && *p != '<')
722 			p++;
723 		p++;
724 		if (p >= buf+nbuf)
725 			break;
726 		if(*p == '/')
727 			p++;
728 		q = p;
729 		while(p < buf+nbuf && *p != '>')
730 			p++;
731 		if (p >= buf+nbuf)
732 			break;
733 		for(i = 0; html_string[i]; i++) {
734 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
735 				if(count++ > 4) {
736 					print(mime ? "text/html\n" : "HTML file\n");
737 					return 1;
738 				}
739 				break;
740 			}
741 		}
742 		p++;
743 	}
744 	return 0;
745 }
746 
747 char*	rfc822_string[] =
748 {
749 	"from:",
750 	"date:",
751 	"to:",
752 	"subject:",
753 	"received:",
754 	"reply to:",
755 	"sender:",
756 	0,
757 };
758 
759 int
760 isrfc822(void)
761 {
762 
763 	char *p, *q, *r;
764 	int i, count;
765 
766 	count = 0;
767 	p = (char*)buf;
768 	for(;;) {
769 		q = strchr(p, '\n');
770 		if(q == nil)
771 			break;
772 		*q = 0;
773 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
774 			count++;
775 			*q = '\n';
776 			p = q+1;
777 			continue;
778 		}
779 		*q = '\n';
780 		if(*p != '\t' && *p != ' '){
781 			r = strchr(p, ':');
782 			if(r == 0 || r > q)
783 				break;
784 			for(i = 0; rfc822_string[i]; i++) {
785 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
786 					count++;
787 					break;
788 				}
789 			}
790 		}
791 		p = q+1;
792 	}
793 	if(count >= 3){
794 		print(mime ? "message/rfc822\n" : "email file\n");
795 		return 1;
796 	}
797 	return 0;
798 }
799 
800 int
801 ismbox(void)
802 {
803 	char *p, *q;
804 
805 	p = (char*)buf;
806 	q = strchr(p, '\n');
807 	if(q == nil)
808 		return 0;
809 	*q = 0;
810 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
811 		print(mime ? "text/plain\n" : "mail box\n");
812 		return 1;
813 	}
814 	*q = '\n';
815 	return 0;
816 }
817 
818 int
819 iscint(void)
820 {
821 	int type;
822 	char *name;
823 	Biobuf b;
824 
825 	if(Binit(&b, fd, OREAD) == Beof)
826 		return 0;
827 	seek(fd, 0, 0);
828 	type = objtype(&b, &name);
829 	if(type < 0)
830 		return 0;
831 	if(mime)
832 		print(OCTET);
833 	else
834 		print("%s intermediate\n", name);
835 	return 1;
836 }
837 
838 int
839 isc(void)
840 {
841 	int n;
842 
843 	n = wfreq[I1];
844 	/*
845 	 * includes
846 	 */
847 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
848 		goto yes;
849 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
850 		goto yes;
851 	/*
852 	 * declarations
853 	 */
854 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
855 		goto yes;
856 	/*
857 	 * assignments
858 	 */
859 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
860 		goto yes;
861 	return 0;
862 
863 yes:
864 	if(mime){
865 		print(PLAIN);
866 		return 1;
867 	}
868 	if(wfreq[Alword] > 0)
869 		print("alef program\n");
870 	else
871 		print("c program\n");
872 	return 1;
873 }
874 
875 int
876 islimbo(void)
877 {
878 
879 	/*
880 	 * includes
881 	 */
882 	if(wfreq[Lword] < 4)
883 		return 0;
884 	print(mime ? PLAIN : "limbo program\n");
885 	return 1;
886 }
887 
888 int
889 isas(void)
890 {
891 
892 	/*
893 	 * includes
894 	 */
895 	if(wfreq[Aword] < 2)
896 		return 0;
897 	print(mime ? PLAIN : "as program\n");
898 	return 1;
899 }
900 
901 /*
902  * low entropy means encrypted
903  */
904 int
905 ismung(void)
906 {
907 	int i, bucket[8];
908 	float cs;
909 
910 	if(nbuf < 64)
911 		return 0;
912 	memset(bucket, 0, sizeof(bucket));
913 	for(i=0; i<64; i++)
914 		bucket[(buf[i]>>5)&07] += 1;
915 
916 	cs = 0.;
917 	for(i=0; i<8; i++)
918 		cs += (bucket[i]-8)*(bucket[i]-8);
919 	cs /= 8.;
920 	if(cs <= 24.322) {
921 		if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
922 			print(mime ? OCTET : "compressed\n");
923 		else
924 			print(mime ? OCTET : "encrypted\n");
925 		return 1;
926 	}
927 	return 0;
928 }
929 
930 /*
931  * english by punctuation and frequencies
932  */
933 int
934 isenglish(void)
935 {
936 	int vow, comm, rare, badpun, punct;
937 	char *p;
938 
939 	if(guess != Fascii && guess != Feascii)
940 		return 0;
941 	badpun = 0;
942 	punct = 0;
943 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
944 		switch(*p) {
945 		case '.':
946 		case ',':
947 		case ')':
948 		case '%':
949 		case ';':
950 		case ':':
951 		case '?':
952 			punct++;
953 			if(p[1] != ' ' && p[1] != '\n')
954 				badpun++;
955 		}
956 	if(badpun*5 > punct)
957 		return 0;
958 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
959 		return 0;
960 	if(2*cfreq[';'] > cfreq['e'])
961 		return 0;
962 
963 	vow = 0;
964 	for(p="AEIOU"; *p; p++) {
965 		vow += cfreq[*p];
966 		vow += cfreq[tolower(*p)];
967 	}
968 	comm = 0;
969 	for(p="ETAION"; *p; p++) {
970 		comm += cfreq[*p];
971 		comm += cfreq[tolower(*p)];
972 	}
973 	rare = 0;
974 	for(p="VJKQXZ"; *p; p++) {
975 		rare += cfreq[*p];
976 		rare += cfreq[tolower(*p)];
977 	}
978 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
979 		print(mime ? PLAIN : "English text\n");
980 		return 1;
981 	}
982 	return 0;
983 }
984 
985 /*
986  * pick up a number with
987  * syntax _*[0-9]+_
988  */
989 #define	P9BITLEN	12
990 int
991 p9bitnum(uchar *bp)
992 {
993 	int n, c, len;
994 
995 	len = P9BITLEN;
996 	while(*bp == ' ') {
997 		bp++;
998 		len--;
999 		if(len <= 0)
1000 			return -1;
1001 	}
1002 	n = 0;
1003 	while(len > 1) {
1004 		c = *bp++;
1005 		if(!isdigit(c))
1006 			return -1;
1007 		n = n*10 + c-'0';
1008 		len--;
1009 	}
1010 	if(*bp != ' ')
1011 		return -1;
1012 	return n;
1013 }
1014 
1015 int
1016 depthof(char *s, int *newp)
1017 {
1018 	char *es;
1019 	int d;
1020 
1021 	*newp = 0;
1022 	es = s+12;
1023 	while(s<es && *s==' ')
1024 		s++;
1025 	if(s == es)
1026 		return -1;
1027 	if('0'<=*s && *s<='9')
1028 		return 1<<atoi(s);
1029 
1030 	*newp = 1;
1031 	d = 0;
1032 	while(s<es && *s!=' '){
1033 		s++;	/* skip letter */
1034 		d += strtoul(s, &s, 10);
1035 	}
1036 
1037 	switch(d){
1038 	case 32:
1039 	case 24:
1040 	case 16:
1041 	case 8:
1042 		return d;
1043 	}
1044 	return -1;
1045 }
1046 
1047 int
1048 isp9bit(void)
1049 {
1050 	int dep, lox, loy, hix, hiy, px, new;
1051 	ulong t;
1052 	long len;
1053 	char *newlabel;
1054 
1055 	newlabel = "old ";
1056 
1057 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
1058 	if(new)
1059 		newlabel = "";
1060 	lox = p9bitnum(buf + 1*P9BITLEN);
1061 	loy = p9bitnum(buf + 2*P9BITLEN);
1062 	hix = p9bitnum(buf + 3*P9BITLEN);
1063 	hiy = p9bitnum(buf + 4*P9BITLEN);
1064 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1065 		return 0;
1066 
1067 	if(dep < 8){
1068 		px = 8/dep;	/* pixels per byte */
1069 		/* set l to number of bytes of data per scan line */
1070 		if(lox >= 0)
1071 			len = (hix+px-1)/px - lox/px;
1072 		else{	/* make positive before divide */
1073 			t = (-lox)+px-1;
1074 			t = (t/px)*px;
1075 			len = (t+hix+px-1)/px;
1076 		}
1077 	}else
1078 		len = (hix-lox)*dep/8;
1079 	len *= (hiy-loy);		/* col length */
1080 	len += 5*P9BITLEN;		/* size of initial ascii */
1081 
1082 	/*
1083 	 * for image file, length is non-zero and must match calculation above
1084 	 * for /dev/window and /dev/screen the length is always zero
1085 	 * for subfont, the subfont header should follow immediately.
1086 	 */
1087 	if (len != 0 && mbuf->length == 0) {
1088 		print("%splan 9 image\n", newlabel);
1089 		return 1;
1090 	}
1091 	if (mbuf->length == len) {
1092 		print("%splan 9 image\n", newlabel);
1093 		return 1;
1094 	}
1095 	/* Ghostscript sometimes produces a little extra on the end */
1096 	if (mbuf->length < len+P9BITLEN) {
1097 		print("%splan 9 image\n", newlabel);
1098 		return 1;
1099 	}
1100 	if (p9subfont(buf+len)) {
1101 		print("%ssubfont file\n", newlabel);
1102 		return 1;
1103 	}
1104 	return 0;
1105 }
1106 
1107 int
1108 p9subfont(uchar *p)
1109 {
1110 	int n, h, a;
1111 
1112 		/* if image too big, assume it's a subfont */
1113 	if (p+3*P9BITLEN > buf+sizeof(buf))
1114 		return 1;
1115 
1116 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1117 	if (n < 0)
1118 		return 0;
1119 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1120 	if (h < 0)
1121 		return 0;
1122 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1123 	if (a < 0)
1124 		return 0;
1125 	return 1;
1126 }
1127 
1128 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1129 
1130 int
1131 isp9font(void)
1132 {
1133 	uchar *cp, *p;
1134 	int i, n;
1135 	char pathname[1024];
1136 
1137 	cp = buf;
1138 	if (!getfontnum(cp, &cp))	/* height */
1139 		return 0;
1140 	if (!getfontnum(cp, &cp))	/* ascent */
1141 		return 0;
1142 	for (i = 0; 1; i++) {
1143 		if (!getfontnum(cp, &cp))	/* min */
1144 			break;
1145 		if (!getfontnum(cp, &cp))	/* max */
1146 			return 0;
1147 		while (WHITESPACE(*cp))
1148 			cp++;
1149 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1150 				;
1151 			/* construct a path name, if needed */
1152 		n = 0;
1153 		if (*p != '/' && slash) {
1154 			n = slash-fname+1;
1155 			if (n < sizeof(pathname))
1156 				memcpy(pathname, fname, n);
1157 			else n = 0;
1158 		}
1159 		if (n+cp-p < sizeof(pathname)) {
1160 			memcpy(pathname+n, p, cp-p);
1161 			n += cp-p;
1162 			pathname[n] = 0;
1163 			if (access(pathname, AEXIST) < 0)
1164 				return 0;
1165 		}
1166 	}
1167 	if (i) {
1168 		print(mime ? "text/plain\n" : "font file\n");
1169 		return 1;
1170 	}
1171 	return 0;
1172 }
1173 
1174 int
1175 getfontnum(uchar *cp, uchar **rp)
1176 {
1177 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1178 		cp++;
1179 	if (*cp < '0' || *cp > '9')
1180 		return 0;
1181 	strtoul((char *)cp, (char **)rp, 0);
1182 	if (!WHITESPACE(**rp))
1183 		return 0;
1184 	return 1;
1185 }
1186 
1187 int
1188 isrtf(void)
1189 {
1190 	if(strstr((char *)buf, "\\rtf1")){
1191 		print(mime ? "application/rtf\n" : "rich text format\n");
1192 		return 1;
1193 	}
1194 	return 0;
1195 }
1196 
1197 int
1198 ismsdos(void)
1199 {
1200 	if (buf[0] == 0x4d && buf[1] == 0x5a){
1201 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1202 		return 1;
1203 	}
1204 	return 0;
1205 }
1206 
1207 int
1208 iself(void)
1209 {
1210 	char *cpu[] = {		/* NB: incomplete and arbitary list */
1211 	[1]	"WE32100",
1212 	[2]	"SPARC",
1213 	[3]	"i386",
1214 	[4]	"M68000",
1215 	[5]	"M88000",
1216 	[6]	"i486",
1217 	[7]	"i860",
1218 	[8]	"R3000",
1219 	[9]	"S370",
1220 	[10]	"R4000",
1221 	[15]	"HP-PA",
1222 	[18]	"sparc v8+",
1223 	[19]	"i960",
1224 	[20]	"PPC-32",
1225 	[21]	"PPC-64",
1226 	[40]	"ARM",
1227 	[41]	"Alpha",
1228 	[43]	"sparc v9",
1229 	[50]	"IA-46",
1230 	[62]	"AMD x86-64",
1231 	[75]	"VAX",
1232 	};
1233 
1234 
1235 	if (memcmp(buf, "\x7fELF", 4) == 0){
1236 		if (!mime){
1237 			int n = (buf[19] << 8) | buf[18];
1238 			char *p = "unknown";
1239 
1240 			if (n > 0 && n < nelem(cpu) && cpu[n])
1241 				p = cpu[n];
1242 			else {
1243 				/* try the other byte order */
1244 				n = (buf[18] << 8) | buf[19];
1245 				if (n > 0 && n < nelem(cpu) && cpu[n])
1246 					p = cpu[n];
1247 			}
1248 			print("%s ELF executable\n", p);
1249 		}
1250 		else
1251 			print("application/x-elf-executable");
1252 		return 1;
1253 	}
1254 
1255 	return 0;
1256 }
1257