xref: /plan9-contrib/sys/src/cmd/file.c (revision 4d44ba9b9ee4246ddbd96c7fcaf0918ab92ab35a)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"chan",		Alword,
51 	"char",		Cword,
52 	"common",	Fword,
53 	"con",		Lword,
54 	"data",		Fword,
55 	"dimension",	Fword,
56 	"double",	Cword,
57 	"extern",	Cword,
58 	"bio",		I2,
59 	"float",	Cword,
60 	"fn",		Lword,
61 	"function",	Fword,
62 	"h",		I3,
63 	"implement",	Lword,
64 	"import",	Lword,
65 	"include",	I1,
66 	"int",		Cword,
67 	"integer",	Fword,
68 	"iota",		Lword,
69 	"libc",		I2,
70 	"long",		Cword,
71 	"module",	Lword,
72 	"real",		Fword,
73 	"ref",		Lword,
74 	"register",	Cword,
75 	"self",		Lword,
76 	"short",	Cword,
77 	"static",	Cword,
78 	"stdio",	I2,
79 	"struct",	Cword,
80 	"subroutine",	Fword,
81 	"u",		I2,
82 	"void",		Cword,
83 };
84 
85 /* codes for 'mode' field in language structure */
86 enum	{
87 		Normal	= 0,
88 		First,		/* first entry for language spanning several ranges */
89 		Multi,		/* later entries "   "       "  ... */
90 		Shared,		/* codes used in several languages */
91 	};
92 
93 struct
94 {
95 	int	mode;		/* see enum above */
96 	int 	count;
97 	int	low;
98 	int	high;
99 	char	*name;
100 
101 } language[] =
102 {
103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
125 	First,	0,	0x3130,	0x318F,	"Korean",
126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
128 	Normal,	0,	0,	0,	0,		/* terminal entry */
129 };
130 
131 
132 enum
133 {
134 	Fascii,		/* printable ascii */
135 	Flatin,		/* latin 1*/
136 	Futf,		/* UTf character set */
137 	Fbinary,	/* binary */
138 	Feascii,	/* ASCII with control chars */
139 	Fnull,		/* NULL in file */
140 } guess;
141 
142 void	bump_utf_count(Rune);
143 int	cistrncmp(char*, char*, int);
144 void	filetype(int);
145 int	getfontnum(uchar*, uchar**);
146 int	isas(void);
147 int	isc(void);
148 int	iscint(void);
149 int	isenglish(void);
150 int	ishp(void);
151 int	ishtml(void);
152 int	isrfc822(void);
153 int	ismbox(void);
154 int	islimbo(void);
155 int	ismung(void);
156 int	isp9bit(void);
157 int	isp9font(void);
158 int	isrtf(void);
159 int	ismsdos(void);
160 int	iself(void);
161 int	istring(void);
162 int	iff(void);
163 int	long0(void);
164 int	istar(void);
165 int	p9bitnum(uchar*);
166 int	p9subfont(uchar*);
167 void	print_utf(void);
168 void	type(char*, int);
169 int	utf_count(void);
170 void	wordfreq(void);
171 
172 int	(*call[])(void) =
173 {
174 	long0,		/* recognizable by first 4 bytes */
175 	istring,	/* recognizable by first string */
176 	iff,		/* interchange file format (strings) */
177 	isrfc822,	/* email file */
178 	ismbox,		/* mail box */
179 	istar,		/* recognizable by tar checksum */
180 	ishtml,		/* html keywords */
181 	iscint,		/* compiler/assembler intermediate */
182 	islimbo,	/* limbo source */
183 	isc,		/* c & alef compiler key words */
184 	isas,		/* assembler key words */
185 	ismung,		/* entropy compressed/encrypted */
186 	isp9font,	/* plan 9 font */
187 	isp9bit,	/* plan 9 image (as from /dev/window) */
188 	isenglish,	/* char frequency English */
189 	isrtf,		/* rich text format */
190 	ismsdos,	/* msdos exe (virus file attachement) */
191 	iself,		/* ELF (foreign) executable */
192 	0
193 };
194 
195 int mime;
196 
197 #define OCTET	"application/octet-stream\n"
198 #define PLAIN	"text/plain\n"
199 
200 void
201 main(int argc, char *argv[])
202 {
203 	int i, j, maxlen;
204 	char *cp;
205 	Rune r;
206 
207 	ARGBEGIN{
208 	case 'm':
209 		mime = 1;
210 		break;
211 	default:
212 		fprint(2, "usage: file [-m] [file...]\n");
213 		exits("usage");
214 	}ARGEND;
215 
216 	maxlen = 0;
217 	if(mime == 0 || argc > 1){
218 		for(i = 0; i < argc; i++) {
219 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
220 					;
221 			if(j > maxlen)
222 				maxlen = j;
223 		}
224 	}
225 	if (argc <= 0) {
226 		if(!mime)
227 			print ("stdin: ");
228 		filetype(0);
229 	}
230 	else {
231 		for(i = 0; i < argc; i++)
232 			type(argv[i], maxlen);
233 	}
234 	exits(0);
235 }
236 
237 void
238 type(char *file, int nlen)
239 {
240 	Rune r;
241 	int i;
242 	char *p;
243 
244 	if(nlen > 0){
245 		slash = 0;
246 		for (i = 0, p = file; *p; i++) {
247 			if (*p == '/')			/* find rightmost slash */
248 				slash = p;
249 			p += chartorune(&r, p);		/* count runes */
250 		}
251 		print("%s:%*s",file, nlen-i+1, "");
252 	}
253 	fname = file;
254 	if ((fd = open(file, OREAD)) < 0) {
255 		print("cannot open\n");
256 		return;
257 	}
258 	filetype(fd);
259 	close(fd);
260 }
261 
262 void
263 filetype(int fd)
264 {
265 	Rune r;
266 	int i, f, n;
267 	char *p, *eob;
268 
269 	free(mbuf);
270 	mbuf = dirfstat(fd);
271 	if(mbuf == nil){
272 		print("cannot stat: %r\n");
273 		return;
274 	}
275 	if(mbuf->mode & DMDIR) {
276 		print(mime ? "text/directory\n" : "directory\n");
277 		return;
278 	}
279 	if(mbuf->type != 'M' && mbuf->type != '|') {
280 		print(mime ? OCTET : "special file #%c/%s\n",
281 			mbuf->type, mbuf->name);
282 		return;
283 	}
284 	nbuf = read(fd, buf, sizeof(buf)-1);
285 
286 	if(nbuf < 0) {
287 		print("cannot read\n");
288 		return;
289 	}
290 	if(nbuf == 0) {
291 		print(mime ? PLAIN : "empty file\n");
292 		return;
293 	}
294 	buf[nbuf] = 0;
295 
296 	/*
297 	 * build histogram table
298 	 */
299 	memset(cfreq, 0, sizeof(cfreq));
300 	for (i = 0; language[i].name; i++)
301 		language[i].count = 0;
302 	eob = (char *)buf+nbuf;
303 	for(n = 0, p = (char *)buf; p < eob; n++) {
304 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
305 			break;
306 		p += chartorune(&r, p);
307 		if (r == 0)
308 			f = Cnull;
309 		else if (r <= 0x7f) {
310 			if (!isprint(r) && !isspace(r))
311 				f = Ceascii;	/* ASCII control char */
312 			else f = r;
313 		} else if (r == 0x080) {
314 			bump_utf_count(r);
315 			f = Cutf;
316 		} else if (r < 0xA0)
317 				f = Cbinary;	/* Invalid Runes */
318 		else if (r <= 0xff)
319 				f = Clatin;	/* Latin 1 */
320 		else {
321 			bump_utf_count(r);
322 			f = Cutf;		/* UTF extension */
323 		}
324 		cfreq[f]++;			/* ASCII chars peg directly */
325 	}
326 	/*
327 	 * gross classify
328 	 */
329 	if (cfreq[Cbinary])
330 		guess = Fbinary;
331 	else if (cfreq[Cutf])
332 		guess = Futf;
333 	else if (cfreq[Clatin])
334 		guess = Flatin;
335 	else if (cfreq[Ceascii])
336 		guess = Feascii;
337 	else if (cfreq[Cnull] == n) {
338 		print(mime ? OCTET : "first block all null bytes\n");
339 		return;
340 	}
341 	else guess = Fascii;
342 	/*
343 	 * lookup dictionary words
344 	 */
345 	memset(wfreq, 0, sizeof(wfreq));
346 	if(guess == Fascii || guess == Flatin || guess == Futf)
347 		wordfreq();
348 	/*
349 	 * call individual classify routines
350 	 */
351 	for(i=0; call[i]; i++)
352 		if((*call[i])())
353 			return;
354 
355 	/*
356 	 * if all else fails,
357 	 * print out gross classification
358 	 */
359 	if (nbuf < 100 && !mime)
360 		print(mime ? PLAIN : "short ");
361 	if (guess == Fascii)
362 		print(mime ? PLAIN : "Ascii\n");
363 	else if (guess == Feascii)
364 		print(mime ? PLAIN : "extended ascii\n");
365 	else if (guess == Flatin)
366 		print(mime ? PLAIN : "latin ascii\n");
367 	else if (guess == Futf && utf_count() < 4)
368 		print_utf();
369 	else print(mime ? OCTET : "binary\n");
370 }
371 
372 void
373 bump_utf_count(Rune r)
374 {
375 	int low, high, mid;
376 
377 	high = sizeof(language)/sizeof(language[0])-1;
378 	for (low = 0; low < high;) {
379 		mid = (low+high)/2;
380 		if (r >=language[mid].low) {
381 			if (r <= language[mid].high) {
382 				language[mid].count++;
383 				break;
384 			} else low = mid+1;
385 		} else high = mid;
386 	}
387 }
388 
389 int
390 utf_count(void)
391 {
392 	int i, count;
393 
394 	count = 0;
395 	for (i = 0; language[i].name; i++)
396 		if (language[i].count > 0)
397 			switch (language[i].mode) {
398 			case Normal:
399 			case First:
400 				count++;
401 				break;
402 			default:
403 				break;
404 			}
405 	return count;
406 }
407 
408 int
409 chkascii(void)
410 {
411 	int i;
412 
413 	for (i = 'a'; i < 'z'; i++)
414 		if (cfreq[i])
415 			return 1;
416 	for (i = 'A'; i < 'Z'; i++)
417 		if (cfreq[i])
418 			return 1;
419 	return 0;
420 }
421 
422 int
423 find_first(char *name)
424 {
425 	int i;
426 
427 	for (i = 0; language[i].name != 0; i++)
428 		if (language[i].mode == First
429 			&& strcmp(language[i].name, name) == 0)
430 			return i;
431 	return -1;
432 }
433 
434 void
435 print_utf(void)
436 {
437 	int i, printed, j;
438 
439 	if(mime){
440 		print(PLAIN);
441 		return;
442 	}
443 	if (chkascii()) {
444 		printed = 1;
445 		print("Ascii");
446 	} else
447 		printed = 0;
448 	for (i = 0; language[i].name; i++)
449 		if (language[i].count) {
450 			switch(language[i].mode) {
451 			case Multi:
452 				j = find_first(language[i].name);
453 				if (j < 0)
454 					break;
455 				if (language[j].count > 0)
456 					break;
457 				/* Fall through */
458 			case Normal:
459 			case First:
460 				if (printed)
461 					print(" & ");
462 				else printed = 1;
463 				print("%s", language[i].name);
464 				break;
465 			case Shared:
466 			default:
467 				break;
468 			}
469 		}
470 	if(!printed)
471 		print("UTF");
472 	print(" text\n");
473 }
474 
475 void
476 wordfreq(void)
477 {
478 	int low, high, mid, r;
479 	uchar *p, *p2, c;
480 
481 	p = buf;
482 	for(;;) {
483 		while (p < buf+nbuf && !isalpha(*p))
484 			p++;
485 		if (p >= buf+nbuf)
486 			return;
487 		p2 = p;
488 		while(p < buf+nbuf && isalpha(*p))
489 			p++;
490 		c = *p;
491 		*p = 0;
492 		high = sizeof(dict)/sizeof(dict[0]);
493 		for(low = 0;low < high;) {
494 			mid = (low+high)/2;
495 			r = strcmp(dict[mid].word, (char*)p2);
496 			if(r == 0) {
497 				wfreq[dict[mid].class]++;
498 				break;
499 			}
500 			if(r < 0)
501 				low = mid+1;
502 			else
503 				high = mid;
504 		}
505 		*p++ = c;
506 	}
507 }
508 
509 typedef struct Filemagic Filemagic;
510 struct Filemagic {
511 	ulong x;
512 	ulong mask;
513 	char *desc;
514 	char *mime;
515 };
516 
517 Filemagic long0tab[] = {
518 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
519 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
520 	0x32636170,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
521 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
522 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
523 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip",
524 	070707,		0xFFFF,		"cpio archive\n", OCTET,
525 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
526 	0xfaff,		0xfeff,		"mp3 audio\n",	"audio/mpeg",
527 };
528 
529 int
530 filemagic(Filemagic *tab, int ntab, ulong x)
531 {
532 	int i;
533 
534 	for(i=0; i<ntab; i++)
535 		if((x&tab[i].mask) == tab[i].x){
536 			print(mime ? tab[i].mime : tab[i].desc);
537 			return 1;
538 		}
539 	return 0;
540 }
541 
542 int
543 long0(void)
544 {
545 	Fhdr f;
546 	long x;
547 
548 	seek(fd, 0, 0);		/* reposition to start of file */
549 	if(crackhdr(fd, &f)) {
550 		print(mime ? OCTET : "%s\n", f.name);
551 		return 1;
552 	}
553 	x = LENDIAN(buf);
554 	if(filemagic(long0tab, nelem(long0tab), x))
555 		return 1;
556 	return 0;
557 }
558 
559 /* from tar.c */
560 enum { NAMSIZ = 100, TBLOCK = 512 };
561 
562 union	hblock
563 {
564 	char	dummy[TBLOCK];
565 	struct	header
566 	{
567 		char	name[NAMSIZ];
568 		char	mode[8];
569 		char	uid[8];
570 		char	gid[8];
571 		char	size[12];
572 		char	mtime[12];
573 		char	chksum[8];
574 		char	linkflag;
575 		char	linkname[NAMSIZ];
576 		/* rest are defined by POSIX's ustar format; see p1003.2b */
577 		char	magic[6];	/* "ustar" */
578 		char	version[2];
579 		char	uname[32];
580 		char	gname[32];
581 		char	devmajor[8];
582 		char	devminor[8];
583 		char	prefix[155];  /* if non-null, path = prefix "/" name */
584 	} dbuf;
585 };
586 
587 int
588 checksum(union hblock *hp)
589 {
590 	int i;
591 	char *cp;
592 	struct header *hdr = &hp->dbuf;
593 
594 	for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
595 		*cp = ' ';
596 	i = 0;
597 	for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
598 		i += *cp & 0xff;
599 	return i;
600 }
601 
602 int
603 istar(void)
604 {
605 	int chksum;
606 	char tblock[TBLOCK];
607 	union hblock *hp = (union hblock *)tblock;
608 	struct header *hdr = &hp->dbuf;
609 
610 	seek(fd, 0, 0);		/* reposition to start of file */
611 	if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
612 		return 0;
613 	chksum = strtol(hdr->chksum, 0, 8);
614 	if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
615 		if (strcmp(hdr->magic, "ustar") == 0)
616 			print(mime? "application/x-ustar\n":
617 				"posix tar archive\n");
618 		else
619 			print(mime? "application/x-tar\n": "tar archive\n");
620 		return 1;
621 	}
622 	return 0;
623 }
624 
625 /*
626  * initial words to classify file
627  */
628 struct	FILE_STRING
629 {
630 	char 	*key;
631 	char	*filetype;
632 	int	length;
633 	char	*mime;
634 } file_string[] =
635 {
636 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
637 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
638 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
639 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
640 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
641 	"%!",			"postscript",			2,	"application/postscript",
642 	"\004%!",		"postscript",			3,	"application/postscript",
643 	"x T post",		"troff output for post",	8,	"application/troff",
644 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
645 	"x T utf",		"troff output for UTF",		7,	"application/troff",
646 	"x T 202",		"troff output for 202",		7,	"application/troff",
647 	"x T aps",		"troff output for aps",		7,	"application/troff",
648 	"GIF",			"GIF image", 			3,	"image/gif",
649 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
650 	"%PDF",			"PDF",				4,	"application/pdf",
651 	"<html>\n",		"HTML file",			7,	"text/html",
652 	"<HTML>\n",		"HTML file",			7,	"text/html",
653 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
654 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
655 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
656 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
657 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
658 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
659 	"BM",			"bmp",				2,	"image/bmp",
660 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
661 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
662 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
663 	"ID3",			"mp3 audio with id3",	3,	"audio/mpeg",
664 	"\211PNG",		"PNG image",		4,	"image/png",
665 	0,0,0,0
666 };
667 
668 int
669 istring(void)
670 {
671 	int i;
672 	struct FILE_STRING *p;
673 
674 	for(p = file_string; p->key; p++) {
675 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
676 			if(mime)
677 				print("%s\n", p->mime);
678 			else
679 				print("%s\n", p->filetype);
680 			return 1;
681 		}
682 	}
683 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
684 		for(i = 5; i < nbuf; i++)
685 			if(buf[i] == '\n')
686 				break;
687 		if(mime)
688 			print(OCTET);
689 		else
690 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
691 		return 1;
692 	}
693 	return 0;
694 }
695 
696 int
697 iff(void)
698 {
699 	if (strncmp((char*)buf, "FORM", 4) == 0 &&
700 	    strncmp((char*)buf+8, "AIFF", 4) == 0) {
701 		print("%s\n", mime? "audio/x-aiff": "aiff audio");
702 		return 1;
703 	}
704 	return 0;
705 }
706 
707 char*	html_string[] =
708 {
709 	"title",
710 	"body",
711 	"head",
712 	"strong",
713 	"h1",
714 	"h2",
715 	"h3",
716 	"h4",
717 	"h5",
718 	"h6",
719 	"ul",
720 	"li",
721 	"dl",
722 	"br",
723 	"em",
724 	0,
725 };
726 
727 int
728 ishtml(void)
729 {
730 	uchar *p, *q;
731 	int i, count;
732 
733 		/* compare strings between '<' and '>' to html table */
734 	count = 0;
735 	p = buf;
736 	for(;;) {
737 		while (p < buf+nbuf && *p != '<')
738 			p++;
739 		p++;
740 		if (p >= buf+nbuf)
741 			break;
742 		if(*p == '/')
743 			p++;
744 		q = p;
745 		while(p < buf+nbuf && *p != '>')
746 			p++;
747 		if (p >= buf+nbuf)
748 			break;
749 		for(i = 0; html_string[i]; i++) {
750 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
751 				if(count++ > 4) {
752 					print(mime ? "text/html\n" : "HTML file\n");
753 					return 1;
754 				}
755 				break;
756 			}
757 		}
758 		p++;
759 	}
760 	return 0;
761 }
762 
763 char*	rfc822_string[] =
764 {
765 	"from:",
766 	"date:",
767 	"to:",
768 	"subject:",
769 	"received:",
770 	"reply to:",
771 	"sender:",
772 	0,
773 };
774 
775 int
776 isrfc822(void)
777 {
778 
779 	char *p, *q, *r;
780 	int i, count;
781 
782 	count = 0;
783 	p = (char*)buf;
784 	for(;;) {
785 		q = strchr(p, '\n');
786 		if(q == nil)
787 			break;
788 		*q = 0;
789 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
790 			count++;
791 			*q = '\n';
792 			p = q+1;
793 			continue;
794 		}
795 		*q = '\n';
796 		if(*p != '\t' && *p != ' '){
797 			r = strchr(p, ':');
798 			if(r == 0 || r > q)
799 				break;
800 			for(i = 0; rfc822_string[i]; i++) {
801 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
802 					count++;
803 					break;
804 				}
805 			}
806 		}
807 		p = q+1;
808 	}
809 	if(count >= 3){
810 		print(mime ? "message/rfc822\n" : "email file\n");
811 		return 1;
812 	}
813 	return 0;
814 }
815 
816 int
817 ismbox(void)
818 {
819 	char *p, *q;
820 
821 	p = (char*)buf;
822 	q = strchr(p, '\n');
823 	if(q == nil)
824 		return 0;
825 	*q = 0;
826 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
827 		print(mime ? "text/plain\n" : "mail box\n");
828 		return 1;
829 	}
830 	*q = '\n';
831 	return 0;
832 }
833 
834 int
835 iscint(void)
836 {
837 	int type;
838 	char *name;
839 	Biobuf b;
840 
841 	if(Binit(&b, fd, OREAD) == Beof)
842 		return 0;
843 	seek(fd, 0, 0);
844 	type = objtype(&b, &name);
845 	if(type < 0)
846 		return 0;
847 	if(mime)
848 		print(OCTET);
849 	else
850 		print("%s intermediate\n", name);
851 	return 1;
852 }
853 
854 int
855 isc(void)
856 {
857 	int n;
858 
859 	n = wfreq[I1];
860 	/*
861 	 * includes
862 	 */
863 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
864 		goto yes;
865 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
866 		goto yes;
867 	/*
868 	 * declarations
869 	 */
870 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
871 		goto yes;
872 	/*
873 	 * assignments
874 	 */
875 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
876 		goto yes;
877 	return 0;
878 
879 yes:
880 	if(mime){
881 		print(PLAIN);
882 		return 1;
883 	}
884 	if(wfreq[Alword] > 0)
885 		print("alef program\n");
886 	else
887 		print("c program\n");
888 	return 1;
889 }
890 
891 int
892 islimbo(void)
893 {
894 
895 	/*
896 	 * includes
897 	 */
898 	if(wfreq[Lword] < 4)
899 		return 0;
900 	print(mime ? PLAIN : "limbo program\n");
901 	return 1;
902 }
903 
904 int
905 isas(void)
906 {
907 
908 	/*
909 	 * includes
910 	 */
911 	if(wfreq[Aword] < 2)
912 		return 0;
913 	print(mime ? PLAIN : "as program\n");
914 	return 1;
915 }
916 
917 /*
918  * low entropy means encrypted
919  */
920 int
921 ismung(void)
922 {
923 	int i, bucket[8];
924 	float cs;
925 
926 	if(nbuf < 64)
927 		return 0;
928 	memset(bucket, 0, sizeof(bucket));
929 	for(i=0; i<64; i++)
930 		bucket[(buf[i]>>5)&07] += 1;
931 
932 	cs = 0.;
933 	for(i=0; i<8; i++)
934 		cs += (bucket[i]-8)*(bucket[i]-8);
935 	cs /= 8.;
936 	if(cs <= 24.322) {
937 		if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
938 			print(mime ? OCTET : "compressed\n");
939 		else
940 			print(mime ? OCTET : "encrypted\n");
941 		return 1;
942 	}
943 	return 0;
944 }
945 
946 /*
947  * english by punctuation and frequencies
948  */
949 int
950 isenglish(void)
951 {
952 	int vow, comm, rare, badpun, punct;
953 	char *p;
954 
955 	if(guess != Fascii && guess != Feascii)
956 		return 0;
957 	badpun = 0;
958 	punct = 0;
959 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
960 		switch(*p) {
961 		case '.':
962 		case ',':
963 		case ')':
964 		case '%':
965 		case ';':
966 		case ':':
967 		case '?':
968 			punct++;
969 			if(p[1] != ' ' && p[1] != '\n')
970 				badpun++;
971 		}
972 	if(badpun*5 > punct)
973 		return 0;
974 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
975 		return 0;
976 	if(2*cfreq[';'] > cfreq['e'])
977 		return 0;
978 
979 	vow = 0;
980 	for(p="AEIOU"; *p; p++) {
981 		vow += cfreq[*p];
982 		vow += cfreq[tolower(*p)];
983 	}
984 	comm = 0;
985 	for(p="ETAION"; *p; p++) {
986 		comm += cfreq[*p];
987 		comm += cfreq[tolower(*p)];
988 	}
989 	rare = 0;
990 	for(p="VJKQXZ"; *p; p++) {
991 		rare += cfreq[*p];
992 		rare += cfreq[tolower(*p)];
993 	}
994 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
995 		print(mime ? PLAIN : "English text\n");
996 		return 1;
997 	}
998 	return 0;
999 }
1000 
1001 /*
1002  * pick up a number with
1003  * syntax _*[0-9]+_
1004  */
1005 #define	P9BITLEN	12
1006 int
1007 p9bitnum(uchar *bp)
1008 {
1009 	int n, c, len;
1010 
1011 	len = P9BITLEN;
1012 	while(*bp == ' ') {
1013 		bp++;
1014 		len--;
1015 		if(len <= 0)
1016 			return -1;
1017 	}
1018 	n = 0;
1019 	while(len > 1) {
1020 		c = *bp++;
1021 		if(!isdigit(c))
1022 			return -1;
1023 		n = n*10 + c-'0';
1024 		len--;
1025 	}
1026 	if(*bp != ' ')
1027 		return -1;
1028 	return n;
1029 }
1030 
1031 int
1032 depthof(char *s, int *newp)
1033 {
1034 	char *es;
1035 	int d;
1036 
1037 	*newp = 0;
1038 	es = s+12;
1039 	while(s<es && *s==' ')
1040 		s++;
1041 	if(s == es)
1042 		return -1;
1043 	if('0'<=*s && *s<='9')
1044 		return 1<<atoi(s);
1045 
1046 	*newp = 1;
1047 	d = 0;
1048 	while(s<es && *s!=' '){
1049 		s++;	/* skip letter */
1050 		d += strtoul(s, &s, 10);
1051 	}
1052 
1053 	switch(d){
1054 	case 32:
1055 	case 24:
1056 	case 16:
1057 	case 8:
1058 		return d;
1059 	}
1060 	return -1;
1061 }
1062 
1063 int
1064 isp9bit(void)
1065 {
1066 	int dep, lox, loy, hix, hiy, px, new;
1067 	ulong t;
1068 	long len;
1069 	char *newlabel;
1070 
1071 	newlabel = "old ";
1072 
1073 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
1074 	if(new)
1075 		newlabel = "";
1076 	lox = p9bitnum(buf + 1*P9BITLEN);
1077 	loy = p9bitnum(buf + 2*P9BITLEN);
1078 	hix = p9bitnum(buf + 3*P9BITLEN);
1079 	hiy = p9bitnum(buf + 4*P9BITLEN);
1080 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1081 		return 0;
1082 
1083 	if(dep < 8){
1084 		px = 8/dep;	/* pixels per byte */
1085 		/* set l to number of bytes of data per scan line */
1086 		if(lox >= 0)
1087 			len = (hix+px-1)/px - lox/px;
1088 		else{	/* make positive before divide */
1089 			t = (-lox)+px-1;
1090 			t = (t/px)*px;
1091 			len = (t+hix+px-1)/px;
1092 		}
1093 	}else
1094 		len = (hix-lox)*dep/8;
1095 	len *= (hiy-loy);		/* col length */
1096 	len += 5*P9BITLEN;		/* size of initial ascii */
1097 
1098 	/*
1099 	 * for image file, length is non-zero and must match calculation above
1100 	 * for /dev/window and /dev/screen the length is always zero
1101 	 * for subfont, the subfont header should follow immediately.
1102 	 */
1103 	if (len != 0 && mbuf->length == 0) {
1104 		print("%splan 9 image\n", newlabel);
1105 		return 1;
1106 	}
1107 	if (mbuf->length == len) {
1108 		print("%splan 9 image\n", newlabel);
1109 		return 1;
1110 	}
1111 	/* Ghostscript sometimes produces a little extra on the end */
1112 	if (mbuf->length < len+P9BITLEN) {
1113 		print("%splan 9 image\n", newlabel);
1114 		return 1;
1115 	}
1116 	if (p9subfont(buf+len)) {
1117 		print("%ssubfont file\n", newlabel);
1118 		return 1;
1119 	}
1120 	return 0;
1121 }
1122 
1123 int
1124 p9subfont(uchar *p)
1125 {
1126 	int n, h, a;
1127 
1128 		/* if image too big, assume it's a subfont */
1129 	if (p+3*P9BITLEN > buf+sizeof(buf))
1130 		return 1;
1131 
1132 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1133 	if (n < 0)
1134 		return 0;
1135 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1136 	if (h < 0)
1137 		return 0;
1138 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1139 	if (a < 0)
1140 		return 0;
1141 	return 1;
1142 }
1143 
1144 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1145 
1146 int
1147 isp9font(void)
1148 {
1149 	uchar *cp, *p;
1150 	int i, n;
1151 	char pathname[1024];
1152 
1153 	cp = buf;
1154 	if (!getfontnum(cp, &cp))	/* height */
1155 		return 0;
1156 	if (!getfontnum(cp, &cp))	/* ascent */
1157 		return 0;
1158 	for (i = 0; 1; i++) {
1159 		if (!getfontnum(cp, &cp))	/* min */
1160 			break;
1161 		if (!getfontnum(cp, &cp))	/* max */
1162 			return 0;
1163 		while (WHITESPACE(*cp))
1164 			cp++;
1165 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1166 				;
1167 			/* construct a path name, if needed */
1168 		n = 0;
1169 		if (*p != '/' && slash) {
1170 			n = slash-fname+1;
1171 			if (n < sizeof(pathname))
1172 				memcpy(pathname, fname, n);
1173 			else n = 0;
1174 		}
1175 		if (n+cp-p < sizeof(pathname)) {
1176 			memcpy(pathname+n, p, cp-p);
1177 			n += cp-p;
1178 			pathname[n] = 0;
1179 			if (access(pathname, AEXIST) < 0)
1180 				return 0;
1181 		}
1182 	}
1183 	if (i) {
1184 		print(mime ? "text/plain\n" : "font file\n");
1185 		return 1;
1186 	}
1187 	return 0;
1188 }
1189 
1190 int
1191 getfontnum(uchar *cp, uchar **rp)
1192 {
1193 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1194 		cp++;
1195 	if (*cp < '0' || *cp > '9')
1196 		return 0;
1197 	strtoul((char *)cp, (char **)rp, 0);
1198 	if (!WHITESPACE(**rp))
1199 		return 0;
1200 	return 1;
1201 }
1202 
1203 int
1204 isrtf(void)
1205 {
1206 	if(strstr((char *)buf, "\\rtf1")){
1207 		print(mime ? "application/rtf\n" : "rich text format\n");
1208 		return 1;
1209 	}
1210 	return 0;
1211 }
1212 
1213 int
1214 ismsdos(void)
1215 {
1216 	if (buf[0] == 0x4d && buf[1] == 0x5a){
1217 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1218 		return 1;
1219 	}
1220 	return 0;
1221 }
1222 
1223 int
1224 iself(void)
1225 {
1226 	char *cpu[] = {		/* NB: incomplete and arbitary list */
1227 	[1]	"WE32100",
1228 	[2]	"SPARC",
1229 	[3]	"i386",
1230 	[4]	"M68000",
1231 	[5]	"M88000",
1232 	[6]	"i486",
1233 	[7]	"i860",
1234 	[8]	"R3000",
1235 	[9]	"S370",
1236 	[10]	"R4000",
1237 	[15]	"HP-PA",
1238 	[18]	"sparc v8+",
1239 	[19]	"i960",
1240 	[20]	"PPC-32",
1241 	[21]	"PPC-64",
1242 	[40]	"ARM",
1243 	[41]	"Alpha",
1244 	[43]	"sparc v9",
1245 	[50]	"IA-46",
1246 	[62]	"AMD64",
1247 	[75]	"VAX",
1248 	};
1249 
1250 
1251 	if (memcmp(buf, "\x7fELF", 4) == 0){
1252 		if (!mime){
1253 			int n = (buf[19] << 8) | buf[18];
1254 			char *p = "unknown";
1255 
1256 			if (n > 0 && n < nelem(cpu) && cpu[n])
1257 				p = cpu[n];
1258 			else {
1259 				/* try the other byte order */
1260 				n = (buf[18] << 8) | buf[19];
1261 				if (n > 0 && n < nelem(cpu) && cpu[n])
1262 					p = cpu[n];
1263 			}
1264 			print("%s ELF executable\n", p);
1265 		}
1266 		else
1267 			print("application/x-elf-executable");
1268 		return 1;
1269 	}
1270 
1271 	return 0;
1272 }
1273