xref: /plan9/sys/src/cmd/file.c (revision ff45c2c04efebdf51664af5ba60b1861b8b1ac95)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"chan",		Alword,
51 	"char",		Cword,
52 	"common",	Fword,
53 	"con",		Lword,
54 	"data",		Fword,
55 	"dimension",	Fword,
56 	"double",	Cword,
57 	"extern",	Cword,
58 	"bio",		I2,
59 	"float",	Cword,
60 	"fn",		Lword,
61 	"function",	Fword,
62 	"h",		I3,
63 	"implement",	Lword,
64 	"import",	Lword,
65 	"include",	I1,
66 	"int",		Cword,
67 	"integer",	Fword,
68 	"iota",		Lword,
69 	"libc",		I2,
70 	"long",		Cword,
71 	"module",	Lword,
72 	"real",		Fword,
73 	"ref",		Lword,
74 	"register",	Cword,
75 	"self",		Lword,
76 	"short",	Cword,
77 	"static",	Cword,
78 	"stdio",	I2,
79 	"struct",	Cword,
80 	"subroutine",	Fword,
81 	"u",		I2,
82 	"void",		Cword,
83 };
84 
85 /* codes for 'mode' field in language structure */
86 enum	{
87 		Normal	= 0,
88 		First,		/* first entry for language spanning several ranges */
89 		Multi,		/* later entries "   "       "  ... */
90 		Shared,		/* codes used in several languages */
91 	};
92 
93 struct
94 {
95 	int	mode;		/* see enum above */
96 	int 	count;
97 	int	low;
98 	int	high;
99 	char	*name;
100 
101 } language[] =
102 {
103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
125 	First,	0,	0x3130,	0x318F,	"Korean",
126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
128 	Normal,	0,	0,	0,	0,		/* terminal entry */
129 };
130 
131 
132 enum
133 {
134 	Fascii,		/* printable ascii */
135 	Flatin,		/* latin 1*/
136 	Futf,		/* UTf character set */
137 	Fbinary,	/* binary */
138 	Feascii,	/* ASCII with control chars */
139 	Fnull,		/* NULL in file */
140 } guess;
141 
142 void	bump_utf_count(Rune);
143 int	cistrncmp(char*, char*, int);
144 void	filetype(int);
145 int	getfontnum(uchar*, uchar**);
146 int	isas(void);
147 int	isc(void);
148 int	iscint(void);
149 int	isenglish(void);
150 int	ishp(void);
151 int	ishtml(void);
152 int	isrfc822(void);
153 int	ismbox(void);
154 int	islimbo(void);
155 int	ismung(void);
156 int	isp9bit(void);
157 int	isp9font(void);
158 int	isrtf(void);
159 int	ismsdos(void);
160 int	iself(void);
161 int	istring(void);
162 int	iff(void);
163 int	long0(void);
164 int	istar(void);
165 int	p9bitnum(uchar*);
166 int	p9subfont(uchar*);
167 void	print_utf(void);
168 void	type(char*, int);
169 int	utf_count(void);
170 void	wordfreq(void);
171 
172 int	(*call[])(void) =
173 {
174 	long0,		/* recognizable by first 4 bytes */
175 	istring,	/* recognizable by first string */
176 	iff,		/* interchange file format (strings) */
177 	isrfc822,	/* email file */
178 	ismbox,		/* mail box */
179 	istar,		/* recognizable by tar checksum */
180 	ishtml,		/* html keywords */
181 	iscint,		/* compiler/assembler intermediate */
182 	islimbo,	/* limbo source */
183 	isc,		/* c & alef compiler key words */
184 	isas,		/* assembler key words */
185 	ismung,		/* entropy compressed/encrypted */
186 	isp9font,	/* plan 9 font */
187 	isp9bit,	/* plan 9 image (as from /dev/window) */
188 	isenglish,	/* char frequency English */
189 	isrtf,		/* rich text format */
190 	ismsdos,	/* msdos exe (virus file attachement) */
191 	iself,		/* ELF (foreign) executable */
192 	0
193 };
194 
195 int mime;
196 
197 #define OCTET	"application/octet-stream\n"
198 #define PLAIN	"text/plain\n"
199 
200 void
201 main(int argc, char *argv[])
202 {
203 	int i, j, maxlen;
204 	char *cp;
205 	Rune r;
206 
207 	ARGBEGIN{
208 	case 'm':
209 		mime = 1;
210 		break;
211 	default:
212 		fprint(2, "usage: file [-m] [file...]\n");
213 		exits("usage");
214 	}ARGEND;
215 
216 	maxlen = 0;
217 	if(mime == 0 || argc > 1){
218 		for(i = 0; i < argc; i++) {
219 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
220 					;
221 			if(j > maxlen)
222 				maxlen = j;
223 		}
224 	}
225 	if (argc <= 0) {
226 		if(!mime)
227 			print ("stdin: ");
228 		filetype(0);
229 	}
230 	else {
231 		for(i = 0; i < argc; i++)
232 			type(argv[i], maxlen);
233 	}
234 	exits(0);
235 }
236 
237 void
238 type(char *file, int nlen)
239 {
240 	Rune r;
241 	int i;
242 	char *p;
243 
244 	if(nlen > 0){
245 		slash = 0;
246 		for (i = 0, p = file; *p; i++) {
247 			if (*p == '/')			/* find rightmost slash */
248 				slash = p;
249 			p += chartorune(&r, p);		/* count runes */
250 		}
251 		print("%s:%*s",file, nlen-i+1, "");
252 	}
253 	fname = file;
254 	if ((fd = open(file, OREAD)) < 0) {
255 		print("cannot open\n");
256 		return;
257 	}
258 	filetype(fd);
259 	close(fd);
260 }
261 
262 void
263 filetype(int fd)
264 {
265 	Rune r;
266 	int i, f, n;
267 	char *p, *eob;
268 
269 	free(mbuf);
270 	mbuf = dirfstat(fd);
271 	if(mbuf == nil){
272 		print("cannot stat: %r\n");
273 		return;
274 	}
275 	if(mbuf->mode & DMDIR) {
276 		print(mime ? "text/directory\n" : "directory\n");
277 		return;
278 	}
279 	if(mbuf->type != 'M' && mbuf->type != '|') {
280 		print(mime ? OCTET : "special file #%c/%s\n",
281 			mbuf->type, mbuf->name);
282 		return;
283 	}
284 	nbuf = read(fd, buf, sizeof(buf)-1);
285 
286 	if(nbuf < 0) {
287 		print("cannot read\n");
288 		return;
289 	}
290 	if(nbuf == 0) {
291 		print(mime ? PLAIN : "empty file\n");
292 		return;
293 	}
294 	buf[nbuf] = 0;
295 
296 	/*
297 	 * build histogram table
298 	 */
299 	memset(cfreq, 0, sizeof(cfreq));
300 	for (i = 0; language[i].name; i++)
301 		language[i].count = 0;
302 	eob = (char *)buf+nbuf;
303 	for(n = 0, p = (char *)buf; p < eob; n++) {
304 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
305 			break;
306 		p += chartorune(&r, p);
307 		if (r == 0)
308 			f = Cnull;
309 		else if (r <= 0x7f) {
310 			if (!isprint(r) && !isspace(r))
311 				f = Ceascii;	/* ASCII control char */
312 			else f = r;
313 		} else if (r == 0x080) {
314 			bump_utf_count(r);
315 			f = Cutf;
316 		} else if (r < 0xA0)
317 				f = Cbinary;	/* Invalid Runes */
318 		else if (r <= 0xff)
319 				f = Clatin;	/* Latin 1 */
320 		else {
321 			bump_utf_count(r);
322 			f = Cutf;		/* UTF extension */
323 		}
324 		cfreq[f]++;			/* ASCII chars peg directly */
325 	}
326 	/*
327 	 * gross classify
328 	 */
329 	if (cfreq[Cbinary])
330 		guess = Fbinary;
331 	else if (cfreq[Cutf])
332 		guess = Futf;
333 	else if (cfreq[Clatin])
334 		guess = Flatin;
335 	else if (cfreq[Ceascii])
336 		guess = Feascii;
337 	else if (cfreq[Cnull] == n) {
338 		print(mime ? OCTET : "first block all null bytes\n");
339 		return;
340 	}
341 	else guess = Fascii;
342 	/*
343 	 * lookup dictionary words
344 	 */
345 	memset(wfreq, 0, sizeof(wfreq));
346 	if(guess == Fascii || guess == Flatin || guess == Futf)
347 		wordfreq();
348 	/*
349 	 * call individual classify routines
350 	 */
351 	for(i=0; call[i]; i++)
352 		if((*call[i])())
353 			return;
354 
355 	/*
356 	 * if all else fails,
357 	 * print out gross classification
358 	 */
359 	if (nbuf < 100 && !mime)
360 		print(mime ? PLAIN : "short ");
361 	if (guess == Fascii)
362 		print(mime ? PLAIN : "Ascii\n");
363 	else if (guess == Feascii)
364 		print(mime ? PLAIN : "extended ascii\n");
365 	else if (guess == Flatin)
366 		print(mime ? PLAIN : "latin ascii\n");
367 	else if (guess == Futf && utf_count() < 4)
368 		print_utf();
369 	else print(mime ? OCTET : "binary\n");
370 }
371 
372 void
373 bump_utf_count(Rune r)
374 {
375 	int low, high, mid;
376 
377 	high = sizeof(language)/sizeof(language[0])-1;
378 	for (low = 0; low < high;) {
379 		mid = (low+high)/2;
380 		if (r >=language[mid].low) {
381 			if (r <= language[mid].high) {
382 				language[mid].count++;
383 				break;
384 			} else low = mid+1;
385 		} else high = mid;
386 	}
387 }
388 
389 int
390 utf_count(void)
391 {
392 	int i, count;
393 
394 	count = 0;
395 	for (i = 0; language[i].name; i++)
396 		if (language[i].count > 0)
397 			switch (language[i].mode) {
398 			case Normal:
399 			case First:
400 				count++;
401 				break;
402 			default:
403 				break;
404 			}
405 	return count;
406 }
407 
408 int
409 chkascii(void)
410 {
411 	int i;
412 
413 	for (i = 'a'; i < 'z'; i++)
414 		if (cfreq[i])
415 			return 1;
416 	for (i = 'A'; i < 'Z'; i++)
417 		if (cfreq[i])
418 			return 1;
419 	return 0;
420 }
421 
422 int
423 find_first(char *name)
424 {
425 	int i;
426 
427 	for (i = 0; language[i].name != 0; i++)
428 		if (language[i].mode == First
429 			&& strcmp(language[i].name, name) == 0)
430 			return i;
431 	return -1;
432 }
433 
434 void
435 print_utf(void)
436 {
437 	int i, printed, j;
438 
439 	if(mime){
440 		print(PLAIN);
441 		return;
442 	}
443 	if (chkascii()) {
444 		printed = 1;
445 		print("Ascii");
446 	} else
447 		printed = 0;
448 	for (i = 0; language[i].name; i++)
449 		if (language[i].count) {
450 			switch(language[i].mode) {
451 			case Multi:
452 				j = find_first(language[i].name);
453 				if (j < 0)
454 					break;
455 				if (language[j].count > 0)
456 					break;
457 				/* Fall through */
458 			case Normal:
459 			case First:
460 				if (printed)
461 					print(" & ");
462 				else printed = 1;
463 				print("%s", language[i].name);
464 				break;
465 			case Shared:
466 			default:
467 				break;
468 			}
469 		}
470 	if(!printed)
471 		print("UTF");
472 	print(" text\n");
473 }
474 
475 void
476 wordfreq(void)
477 {
478 	int low, high, mid, r;
479 	uchar *p, *p2, c;
480 
481 	p = buf;
482 	for(;;) {
483 		while (p < buf+nbuf && !isalpha(*p))
484 			p++;
485 		if (p >= buf+nbuf)
486 			return;
487 		p2 = p;
488 		while(p < buf+nbuf && isalpha(*p))
489 			p++;
490 		c = *p;
491 		*p = 0;
492 		high = sizeof(dict)/sizeof(dict[0]);
493 		for(low = 0;low < high;) {
494 			mid = (low+high)/2;
495 			r = strcmp(dict[mid].word, (char*)p2);
496 			if(r == 0) {
497 				wfreq[dict[mid].class]++;
498 				break;
499 			}
500 			if(r < 0)
501 				low = mid+1;
502 			else
503 				high = mid;
504 		}
505 		*p++ = c;
506 	}
507 }
508 
509 typedef struct Filemagic Filemagic;
510 struct Filemagic {
511 	ulong x;
512 	ulong mask;
513 	char *desc;
514 	char *mime;
515 };
516 
517 Filemagic long0tab[] = {
518 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
519 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
520 	0x32636170,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
521 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
522 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
523 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip",
524 	070707,		0xFFFF,		"cpio archive\n", OCTET,
525 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
526 	0xfffa0000,	0xfffe0000,	"mp3 audio",	"audio/mpeg",
527 };
528 
529 int
530 filemagic(Filemagic *tab, int ntab, ulong x)
531 {
532 	int i;
533 
534 	for(i=0; i<ntab; i++)
535 		if((x&tab[i].mask) == tab[i].x){
536 			print(mime ? tab[i].mime : tab[i].desc);
537 			return 1;
538 		}
539 	return 0;
540 }
541 
542 int
543 long0(void)
544 {
545 	Fhdr f;
546 	long x;
547 
548 	seek(fd, 0, 0);		/* reposition to start of file */
549 	if(crackhdr(fd, &f)) {
550 		print(mime ? OCTET : "%s\n", f.name);
551 		return 1;
552 	}
553 	x = LENDIAN(buf);
554 	if(filemagic(long0tab, nelem(long0tab), x))
555 		return 1;
556 	return 0;
557 }
558 
559 /* from tar.c */
560 enum { NAMSIZ = 100, TBLOCK = 512 };
561 
562 union	hblock
563 {
564 	char	dummy[TBLOCK];
565 	struct	header
566 	{
567 		char	name[NAMSIZ];
568 		char	mode[8];
569 		char	uid[8];
570 		char	gid[8];
571 		char	size[12];
572 		char	mtime[12];
573 		char	chksum[8];
574 		char	linkflag;
575 		char	linkname[NAMSIZ];
576 		/* rest are defined by POSIX's ustar format; see p1003.2b */
577 		char	magic[6];	/* "ustar" */
578 		char	version[2];
579 		char	uname[32];
580 		char	gname[32];
581 		char	devmajor[8];
582 		char	devminor[8];
583 		char	prefix[155];  /* if non-null, path = prefix "/" name */
584 	} dbuf;
585 };
586 
587 int
588 checksum(union hblock *hp)
589 {
590 	int i;
591 	char *cp;
592 	struct header *hdr = &hp->dbuf;
593 
594 	for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
595 		*cp = ' ';
596 	i = 0;
597 	for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
598 		i += *cp & 0xff;
599 	return i;
600 }
601 
602 int
603 istar(void)
604 {
605 	int chksum;
606 	char tblock[TBLOCK];
607 	union hblock *hp = (union hblock *)tblock;
608 	struct header *hdr = &hp->dbuf;
609 
610 	seek(fd, 0, 0);		/* reposition to start of file */
611 	if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
612 		return 0;
613 	chksum = strtol(hdr->chksum, 0, 8);
614 	if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
615 		if (strcmp(hdr->magic, "ustar") == 0)
616 			print(mime? "application/x-ustar\n":
617 				"posix tar archive\n");
618 		else
619 			print(mime? "application/x-tar\n": "tar archive\n");
620 		return 1;
621 	}
622 	return 0;
623 }
624 
625 /*
626  * initial words to classify file
627  */
628 struct	FILE_STRING
629 {
630 	char 	*key;
631 	char	*filetype;
632 	int	length;
633 	char	*mime;
634 } file_string[] =
635 {
636 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
637 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
638 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
639 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
640 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
641 	"%!",			"postscript",			2,	"application/postscript",
642 	"\004%!",		"postscript",			3,	"application/postscript",
643 	"x T post",		"troff output for post",	8,	"application/troff",
644 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
645 	"x T utf",		"troff output for UTF",		7,	"application/troff",
646 	"x T 202",		"troff output for 202",		7,	"application/troff",
647 	"x T aps",		"troff output for aps",		7,	"application/troff",
648 	"GIF",			"GIF image", 			3,	"image/gif",
649 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
650 	"%PDF",			"PDF",				4,	"application/pdf",
651 	"<html>\n",		"HTML file",			7,	"text/html",
652 	"<HTML>\n",		"HTML file",			7,	"text/html",
653 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
654 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
655 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
656 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
657 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
658 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
659 	"BM",			"bmp",				2,	"image/bmp",
660 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
661 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
662 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
663 	"ID3",			"mp3 audio with id3",	3,	"audio/mpeg",
664 	0,0,0,0
665 };
666 
667 int
668 istring(void)
669 {
670 	int i;
671 	struct FILE_STRING *p;
672 
673 	for(p = file_string; p->key; p++) {
674 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
675 			if(mime)
676 				print("%s\n", p->mime);
677 			else
678 				print("%s\n", p->filetype);
679 			return 1;
680 		}
681 	}
682 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
683 		for(i = 5; i < nbuf; i++)
684 			if(buf[i] == '\n')
685 				break;
686 		if(mime)
687 			print(OCTET);
688 		else
689 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
690 		return 1;
691 	}
692 	return 0;
693 }
694 
695 int
696 iff(void)
697 {
698 	if (strncmp((char*)buf, "FORM", 4) == 0 &&
699 	    strncmp((char*)buf+8, "AIFF", 4) == 0) {
700 		print("%s\n", mime? "audio/x-aiff": "aiff audio");
701 		return 1;
702 	}
703 	return 0;
704 }
705 
706 char*	html_string[] =
707 {
708 	"title",
709 	"body",
710 	"head",
711 	"strong",
712 	"h1",
713 	"h2",
714 	"h3",
715 	"h4",
716 	"h5",
717 	"h6",
718 	"ul",
719 	"li",
720 	"dl",
721 	"br",
722 	"em",
723 	0,
724 };
725 
726 int
727 ishtml(void)
728 {
729 	uchar *p, *q;
730 	int i, count;
731 
732 		/* compare strings between '<' and '>' to html table */
733 	count = 0;
734 	p = buf;
735 	for(;;) {
736 		while (p < buf+nbuf && *p != '<')
737 			p++;
738 		p++;
739 		if (p >= buf+nbuf)
740 			break;
741 		if(*p == '/')
742 			p++;
743 		q = p;
744 		while(p < buf+nbuf && *p != '>')
745 			p++;
746 		if (p >= buf+nbuf)
747 			break;
748 		for(i = 0; html_string[i]; i++) {
749 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
750 				if(count++ > 4) {
751 					print(mime ? "text/html\n" : "HTML file\n");
752 					return 1;
753 				}
754 				break;
755 			}
756 		}
757 		p++;
758 	}
759 	return 0;
760 }
761 
762 char*	rfc822_string[] =
763 {
764 	"from:",
765 	"date:",
766 	"to:",
767 	"subject:",
768 	"received:",
769 	"reply to:",
770 	"sender:",
771 	0,
772 };
773 
774 int
775 isrfc822(void)
776 {
777 
778 	char *p, *q, *r;
779 	int i, count;
780 
781 	count = 0;
782 	p = (char*)buf;
783 	for(;;) {
784 		q = strchr(p, '\n');
785 		if(q == nil)
786 			break;
787 		*q = 0;
788 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
789 			count++;
790 			*q = '\n';
791 			p = q+1;
792 			continue;
793 		}
794 		*q = '\n';
795 		if(*p != '\t' && *p != ' '){
796 			r = strchr(p, ':');
797 			if(r == 0 || r > q)
798 				break;
799 			for(i = 0; rfc822_string[i]; i++) {
800 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
801 					count++;
802 					break;
803 				}
804 			}
805 		}
806 		p = q+1;
807 	}
808 	if(count >= 3){
809 		print(mime ? "message/rfc822\n" : "email file\n");
810 		return 1;
811 	}
812 	return 0;
813 }
814 
815 int
816 ismbox(void)
817 {
818 	char *p, *q;
819 
820 	p = (char*)buf;
821 	q = strchr(p, '\n');
822 	if(q == nil)
823 		return 0;
824 	*q = 0;
825 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
826 		print(mime ? "text/plain\n" : "mail box\n");
827 		return 1;
828 	}
829 	*q = '\n';
830 	return 0;
831 }
832 
833 int
834 iscint(void)
835 {
836 	int type;
837 	char *name;
838 	Biobuf b;
839 
840 	if(Binit(&b, fd, OREAD) == Beof)
841 		return 0;
842 	seek(fd, 0, 0);
843 	type = objtype(&b, &name);
844 	if(type < 0)
845 		return 0;
846 	if(mime)
847 		print(OCTET);
848 	else
849 		print("%s intermediate\n", name);
850 	return 1;
851 }
852 
853 int
854 isc(void)
855 {
856 	int n;
857 
858 	n = wfreq[I1];
859 	/*
860 	 * includes
861 	 */
862 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
863 		goto yes;
864 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
865 		goto yes;
866 	/*
867 	 * declarations
868 	 */
869 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
870 		goto yes;
871 	/*
872 	 * assignments
873 	 */
874 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
875 		goto yes;
876 	return 0;
877 
878 yes:
879 	if(mime){
880 		print(PLAIN);
881 		return 1;
882 	}
883 	if(wfreq[Alword] > 0)
884 		print("alef program\n");
885 	else
886 		print("c program\n");
887 	return 1;
888 }
889 
890 int
891 islimbo(void)
892 {
893 
894 	/*
895 	 * includes
896 	 */
897 	if(wfreq[Lword] < 4)
898 		return 0;
899 	print(mime ? PLAIN : "limbo program\n");
900 	return 1;
901 }
902 
903 int
904 isas(void)
905 {
906 
907 	/*
908 	 * includes
909 	 */
910 	if(wfreq[Aword] < 2)
911 		return 0;
912 	print(mime ? PLAIN : "as program\n");
913 	return 1;
914 }
915 
916 /*
917  * low entropy means encrypted
918  */
919 int
920 ismung(void)
921 {
922 	int i, bucket[8];
923 	float cs;
924 
925 	if(nbuf < 64)
926 		return 0;
927 	memset(bucket, 0, sizeof(bucket));
928 	for(i=0; i<64; i++)
929 		bucket[(buf[i]>>5)&07] += 1;
930 
931 	cs = 0.;
932 	for(i=0; i<8; i++)
933 		cs += (bucket[i]-8)*(bucket[i]-8);
934 	cs /= 8.;
935 	if(cs <= 24.322) {
936 		if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
937 			print(mime ? OCTET : "compressed\n");
938 		else
939 			print(mime ? OCTET : "encrypted\n");
940 		return 1;
941 	}
942 	return 0;
943 }
944 
945 /*
946  * english by punctuation and frequencies
947  */
948 int
949 isenglish(void)
950 {
951 	int vow, comm, rare, badpun, punct;
952 	char *p;
953 
954 	if(guess != Fascii && guess != Feascii)
955 		return 0;
956 	badpun = 0;
957 	punct = 0;
958 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
959 		switch(*p) {
960 		case '.':
961 		case ',':
962 		case ')':
963 		case '%':
964 		case ';':
965 		case ':':
966 		case '?':
967 			punct++;
968 			if(p[1] != ' ' && p[1] != '\n')
969 				badpun++;
970 		}
971 	if(badpun*5 > punct)
972 		return 0;
973 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
974 		return 0;
975 	if(2*cfreq[';'] > cfreq['e'])
976 		return 0;
977 
978 	vow = 0;
979 	for(p="AEIOU"; *p; p++) {
980 		vow += cfreq[*p];
981 		vow += cfreq[tolower(*p)];
982 	}
983 	comm = 0;
984 	for(p="ETAION"; *p; p++) {
985 		comm += cfreq[*p];
986 		comm += cfreq[tolower(*p)];
987 	}
988 	rare = 0;
989 	for(p="VJKQXZ"; *p; p++) {
990 		rare += cfreq[*p];
991 		rare += cfreq[tolower(*p)];
992 	}
993 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
994 		print(mime ? PLAIN : "English text\n");
995 		return 1;
996 	}
997 	return 0;
998 }
999 
1000 /*
1001  * pick up a number with
1002  * syntax _*[0-9]+_
1003  */
1004 #define	P9BITLEN	12
1005 int
1006 p9bitnum(uchar *bp)
1007 {
1008 	int n, c, len;
1009 
1010 	len = P9BITLEN;
1011 	while(*bp == ' ') {
1012 		bp++;
1013 		len--;
1014 		if(len <= 0)
1015 			return -1;
1016 	}
1017 	n = 0;
1018 	while(len > 1) {
1019 		c = *bp++;
1020 		if(!isdigit(c))
1021 			return -1;
1022 		n = n*10 + c-'0';
1023 		len--;
1024 	}
1025 	if(*bp != ' ')
1026 		return -1;
1027 	return n;
1028 }
1029 
1030 int
1031 depthof(char *s, int *newp)
1032 {
1033 	char *es;
1034 	int d;
1035 
1036 	*newp = 0;
1037 	es = s+12;
1038 	while(s<es && *s==' ')
1039 		s++;
1040 	if(s == es)
1041 		return -1;
1042 	if('0'<=*s && *s<='9')
1043 		return 1<<atoi(s);
1044 
1045 	*newp = 1;
1046 	d = 0;
1047 	while(s<es && *s!=' '){
1048 		s++;	/* skip letter */
1049 		d += strtoul(s, &s, 10);
1050 	}
1051 
1052 	switch(d){
1053 	case 32:
1054 	case 24:
1055 	case 16:
1056 	case 8:
1057 		return d;
1058 	}
1059 	return -1;
1060 }
1061 
1062 int
1063 isp9bit(void)
1064 {
1065 	int dep, lox, loy, hix, hiy, px, new;
1066 	ulong t;
1067 	long len;
1068 	char *newlabel;
1069 
1070 	newlabel = "old ";
1071 
1072 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
1073 	if(new)
1074 		newlabel = "";
1075 	lox = p9bitnum(buf + 1*P9BITLEN);
1076 	loy = p9bitnum(buf + 2*P9BITLEN);
1077 	hix = p9bitnum(buf + 3*P9BITLEN);
1078 	hiy = p9bitnum(buf + 4*P9BITLEN);
1079 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1080 		return 0;
1081 
1082 	if(dep < 8){
1083 		px = 8/dep;	/* pixels per byte */
1084 		/* set l to number of bytes of data per scan line */
1085 		if(lox >= 0)
1086 			len = (hix+px-1)/px - lox/px;
1087 		else{	/* make positive before divide */
1088 			t = (-lox)+px-1;
1089 			t = (t/px)*px;
1090 			len = (t+hix+px-1)/px;
1091 		}
1092 	}else
1093 		len = (hix-lox)*dep/8;
1094 	len *= (hiy-loy);		/* col length */
1095 	len += 5*P9BITLEN;		/* size of initial ascii */
1096 
1097 	/*
1098 	 * for image file, length is non-zero and must match calculation above
1099 	 * for /dev/window and /dev/screen the length is always zero
1100 	 * for subfont, the subfont header should follow immediately.
1101 	 */
1102 	if (len != 0 && mbuf->length == 0) {
1103 		print("%splan 9 image\n", newlabel);
1104 		return 1;
1105 	}
1106 	if (mbuf->length == len) {
1107 		print("%splan 9 image\n", newlabel);
1108 		return 1;
1109 	}
1110 	/* Ghostscript sometimes produces a little extra on the end */
1111 	if (mbuf->length < len+P9BITLEN) {
1112 		print("%splan 9 image\n", newlabel);
1113 		return 1;
1114 	}
1115 	if (p9subfont(buf+len)) {
1116 		print("%ssubfont file\n", newlabel);
1117 		return 1;
1118 	}
1119 	return 0;
1120 }
1121 
1122 int
1123 p9subfont(uchar *p)
1124 {
1125 	int n, h, a;
1126 
1127 		/* if image too big, assume it's a subfont */
1128 	if (p+3*P9BITLEN > buf+sizeof(buf))
1129 		return 1;
1130 
1131 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1132 	if (n < 0)
1133 		return 0;
1134 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1135 	if (h < 0)
1136 		return 0;
1137 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1138 	if (a < 0)
1139 		return 0;
1140 	return 1;
1141 }
1142 
1143 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1144 
1145 int
1146 isp9font(void)
1147 {
1148 	uchar *cp, *p;
1149 	int i, n;
1150 	char pathname[1024];
1151 
1152 	cp = buf;
1153 	if (!getfontnum(cp, &cp))	/* height */
1154 		return 0;
1155 	if (!getfontnum(cp, &cp))	/* ascent */
1156 		return 0;
1157 	for (i = 0; 1; i++) {
1158 		if (!getfontnum(cp, &cp))	/* min */
1159 			break;
1160 		if (!getfontnum(cp, &cp))	/* max */
1161 			return 0;
1162 		while (WHITESPACE(*cp))
1163 			cp++;
1164 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1165 				;
1166 			/* construct a path name, if needed */
1167 		n = 0;
1168 		if (*p != '/' && slash) {
1169 			n = slash-fname+1;
1170 			if (n < sizeof(pathname))
1171 				memcpy(pathname, fname, n);
1172 			else n = 0;
1173 		}
1174 		if (n+cp-p < sizeof(pathname)) {
1175 			memcpy(pathname+n, p, cp-p);
1176 			n += cp-p;
1177 			pathname[n] = 0;
1178 			if (access(pathname, AEXIST) < 0)
1179 				return 0;
1180 		}
1181 	}
1182 	if (i) {
1183 		print(mime ? "text/plain\n" : "font file\n");
1184 		return 1;
1185 	}
1186 	return 0;
1187 }
1188 
1189 int
1190 getfontnum(uchar *cp, uchar **rp)
1191 {
1192 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1193 		cp++;
1194 	if (*cp < '0' || *cp > '9')
1195 		return 0;
1196 	strtoul((char *)cp, (char **)rp, 0);
1197 	if (!WHITESPACE(**rp))
1198 		return 0;
1199 	return 1;
1200 }
1201 
1202 int
1203 isrtf(void)
1204 {
1205 	if(strstr((char *)buf, "\\rtf1")){
1206 		print(mime ? "application/rtf\n" : "rich text format\n");
1207 		return 1;
1208 	}
1209 	return 0;
1210 }
1211 
1212 int
1213 ismsdos(void)
1214 {
1215 	if (buf[0] == 0x4d && buf[1] == 0x5a){
1216 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1217 		return 1;
1218 	}
1219 	return 0;
1220 }
1221 
1222 int
1223 iself(void)
1224 {
1225 	char *cpu[] = {		/* NB: incomplete and arbitary list */
1226 	[1]	"WE32100",
1227 	[2]	"SPARC",
1228 	[3]	"i386",
1229 	[4]	"M68000",
1230 	[5]	"M88000",
1231 	[6]	"i486",
1232 	[7]	"i860",
1233 	[8]	"R3000",
1234 	[9]	"S370",
1235 	[10]	"R4000",
1236 	[15]	"HP-PA",
1237 	[18]	"sparc v8+",
1238 	[19]	"i960",
1239 	[20]	"PPC-32",
1240 	[21]	"PPC-64",
1241 	[40]	"ARM",
1242 	[41]	"Alpha",
1243 	[43]	"sparc v9",
1244 	[50]	"IA-46",
1245 	[62]	"AMD64",
1246 	[75]	"VAX",
1247 	};
1248 
1249 
1250 	if (memcmp(buf, "\x7fELF", 4) == 0){
1251 		if (!mime){
1252 			int n = (buf[19] << 8) | buf[18];
1253 			char *p = "unknown";
1254 
1255 			if (n > 0 && n < nelem(cpu) && cpu[n])
1256 				p = cpu[n];
1257 			else {
1258 				/* try the other byte order */
1259 				n = (buf[18] << 8) | buf[19];
1260 				if (n > 0 && n < nelem(cpu) && cpu[n])
1261 					p = cpu[n];
1262 			}
1263 			print("%s ELF executable\n", p);
1264 		}
1265 		else
1266 			print("application/x-elf-executable");
1267 		return 1;
1268 	}
1269 
1270 	return 0;
1271 }
1272