xref: /plan9/sys/src/cmd/file.c (revision f9e1cf08d3be51592e03e639fc848a68dc31a55e)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"char",		Cword,
51 	"common",	Fword,
52 	"con",		Lword,
53 	"data",		Fword,
54 	"dimension",	Fword,
55 	"double",	Cword,
56 	"extern",	Cword,
57 	"bio",		I2,
58 	"float",	Cword,
59 	"fn",		Lword,
60 	"function",	Fword,
61 	"h",		I3,
62 	"implement",	Lword,
63 	"import",	Lword,
64 	"include",	I1,
65 	"int",		Cword,
66 	"integer",	Fword,
67 	"iota",		Lword,
68 	"libc",		I2,
69 	"long",		Cword,
70 	"module",	Lword,
71 	"real",		Fword,
72 	"ref",		Lword,
73 	"register",	Cword,
74 	"self",		Lword,
75 	"short",	Cword,
76 	"static",	Cword,
77 	"stdio",	I2,
78 	"struct",	Cword,
79 	"subroutine",	Fword,
80 	"u",		I2,
81 	"void",		Cword,
82 };
83 
84 /* codes for 'mode' field in language structure */
85 enum	{
86 		Normal	= 0,
87 		First,		/* first entry for language spanning several ranges */
88 		Multi,		/* later entries "   "       "  ... */
89 		Shared,		/* codes used in several languages */
90 	};
91 
92 struct
93 {
94 	int	mode;		/* see enum above */
95 	int 	count;
96 	int	low;
97 	int	high;
98 	char	*name;
99 
100 } language[] =
101 {
102 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
103 	Normal,	0,	0x0370,	0x03FF,	"Greek",
104 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
105 	Normal,	0,	0x0530,	0x058F,	"Armenian",
106 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
107 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
108 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
109 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
110 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
111 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
112 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
113 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
114 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
115 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
116 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
117 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
118 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
119 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
120 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
121 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
122 	Normal,	0,	0x3100,	0x312F,	"Chinese",
123 	First,	0,	0x3130,	0x318F,	"Korean",
124 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
125 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
126 	Normal,	0,	0,	0,	0,		/* terminal entry */
127 };
128 
129 
130 enum
131 {
132 	Fascii,		/* printable ascii */
133 	Flatin,		/* latin 1*/
134 	Futf,		/* UTF character set */
135 	Fbinary,	/* binary */
136 	Feascii,	/* ASCII with control chars */
137 	Fnull,		/* NULL in file */
138 } guess;
139 
140 void	bump_utf_count(Rune);
141 int	cistrncmp(char*, char*, int);
142 void	filetype(int);
143 int	getfontnum(uchar*, uchar**);
144 int	isas(void);
145 int	isc(void);
146 int	iscint(void);
147 int	isenglish(void);
148 int	ishp(void);
149 int	ishtml(void);
150 int	isrfc822(void);
151 int	ismbox(void);
152 int	islimbo(void);
153 int	ismung(void);
154 int	isp9bit(void);
155 int	isp9font(void);
156 int	isrtf(void);
157 int	ismsdos(void);
158 int	iself(void);
159 int	istring(void);
160 int	isoffstr(void);
161 int	iff(void);
162 int	long0(void);
163 int	longoff(void);
164 int	istar(void);
165 int	isface(void);
166 int	isexec(void);
167 int	p9bitnum(uchar*);
168 int	p9subfont(uchar*);
169 void	print_utf(void);
170 void	type(char*, int);
171 int	utf_count(void);
172 void	wordfreq(void);
173 
174 int	(*call[])(void) =
175 {
176 	long0,		/* recognizable by first 4 bytes */
177 	istring,	/* recognizable by first string */
178 	iself,		/* ELF (foreign) executable */
179 	isexec,		/* native executables */
180 	iff,		/* interchange file format (strings) */
181 	longoff,	/* recognizable by 4 bytes at some offset */
182 	isoffstr,	/* recognizable by string at some offset */
183 	isrfc822,	/* email file */
184 	ismbox,		/* mail box */
185 	istar,		/* recognizable by tar checksum */
186 	ishtml,		/* html keywords */
187 	iscint,		/* compiler/assembler intermediate */
188 	islimbo,	/* limbo source */
189 	isc,		/* c & alef compiler key words */
190 	isas,		/* assembler key words */
191 	ismung,		/* entropy compressed/encrypted */
192 	isp9font,	/* plan 9 font */
193 	isp9bit,	/* plan 9 image (as from /dev/window) */
194 	isenglish,	/* char frequency English */
195 	isrtf,		/* rich text format */
196 	ismsdos,	/* msdos exe (virus file attachement) */
197 	isface,		/* ascii face file */
198 	0
199 };
200 
201 int mime;
202 
203 #define OCTET	"application/octet-stream\n"
204 #define PLAIN	"text/plain\n"
205 
206 void
207 main(int argc, char *argv[])
208 {
209 	int i, j, maxlen;
210 	char *cp;
211 	Rune r;
212 
213 	ARGBEGIN{
214 	case 'm':
215 		mime = 1;
216 		break;
217 	default:
218 		fprint(2, "usage: file [-m] [file...]\n");
219 		exits("usage");
220 	}ARGEND;
221 
222 	maxlen = 0;
223 	if(mime == 0 || argc > 1){
224 		for(i = 0; i < argc; i++) {
225 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
226 					;
227 			if(j > maxlen)
228 				maxlen = j;
229 		}
230 	}
231 	if (argc <= 0) {
232 		if(!mime)
233 			print ("stdin: ");
234 		filetype(0);
235 	}
236 	else {
237 		for(i = 0; i < argc; i++)
238 			type(argv[i], maxlen);
239 	}
240 	exits(0);
241 }
242 
243 void
244 type(char *file, int nlen)
245 {
246 	Rune r;
247 	int i;
248 	char *p;
249 
250 	if(nlen > 0){
251 		slash = 0;
252 		for (i = 0, p = file; *p; i++) {
253 			if (*p == '/')			/* find rightmost slash */
254 				slash = p;
255 			p += chartorune(&r, p);		/* count runes */
256 		}
257 		print("%s:%*s",file, nlen-i+1, "");
258 	}
259 	fname = file;
260 	if ((fd = open(file, OREAD)) < 0) {
261 		print("cannot open\n");
262 		return;
263 	}
264 	filetype(fd);
265 	close(fd);
266 }
267 
268 /*
269  * Unicode 4.0 4-byte runes.
270  */
271 typedef int Rune1;
272 
273 enum {
274 	UTFmax1 = 4,
275 };
276 
277 int
278 fullrune1(char *p, int n)
279 {
280 	int c;
281 
282 	if(n >= 1) {
283 		c = *(uchar*)p;
284 		if(c < 0x80)
285 			return 1;
286 		if(n >= 2 && c < 0xE0)
287 			return 1;
288 		if(n >= 3 && c < 0xF0)
289 			return 1;
290 		if(n >= 4)
291 			return 1;
292 	}
293 	return 0;
294 }
295 
296 int
297 chartorune1(Rune1 *rune, char *str)
298 {
299 	int c, c1, c2, c3, n;
300 	Rune r;
301 
302 	c = *(uchar*)str;
303 	if(c < 0xF0){
304 		r = 0;
305 		n = chartorune(&r, str);
306 		*rune = r;
307 		return n;
308 	}
309 	c &= ~0xF0;
310 	c1 = *(uchar*)(str+1) & ~0x80;
311 	c2 = *(uchar*)(str+2) & ~0x80;
312 	c3 = *(uchar*)(str+3) & ~0x80;
313 	n = (c<<18) | (c1<<12) | (c2<<6) | c3;
314 	if(n < 0x10000 || n > 0x10FFFF){
315 		*rune = Runeerror;
316 		return 1;
317 	}
318 	*rune = n;
319 	return 4;
320 }
321 
322 void
323 filetype(int fd)
324 {
325 	Rune1 r;
326 	int i, f, n;
327 	char *p, *eob;
328 
329 	free(mbuf);
330 	mbuf = dirfstat(fd);
331 	if(mbuf == nil){
332 		print("cannot stat: %r\n");
333 		return;
334 	}
335 	if(mbuf->mode & DMDIR) {
336 		print(mime ? "text/directory\n" : "directory\n");
337 		return;
338 	}
339 	if(mbuf->type != 'M' && mbuf->type != '|') {
340 		print(mime ? OCTET : "special file #%c/%s\n",
341 			mbuf->type, mbuf->name);
342 		return;
343 	}
344 	nbuf = read(fd, buf, sizeof(buf)-1);
345 
346 	if(nbuf < 0) {
347 		print("cannot read\n");
348 		return;
349 	}
350 	if(nbuf == 0) {
351 		print(mime ? PLAIN : "empty file\n");
352 		return;
353 	}
354 	buf[nbuf] = 0;
355 
356 	/*
357 	 * build histogram table
358 	 */
359 	memset(cfreq, 0, sizeof(cfreq));
360 	for (i = 0; language[i].name; i++)
361 		language[i].count = 0;
362 	eob = (char *)buf+nbuf;
363 	for(n = 0, p = (char *)buf; p < eob; n++) {
364 		if (!fullrune1(p, eob-p) && eob-p < UTFmax1)
365 			break;
366 		p += chartorune1(&r, p);
367 		if (r == 0)
368 			f = Cnull;
369 		else if (r <= 0x7f) {
370 			if (!isprint(r) && !isspace(r))
371 				f = Ceascii;	/* ASCII control char */
372 			else f = r;
373 		} else if (r == 0x80) {
374 			bump_utf_count(r);
375 			f = Cutf;
376 		} else if (r < 0xA0)
377 			f = Cbinary;	/* Invalid Runes */
378 		else if (r <= 0xff)
379 			f = Clatin;	/* Latin 1 */
380 		else {
381 			bump_utf_count(r);
382 			f = Cutf;		/* UTF extension */
383 		}
384 		cfreq[f]++;			/* ASCII chars peg directly */
385 	}
386 	/*
387 	 * gross classify
388 	 */
389 	if (cfreq[Cbinary])
390 		guess = Fbinary;
391 	else if (cfreq[Cutf])
392 		guess = Futf;
393 	else if (cfreq[Clatin])
394 		guess = Flatin;
395 	else if (cfreq[Ceascii])
396 		guess = Feascii;
397 	else if (cfreq[Cnull])
398 		guess = Fbinary;
399 	else
400 		guess = Fascii;
401 	/*
402 	 * lookup dictionary words
403 	 */
404 	memset(wfreq, 0, sizeof(wfreq));
405 	if(guess == Fascii || guess == Flatin || guess == Futf)
406 		wordfreq();
407 	/*
408 	 * call individual classify routines
409 	 */
410 	for(i=0; call[i]; i++)
411 		if((*call[i])())
412 			return;
413 
414 	/*
415 	 * if all else fails,
416 	 * print out gross classification
417 	 */
418 	if (nbuf < 100 && !mime)
419 		print(mime ? PLAIN : "short ");
420 	if (guess == Fascii)
421 		print(mime ? PLAIN : "Ascii\n");
422 	else if (guess == Feascii)
423 		print(mime ? PLAIN : "extended ascii\n");
424 	else if (guess == Flatin)
425 		print(mime ? PLAIN : "latin ascii\n");
426 	else if (guess == Futf && utf_count() < 4)
427 		print_utf();
428 	else print(mime ? OCTET : "binary\n");
429 }
430 
431 void
432 bump_utf_count(Rune r)
433 {
434 	int low, high, mid;
435 
436 	high = sizeof(language)/sizeof(language[0])-1;
437 	for (low = 0; low < high;) {
438 		mid = (low+high)/2;
439 		if (r >= language[mid].low) {
440 			if (r <= language[mid].high) {
441 				language[mid].count++;
442 				break;
443 			} else low = mid+1;
444 		} else high = mid;
445 	}
446 }
447 
448 int
449 utf_count(void)
450 {
451 	int i, count;
452 
453 	count = 0;
454 	for (i = 0; language[i].name; i++)
455 		if (language[i].count > 0)
456 			switch (language[i].mode) {
457 			case Normal:
458 			case First:
459 				count++;
460 				break;
461 			default:
462 				break;
463 			}
464 	return count;
465 }
466 
467 int
468 chkascii(void)
469 {
470 	int i;
471 
472 	for (i = 'a'; i < 'z'; i++)
473 		if (cfreq[i])
474 			return 1;
475 	for (i = 'A'; i < 'Z'; i++)
476 		if (cfreq[i])
477 			return 1;
478 	return 0;
479 }
480 
481 int
482 find_first(char *name)
483 {
484 	int i;
485 
486 	for (i = 0; language[i].name != 0; i++)
487 		if (language[i].mode == First
488 			&& strcmp(language[i].name, name) == 0)
489 			return i;
490 	return -1;
491 }
492 
493 void
494 print_utf(void)
495 {
496 	int i, printed, j;
497 
498 	if(mime){
499 		print(PLAIN);
500 		return;
501 	}
502 	if (chkascii()) {
503 		printed = 1;
504 		print("Ascii");
505 	} else
506 		printed = 0;
507 	for (i = 0; language[i].name; i++)
508 		if (language[i].count) {
509 			switch(language[i].mode) {
510 			case Multi:
511 				j = find_first(language[i].name);
512 				if (j < 0)
513 					break;
514 				if (language[j].count > 0)
515 					break;
516 				/* Fall through */
517 			case Normal:
518 			case First:
519 				if (printed)
520 					print(" & ");
521 				else printed = 1;
522 				print("%s", language[i].name);
523 				break;
524 			case Shared:
525 			default:
526 				break;
527 			}
528 		}
529 	if(!printed)
530 		print("UTF");
531 	print(" text\n");
532 }
533 
534 void
535 wordfreq(void)
536 {
537 	int low, high, mid, r;
538 	uchar *p, *p2, c;
539 
540 	p = buf;
541 	for(;;) {
542 		while (p < buf+nbuf && !isalpha(*p))
543 			p++;
544 		if (p >= buf+nbuf)
545 			return;
546 		p2 = p;
547 		while(p < buf+nbuf && isalpha(*p))
548 			p++;
549 		c = *p;
550 		*p = 0;
551 		high = sizeof(dict)/sizeof(dict[0]);
552 		for(low = 0;low < high;) {
553 			mid = (low+high)/2;
554 			r = strcmp(dict[mid].word, (char*)p2);
555 			if(r == 0) {
556 				wfreq[dict[mid].class]++;
557 				break;
558 			}
559 			if(r < 0)
560 				low = mid+1;
561 			else
562 				high = mid;
563 		}
564 		*p++ = c;
565 	}
566 }
567 
568 typedef struct Filemagic Filemagic;
569 struct Filemagic {
570 	ulong x;
571 	ulong mask;
572 	char *desc;
573 	char *mime;
574 };
575 
576 /*
577  * integers in this table must be as seen on a little-endian machine
578  * when read from a file.
579  */
580 Filemagic long0tab[] = {
581 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
582 	/* "pac1" */
583 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
584 	/* "pXc2 */
585 	0x32630070,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
586 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
587 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
588 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip",
589 	070707,		0xFFFF,		"cpio archive\n", OCTET,
590 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
591 	0xfaff,		0xfeff,		"mp3 audio\n",	"audio/mpeg",
592 	0xfeff0000,	0xffffffff,	"utf-32be\n",	"text/plain charset=utf-32be",
593 	0xfffe,		0xffffffff,	"utf-32le\n",	"text/plain charset=utf-32le",
594 	0xfeff,		0xffff,		"utf-16be\n",	"text/plain charset=utf-16be",
595 	0xfffe,		0xffff,		"utf-16le\n",	"text/plain charset=utf-16le",
596 	/*
597 	 * venti & fossil magic numbers are stored big-endian on disk,
598 	 * thus the numbers appear reversed in this table.
599 	 */
600 	0xad4e5cd1,	0xFFFFFFFF,	"venti arena\n", OCTET,
601 };
602 
603 int
604 filemagic(Filemagic *tab, int ntab, ulong x)
605 {
606 	int i;
607 
608 	for(i=0; i<ntab; i++)
609 		if((x&tab[i].mask) == tab[i].x){
610 			print(mime ? tab[i].mime : tab[i].desc);
611 			return 1;
612 		}
613 	return 0;
614 }
615 
616 int
617 long0(void)
618 {
619 	return filemagic(long0tab, nelem(long0tab), LENDIAN(buf));
620 }
621 
622 typedef struct Fileoffmag Fileoffmag;
623 struct Fileoffmag {
624 	ulong	off;
625 	Filemagic;
626 };
627 
628 /*
629  * integers in this table must be as seen on a little-endian machine
630  * when read from a file.
631  */
632 Fileoffmag longofftab[] = {
633 	/*
634 	 * venti & fossil magic numbers are stored big-endian on disk,
635 	 * thus the numbers appear reversed in this table.
636 	 */
637 	256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET,
638 	256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET,
639 	128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET,
640 };
641 
642 int
643 fileoffmagic(Fileoffmag *tab, int ntab)
644 {
645 	int i;
646 	ulong x;
647 	Fileoffmag *tp;
648 	uchar buf[sizeof(long)];
649 
650 	for(i=0; i<ntab; i++) {
651 		tp = tab + i;
652 		seek(fd, tp->off, 0);
653 		if (read(fd, buf, sizeof buf) != sizeof buf)
654 			continue;
655 		x = LENDIAN(buf);
656 		if((x&tp->mask) == tp->x){
657 			print(mime? tp->mime: tp->desc);
658 			return 1;
659 		}
660 	}
661 	return 0;
662 }
663 
664 int
665 longoff(void)
666 {
667 	return fileoffmagic(longofftab, nelem(longofftab));
668 }
669 
670 int
671 isexec(void)
672 {
673 	Fhdr f;
674 
675 	seek(fd, 0, 0);		/* reposition to start of file */
676 	if(crackhdr(fd, &f)) {
677 		print(mime ? OCTET : "%s\n", f.name);
678 		return 1;
679 	}
680 	return 0;
681 }
682 
683 
684 /* from tar.c */
685 enum { NAMSIZ = 100, TBLOCK = 512 };
686 
687 union	hblock
688 {
689 	char	dummy[TBLOCK];
690 	struct	header
691 	{
692 		char	name[NAMSIZ];
693 		char	mode[8];
694 		char	uid[8];
695 		char	gid[8];
696 		char	size[12];
697 		char	mtime[12];
698 		char	chksum[8];
699 		char	linkflag;
700 		char	linkname[NAMSIZ];
701 		/* rest are defined by POSIX's ustar format; see p1003.2b */
702 		char	magic[6];	/* "ustar" */
703 		char	version[2];
704 		char	uname[32];
705 		char	gname[32];
706 		char	devmajor[8];
707 		char	devminor[8];
708 		char	prefix[155];  /* if non-null, path = prefix "/" name */
709 	} dbuf;
710 };
711 
712 int
713 checksum(union hblock *hp)
714 {
715 	int i;
716 	char *cp;
717 	struct header *hdr = &hp->dbuf;
718 
719 	for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
720 		*cp = ' ';
721 	i = 0;
722 	for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
723 		i += *cp & 0xff;
724 	return i;
725 }
726 
727 int
728 istar(void)
729 {
730 	int chksum;
731 	char tblock[TBLOCK];
732 	union hblock *hp = (union hblock *)tblock;
733 	struct header *hdr = &hp->dbuf;
734 
735 	seek(fd, 0, 0);		/* reposition to start of file */
736 	if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
737 		return 0;
738 	chksum = strtol(hdr->chksum, 0, 8);
739 	if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
740 		if (strcmp(hdr->magic, "ustar") == 0)
741 			print(mime? "application/x-ustar\n":
742 				"posix tar archive\n");
743 		else
744 			print(mime? "application/x-tar\n": "tar archive\n");
745 		return 1;
746 	}
747 	return 0;
748 }
749 
750 /*
751  * initial words to classify file
752  */
753 struct	FILE_STRING
754 {
755 	char 	*key;
756 	char	*filetype;
757 	int	length;
758 	char	*mime;
759 } file_string[] =
760 {
761 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
762 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
763 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
764 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
765 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
766 	"%!",			"postscript",			2,	"application/postscript",
767 	"\004%!",		"postscript",			3,	"application/postscript",
768 	"x T post",		"troff output for post",	8,	"application/troff",
769 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
770 	"x T utf",		"troff output for UTF",		7,	"application/troff",
771 	"x T 202",		"troff output for 202",		7,	"application/troff",
772 	"x T aps",		"troff output for aps",		7,	"application/troff",
773 	"GIF",			"GIF image", 			3,	"image/gif",
774 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
775 	"%PDF",			"PDF",				4,	"application/pdf",
776 	"<html>\n",		"HTML file",			7,	"text/html",
777 	"<HTML>\n",		"HTML file",			7,	"text/html",
778 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
779 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
780 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
781 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
782 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
783 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
784 	"BM",			"bmp",				2,	"image/bmp",
785 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
786 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
787 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
788 	"ID3",			"mp3 audio with id3",	3,	"audio/mpeg",
789 	"\211PNG",		"PNG image",		4,	"image/png",
790 	"P3\n",			"ppm",				3,	"image/ppm",
791 	"P6\n",			"ppm",				3,	"image/ppm",
792 	"/* XPM */\n",	"xbm",				10,	"image/xbm",
793 	".HTML ",		"troff -ms input",	6,	"text/troff",
794 	".LP",			"troff -ms input",	3,	"text/troff",
795 	".ND",			"troff -ms input",	3,	"text/troff",
796 	".PP",			"troff -ms input",	3,	"text/troff",
797 	".TL",			"troff -ms input",	3,	"text/troff",
798 	".TR",			"troff -ms input",	3,	"text/troff",
799 	".TH",			"manual page",		3,	"text/troff",
800 	".\\\"",		"troff input",		3,	"text/troff",
801 	".de",			"troff input",		3,	"text/troff",
802 	".if",			"troff input",		3,	"text/troff",
803 	".nr",			"troff input",		3,	"text/troff",
804 	".tr",			"troff input",		3,	"text/troff",
805 	"vac:",			"venti score",		4,	"text/plain",
806 	0,0,0,0
807 };
808 
809 int
810 istring(void)
811 {
812 	int i;
813 	struct FILE_STRING *p;
814 
815 	for(p = file_string; p->key; p++) {
816 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
817 			if(mime)
818 				print("%s\n", p->mime);
819 			else
820 				print("%s\n", p->filetype);
821 			return 1;
822 		}
823 	}
824 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
825 		for(i = 5; i < nbuf; i++)
826 			if(buf[i] == '\n')
827 				break;
828 		if(mime)
829 			print(OCTET);
830 		else
831 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
832 		return 1;
833 	}
834 	return 0;
835 }
836 
837 struct offstr
838 {
839 	ulong	off;
840 	struct FILE_STRING;
841 } offstrs[] = {
842 	32*1024, "\001CD001\001",	"ISO9660 CD image",	7,	OCTET,
843 	0, 0, 0, 0, 0
844 };
845 
846 int
847 isoffstr(void)
848 {
849 	int n;
850 	char buf[256];
851 	struct offstr *p;
852 
853 	for(p = offstrs; p->key; p++) {
854 		seek(fd, p->off, 0);
855 		n = p->length;
856 		if (n > sizeof buf)
857 			n = sizeof buf;
858 		if (read(fd, buf, n) != n)
859 			continue;
860 		if(memcmp(buf, p->key, n) == 0) {
861 			if(mime)
862 				print("%s\n", p->mime);
863 			else
864 				print("%s\n", p->filetype);
865 			return 1;
866 		}
867 	}
868 	return 0;
869 }
870 
871 int
872 iff(void)
873 {
874 	if (strncmp((char*)buf, "FORM", 4) == 0 &&
875 	    strncmp((char*)buf+8, "AIFF", 4) == 0) {
876 		print("%s\n", mime? "audio/x-aiff": "aiff audio");
877 		return 1;
878 	}
879 	return 0;
880 }
881 
882 char*	html_string[] =
883 {
884 	"title",
885 	"body",
886 	"head",
887 	"strong",
888 	"h1",
889 	"h2",
890 	"h3",
891 	"h4",
892 	"h5",
893 	"h6",
894 	"ul",
895 	"li",
896 	"dl",
897 	"br",
898 	"em",
899 	0,
900 };
901 
902 int
903 ishtml(void)
904 {
905 	uchar *p, *q;
906 	int i, count;
907 
908 		/* compare strings between '<' and '>' to html table */
909 	count = 0;
910 	p = buf;
911 	for(;;) {
912 		while (p < buf+nbuf && *p != '<')
913 			p++;
914 		p++;
915 		if (p >= buf+nbuf)
916 			break;
917 		if(*p == '/')
918 			p++;
919 		q = p;
920 		while(p < buf+nbuf && *p != '>')
921 			p++;
922 		if (p >= buf+nbuf)
923 			break;
924 		for(i = 0; html_string[i]; i++) {
925 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
926 				if(count++ > 4) {
927 					print(mime ? "text/html\n" : "HTML file\n");
928 					return 1;
929 				}
930 				break;
931 			}
932 		}
933 		p++;
934 	}
935 	return 0;
936 }
937 
938 char*	rfc822_string[] =
939 {
940 	"from:",
941 	"date:",
942 	"to:",
943 	"subject:",
944 	"received:",
945 	"reply to:",
946 	"sender:",
947 	0,
948 };
949 
950 int
951 isrfc822(void)
952 {
953 
954 	char *p, *q, *r;
955 	int i, count;
956 
957 	count = 0;
958 	p = (char*)buf;
959 	for(;;) {
960 		q = strchr(p, '\n');
961 		if(q == nil)
962 			break;
963 		*q = 0;
964 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
965 			count++;
966 			*q = '\n';
967 			p = q+1;
968 			continue;
969 		}
970 		*q = '\n';
971 		if(*p != '\t' && *p != ' '){
972 			r = strchr(p, ':');
973 			if(r == 0 || r > q)
974 				break;
975 			for(i = 0; rfc822_string[i]; i++) {
976 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
977 					count++;
978 					break;
979 				}
980 			}
981 		}
982 		p = q+1;
983 	}
984 	if(count >= 3){
985 		print(mime ? "message/rfc822\n" : "email file\n");
986 		return 1;
987 	}
988 	return 0;
989 }
990 
991 int
992 ismbox(void)
993 {
994 	char *p, *q;
995 
996 	p = (char*)buf;
997 	q = strchr(p, '\n');
998 	if(q == nil)
999 		return 0;
1000 	*q = 0;
1001 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
1002 		print(mime ? "text/plain\n" : "mail box\n");
1003 		return 1;
1004 	}
1005 	*q = '\n';
1006 	return 0;
1007 }
1008 
1009 int
1010 iscint(void)
1011 {
1012 	int type;
1013 	char *name;
1014 	Biobuf b;
1015 
1016 	if(Binit(&b, fd, OREAD) == Beof)
1017 		return 0;
1018 	seek(fd, 0, 0);
1019 	type = objtype(&b, &name);
1020 	if(type < 0)
1021 		return 0;
1022 	if(mime)
1023 		print(OCTET);
1024 	else
1025 		print("%s intermediate\n", name);
1026 	return 1;
1027 }
1028 
1029 int
1030 isc(void)
1031 {
1032 	int n;
1033 
1034 	n = wfreq[I1];
1035 	/*
1036 	 * includes
1037 	 */
1038 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1039 		goto yes;
1040 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1041 		goto yes;
1042 	/*
1043 	 * declarations
1044 	 */
1045 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
1046 		goto yes;
1047 	/*
1048 	 * assignments
1049 	 */
1050 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
1051 		goto yes;
1052 	return 0;
1053 
1054 yes:
1055 	if(mime){
1056 		print(PLAIN);
1057 		return 1;
1058 	}
1059 	if(wfreq[Alword] > 0)
1060 		print("alef program\n");
1061 	else
1062 		print("c program\n");
1063 	return 1;
1064 }
1065 
1066 int
1067 islimbo(void)
1068 {
1069 
1070 	/*
1071 	 * includes
1072 	 */
1073 	if(wfreq[Lword] < 4)
1074 		return 0;
1075 	print(mime ? PLAIN : "limbo program\n");
1076 	return 1;
1077 }
1078 
1079 int
1080 isas(void)
1081 {
1082 
1083 	/*
1084 	 * includes
1085 	 */
1086 	if(wfreq[Aword] < 2)
1087 		return 0;
1088 	print(mime ? PLAIN : "as program\n");
1089 	return 1;
1090 }
1091 
1092 /*
1093  * low entropy means encrypted
1094  */
1095 int
1096 ismung(void)
1097 {
1098 	int i, bucket[8];
1099 	float cs;
1100 
1101 	if(nbuf < 64)
1102 		return 0;
1103 	memset(bucket, 0, sizeof(bucket));
1104 	for(i=nbuf-64; i<nbuf; i++)
1105 		bucket[(buf[i]>>5)&07] += 1;
1106 
1107 	cs = 0.;
1108 	for(i=0; i<8; i++)
1109 		cs += (bucket[i]-8)*(bucket[i]-8);
1110 	cs /= 8.;
1111 	if(cs <= 24.322) {
1112 		if(buf[0]==0x1f && buf[1]==0x9d)
1113 			print(mime ? OCTET : "compressed\n");
1114 		else
1115 		if(buf[0]==0x1f && buf[1]==0x8b)
1116 			print(mime ? OCTET : "gzip compressed\n");
1117 		else
1118 		if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h')
1119 			print(mime ? OCTET : "bzip2 compressed\n");
1120 		else
1121 			print(mime ? OCTET : "encrypted\n");
1122 		return 1;
1123 	}
1124 	return 0;
1125 }
1126 
1127 /*
1128  * english by punctuation and frequencies
1129  */
1130 int
1131 isenglish(void)
1132 {
1133 	int vow, comm, rare, badpun, punct;
1134 	char *p;
1135 
1136 	if(guess != Fascii && guess != Feascii)
1137 		return 0;
1138 	badpun = 0;
1139 	punct = 0;
1140 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
1141 		switch(*p) {
1142 		case '.':
1143 		case ',':
1144 		case ')':
1145 		case '%':
1146 		case ';':
1147 		case ':':
1148 		case '?':
1149 			punct++;
1150 			if(p[1] != ' ' && p[1] != '\n')
1151 				badpun++;
1152 		}
1153 	if(badpun*5 > punct)
1154 		return 0;
1155 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
1156 		return 0;
1157 	if(2*cfreq[';'] > cfreq['e'])
1158 		return 0;
1159 
1160 	vow = 0;
1161 	for(p="AEIOU"; *p; p++) {
1162 		vow += cfreq[*p];
1163 		vow += cfreq[tolower(*p)];
1164 	}
1165 	comm = 0;
1166 	for(p="ETAION"; *p; p++) {
1167 		comm += cfreq[*p];
1168 		comm += cfreq[tolower(*p)];
1169 	}
1170 	rare = 0;
1171 	for(p="VJKQXZ"; *p; p++) {
1172 		rare += cfreq[*p];
1173 		rare += cfreq[tolower(*p)];
1174 	}
1175 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
1176 		print(mime ? PLAIN : "English text\n");
1177 		return 1;
1178 	}
1179 	return 0;
1180 }
1181 
1182 /*
1183  * pick up a number with
1184  * syntax _*[0-9]+_
1185  */
1186 #define	P9BITLEN	12
1187 int
1188 p9bitnum(uchar *bp)
1189 {
1190 	int n, c, len;
1191 
1192 	len = P9BITLEN;
1193 	while(*bp == ' ') {
1194 		bp++;
1195 		len--;
1196 		if(len <= 0)
1197 			return -1;
1198 	}
1199 	n = 0;
1200 	while(len > 1) {
1201 		c = *bp++;
1202 		if(!isdigit(c))
1203 			return -1;
1204 		n = n*10 + c-'0';
1205 		len--;
1206 	}
1207 	if(*bp != ' ')
1208 		return -1;
1209 	return n;
1210 }
1211 
1212 int
1213 depthof(char *s, int *newp)
1214 {
1215 	char *es;
1216 	int d;
1217 
1218 	*newp = 0;
1219 	es = s+12;
1220 	while(s<es && *s==' ')
1221 		s++;
1222 	if(s == es)
1223 		return -1;
1224 	if('0'<=*s && *s<='9')
1225 		return 1<<strtol(s, 0, 0);
1226 
1227 	*newp = 1;
1228 	d = 0;
1229 	while(s<es && *s!=' '){
1230 		s++;	/* skip letter */
1231 		d += strtoul(s, &s, 10);
1232 	}
1233 
1234 	switch(d){
1235 	case 32:
1236 	case 24:
1237 	case 16:
1238 	case 8:
1239 		return d;
1240 	}
1241 	return -1;
1242 }
1243 
1244 int
1245 isp9bit(void)
1246 {
1247 	int dep, lox, loy, hix, hiy, px, new;
1248 	ulong t;
1249 	long len;
1250 	char *newlabel;
1251 
1252 	newlabel = "old ";
1253 
1254 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
1255 	if(new)
1256 		newlabel = "";
1257 	lox = p9bitnum(buf + 1*P9BITLEN);
1258 	loy = p9bitnum(buf + 2*P9BITLEN);
1259 	hix = p9bitnum(buf + 3*P9BITLEN);
1260 	hiy = p9bitnum(buf + 4*P9BITLEN);
1261 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1262 		return 0;
1263 
1264 	if(dep < 8){
1265 		px = 8/dep;	/* pixels per byte */
1266 		/* set l to number of bytes of data per scan line */
1267 		if(lox >= 0)
1268 			len = (hix+px-1)/px - lox/px;
1269 		else{	/* make positive before divide */
1270 			t = (-lox)+px-1;
1271 			t = (t/px)*px;
1272 			len = (t+hix+px-1)/px;
1273 		}
1274 	}else
1275 		len = (hix-lox)*dep/8;
1276 	len *= (hiy-loy);		/* col length */
1277 	len += 5*P9BITLEN;		/* size of initial ascii */
1278 
1279 	/*
1280 	 * for image file, length is non-zero and must match calculation above
1281 	 * for /dev/window and /dev/screen the length is always zero
1282 	 * for subfont, the subfont header should follow immediately.
1283 	 */
1284 	if (len != 0 && mbuf->length == 0) {
1285 		print("%splan 9 image\n", newlabel);
1286 		return 1;
1287 	}
1288 	if (mbuf->length == len) {
1289 		print("%splan 9 image\n", newlabel);
1290 		return 1;
1291 	}
1292 	/* Ghostscript sometimes produces a little extra on the end */
1293 	if (mbuf->length < len+P9BITLEN) {
1294 		print("%splan 9 image\n", newlabel);
1295 		return 1;
1296 	}
1297 	if (p9subfont(buf+len)) {
1298 		print("%ssubfont file\n", newlabel);
1299 		return 1;
1300 	}
1301 	return 0;
1302 }
1303 
1304 int
1305 p9subfont(uchar *p)
1306 {
1307 	int n, h, a;
1308 
1309 		/* if image too big, assume it's a subfont */
1310 	if (p+3*P9BITLEN > buf+sizeof(buf))
1311 		return 1;
1312 
1313 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1314 	if (n < 0)
1315 		return 0;
1316 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1317 	if (h < 0)
1318 		return 0;
1319 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1320 	if (a < 0)
1321 		return 0;
1322 	return 1;
1323 }
1324 
1325 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1326 
1327 int
1328 isp9font(void)
1329 {
1330 	uchar *cp, *p;
1331 	int i, n;
1332 	char pathname[1024];
1333 
1334 	cp = buf;
1335 	if (!getfontnum(cp, &cp))	/* height */
1336 		return 0;
1337 	if (!getfontnum(cp, &cp))	/* ascent */
1338 		return 0;
1339 	for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) {
1340 		if (!getfontnum(cp, &cp))	/* min */
1341 			break;
1342 		if (!getfontnum(cp, &cp))	/* max */
1343 			return 0;
1344 		getfontnum(cp, &cp);	/* optional offset */
1345 		while (WHITESPACE(*cp))
1346 			cp++;
1347 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1348 				;
1349 			/* construct a path name, if needed */
1350 		n = 0;
1351 		if (*p != '/' && slash) {
1352 			n = slash-fname+1;
1353 			if (n < sizeof(pathname))
1354 				memcpy(pathname, fname, n);
1355 			else n = 0;
1356 		}
1357 		if (n+cp-p+4 < sizeof(pathname)) {
1358 			memcpy(pathname+n, p, cp-p);
1359 			n += cp-p;
1360 			pathname[n] = 0;
1361 			if (access(pathname, AEXIST) < 0) {
1362 				strcpy(pathname+n, ".0");
1363 				if (access(pathname, AEXIST) < 0)
1364 					return 0;
1365 			}
1366 		}
1367 	}
1368 	if (i) {
1369 		print(mime ? "text/plain\n" : "font file\n");
1370 		return 1;
1371 	}
1372 	return 0;
1373 }
1374 
1375 int
1376 getfontnum(uchar *cp, uchar **rp)
1377 {
1378 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1379 		cp++;
1380 	if (*cp < '0' || *cp > '9')
1381 		return 0;
1382 	strtoul((char *)cp, (char **)rp, 0);
1383 	if (!WHITESPACE(**rp)) {
1384 		*rp = cp;
1385 		return 0;
1386 	}
1387 	return 1;
1388 }
1389 
1390 int
1391 isrtf(void)
1392 {
1393 	if(strstr((char *)buf, "\\rtf1")){
1394 		print(mime ? "application/rtf\n" : "rich text format\n");
1395 		return 1;
1396 	}
1397 	return 0;
1398 }
1399 
1400 int
1401 ismsdos(void)
1402 {
1403 	if (buf[0] == 0x4d && buf[1] == 0x5a){
1404 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1405 		return 1;
1406 	}
1407 	return 0;
1408 }
1409 
1410 int
1411 iself(void)
1412 {
1413 	static char *cpu[] = {		/* NB: incomplete and arbitary list */
1414 	[1]	"WE32100",
1415 	[2]	"SPARC",
1416 	[3]	"i386",
1417 	[4]	"M68000",
1418 	[5]	"M88000",
1419 	[6]	"i486",
1420 	[7]	"i860",
1421 	[8]	"R3000",
1422 	[9]	"S370",
1423 	[10]	"R4000",
1424 	[15]	"HP-PA",
1425 	[18]	"sparc v8+",
1426 	[19]	"i960",
1427 	[20]	"PPC-32",
1428 	[21]	"PPC-64",
1429 	[40]	"ARM",
1430 	[41]	"Alpha",
1431 	[43]	"sparc v9",
1432 	[50]	"IA-64",
1433 	[62]	"AMD64",
1434 	[75]	"VAX",
1435 	};
1436 	static char *type[] = {
1437 	[1]	"relocatable object",
1438 	[2]	"executable",
1439 	[3]	"shared library",
1440 	[4]	"core dump",
1441 	};
1442 
1443 	if (memcmp(buf, "\x7fELF", 4) == 0){
1444 		if (!mime){
1445 			int n = (buf[19] << 8) | buf[18];
1446 			char *p = "unknown";
1447 			char *t = "unknown";
1448 
1449 			if (n > 0 && n < nelem(cpu) && cpu[n])
1450 				p = cpu[n];
1451 			else {
1452 				/* try the other byte order */
1453 				n = (buf[18] << 8) | buf[19];
1454 				if (n > 0 && n < nelem(cpu) && cpu[n])
1455 					p = cpu[n];
1456 			}
1457 			n = buf[16];
1458 			if(n>0 && n < nelem(type) && type[n])
1459 				t = type[n];
1460 			print("%s ELF %s\n", p, t);
1461 		}
1462 		else
1463 			print("application/x-elf-executable");
1464 		return 1;
1465 	}
1466 
1467 	return 0;
1468 }
1469 
1470 int
1471 isface(void)
1472 {
1473 	int i, j, ldepth, l;
1474 	char *p;
1475 
1476 	ldepth = -1;
1477 	for(j = 0; j < 3; j++){
1478 		for(p = (char*)buf, i=0; i<3; i++){
1479 			if(p[0] != '0' || p[1] != 'x')
1480 				return 0;
1481 			if(buf[2+8] == ',')
1482 				l = 2;
1483 			else if(buf[2+4] == ',')
1484 				l = 1;
1485 			else
1486 				return 0;
1487 			if(ldepth == -1)
1488 				ldepth = l;
1489 			if(l != ldepth)
1490 				return 0;
1491 			strtoul(p, &p, 16);
1492 			if(*p++ != ',')
1493 				return 0;
1494 			while(*p == ' ' || *p == '\t')
1495 				p++;
1496 		}
1497 		if (*p++ != '\n')
1498 			return 0;
1499 	}
1500 
1501 	if(mime)
1502 		print("application/x-face\n");
1503 	else
1504 		print("face image depth %d\n", ldepth);
1505 	return 1;
1506 }
1507 
1508