xref: /plan9/sys/src/cmd/file.c (revision 7750a8dc5f033d67ad7919d2ab3d82124362e986)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"char",		Cword,
51 	"common",	Fword,
52 	"con",		Lword,
53 	"data",		Fword,
54 	"dimension",	Fword,
55 	"double",	Cword,
56 	"extern",	Cword,
57 	"bio",		I2,
58 	"float",	Cword,
59 	"fn",		Lword,
60 	"function",	Fword,
61 	"h",		I3,
62 	"implement",	Lword,
63 	"import",	Lword,
64 	"include",	I1,
65 	"int",		Cword,
66 	"integer",	Fword,
67 	"iota",		Lword,
68 	"libc",		I2,
69 	"long",		Cword,
70 	"module",	Lword,
71 	"real",		Fword,
72 	"ref",		Lword,
73 	"register",	Cword,
74 	"self",		Lword,
75 	"short",	Cword,
76 	"static",	Cword,
77 	"stdio",	I2,
78 	"struct",	Cword,
79 	"subroutine",	Fword,
80 	"u",		I2,
81 	"void",		Cword,
82 };
83 
84 /* codes for 'mode' field in language structure */
85 enum	{
86 		Normal	= 0,
87 		First,		/* first entry for language spanning several ranges */
88 		Multi,		/* later entries "   "       "  ... */
89 		Shared,		/* codes used in several languages */
90 	};
91 
92 struct
93 {
94 	int	mode;		/* see enum above */
95 	int 	count;
96 	int	low;
97 	int	high;
98 	char	*name;
99 
100 } language[] =
101 {
102 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
103 	Normal,	0,	0x0370,	0x03FF,	"Greek",
104 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
105 	Normal,	0,	0x0530,	0x058F,	"Armenian",
106 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
107 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
108 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
109 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
110 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
111 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
112 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
113 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
114 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
115 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
116 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
117 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
118 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
119 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
120 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
121 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
122 	Normal,	0,	0x3100,	0x312F,	"Chinese",
123 	First,	0,	0x3130,	0x318F,	"Korean",
124 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
125 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
126 	Normal,	0,	0,	0,	0,		/* terminal entry */
127 };
128 
129 
130 enum
131 {
132 	Fascii,		/* printable ascii */
133 	Flatin,		/* latin 1*/
134 	Futf,		/* UTF character set */
135 	Fbinary,	/* binary */
136 	Feascii,	/* ASCII with control chars */
137 	Fnull,		/* NULL in file */
138 } guess;
139 
140 void	bump_utf_count(Rune);
141 int	cistrncmp(char*, char*, int);
142 void	filetype(int);
143 int	getfontnum(uchar*, uchar**);
144 int	isas(void);
145 int	isc(void);
146 int	iscint(void);
147 int	isenglish(void);
148 int	ishp(void);
149 int	ishtml(void);
150 int	isrfc822(void);
151 int	ismbox(void);
152 int	islimbo(void);
153 int	ismung(void);
154 int	isp9bit(void);
155 int	isp9font(void);
156 int	isrtf(void);
157 int	ismsdos(void);
158 int	iself(void);
159 int	istring(void);
160 int	isoffstr(void);
161 int	iff(void);
162 int	long0(void);
163 int	longoff(void);
164 int	istar(void);
165 int	isface(void);
166 int	isexec(void);
167 int	p9bitnum(uchar*);
168 int	p9subfont(uchar*);
169 void	print_utf(void);
170 void	type(char*, int);
171 int	utf_count(void);
172 void	wordfreq(void);
173 
174 int	(*call[])(void) =
175 {
176 	long0,		/* recognizable by first 4 bytes */
177 	istring,	/* recognizable by first string */
178 	iself,		/* ELF (foreign) executable */
179 	isexec,		/* native executables */
180 	iff,		/* interchange file format (strings) */
181 	longoff,	/* recognizable by 4 bytes at some offset */
182 	isoffstr,	/* recognizable by string at some offset */
183 	isrfc822,	/* email file */
184 	ismbox,		/* mail box */
185 	istar,		/* recognizable by tar checksum */
186 	ishtml,		/* html keywords */
187 	iscint,		/* compiler/assembler intermediate */
188 	islimbo,	/* limbo source */
189 	isc,		/* c & alef compiler key words */
190 	isas,		/* assembler key words */
191 	isp9font,	/* plan 9 font */
192 	isp9bit,	/* plan 9 image (as from /dev/window) */
193 	ismung,		/* entropy compressed/encrypted */
194 	isenglish,	/* char frequency English */
195 	isrtf,		/* rich text format */
196 	ismsdos,	/* msdos exe (virus file attachement) */
197 	isface,		/* ascii face file */
198 	0
199 };
200 
201 int mime;
202 
203 #define OCTET	"application/octet-stream\n"
204 #define PLAIN	"text/plain\n"
205 
206 void
207 main(int argc, char *argv[])
208 {
209 	int i, j, maxlen;
210 	char *cp;
211 	Rune r;
212 
213 	ARGBEGIN{
214 	case 'm':
215 		mime = 1;
216 		break;
217 	default:
218 		fprint(2, "usage: file [-m] [file...]\n");
219 		exits("usage");
220 	}ARGEND;
221 
222 	maxlen = 0;
223 	if(mime == 0 || argc > 1){
224 		for(i = 0; i < argc; i++) {
225 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
226 					;
227 			if(j > maxlen)
228 				maxlen = j;
229 		}
230 	}
231 	if (argc <= 0) {
232 		if(!mime)
233 			print ("stdin: ");
234 		filetype(0);
235 	}
236 	else {
237 		for(i = 0; i < argc; i++)
238 			type(argv[i], maxlen);
239 	}
240 	exits(0);
241 }
242 
243 void
244 type(char *file, int nlen)
245 {
246 	Rune r;
247 	int i;
248 	char *p;
249 
250 	if(nlen > 0){
251 		slash = 0;
252 		for (i = 0, p = file; *p; i++) {
253 			if (*p == '/')			/* find rightmost slash */
254 				slash = p;
255 			p += chartorune(&r, p);		/* count runes */
256 		}
257 		print("%s:%*s",file, nlen-i+1, "");
258 	}
259 	fname = file;
260 	if ((fd = open(file, OREAD)) < 0) {
261 		print("cannot open\n");
262 		return;
263 	}
264 	filetype(fd);
265 	close(fd);
266 }
267 
268 /*
269  * Unicode 4.0 4-byte runes.
270  */
271 typedef int Rune1;
272 
273 enum {
274 	UTFmax1 = 4,
275 };
276 
277 int
278 fullrune1(char *p, int n)
279 {
280 	int c;
281 
282 	if(n >= 1) {
283 		c = *(uchar*)p;
284 		if(c < 0x80)
285 			return 1;
286 		if(n >= 2 && c < 0xE0)
287 			return 1;
288 		if(n >= 3 && c < 0xF0)
289 			return 1;
290 		if(n >= 4)
291 			return 1;
292 	}
293 	return 0;
294 }
295 
296 int
297 chartorune1(Rune1 *rune, char *str)
298 {
299 	int c, c1, c2, c3, n;
300 	Rune r;
301 
302 	c = *(uchar*)str;
303 	if(c < 0xF0){
304 		r = 0;
305 		n = chartorune(&r, str);
306 		*rune = r;
307 		return n;
308 	}
309 	c &= ~0xF0;
310 	c1 = *(uchar*)(str+1) & ~0x80;
311 	c2 = *(uchar*)(str+2) & ~0x80;
312 	c3 = *(uchar*)(str+3) & ~0x80;
313 	n = (c<<18) | (c1<<12) | (c2<<6) | c3;
314 	if(n < 0x10000 || n > 0x10FFFF){
315 		*rune = Runeerror;
316 		return 1;
317 	}
318 	*rune = n;
319 	return 4;
320 }
321 
322 void
323 filetype(int fd)
324 {
325 	Rune1 r;
326 	int i, f, n;
327 	char *p, *eob;
328 
329 	free(mbuf);
330 	mbuf = dirfstat(fd);
331 	if(mbuf == nil){
332 		print("cannot stat: %r\n");
333 		return;
334 	}
335 	if(mbuf->mode & DMDIR) {
336 		print(mime ? "text/directory\n" : "directory\n");
337 		return;
338 	}
339 	if(mbuf->type != 'M' && mbuf->type != '|') {
340 		print(mime ? OCTET : "special file #%c/%s\n",
341 			mbuf->type, mbuf->name);
342 		return;
343 	}
344 	nbuf = read(fd, buf, sizeof(buf)-1);
345 
346 	if(nbuf < 0) {
347 		print("cannot read\n");
348 		return;
349 	}
350 	if(nbuf == 0) {
351 		print(mime ? PLAIN : "empty file\n");
352 		return;
353 	}
354 	buf[nbuf] = 0;
355 
356 	/*
357 	 * build histogram table
358 	 */
359 	memset(cfreq, 0, sizeof(cfreq));
360 	for (i = 0; language[i].name; i++)
361 		language[i].count = 0;
362 	eob = (char *)buf+nbuf;
363 	for(n = 0, p = (char *)buf; p < eob; n++) {
364 		if (!fullrune1(p, eob-p) && eob-p < UTFmax1)
365 			break;
366 		p += chartorune1(&r, p);
367 		if (r == 0)
368 			f = Cnull;
369 		else if (r <= 0x7f) {
370 			if (!isprint(r) && !isspace(r))
371 				f = Ceascii;	/* ASCII control char */
372 			else f = r;
373 		} else if (r == 0x80) {
374 			bump_utf_count(r);
375 			f = Cutf;
376 		} else if (r < 0xA0)
377 			f = Cbinary;	/* Invalid Runes */
378 		else if (r <= 0xff)
379 			f = Clatin;	/* Latin 1 */
380 		else {
381 			bump_utf_count(r);
382 			f = Cutf;		/* UTF extension */
383 		}
384 		cfreq[f]++;			/* ASCII chars peg directly */
385 	}
386 	/*
387 	 * gross classify
388 	 */
389 	if (cfreq[Cbinary])
390 		guess = Fbinary;
391 	else if (cfreq[Cutf])
392 		guess = Futf;
393 	else if (cfreq[Clatin])
394 		guess = Flatin;
395 	else if (cfreq[Ceascii])
396 		guess = Feascii;
397 	else if (cfreq[Cnull])
398 		guess = Fbinary;
399 	else
400 		guess = Fascii;
401 	/*
402 	 * lookup dictionary words
403 	 */
404 	memset(wfreq, 0, sizeof(wfreq));
405 	if(guess == Fascii || guess == Flatin || guess == Futf)
406 		wordfreq();
407 	/*
408 	 * call individual classify routines
409 	 */
410 	for(i=0; call[i]; i++)
411 		if((*call[i])())
412 			return;
413 
414 	/*
415 	 * if all else fails,
416 	 * print out gross classification
417 	 */
418 	if (nbuf < 100 && !mime)
419 		print(mime ? PLAIN : "short ");
420 	if (guess == Fascii)
421 		print(mime ? PLAIN : "Ascii\n");
422 	else if (guess == Feascii)
423 		print(mime ? PLAIN : "extended ascii\n");
424 	else if (guess == Flatin)
425 		print(mime ? PLAIN : "latin ascii\n");
426 	else if (guess == Futf && utf_count() < 4)
427 		print_utf();
428 	else print(mime ? OCTET : "binary\n");
429 }
430 
431 void
432 bump_utf_count(Rune r)
433 {
434 	int low, high, mid;
435 
436 	high = sizeof(language)/sizeof(language[0])-1;
437 	for (low = 0; low < high;) {
438 		mid = (low+high)/2;
439 		if (r >= language[mid].low) {
440 			if (r <= language[mid].high) {
441 				language[mid].count++;
442 				break;
443 			} else low = mid+1;
444 		} else high = mid;
445 	}
446 }
447 
448 int
449 utf_count(void)
450 {
451 	int i, count;
452 
453 	count = 0;
454 	for (i = 0; language[i].name; i++)
455 		if (language[i].count > 0)
456 			switch (language[i].mode) {
457 			case Normal:
458 			case First:
459 				count++;
460 				break;
461 			default:
462 				break;
463 			}
464 	return count;
465 }
466 
467 int
468 chkascii(void)
469 {
470 	int i;
471 
472 	for (i = 'a'; i < 'z'; i++)
473 		if (cfreq[i])
474 			return 1;
475 	for (i = 'A'; i < 'Z'; i++)
476 		if (cfreq[i])
477 			return 1;
478 	return 0;
479 }
480 
481 int
482 find_first(char *name)
483 {
484 	int i;
485 
486 	for (i = 0; language[i].name != 0; i++)
487 		if (language[i].mode == First
488 			&& strcmp(language[i].name, name) == 0)
489 			return i;
490 	return -1;
491 }
492 
493 void
494 print_utf(void)
495 {
496 	int i, printed, j;
497 
498 	if(mime){
499 		print(PLAIN);
500 		return;
501 	}
502 	if (chkascii()) {
503 		printed = 1;
504 		print("Ascii");
505 	} else
506 		printed = 0;
507 	for (i = 0; language[i].name; i++)
508 		if (language[i].count) {
509 			switch(language[i].mode) {
510 			case Multi:
511 				j = find_first(language[i].name);
512 				if (j < 0)
513 					break;
514 				if (language[j].count > 0)
515 					break;
516 				/* Fall through */
517 			case Normal:
518 			case First:
519 				if (printed)
520 					print(" & ");
521 				else printed = 1;
522 				print("%s", language[i].name);
523 				break;
524 			case Shared:
525 			default:
526 				break;
527 			}
528 		}
529 	if(!printed)
530 		print("UTF");
531 	print(" text\n");
532 }
533 
534 void
535 wordfreq(void)
536 {
537 	int low, high, mid, r;
538 	uchar *p, *p2, c;
539 
540 	p = buf;
541 	for(;;) {
542 		while (p < buf+nbuf && !isalpha(*p))
543 			p++;
544 		if (p >= buf+nbuf)
545 			return;
546 		p2 = p;
547 		while(p < buf+nbuf && isalpha(*p))
548 			p++;
549 		c = *p;
550 		*p = 0;
551 		high = sizeof(dict)/sizeof(dict[0]);
552 		for(low = 0;low < high;) {
553 			mid = (low+high)/2;
554 			r = strcmp(dict[mid].word, (char*)p2);
555 			if(r == 0) {
556 				wfreq[dict[mid].class]++;
557 				break;
558 			}
559 			if(r < 0)
560 				low = mid+1;
561 			else
562 				high = mid;
563 		}
564 		*p++ = c;
565 	}
566 }
567 
568 typedef struct Filemagic Filemagic;
569 struct Filemagic {
570 	ulong x;
571 	ulong mask;
572 	char *desc;
573 	char *mime;
574 };
575 
576 /*
577  * integers in this table must be as seen on a little-endian machine
578  * when read from a file.
579  */
580 Filemagic long0tab[] = {
581 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
582 	/* "pac1" */
583 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
584 	/* "pXc2 */
585 	0x32630070,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
586 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
587 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
588 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip",
589 	070707,		0xFFFF,		"cpio archive\n", OCTET,
590 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
591 	0xfaff,		0xfeff,		"mp3 audio\n",	"audio/mpeg",
592 	0xfeff0000,	0xffffffff,	"utf-32be\n",	"text/plain charset=utf-32be",
593 	0xfffe,		0xffffffff,	"utf-32le\n",	"text/plain charset=utf-32le",
594 	0xfeff,		0xffff,		"utf-16be\n",	"text/plain charset=utf-16be",
595 	0xfffe,		0xffff,		"utf-16le\n",	"text/plain charset=utf-16le",
596 	/*
597 	 * venti & fossil magic numbers are stored big-endian on disk,
598 	 * thus the numbers appear reversed in this table.
599 	 */
600 	0xad4e5cd1,	0xFFFFFFFF,	"venti arena\n", OCTET,
601 };
602 
603 int
604 filemagic(Filemagic *tab, int ntab, ulong x)
605 {
606 	int i;
607 
608 	for(i=0; i<ntab; i++)
609 		if((x&tab[i].mask) == tab[i].x){
610 			print(mime ? tab[i].mime : tab[i].desc);
611 			return 1;
612 		}
613 	return 0;
614 }
615 
616 int
617 long0(void)
618 {
619 	return filemagic(long0tab, nelem(long0tab), LENDIAN(buf));
620 }
621 
622 typedef struct Fileoffmag Fileoffmag;
623 struct Fileoffmag {
624 	ulong	off;
625 	Filemagic;
626 };
627 
628 /*
629  * integers in this table must be as seen on a little-endian machine
630  * when read from a file.
631  */
632 Fileoffmag longofftab[] = {
633 	/*
634 	 * venti & fossil magic numbers are stored big-endian on disk,
635 	 * thus the numbers appear reversed in this table.
636 	 */
637 	256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET,
638 	256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET,
639 	128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET,
640 };
641 
642 int
643 fileoffmagic(Fileoffmag *tab, int ntab)
644 {
645 	int i;
646 	ulong x;
647 	Fileoffmag *tp;
648 	uchar buf[sizeof(long)];
649 
650 	for(i=0; i<ntab; i++) {
651 		tp = tab + i;
652 		seek(fd, tp->off, 0);
653 		if (read(fd, buf, sizeof buf) != sizeof buf)
654 			continue;
655 		x = LENDIAN(buf);
656 		if((x&tp->mask) == tp->x){
657 			print(mime? tp->mime: tp->desc);
658 			return 1;
659 		}
660 	}
661 	return 0;
662 }
663 
664 int
665 longoff(void)
666 {
667 	return fileoffmagic(longofftab, nelem(longofftab));
668 }
669 
670 int
671 isexec(void)
672 {
673 	Fhdr f;
674 
675 	seek(fd, 0, 0);		/* reposition to start of file */
676 	if(crackhdr(fd, &f)) {
677 		print(mime ? OCTET : "%s\n", f.name);
678 		return 1;
679 	}
680 	return 0;
681 }
682 
683 
684 /* from tar.c */
685 enum { NAMSIZ = 100, TBLOCK = 512 };
686 
687 union	hblock
688 {
689 	char	dummy[TBLOCK];
690 	struct	header
691 	{
692 		char	name[NAMSIZ];
693 		char	mode[8];
694 		char	uid[8];
695 		char	gid[8];
696 		char	size[12];
697 		char	mtime[12];
698 		char	chksum[8];
699 		char	linkflag;
700 		char	linkname[NAMSIZ];
701 		/* rest are defined by POSIX's ustar format; see p1003.2b */
702 		char	magic[6];	/* "ustar" */
703 		char	version[2];
704 		char	uname[32];
705 		char	gname[32];
706 		char	devmajor[8];
707 		char	devminor[8];
708 		char	prefix[155];  /* if non-null, path = prefix "/" name */
709 	} dbuf;
710 };
711 
712 int
713 checksum(union hblock *hp)
714 {
715 	int i;
716 	char *cp;
717 	struct header *hdr = &hp->dbuf;
718 
719 	for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
720 		*cp = ' ';
721 	i = 0;
722 	for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
723 		i += *cp & 0xff;
724 	return i;
725 }
726 
727 int
728 istar(void)
729 {
730 	int chksum;
731 	char tblock[TBLOCK];
732 	union hblock *hp = (union hblock *)tblock;
733 	struct header *hdr = &hp->dbuf;
734 
735 	seek(fd, 0, 0);		/* reposition to start of file */
736 	if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
737 		return 0;
738 	chksum = strtol(hdr->chksum, 0, 8);
739 	if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
740 		if (strcmp(hdr->magic, "ustar") == 0)
741 			print(mime? "application/x-ustar\n":
742 				"posix tar archive\n");
743 		else
744 			print(mime? "application/x-tar\n": "tar archive\n");
745 		return 1;
746 	}
747 	return 0;
748 }
749 
750 /*
751  * initial words to classify file
752  */
753 struct	FILE_STRING
754 {
755 	char 	*key;
756 	char	*filetype;
757 	int	length;
758 	char	*mime;
759 } file_string[] =
760 {
761 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
762 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
763 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
764 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
765 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
766 	"%!",			"postscript",			2,	"application/postscript",
767 	"\004%!",		"postscript",			3,	"application/postscript",
768 	"x T post",		"troff output for post",	8,	"application/troff",
769 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
770 	"x T utf",		"troff output for UTF",		7,	"application/troff",
771 	"x T 202",		"troff output for 202",		7,	"application/troff",
772 	"x T aps",		"troff output for aps",		7,	"application/troff",
773 	"GIF",			"GIF image", 			3,	"image/gif",
774 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
775 	"%PDF",			"PDF",				4,	"application/pdf",
776 	"<html>\n",		"HTML file",			7,	"text/html",
777 	"<HTML>\n",		"HTML file",			7,	"text/html",
778 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
779 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
780 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
781 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
782 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
783 	"BM",			"bmp",				2,	"image/bmp",
784 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
785 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
786 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
787 	"ID3",			"mp3 audio with id3",	3,	"audio/mpeg",
788 	"\211PNG",		"PNG image",		4,	"image/png",
789 	"P3\n",			"ppm",				3,	"image/ppm",
790 	"P6\n",			"ppm",				3,	"image/ppm",
791 	"/* XPM */\n",	"xbm",				10,	"image/xbm",
792 	".HTML ",		"troff -ms input",	6,	"text/troff",
793 	".LP",			"troff -ms input",	3,	"text/troff",
794 	".ND",			"troff -ms input",	3,	"text/troff",
795 	".PP",			"troff -ms input",	3,	"text/troff",
796 	".TL",			"troff -ms input",	3,	"text/troff",
797 	".TR",			"troff -ms input",	3,	"text/troff",
798 	".TH",			"manual page",		3,	"text/troff",
799 	".\\\"",		"troff input",		3,	"text/troff",
800 	".de",			"troff input",		3,	"text/troff",
801 	".if",			"troff input",		3,	"text/troff",
802 	".nr",			"troff input",		3,	"text/troff",
803 	".tr",			"troff input",		3,	"text/troff",
804 	"vac:",			"venti score",		4,	"text/plain",
805 	0,0,0,0
806 };
807 
808 int
809 istring(void)
810 {
811 	int i;
812 	struct FILE_STRING *p;
813 
814 	for(p = file_string; p->key; p++) {
815 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
816 			if(mime)
817 				print("%s\n", p->mime);
818 			else
819 				print("%s\n", p->filetype);
820 			return 1;
821 		}
822 	}
823 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
824 		for(i = 5; i < nbuf; i++)
825 			if(buf[i] == '\n')
826 				break;
827 		if(mime)
828 			print(OCTET);
829 		else
830 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
831 		return 1;
832 	}
833 	return 0;
834 }
835 
836 struct offstr
837 {
838 	ulong	off;
839 	struct FILE_STRING;
840 } offstrs[] = {
841 	32*1024, "\001CD001\001",	"ISO9660 CD image",	7,	OCTET,
842 	0, 0, 0, 0, 0
843 };
844 
845 int
846 isoffstr(void)
847 {
848 	int n;
849 	char buf[256];
850 	struct offstr *p;
851 
852 	for(p = offstrs; p->key; p++) {
853 		seek(fd, p->off, 0);
854 		n = p->length;
855 		if (n > sizeof buf)
856 			n = sizeof buf;
857 		if (read(fd, buf, n) != n)
858 			continue;
859 		if(memcmp(buf, p->key, n) == 0) {
860 			if(mime)
861 				print("%s\n", p->mime);
862 			else
863 				print("%s\n", p->filetype);
864 			return 1;
865 		}
866 	}
867 	return 0;
868 }
869 
870 int
871 iff(void)
872 {
873 	if (strncmp((char*)buf, "FORM", 4) == 0 &&
874 	    strncmp((char*)buf+8, "AIFF", 4) == 0) {
875 		print("%s\n", mime? "audio/x-aiff": "aiff audio");
876 		return 1;
877 	}
878 	return 0;
879 }
880 
881 char*	html_string[] =
882 {
883 	"title",
884 	"body",
885 	"head",
886 	"strong",
887 	"h1",
888 	"h2",
889 	"h3",
890 	"h4",
891 	"h5",
892 	"h6",
893 	"ul",
894 	"li",
895 	"dl",
896 	"br",
897 	"em",
898 	0,
899 };
900 
901 int
902 ishtml(void)
903 {
904 	uchar *p, *q;
905 	int i, count;
906 
907 		/* compare strings between '<' and '>' to html table */
908 	count = 0;
909 	p = buf;
910 	for(;;) {
911 		while (p < buf+nbuf && *p != '<')
912 			p++;
913 		p++;
914 		if (p >= buf+nbuf)
915 			break;
916 		if(*p == '/')
917 			p++;
918 		q = p;
919 		while(p < buf+nbuf && *p != '>')
920 			p++;
921 		if (p >= buf+nbuf)
922 			break;
923 		for(i = 0; html_string[i]; i++) {
924 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
925 				if(count++ > 4) {
926 					print(mime ? "text/html\n" : "HTML file\n");
927 					return 1;
928 				}
929 				break;
930 			}
931 		}
932 		p++;
933 	}
934 	return 0;
935 }
936 
937 char*	rfc822_string[] =
938 {
939 	"from:",
940 	"date:",
941 	"to:",
942 	"subject:",
943 	"received:",
944 	"reply to:",
945 	"sender:",
946 	0,
947 };
948 
949 int
950 isrfc822(void)
951 {
952 
953 	char *p, *q, *r;
954 	int i, count;
955 
956 	count = 0;
957 	p = (char*)buf;
958 	for(;;) {
959 		q = strchr(p, '\n');
960 		if(q == nil)
961 			break;
962 		*q = 0;
963 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
964 			count++;
965 			*q = '\n';
966 			p = q+1;
967 			continue;
968 		}
969 		*q = '\n';
970 		if(*p != '\t' && *p != ' '){
971 			r = strchr(p, ':');
972 			if(r == 0 || r > q)
973 				break;
974 			for(i = 0; rfc822_string[i]; i++) {
975 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
976 					count++;
977 					break;
978 				}
979 			}
980 		}
981 		p = q+1;
982 	}
983 	if(count >= 3){
984 		print(mime ? "message/rfc822\n" : "email file\n");
985 		return 1;
986 	}
987 	return 0;
988 }
989 
990 int
991 ismbox(void)
992 {
993 	char *p, *q;
994 
995 	p = (char*)buf;
996 	q = strchr(p, '\n');
997 	if(q == nil)
998 		return 0;
999 	*q = 0;
1000 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
1001 		print(mime ? "text/plain\n" : "mail box\n");
1002 		return 1;
1003 	}
1004 	*q = '\n';
1005 	return 0;
1006 }
1007 
1008 int
1009 iscint(void)
1010 {
1011 	int type;
1012 	char *name;
1013 	Biobuf b;
1014 
1015 	if(Binit(&b, fd, OREAD) == Beof)
1016 		return 0;
1017 	seek(fd, 0, 0);
1018 	type = objtype(&b, &name);
1019 	if(type < 0)
1020 		return 0;
1021 	if(mime)
1022 		print(OCTET);
1023 	else
1024 		print("%s intermediate\n", name);
1025 	return 1;
1026 }
1027 
1028 int
1029 isc(void)
1030 {
1031 	int n;
1032 
1033 	n = wfreq[I1];
1034 	/*
1035 	 * includes
1036 	 */
1037 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1038 		goto yes;
1039 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
1040 		goto yes;
1041 	/*
1042 	 * declarations
1043 	 */
1044 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
1045 		goto yes;
1046 	/*
1047 	 * assignments
1048 	 */
1049 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
1050 		goto yes;
1051 	return 0;
1052 
1053 yes:
1054 	if(mime){
1055 		print(PLAIN);
1056 		return 1;
1057 	}
1058 	if(wfreq[Alword] > 0)
1059 		print("alef program\n");
1060 	else
1061 		print("c program\n");
1062 	return 1;
1063 }
1064 
1065 int
1066 islimbo(void)
1067 {
1068 
1069 	/*
1070 	 * includes
1071 	 */
1072 	if(wfreq[Lword] < 4)
1073 		return 0;
1074 	print(mime ? PLAIN : "limbo program\n");
1075 	return 1;
1076 }
1077 
1078 int
1079 isas(void)
1080 {
1081 
1082 	/*
1083 	 * includes
1084 	 */
1085 	if(wfreq[Aword] < 2)
1086 		return 0;
1087 	print(mime ? PLAIN : "as program\n");
1088 	return 1;
1089 }
1090 
1091 /*
1092  * low entropy means encrypted
1093  */
1094 int
1095 ismung(void)
1096 {
1097 	int i, bucket[8];
1098 	float cs;
1099 
1100 	if(nbuf < 64)
1101 		return 0;
1102 	memset(bucket, 0, sizeof(bucket));
1103 	for(i=nbuf-64; i<nbuf; i++)
1104 		bucket[(buf[i]>>5)&07] += 1;
1105 
1106 	cs = 0.;
1107 	for(i=0; i<8; i++)
1108 		cs += (bucket[i]-8)*(bucket[i]-8);
1109 	cs /= 8.;
1110 	if(cs <= 24.322) {
1111 		if(buf[0]==0x1f && buf[1]==0x9d)
1112 			print(mime ? OCTET : "compressed\n");
1113 		else
1114 		if(buf[0]==0x1f && buf[1]==0x8b)
1115 			print(mime ? OCTET : "gzip compressed\n");
1116 		else
1117 		if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h')
1118 			print(mime ? OCTET : "bzip2 compressed\n");
1119 		else
1120 			print(mime ? OCTET : "encrypted\n");
1121 		return 1;
1122 	}
1123 	return 0;
1124 }
1125 
1126 /*
1127  * english by punctuation and frequencies
1128  */
1129 int
1130 isenglish(void)
1131 {
1132 	int vow, comm, rare, badpun, punct;
1133 	char *p;
1134 
1135 	if(guess != Fascii && guess != Feascii)
1136 		return 0;
1137 	badpun = 0;
1138 	punct = 0;
1139 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
1140 		switch(*p) {
1141 		case '.':
1142 		case ',':
1143 		case ')':
1144 		case '%':
1145 		case ';':
1146 		case ':':
1147 		case '?':
1148 			punct++;
1149 			if(p[1] != ' ' && p[1] != '\n')
1150 				badpun++;
1151 		}
1152 	if(badpun*5 > punct)
1153 		return 0;
1154 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
1155 		return 0;
1156 	if(2*cfreq[';'] > cfreq['e'])
1157 		return 0;
1158 
1159 	vow = 0;
1160 	for(p="AEIOU"; *p; p++) {
1161 		vow += cfreq[*p];
1162 		vow += cfreq[tolower(*p)];
1163 	}
1164 	comm = 0;
1165 	for(p="ETAION"; *p; p++) {
1166 		comm += cfreq[*p];
1167 		comm += cfreq[tolower(*p)];
1168 	}
1169 	rare = 0;
1170 	for(p="VJKQXZ"; *p; p++) {
1171 		rare += cfreq[*p];
1172 		rare += cfreq[tolower(*p)];
1173 	}
1174 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
1175 		print(mime ? PLAIN : "English text\n");
1176 		return 1;
1177 	}
1178 	return 0;
1179 }
1180 
1181 /*
1182  * pick up a number with
1183  * syntax _*[0-9]+_
1184  */
1185 #define	P9BITLEN	12
1186 int
1187 p9bitnum(uchar *bp)
1188 {
1189 	int n, c, len;
1190 
1191 	len = P9BITLEN;
1192 	while(*bp == ' ') {
1193 		bp++;
1194 		len--;
1195 		if(len <= 0)
1196 			return -1;
1197 	}
1198 	n = 0;
1199 	while(len > 1) {
1200 		c = *bp++;
1201 		if(!isdigit(c))
1202 			return -1;
1203 		n = n*10 + c-'0';
1204 		len--;
1205 	}
1206 	if(*bp != ' ')
1207 		return -1;
1208 	return n;
1209 }
1210 
1211 int
1212 depthof(char *s, int *newp)
1213 {
1214 	char *es;
1215 	int d;
1216 
1217 	*newp = 0;
1218 	es = s+12;
1219 	while(s<es && *s==' ')
1220 		s++;
1221 	if(s == es)
1222 		return -1;
1223 	if('0'<=*s && *s<='9')
1224 		return 1<<strtol(s, 0, 0);
1225 
1226 	*newp = 1;
1227 	d = 0;
1228 	while(s<es && *s!=' '){
1229 		s++;			/* skip letter */
1230 		d += strtoul(s, &s, 10);
1231 	}
1232 
1233 	if(d % 8 == 0 || 8 % d == 0)
1234 		return d;
1235 	else
1236 		return -1;
1237 }
1238 
1239 int
1240 isp9bit(void)
1241 {
1242 	int dep, lox, loy, hix, hiy, px, new, cmpr;
1243 	ulong t;
1244 	long len;
1245 	char *newlabel;
1246 	uchar *cp;
1247 
1248 	cp = buf;
1249 	cmpr = 0;
1250 	newlabel = "old ";
1251 
1252 	if(memcmp(cp, "compressed\n", 11) == 0) {
1253 		cmpr = 1;
1254 		cp = buf + 11;
1255 	}
1256 
1257 	dep = depthof((char*)cp + 0*P9BITLEN, &new);
1258 	if(new)
1259 		newlabel = "";
1260 	lox = p9bitnum(cp + 1*P9BITLEN);
1261 	loy = p9bitnum(cp + 2*P9BITLEN);
1262 	hix = p9bitnum(cp + 3*P9BITLEN);
1263 	hiy = p9bitnum(cp + 4*P9BITLEN);
1264 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1265 		return 0;
1266 
1267 	if(dep < 8){
1268 		px = 8/dep;		/* pixels per byte */
1269 		/* set l to number of bytes of data per scan line */
1270 		if(lox >= 0)
1271 			len = (hix+px-1)/px - lox/px;
1272 		else{			/* make positive before divide */
1273 			t = (-lox)+px-1;
1274 			t = (t/px)*px;
1275 			len = (t+hix+px-1)/px;
1276 		}
1277 	}else
1278 		len = (hix-lox)*dep/8;
1279 	len *= hiy - loy;		/* col length */
1280 	len += 5 * P9BITLEN;		/* size of initial ascii */
1281 
1282 	/*
1283 	 * for compressed images, don't look any further. otherwise:
1284 	 * for image file, length is non-zero and must match calculation above
1285 	 * for /dev/window and /dev/screen the length is always zero
1286 	 * for subfont, the subfont header should follow immediately.
1287 	 */
1288 	if (cmpr) {
1289 		print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n",
1290 			newlabel, dep);
1291 		return 1;
1292 	}
1293 	if (len != 0 && mbuf->length == 0) {
1294 		print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep);
1295 		return 1;
1296 	}
1297 	if (mbuf->length == len) {
1298 		print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep);
1299 		return 1;
1300 	}
1301 	/* Ghostscript sometimes produces a little extra on the end */
1302 	if (mbuf->length < len+P9BITLEN) {
1303 		print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep);
1304 		return 1;
1305 	}
1306 	if (p9subfont(buf+len)) {
1307 		print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep);
1308 		return 1;
1309 	}
1310 	return 0;
1311 }
1312 
1313 int
1314 p9subfont(uchar *p)
1315 {
1316 	int n, h, a;
1317 
1318 	/* if image too big, assume it's a subfont */
1319 	if (p+3*P9BITLEN > buf+sizeof(buf))
1320 		return 1;
1321 
1322 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1323 	if (n < 0)
1324 		return 0;
1325 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1326 	if (h < 0)
1327 		return 0;
1328 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1329 	if (a < 0)
1330 		return 0;
1331 	return 1;
1332 }
1333 
1334 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1335 
1336 int
1337 isp9font(void)
1338 {
1339 	uchar *cp, *p;
1340 	int i, n;
1341 	char pathname[1024];
1342 
1343 	cp = buf;
1344 	if (!getfontnum(cp, &cp))	/* height */
1345 		return 0;
1346 	if (!getfontnum(cp, &cp))	/* ascent */
1347 		return 0;
1348 	for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) {
1349 		if (!getfontnum(cp, &cp))	/* min */
1350 			break;
1351 		if (!getfontnum(cp, &cp))	/* max */
1352 			return 0;
1353 		getfontnum(cp, &cp);	/* optional offset */
1354 		while (WHITESPACE(*cp))
1355 			cp++;
1356 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1357 				;
1358 			/* construct a path name, if needed */
1359 		n = 0;
1360 		if (*p != '/' && slash) {
1361 			n = slash-fname+1;
1362 			if (n < sizeof(pathname))
1363 				memcpy(pathname, fname, n);
1364 			else n = 0;
1365 		}
1366 		if (n+cp-p+4 < sizeof(pathname)) {
1367 			memcpy(pathname+n, p, cp-p);
1368 			n += cp-p;
1369 			pathname[n] = 0;
1370 			if (access(pathname, AEXIST) < 0) {
1371 				strcpy(pathname+n, ".0");
1372 				if (access(pathname, AEXIST) < 0)
1373 					return 0;
1374 			}
1375 		}
1376 	}
1377 	if (i) {
1378 		print(mime ? "text/plain\n" : "font file\n");
1379 		return 1;
1380 	}
1381 	return 0;
1382 }
1383 
1384 int
1385 getfontnum(uchar *cp, uchar **rp)
1386 {
1387 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1388 		cp++;
1389 	if (*cp < '0' || *cp > '9')
1390 		return 0;
1391 	strtoul((char *)cp, (char **)rp, 0);
1392 	if (!WHITESPACE(**rp)) {
1393 		*rp = cp;
1394 		return 0;
1395 	}
1396 	return 1;
1397 }
1398 
1399 int
1400 isrtf(void)
1401 {
1402 	if(strstr((char *)buf, "\\rtf1")){
1403 		print(mime ? "application/rtf\n" : "rich text format\n");
1404 		return 1;
1405 	}
1406 	return 0;
1407 }
1408 
1409 int
1410 ismsdos(void)
1411 {
1412 	if (buf[0] == 0x4d && buf[1] == 0x5a){
1413 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1414 		return 1;
1415 	}
1416 	return 0;
1417 }
1418 
1419 int
1420 iself(void)
1421 {
1422 	static char *cpu[] = {		/* NB: incomplete and arbitary list */
1423 	[1]	"WE32100",
1424 	[2]	"SPARC",
1425 	[3]	"i386",
1426 	[4]	"M68000",
1427 	[5]	"M88000",
1428 	[6]	"i486",
1429 	[7]	"i860",
1430 	[8]	"R3000",
1431 	[9]	"S370",
1432 	[10]	"R4000",
1433 	[15]	"HP-PA",
1434 	[18]	"sparc v8+",
1435 	[19]	"i960",
1436 	[20]	"PPC-32",
1437 	[21]	"PPC-64",
1438 	[40]	"ARM",
1439 	[41]	"Alpha",
1440 	[43]	"sparc v9",
1441 	[50]	"IA-64",
1442 	[62]	"AMD64",
1443 	[75]	"VAX",
1444 	};
1445 	static char *type[] = {
1446 	[1]	"relocatable object",
1447 	[2]	"executable",
1448 	[3]	"shared library",
1449 	[4]	"core dump",
1450 	};
1451 
1452 	if (memcmp(buf, "\x7fELF", 4) == 0){
1453 		if (!mime){
1454 			int isdifend = 0;
1455 			int n = (buf[19] << 8) | buf[18];
1456 			char *p = "unknown";
1457 			char *t = "unknown";
1458 
1459 			if (n > 0 && n < nelem(cpu) && cpu[n])
1460 				p = cpu[n];
1461 			else {
1462 				/* try the other byte order */
1463 				isdifend = 1;
1464 				n = (buf[18] << 8) | buf[19];
1465 				if (n > 0 && n < nelem(cpu) && cpu[n])
1466 					p = cpu[n];
1467 			}
1468 			if(isdifend)
1469 				n = (buf[16]<< 8) | buf[17];
1470 			else
1471 				n = (buf[17]<< 8) | buf[16];
1472 
1473 			if(n>0 && n < nelem(type) && type[n])
1474 				t = type[n];
1475 			print("%s ELF %s\n", p, t);
1476 		}
1477 		else
1478 			print("application/x-elf-executable");
1479 		return 1;
1480 	}
1481 
1482 	return 0;
1483 }
1484 
1485 int
1486 isface(void)
1487 {
1488 	int i, j, ldepth, l;
1489 	char *p;
1490 
1491 	ldepth = -1;
1492 	for(j = 0; j < 3; j++){
1493 		for(p = (char*)buf, i=0; i<3; i++){
1494 			if(p[0] != '0' || p[1] != 'x')
1495 				return 0;
1496 			if(buf[2+8] == ',')
1497 				l = 2;
1498 			else if(buf[2+4] == ',')
1499 				l = 1;
1500 			else
1501 				return 0;
1502 			if(ldepth == -1)
1503 				ldepth = l;
1504 			if(l != ldepth)
1505 				return 0;
1506 			strtoul(p, &p, 16);
1507 			if(*p++ != ',')
1508 				return 0;
1509 			while(*p == ' ' || *p == '\t')
1510 				p++;
1511 		}
1512 		if (*p++ != '\n')
1513 			return 0;
1514 	}
1515 
1516 	if(mime)
1517 		print("application/x-face\n");
1518 	else
1519 		print("face image depth %d\n", ldepth);
1520 	return 1;
1521 }
1522 
1523