xref: /plan9/sys/src/cmd/file.c (revision 282e677fa45fb578cdb8bc2c412ac084c367776e)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"chan",		Alword,
51 	"char",		Cword,
52 	"common",	Fword,
53 	"con",		Lword,
54 	"data",		Fword,
55 	"dimension",	Fword,
56 	"double",	Cword,
57 	"extern",	Cword,
58 	"bio",		I2,
59 	"float",	Cword,
60 	"fn",		Lword,
61 	"function",	Fword,
62 	"h",		I3,
63 	"implement",	Lword,
64 	"import",	Lword,
65 	"include",	I1,
66 	"int",		Cword,
67 	"integer",	Fword,
68 	"iota",		Lword,
69 	"libc",		I2,
70 	"long",		Cword,
71 	"module",	Lword,
72 	"real",		Fword,
73 	"ref",		Lword,
74 	"register",	Cword,
75 	"self",		Lword,
76 	"short",	Cword,
77 	"static",	Cword,
78 	"stdio",	I2,
79 	"struct",	Cword,
80 	"subroutine",	Fword,
81 	"u",		I2,
82 	"void",		Cword,
83 };
84 
85 /* codes for 'mode' field in language structure */
86 enum	{
87 		Normal	= 0,
88 		First,		/* first entry for language spanning several ranges */
89 		Multi,		/* later entries "   "       "  ... */
90 		Shared,		/* codes used in several languages */
91 	};
92 
93 struct
94 {
95 	int	mode;		/* see enum above */
96 	int 	count;
97 	int	low;
98 	int	high;
99 	char	*name;
100 
101 } language[] =
102 {
103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
125 	First,	0,	0x3130,	0x318F,	"Korean",
126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
128 	Normal,	0,	0,	0,	0,		/* terminal entry */
129 };
130 
131 
132 enum
133 {
134 	Fascii,		/* printable ascii */
135 	Flatin,		/* latin 1*/
136 	Futf,		/* UTf character set */
137 	Fbinary,	/* binary */
138 	Feascii,	/* ASCII with control chars */
139 	Fnull,		/* NULL in file */
140 } guess;
141 
142 void	bump_utf_count(Rune);
143 int	cistrncmp(char*, char*, int);
144 void	filetype(int);
145 int	getfontnum(uchar*, uchar**);
146 int	isas(void);
147 int	isc(void);
148 int	iscint(void);
149 int	isenglish(void);
150 int	ishp(void);
151 int	ishtml(void);
152 int	isrfc822(void);
153 int	ismbox(void);
154 int	islimbo(void);
155 int	ismung(void);
156 int	isp9bit(void);
157 int	isp9font(void);
158 int	isrtf(void);
159 int	ismsdos(void);
160 int	iself(void);
161 int	istring(void);
162 int	long0(void);
163 int	p9bitnum(uchar*);
164 int	p9subfont(uchar*);
165 void	print_utf(void);
166 void	type(char*, int);
167 int	utf_count(void);
168 void	wordfreq(void);
169 
170 int	(*call[])(void) =
171 {
172 	long0,		/* recognizable by first 4 bytes */
173 	istring,	/* recognizable by first string */
174 	isrfc822,	/* email file */
175 	ismbox,		/* mail box */
176 	ishtml,		/* html keywords */
177 	iscint,		/* compiler/assembler intermediate */
178 	islimbo,	/* limbo source */
179 	isc,		/* c & alef compiler key words */
180 	isas,		/* assembler key words */
181 	ismung,		/* entropy compressed/encrypted */
182 	isp9font,	/* plan 9 font */
183 	isp9bit,	/* plan 9 image (as from /dev/window) */
184 	isenglish,	/* char frequency English */
185 	isrtf,		/* rich text format */
186 	ismsdos,	/* msdos exe (virus file attachement) */
187 	iself,		/* ELF (foreign) executable */
188 	0
189 };
190 
191 int mime;
192 
193 #define OCTET	"application/octet-stream\n"
194 #define PLAIN	"text/plain\n"
195 
196 void
197 main(int argc, char *argv[])
198 {
199 	int i, j, maxlen;
200 	char *cp;
201 	Rune r;
202 
203 	ARGBEGIN{
204 	case 'm':
205 		mime = 1;
206 		break;
207 	default:
208 		fprint(2, "usage: file [-m] [file...]\n");
209 		exits("usage");
210 	}ARGEND;
211 
212 	maxlen = 0;
213 	if(mime == 0 || argc > 1){
214 		for(i = 0; i < argc; i++) {
215 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
216 					;
217 			if(j > maxlen)
218 				maxlen = j;
219 		}
220 	}
221 	if (argc <= 0) {
222 		if(!mime)
223 			print ("stdin: ");
224 		filetype(0);
225 	}
226 	else {
227 		for(i = 0; i < argc; i++)
228 			type(argv[i], maxlen);
229 	}
230 	exits(0);
231 }
232 
233 void
234 type(char *file, int nlen)
235 {
236 	Rune r;
237 	int i;
238 	char *p;
239 
240 	if(nlen > 0){
241 		slash = 0;
242 		for (i = 0, p = file; *p; i++) {
243 			if (*p == '/')			/* find rightmost slash */
244 				slash = p;
245 			p += chartorune(&r, p);		/* count runes */
246 		}
247 		print("%s:%*s",file, nlen-i+1, "");
248 	}
249 	fname = file;
250 	if ((fd = open(file, OREAD)) < 0) {
251 		print("cannot open\n");
252 		return;
253 	}
254 	filetype(fd);
255 	close(fd);
256 }
257 
258 void
259 filetype(int fd)
260 {
261 	Rune r;
262 	int i, f, n;
263 	char *p, *eob;
264 
265 	free(mbuf);
266 	mbuf = dirfstat(fd);
267 	if(mbuf == nil){
268 		print("cannot stat: %r\n");
269 		return;
270 	}
271 	if(mbuf->mode & DMDIR) {
272 		print(mime ? "text/directory\n" : "directory\n");
273 		return;
274 	}
275 	if(mbuf->type != 'M' && mbuf->type != '|') {
276 		print(mime ? OCTET : "special file #%c/%s\n",
277 			mbuf->type, mbuf->name);
278 		return;
279 	}
280 	nbuf = read(fd, buf, sizeof(buf)-1);
281 
282 	if(nbuf < 0) {
283 		print("cannot read\n");
284 		return;
285 	}
286 	if(nbuf == 0) {
287 		print(mime ? PLAIN : "empty file\n");
288 		return;
289 	}
290 	buf[nbuf] = 0;
291 
292 	/*
293 	 * build histogram table
294 	 */
295 	memset(cfreq, 0, sizeof(cfreq));
296 	for (i = 0; language[i].name; i++)
297 		language[i].count = 0;
298 	eob = (char *)buf+nbuf;
299 	for(n = 0, p = (char *)buf; p < eob; n++) {
300 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
301 			break;
302 		p += chartorune(&r, p);
303 		if (r == 0)
304 			f = Cnull;
305 		else if (r <= 0x7f) {
306 			if (!isprint(r) && !isspace(r))
307 				f = Ceascii;	/* ASCII control char */
308 			else f = r;
309 		} else if (r == 0x080) {
310 			bump_utf_count(r);
311 			f = Cutf;
312 		} else if (r < 0xA0)
313 				f = Cbinary;	/* Invalid Runes */
314 		else if (r <= 0xff)
315 				f = Clatin;	/* Latin 1 */
316 		else {
317 			bump_utf_count(r);
318 			f = Cutf;		/* UTF extension */
319 		}
320 		cfreq[f]++;			/* ASCII chars peg directly */
321 	}
322 	/*
323 	 * gross classify
324 	 */
325 	if (cfreq[Cbinary])
326 		guess = Fbinary;
327 	else if (cfreq[Cutf])
328 		guess = Futf;
329 	else if (cfreq[Clatin])
330 		guess = Flatin;
331 	else if (cfreq[Ceascii])
332 		guess = Feascii;
333 	else if (cfreq[Cnull] == n) {
334 		print(mime ? OCTET : "first block all null bytes\n");
335 		return;
336 	}
337 	else guess = Fascii;
338 	/*
339 	 * lookup dictionary words
340 	 */
341 	memset(wfreq, 0, sizeof(wfreq));
342 	if(guess == Fascii || guess == Flatin || guess == Futf)
343 		wordfreq();
344 	/*
345 	 * call individual classify routines
346 	 */
347 	for(i=0; call[i]; i++)
348 		if((*call[i])())
349 			return;
350 
351 	/*
352 	 * if all else fails,
353 	 * print out gross classification
354 	 */
355 	if (nbuf < 100 && !mime)
356 		print(mime ? PLAIN : "short ");
357 	if (guess == Fascii)
358 		print(mime ? PLAIN : "Ascii\n");
359 	else if (guess == Feascii)
360 		print(mime ? PLAIN : "extended ascii\n");
361 	else if (guess == Flatin)
362 		print(mime ? PLAIN : "latin ascii\n");
363 	else if (guess == Futf && utf_count() < 4)
364 		print_utf();
365 	else print(mime ? OCTET : "binary\n");
366 }
367 
368 void
369 bump_utf_count(Rune r)
370 {
371 	int low, high, mid;
372 
373 	high = sizeof(language)/sizeof(language[0])-1;
374 	for (low = 0; low < high;) {
375 		mid = (low+high)/2;
376 		if (r >=language[mid].low) {
377 			if (r <= language[mid].high) {
378 				language[mid].count++;
379 				break;
380 			} else low = mid+1;
381 		} else high = mid;
382 	}
383 }
384 
385 int
386 utf_count(void)
387 {
388 	int i, count;
389 
390 	count = 0;
391 	for (i = 0; language[i].name; i++)
392 		if (language[i].count > 0)
393 			switch (language[i].mode) {
394 			case Normal:
395 			case First:
396 				count++;
397 				break;
398 			default:
399 				break;
400 			}
401 	return count;
402 }
403 
404 int
405 chkascii(void)
406 {
407 	int i;
408 
409 	for (i = 'a'; i < 'z'; i++)
410 		if (cfreq[i])
411 			return 1;
412 	for (i = 'A'; i < 'Z'; i++)
413 		if (cfreq[i])
414 			return 1;
415 	return 0;
416 }
417 
418 int
419 find_first(char *name)
420 {
421 	int i;
422 
423 	for (i = 0; language[i].name != 0; i++)
424 		if (language[i].mode == First
425 			&& strcmp(language[i].name, name) == 0)
426 			return i;
427 	return -1;
428 }
429 
430 void
431 print_utf(void)
432 {
433 	int i, printed, j;
434 
435 	if(mime){
436 		print(PLAIN);
437 		return;
438 	}
439 	if (chkascii()) {
440 		printed = 1;
441 		print("Ascii");
442 	} else
443 		printed = 0;
444 	for (i = 0; language[i].name; i++)
445 		if (language[i].count) {
446 			switch(language[i].mode) {
447 			case Multi:
448 				j = find_first(language[i].name);
449 				if (j < 0)
450 					break;
451 				if (language[j].count > 0)
452 					break;
453 				/* Fall through */
454 			case Normal:
455 			case First:
456 				if (printed)
457 					print(" & ");
458 				else printed = 1;
459 				print("%s", language[i].name);
460 				break;
461 			case Shared:
462 			default:
463 				break;
464 			}
465 		}
466 	if(!printed)
467 		print("UTF");
468 	print(" text\n");
469 }
470 
471 void
472 wordfreq(void)
473 {
474 	int low, high, mid, r;
475 	uchar *p, *p2, c;
476 
477 	p = buf;
478 	for(;;) {
479 		while (p < buf+nbuf && !isalpha(*p))
480 			p++;
481 		if (p >= buf+nbuf)
482 			return;
483 		p2 = p;
484 		while(p < buf+nbuf && isalpha(*p))
485 			p++;
486 		c = *p;
487 		*p = 0;
488 		high = sizeof(dict)/sizeof(dict[0]);
489 		for(low = 0;low < high;) {
490 			mid = (low+high)/2;
491 			r = strcmp(dict[mid].word, (char*)p2);
492 			if(r == 0) {
493 				wfreq[dict[mid].class]++;
494 				break;
495 			}
496 			if(r < 0)
497 				low = mid+1;
498 			else
499 				high = mid;
500 		}
501 		*p++ = c;
502 	}
503 }
504 
505 typedef struct Filemagic Filemagic;
506 struct Filemagic {
507 	ulong x;
508 	ulong mask;
509 	char *desc;
510 	char *mime;
511 };
512 
513 Filemagic long0tab[] = {
514 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
515 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
516 	0x32636170,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
517 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
518 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
519 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip",
520 	070707,		0xFFFF,		"cpio archive\n", OCTET,
521 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
522 };
523 
524 int
525 filemagic(Filemagic *tab, int ntab, ulong x)
526 {
527 	int i;
528 
529 	for(i=0; i<ntab; i++)
530 		if((x&tab[i].mask) == tab[i].x){
531 			print(mime ? tab[i].mime : tab[i].desc);
532 			return 1;
533 		}
534 	return 0;
535 }
536 
537 int
538 long0(void)
539 {
540 	Fhdr f;
541 	long x;
542 
543 	seek(fd, 0, 0);		/* reposition to start of file */
544 	if(crackhdr(fd, &f)) {
545 		print(mime ? OCTET : "%s\n", f.name);
546 		return 1;
547 	}
548 	x = LENDIAN(buf);
549 	if(filemagic(long0tab, nelem(long0tab), x))
550 		return 1;
551 	return 0;
552 }
553 
554 /*
555  * initial words to classify file
556  */
557 struct	FILE_STRING
558 {
559 	char 	*key;
560 	char	*filetype;
561 	int	length;
562 	char	*mime;
563 } file_string[] =
564 {
565 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
566 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
567 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
568 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
569 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
570 	"%!",			"postscript",			2,	"application/postscript",
571 	"\004%!",		"postscript",			3,	"application/postscript",
572 	"x T post",		"troff output for post",	8,	"application/troff",
573 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
574 	"x T utf",		"troff output for UTF",		7,	"application/troff",
575 	"x T 202",		"troff output for 202",		7,	"application/troff",
576 	"x T aps",		"troff output for aps",		7,	"application/troff",
577 	"GIF",			"GIF image", 			3,	"image/gif",
578 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
579 	"%PDF",			"PDF",				4,	"application/pdf",
580 	"<html>\n",		"HTML file",			7,	"text/html",
581 	"<HTML>\n",		"HTML file",			7,	"text/html",
582 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
583 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
584 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
585 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
586 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
587 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
588 	"BM",			"bmp",				2,	"image/bmp",
589 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
590 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
591 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
592 	0,0,0,0
593 };
594 
595 int
596 istring(void)
597 {
598 	int i;
599 	struct FILE_STRING *p;
600 
601 	for(p = file_string; p->key; p++) {
602 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
603 			if(mime)
604 				print("%s\n", p->mime);
605 			else
606 				print("%s\n", p->filetype);
607 			return 1;
608 		}
609 	}
610 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
611 		for(i = 5; i < nbuf; i++)
612 			if(buf[i] == '\n')
613 				break;
614 		if(mime)
615 			print(OCTET);
616 		else
617 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
618 		return 1;
619 	}
620 	return 0;
621 }
622 
623 char*	html_string[] =
624 {
625 	"title",
626 	"body",
627 	"head",
628 	"strong",
629 	"h1",
630 	"h2",
631 	"h3",
632 	"h4",
633 	"h5",
634 	"h6",
635 	"ul",
636 	"li",
637 	"dl",
638 	"br",
639 	"em",
640 	0,
641 };
642 
643 int
644 ishtml(void)
645 {
646 	uchar *p, *q;
647 	int i, count;
648 
649 		/* compare strings between '<' and '>' to html table */
650 	count = 0;
651 	p = buf;
652 	for(;;) {
653 		while (p < buf+nbuf && *p != '<')
654 			p++;
655 		p++;
656 		if (p >= buf+nbuf)
657 			break;
658 		if(*p == '/')
659 			p++;
660 		q = p;
661 		while(p < buf+nbuf && *p != '>')
662 			p++;
663 		if (p >= buf+nbuf)
664 			break;
665 		for(i = 0; html_string[i]; i++) {
666 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
667 				if(count++ > 4) {
668 					print(mime ? "text/html\n" : "HTML file\n");
669 					return 1;
670 				}
671 				break;
672 			}
673 		}
674 		p++;
675 	}
676 	return 0;
677 }
678 
679 char*	rfc822_string[] =
680 {
681 	"from:",
682 	"date:",
683 	"to:",
684 	"subject:",
685 	"received:",
686 	"reply to:",
687 	"sender:",
688 	0,
689 };
690 
691 int
692 isrfc822(void)
693 {
694 
695 	char *p, *q, *r;
696 	int i, count;
697 
698 	count = 0;
699 	p = (char*)buf;
700 	for(;;) {
701 		q = strchr(p, '\n');
702 		if(q == nil)
703 			break;
704 		*q = 0;
705 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
706 			count++;
707 			*q = '\n';
708 			p = q+1;
709 			continue;
710 		}
711 		*q = '\n';
712 		if(*p != '\t' && *p != ' '){
713 			r = strchr(p, ':');
714 			if(r == 0 || r > q)
715 				break;
716 			for(i = 0; rfc822_string[i]; i++) {
717 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
718 					count++;
719 					break;
720 				}
721 			}
722 		}
723 		p = q+1;
724 	}
725 	if(count >= 3){
726 		print(mime ? "message/rfc822\n" : "email file\n");
727 		return 1;
728 	}
729 	return 0;
730 }
731 
732 int
733 ismbox(void)
734 {
735 	char *p, *q;
736 
737 	p = (char*)buf;
738 	q = strchr(p, '\n');
739 	if(q == nil)
740 		return 0;
741 	*q = 0;
742 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
743 		print(mime ? "text/plain\n" : "mail box\n");
744 		return 1;
745 	}
746 	*q = '\n';
747 	return 0;
748 }
749 
750 int
751 iscint(void)
752 {
753 	int type;
754 	char *name;
755 	Biobuf b;
756 
757 	if(Binit(&b, fd, OREAD) == Beof)
758 		return 0;
759 	seek(fd, 0, 0);
760 	type = objtype(&b, &name);
761 	if(type < 0)
762 		return 0;
763 	if(mime)
764 		print(OCTET);
765 	else
766 		print("%s intermediate\n", name);
767 	return 1;
768 }
769 
770 int
771 isc(void)
772 {
773 	int n;
774 
775 	n = wfreq[I1];
776 	/*
777 	 * includes
778 	 */
779 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
780 		goto yes;
781 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
782 		goto yes;
783 	/*
784 	 * declarations
785 	 */
786 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
787 		goto yes;
788 	/*
789 	 * assignments
790 	 */
791 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
792 		goto yes;
793 	return 0;
794 
795 yes:
796 	if(mime){
797 		print(PLAIN);
798 		return 1;
799 	}
800 	if(wfreq[Alword] > 0)
801 		print("alef program\n");
802 	else
803 		print("c program\n");
804 	return 1;
805 }
806 
807 int
808 islimbo(void)
809 {
810 
811 	/*
812 	 * includes
813 	 */
814 	if(wfreq[Lword] < 4)
815 		return 0;
816 	print(mime ? PLAIN : "limbo program\n");
817 	return 1;
818 }
819 
820 int
821 isas(void)
822 {
823 
824 	/*
825 	 * includes
826 	 */
827 	if(wfreq[Aword] < 2)
828 		return 0;
829 	print(mime ? PLAIN : "as program\n");
830 	return 1;
831 }
832 
833 /*
834  * low entropy means encrypted
835  */
836 int
837 ismung(void)
838 {
839 	int i, bucket[8];
840 	float cs;
841 
842 	if(nbuf < 64)
843 		return 0;
844 	memset(bucket, 0, sizeof(bucket));
845 	for(i=0; i<64; i++)
846 		bucket[(buf[i]>>5)&07] += 1;
847 
848 	cs = 0.;
849 	for(i=0; i<8; i++)
850 		cs += (bucket[i]-8)*(bucket[i]-8);
851 	cs /= 8.;
852 	if(cs <= 24.322) {
853 		if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
854 			print(mime ? OCTET : "compressed\n");
855 		else
856 			print(mime ? OCTET : "encrypted\n");
857 		return 1;
858 	}
859 	return 0;
860 }
861 
862 /*
863  * english by punctuation and frequencies
864  */
865 int
866 isenglish(void)
867 {
868 	int vow, comm, rare, badpun, punct;
869 	char *p;
870 
871 	if(guess != Fascii && guess != Feascii)
872 		return 0;
873 	badpun = 0;
874 	punct = 0;
875 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
876 		switch(*p) {
877 		case '.':
878 		case ',':
879 		case ')':
880 		case '%':
881 		case ';':
882 		case ':':
883 		case '?':
884 			punct++;
885 			if(p[1] != ' ' && p[1] != '\n')
886 				badpun++;
887 		}
888 	if(badpun*5 > punct)
889 		return 0;
890 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
891 		return 0;
892 	if(2*cfreq[';'] > cfreq['e'])
893 		return 0;
894 
895 	vow = 0;
896 	for(p="AEIOU"; *p; p++) {
897 		vow += cfreq[*p];
898 		vow += cfreq[tolower(*p)];
899 	}
900 	comm = 0;
901 	for(p="ETAION"; *p; p++) {
902 		comm += cfreq[*p];
903 		comm += cfreq[tolower(*p)];
904 	}
905 	rare = 0;
906 	for(p="VJKQXZ"; *p; p++) {
907 		rare += cfreq[*p];
908 		rare += cfreq[tolower(*p)];
909 	}
910 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
911 		print(mime ? PLAIN : "English text\n");
912 		return 1;
913 	}
914 	return 0;
915 }
916 
917 /*
918  * pick up a number with
919  * syntax _*[0-9]+_
920  */
921 #define	P9BITLEN	12
922 int
923 p9bitnum(uchar *bp)
924 {
925 	int n, c, len;
926 
927 	len = P9BITLEN;
928 	while(*bp == ' ') {
929 		bp++;
930 		len--;
931 		if(len <= 0)
932 			return -1;
933 	}
934 	n = 0;
935 	while(len > 1) {
936 		c = *bp++;
937 		if(!isdigit(c))
938 			return -1;
939 		n = n*10 + c-'0';
940 		len--;
941 	}
942 	if(*bp != ' ')
943 		return -1;
944 	return n;
945 }
946 
947 int
948 depthof(char *s, int *newp)
949 {
950 	char *es;
951 	int d;
952 
953 	*newp = 0;
954 	es = s+12;
955 	while(s<es && *s==' ')
956 		s++;
957 	if(s == es)
958 		return -1;
959 	if('0'<=*s && *s<='9')
960 		return 1<<atoi(s);
961 
962 	*newp = 1;
963 	d = 0;
964 	while(s<es && *s!=' '){
965 		s++;	/* skip letter */
966 		d += strtoul(s, &s, 10);
967 	}
968 
969 	switch(d){
970 	case 32:
971 	case 24:
972 	case 16:
973 	case 8:
974 		return d;
975 	}
976 	return -1;
977 }
978 
979 int
980 isp9bit(void)
981 {
982 	int dep, lox, loy, hix, hiy, px, new;
983 	ulong t;
984 	long len;
985 	char *newlabel;
986 
987 	newlabel = "old ";
988 
989 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
990 	if(new)
991 		newlabel = "";
992 	lox = p9bitnum(buf + 1*P9BITLEN);
993 	loy = p9bitnum(buf + 2*P9BITLEN);
994 	hix = p9bitnum(buf + 3*P9BITLEN);
995 	hiy = p9bitnum(buf + 4*P9BITLEN);
996 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
997 		return 0;
998 
999 	if(dep < 8){
1000 		px = 8/dep;	/* pixels per byte */
1001 		/* set l to number of bytes of data per scan line */
1002 		if(lox >= 0)
1003 			len = (hix+px-1)/px - lox/px;
1004 		else{	/* make positive before divide */
1005 			t = (-lox)+px-1;
1006 			t = (t/px)*px;
1007 			len = (t+hix+px-1)/px;
1008 		}
1009 	}else
1010 		len = (hix-lox)*dep/8;
1011 	len *= (hiy-loy);		/* col length */
1012 	len += 5*P9BITLEN;		/* size of initial ascii */
1013 
1014 	/*
1015 	 * for image file, length is non-zero and must match calculation above
1016 	 * for /dev/window and /dev/screen the length is always zero
1017 	 * for subfont, the subfont header should follow immediately.
1018 	 */
1019 	if (len != 0 && mbuf->length == 0) {
1020 		print("%splan 9 image\n", newlabel);
1021 		return 1;
1022 	}
1023 	if (mbuf->length == len) {
1024 		print("%splan 9 image\n", newlabel);
1025 		return 1;
1026 	}
1027 	/* Ghostscript sometimes produces a little extra on the end */
1028 	if (mbuf->length < len+P9BITLEN) {
1029 		print("%splan 9 image\n", newlabel);
1030 		return 1;
1031 	}
1032 	if (p9subfont(buf+len)) {
1033 		print("%ssubfont file\n", newlabel);
1034 		return 1;
1035 	}
1036 	return 0;
1037 }
1038 
1039 int
1040 p9subfont(uchar *p)
1041 {
1042 	int n, h, a;
1043 
1044 		/* if image too big, assume it's a subfont */
1045 	if (p+3*P9BITLEN > buf+sizeof(buf))
1046 		return 1;
1047 
1048 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1049 	if (n < 0)
1050 		return 0;
1051 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1052 	if (h < 0)
1053 		return 0;
1054 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1055 	if (a < 0)
1056 		return 0;
1057 	return 1;
1058 }
1059 
1060 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1061 
1062 int
1063 isp9font(void)
1064 {
1065 	uchar *cp, *p;
1066 	int i, n;
1067 	char pathname[1024];
1068 
1069 	cp = buf;
1070 	if (!getfontnum(cp, &cp))	/* height */
1071 		return 0;
1072 	if (!getfontnum(cp, &cp))	/* ascent */
1073 		return 0;
1074 	for (i = 0; 1; i++) {
1075 		if (!getfontnum(cp, &cp))	/* min */
1076 			break;
1077 		if (!getfontnum(cp, &cp))	/* max */
1078 			return 0;
1079 		while (WHITESPACE(*cp))
1080 			cp++;
1081 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1082 				;
1083 			/* construct a path name, if needed */
1084 		n = 0;
1085 		if (*p != '/' && slash) {
1086 			n = slash-fname+1;
1087 			if (n < sizeof(pathname))
1088 				memcpy(pathname, fname, n);
1089 			else n = 0;
1090 		}
1091 		if (n+cp-p < sizeof(pathname)) {
1092 			memcpy(pathname+n, p, cp-p);
1093 			n += cp-p;
1094 			pathname[n] = 0;
1095 			if (access(pathname, AEXIST) < 0)
1096 				return 0;
1097 		}
1098 	}
1099 	if (i) {
1100 		print(mime ? "text/plain\n" : "font file\n");
1101 		return 1;
1102 	}
1103 	return 0;
1104 }
1105 
1106 int
1107 getfontnum(uchar *cp, uchar **rp)
1108 {
1109 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1110 		cp++;
1111 	if (*cp < '0' || *cp > '9')
1112 		return 0;
1113 	strtoul((char *)cp, (char **)rp, 0);
1114 	if (!WHITESPACE(**rp))
1115 		return 0;
1116 	return 1;
1117 }
1118 
1119 int
1120 isrtf(void)
1121 {
1122 	if(strstr((char *)buf, "\\rtf1")){
1123 		print(mime ? "application/rtf\n" : "rich text format\n");
1124 		return 1;
1125 	}
1126 	return 0;
1127 }
1128 
1129 int
1130 ismsdos(void)
1131 {
1132 	if (buf[0] == 0x4d && buf[1] == 0x5a){
1133 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1134 		return 1;
1135 	}
1136 	return 0;
1137 }
1138 
1139 int
1140 iself(void)
1141 {
1142 	char *cpu[] = {		/* NB: incomplete and arbitary list */
1143 	[1]	"WE32100",
1144 	[2]	"SPARC",
1145 	[3]	"i386",
1146 	[4]	"M68000",
1147 	[5]	"M88000",
1148 	[6]	"i486",
1149 	[7]	"i860",
1150 	[8]	"R3000",
1151 	[9]	"S370",
1152 	[10]	"R4000",
1153 	[15]	"HP-PA",
1154 	[18]	"sparc v8+",
1155 	[19]	"i960",
1156 	[20]	"PPC-32",
1157 	[21]	"PPC-64",
1158 	[40]	"ARM",
1159 	[41]	"Alpha",
1160 	[43]	"sparc v9",
1161 	[50]	"IA-46",
1162 	[62]	"AMD x86-64",
1163 	[75]	"VAX",
1164 	};
1165 
1166 
1167 	if (memcmp(buf, "\x7fELF", 4) == 0){
1168 		if (!mime){
1169 			int n = (buf[19] << 8) | buf[18];
1170 			char *p = (n > 0 && n < nelem(cpu) && cpu[n])? cpu[n]: "unknown";
1171 			print("%s ELF executable\n", p);
1172 		}
1173 		else
1174 			print("application/x-elf-executable");
1175 		return 1;
1176 	}
1177 
1178 	return 0;
1179 }
1180