xref: /plan9/sys/src/cmd/file.c (revision ff8c3af2f44d95267f67219afa20ba82ff6cf7e4)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"chan",		Alword,
51 	"char",		Cword,
52 	"common",	Fword,
53 	"con",		Lword,
54 	"data",		Fword,
55 	"dimension",	Fword,
56 	"double",	Cword,
57 	"extern",	Cword,
58 	"bio",		I2,
59 	"float",	Cword,
60 	"fn",		Lword,
61 	"function",	Fword,
62 	"h",		I3,
63 	"implement",	Lword,
64 	"import",	Lword,
65 	"include",	I1,
66 	"int",		Cword,
67 	"integer",	Fword,
68 	"iota",		Lword,
69 	"libc",		I2,
70 	"long",		Cword,
71 	"module",	Lword,
72 	"real",		Fword,
73 	"ref",		Lword,
74 	"register",	Cword,
75 	"self",		Lword,
76 	"short",	Cword,
77 	"static",	Cword,
78 	"stdio",	I2,
79 	"struct",	Cword,
80 	"subroutine",	Fword,
81 	"u",		I2,
82 	"void",		Cword,
83 };
84 
85 /* codes for 'mode' field in language structure */
86 enum	{
87 		Normal	= 0,
88 		First,		/* first entry for language spanning several ranges */
89 		Multi,		/* later entries "   "       "  ... */
90 		Shared,		/* codes used in several languages */
91 	};
92 
93 struct
94 {
95 	int	mode;		/* see enum above */
96 	int 	count;
97 	int	low;
98 	int	high;
99 	char	*name;
100 
101 } language[] =
102 {
103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
125 	First,	0,	0x3130,	0x318F,	"Korean",
126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
128 	Normal,	0,	0,	0,	0,		/* terminal entry */
129 };
130 
131 
132 enum
133 {
134 	Fascii,		/* printable ascii */
135 	Flatin,		/* latin 1*/
136 	Futf,		/* UTf character set */
137 	Fbinary,	/* binary */
138 	Feascii,	/* ASCII with control chars */
139 	Fnull,		/* NULL in file */
140 } guess;
141 
142 void	bump_utf_count(Rune);
143 int	cistrncmp(char*, char*, int);
144 void	filetype(int);
145 int	getfontnum(uchar*, uchar**);
146 int	isas(void);
147 int	isc(void);
148 int	iscint(void);
149 int	isenglish(void);
150 int	ishp(void);
151 int	ishtml(void);
152 int	isrfc822(void);
153 int	ismbox(void);
154 int	islimbo(void);
155 int	ismung(void);
156 int	isp9bit(void);
157 int	isp9font(void);
158 int	isrtf(void);
159 int	ismsdos(void);
160 int	istring(void);
161 int	long0(void);
162 int	p9bitnum(uchar*);
163 int	p9subfont(uchar*);
164 void	print_utf(void);
165 void	type(char*, int);
166 int	utf_count(void);
167 void	wordfreq(void);
168 
169 int	(*call[])(void) =
170 {
171 	long0,		/* recognizable by first 4 bytes */
172 	istring,	/* recognizable by first string */
173 	isrfc822,	/* email file */
174 	ismbox,		/* mail box */
175 	ishtml,		/* html keywords */
176 	iscint,		/* compiler/assembler intermediate */
177 	islimbo,	/* limbo source */
178 	isc,		/* c & alef compiler key words */
179 	isas,		/* assembler key words */
180 	ismung,		/* entropy compressed/encrypted */
181 	isp9font,	/* plan 9 font */
182 	isp9bit,	/* plan 9 image (as from /dev/window) */
183 	isenglish,	/* char frequency English */
184 	isrtf,		/* rich text format */
185 	ismsdos,	/* msdos exe (virus file attachement) */
186 	0
187 };
188 
189 int mime;
190 
191 #define OCTET	"application/octet-stream\n"
192 #define PLAIN	"text/plain\n"
193 
194 void
195 main(int argc, char *argv[])
196 {
197 	int i, j, maxlen;
198 	char *cp;
199 	Rune r;
200 
201 	ARGBEGIN{
202 	case 'm':
203 		mime = 1;
204 		break;
205 	default:
206 		fprint(2, "usage: file [-m] [file...]\n");
207 		exits("usage");
208 	}ARGEND;
209 
210 	maxlen = 0;
211 	if(mime == 0 || argc > 1){
212 		for(i = 0; i < argc; i++) {
213 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
214 					;
215 			if(j > maxlen)
216 				maxlen = j;
217 		}
218 	}
219 	if (argc <= 0) {
220 		if(!mime)
221 			print ("stdin: ");
222 		filetype(0);
223 	}
224 	else {
225 		for(i = 0; i < argc; i++)
226 			type(argv[i], maxlen);
227 	}
228 	exits(0);
229 }
230 
231 void
232 type(char *file, int nlen)
233 {
234 	Rune r;
235 	int i;
236 	char *p;
237 
238 	if(nlen > 0){
239 		slash = 0;
240 		for (i = 0, p = file; *p; i++) {
241 			if (*p == '/')			/* find rightmost slash */
242 				slash = p;
243 			p += chartorune(&r, p);		/* count runes */
244 		}
245 		print("%s:%*s",file, nlen-i+1, "");
246 	}
247 	fname = file;
248 	if ((fd = open(file, OREAD)) < 0) {
249 		print("cannot open\n");
250 		return;
251 	}
252 	filetype(fd);
253 	close(fd);
254 }
255 
256 void
257 filetype(int fd)
258 {
259 	Rune r;
260 	int i, f, n;
261 	char *p, *eob;
262 
263 	free(mbuf);
264 	mbuf = dirfstat(fd);
265 	if(mbuf == nil){
266 		print("cannot stat: %r\n");
267 		return;
268 	}
269 	if(mbuf->mode & DMDIR) {
270 		print(mime ? "text/directory\n" : "directory\n");
271 		return;
272 	}
273 	if(mbuf->type != 'M' && mbuf->type != '|') {
274 		print(mime ? OCTET : "special file #%c/%s\n",
275 			mbuf->type, mbuf->name);
276 		return;
277 	}
278 	nbuf = read(fd, buf, sizeof(buf)-1);
279 
280 	if(nbuf < 0) {
281 		print("cannot read\n");
282 		return;
283 	}
284 	if(nbuf == 0) {
285 		print(mime ? PLAIN : "empty file\n");
286 		return;
287 	}
288 	buf[nbuf] = 0;
289 
290 	/*
291 	 * build histogram table
292 	 */
293 	memset(cfreq, 0, sizeof(cfreq));
294 	for (i = 0; language[i].name; i++)
295 		language[i].count = 0;
296 	eob = (char *)buf+nbuf;
297 	for(n = 0, p = (char *)buf; p < eob; n++) {
298 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
299 			break;
300 		p += chartorune(&r, p);
301 		if (r == 0)
302 			f = Cnull;
303 		else if (r <= 0x7f) {
304 			if (!isprint(r) && !isspace(r))
305 				f = Ceascii;	/* ASCII control char */
306 			else f = r;
307 		} else if (r == 0x080) {
308 			bump_utf_count(r);
309 			f = Cutf;
310 		} else if (r < 0xA0)
311 				f = Cbinary;	/* Invalid Runes */
312 		else if (r <= 0xff)
313 				f = Clatin;	/* Latin 1 */
314 		else {
315 			bump_utf_count(r);
316 			f = Cutf;		/* UTF extension */
317 		}
318 		cfreq[f]++;			/* ASCII chars peg directly */
319 	}
320 	/*
321 	 * gross classify
322 	 */
323 	if (cfreq[Cbinary])
324 		guess = Fbinary;
325 	else if (cfreq[Cutf])
326 		guess = Futf;
327 	else if (cfreq[Clatin])
328 		guess = Flatin;
329 	else if (cfreq[Ceascii])
330 		guess = Feascii;
331 	else if (cfreq[Cnull] == n) {
332 		print(mime ? OCTET : "first block all null bytes\n");
333 		return;
334 	}
335 	else guess = Fascii;
336 	/*
337 	 * lookup dictionary words
338 	 */
339 	memset(wfreq, 0, sizeof(wfreq));
340 	if(guess == Fascii || guess == Flatin || guess == Futf)
341 		wordfreq();
342 	/*
343 	 * call individual classify routines
344 	 */
345 	for(i=0; call[i]; i++)
346 		if((*call[i])())
347 			return;
348 
349 	/*
350 	 * if all else fails,
351 	 * print out gross classification
352 	 */
353 	if (nbuf < 100 && !mime)
354 		print(mime ? PLAIN : "short ");
355 	if (guess == Fascii)
356 		print(mime ? PLAIN : "Ascii\n");
357 	else if (guess == Feascii)
358 		print(mime ? PLAIN : "extended ascii\n");
359 	else if (guess == Flatin)
360 		print(mime ? PLAIN : "latin ascii\n");
361 	else if (guess == Futf && utf_count() < 4)
362 		print_utf();
363 	else print(mime ? OCTET : "binary\n");
364 }
365 
366 void
367 bump_utf_count(Rune r)
368 {
369 	int low, high, mid;
370 
371 	high = sizeof(language)/sizeof(language[0])-1;
372 	for (low = 0; low < high;) {
373 		mid = (low+high)/2;
374 		if (r >=language[mid].low) {
375 			if (r <= language[mid].high) {
376 				language[mid].count++;
377 				break;
378 			} else low = mid+1;
379 		} else high = mid;
380 	}
381 }
382 
383 int
384 utf_count(void)
385 {
386 	int i, count;
387 
388 	count = 0;
389 	for (i = 0; language[i].name; i++)
390 		if (language[i].count > 0)
391 			switch (language[i].mode) {
392 			case Normal:
393 			case First:
394 				count++;
395 				break;
396 			default:
397 				break;
398 			}
399 	return count;
400 }
401 
402 int
403 chkascii(void)
404 {
405 	int i;
406 
407 	for (i = 'a'; i < 'z'; i++)
408 		if (cfreq[i])
409 			return 1;
410 	for (i = 'A'; i < 'Z'; i++)
411 		if (cfreq[i])
412 			return 1;
413 	return 0;
414 }
415 
416 int
417 find_first(char *name)
418 {
419 	int i;
420 
421 	for (i = 0; language[i].name != 0; i++)
422 		if (language[i].mode == First
423 			&& strcmp(language[i].name, name) == 0)
424 			return i;
425 	return -1;
426 }
427 
428 void
429 print_utf(void)
430 {
431 	int i, printed, j;
432 
433 	if(mime){
434 		print(PLAIN);
435 		return;
436 	}
437 	if (chkascii()) {
438 		printed = 1;
439 		print("Ascii");
440 	} else
441 		printed = 0;
442 	for (i = 0; language[i].name; i++)
443 		if (language[i].count) {
444 			switch(language[i].mode) {
445 			case Multi:
446 				j = find_first(language[i].name);
447 				if (j < 0)
448 					break;
449 				if (language[j].count > 0)
450 					break;
451 				/* Fall through */
452 			case Normal:
453 			case First:
454 				if (printed)
455 					print(" & ");
456 				else printed = 1;
457 				print("%s", language[i].name);
458 				break;
459 			case Shared:
460 			default:
461 				break;
462 			}
463 		}
464 	if(!printed)
465 		print("UTF");
466 	print(" text\n");
467 }
468 
469 void
470 wordfreq(void)
471 {
472 	int low, high, mid, r;
473 	uchar *p, *p2, c;
474 
475 	p = buf;
476 	for(;;) {
477 		while (p < buf+nbuf && !isalpha(*p))
478 			p++;
479 		if (p >= buf+nbuf)
480 			return;
481 		p2 = p;
482 		while(p < buf+nbuf && isalpha(*p))
483 			p++;
484 		c = *p;
485 		*p = 0;
486 		high = sizeof(dict)/sizeof(dict[0]);
487 		for(low = 0;low < high;) {
488 			mid = (low+high)/2;
489 			r = strcmp(dict[mid].word, (char*)p2);
490 			if(r == 0) {
491 				wfreq[dict[mid].class]++;
492 				break;
493 			}
494 			if(r < 0)
495 				low = mid+1;
496 			else
497 				high = mid;
498 		}
499 		*p++ = c;
500 	}
501 }
502 
503 typedef struct Filemagic Filemagic;
504 struct Filemagic {
505 	ulong x;
506 	ulong mask;
507 	char *desc;
508 	char *mime;
509 };
510 
511 Filemagic long0tab[] = {
512 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
513 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
514 	0x32636170,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
515 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
516 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
517 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip",
518 	070707,		0xFFFF,		"cpio archive\n", OCTET,
519 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
520 };
521 
522 int
523 filemagic(Filemagic *tab, int ntab, ulong x)
524 {
525 	int i;
526 
527 	for(i=0; i<ntab; i++)
528 		if((x&tab[i].mask) == tab[i].x){
529 			print(mime ? tab[i].mime : tab[i].desc);
530 			return 1;
531 		}
532 	return 0;
533 }
534 
535 int
536 long0(void)
537 {
538 	Fhdr f;
539 	long x;
540 
541 	seek(fd, 0, 0);		/* reposition to start of file */
542 	if(crackhdr(fd, &f)) {
543 		print(mime ? OCTET : "%s\n", f.name);
544 		return 1;
545 	}
546 	x = LENDIAN(buf);
547 	if(filemagic(long0tab, nelem(long0tab), x))
548 		return 1;
549 	return 0;
550 }
551 
552 /*
553  * initial words to classify file
554  */
555 struct	FILE_STRING
556 {
557 	char 	*key;
558 	char	*filetype;
559 	int	length;
560 	char	*mime;
561 } file_string[] =
562 {
563 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
564 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
565 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
566 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
567 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
568 	"%!",			"postscript",			2,	"application/postscript",
569 	"\004%!",		"postscript",			3,	"application/postscript",
570 	"x T post",		"troff output for post",	8,	"application/troff",
571 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
572 	"x T utf",		"troff output for UTF",		7,	"application/troff",
573 	"x T 202",		"troff output for 202",		7,	"application/troff",
574 	"x T aps",		"troff output for aps",		7,	"application/troff",
575 	"GIF",			"GIF image", 			3,	"image/gif",
576 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
577 	"%PDF",			"PDF",				4,	"application/pdf",
578 	"<html>\n",		"HTML file",			7,	"text/html",
579 	"<HTML>\n",		"HTML file",			7,	"text/html",
580 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
581 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
582 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
583 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
584 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
585 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
586 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
587 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
588 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
589 	0,0,0,0
590 };
591 
592 int
593 istring(void)
594 {
595 	int i;
596 	struct FILE_STRING *p;
597 
598 	for(p = file_string; p->key; p++) {
599 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
600 			if(mime)
601 				print("%s\n", p->mime);
602 			else
603 				print("%s\n", p->filetype);
604 			return 1;
605 		}
606 	}
607 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
608 		for(i = 5; i < nbuf; i++)
609 			if(buf[i] == '\n')
610 				break;
611 		if(mime)
612 			print(OCTET);
613 		else
614 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
615 		return 1;
616 	}
617 	return 0;
618 }
619 
620 char*	html_string[] =
621 {
622 	"title",
623 	"body",
624 	"head",
625 	"strong",
626 	"h1",
627 	"h2",
628 	"h3",
629 	"h4",
630 	"h5",
631 	"h6",
632 	"ul",
633 	"li",
634 	"dl",
635 	"br",
636 	"em",
637 	0,
638 };
639 
640 int
641 ishtml(void)
642 {
643 	uchar *p, *q;
644 	int i, count;
645 
646 		/* compare strings between '<' and '>' to html table */
647 	count = 0;
648 	p = buf;
649 	for(;;) {
650 		while (p < buf+nbuf && *p != '<')
651 			p++;
652 		p++;
653 		if (p >= buf+nbuf)
654 			break;
655 		if(*p == '/')
656 			p++;
657 		q = p;
658 		while(p < buf+nbuf && *p != '>')
659 			p++;
660 		if (p >= buf+nbuf)
661 			break;
662 		for(i = 0; html_string[i]; i++) {
663 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
664 				if(count++ > 4) {
665 					print(mime ? "text/html\n" : "HTML file\n");
666 					return 1;
667 				}
668 				break;
669 			}
670 		}
671 		p++;
672 	}
673 	return 0;
674 }
675 
676 char*	rfc822_string[] =
677 {
678 	"from:",
679 	"date:",
680 	"to:",
681 	"subject:",
682 	"received:",
683 	"reply to:",
684 	"sender:",
685 	0,
686 };
687 
688 int
689 isrfc822(void)
690 {
691 
692 	char *p, *q, *r;
693 	int i, count;
694 
695 	count = 0;
696 	p = (char*)buf;
697 	for(;;) {
698 		q = strchr(p, '\n');
699 		if(q == nil)
700 			break;
701 		*q = 0;
702 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
703 			count++;
704 			*q = '\n';
705 			p = q+1;
706 			continue;
707 		}
708 		*q = '\n';
709 		if(*p != '\t' && *p != ' '){
710 			r = strchr(p, ':');
711 			if(r == 0 || r > q)
712 				break;
713 			for(i = 0; rfc822_string[i]; i++) {
714 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
715 					count++;
716 					break;
717 				}
718 			}
719 		}
720 		p = q+1;
721 	}
722 	if(count >= 3){
723 		print(mime ? "message/rfc822\n" : "email file\n");
724 		return 1;
725 	}
726 	return 0;
727 }
728 
729 int
730 ismbox(void)
731 {
732 	char *p, *q;
733 
734 	p = (char*)buf;
735 	q = strchr(p, '\n');
736 	if(q == nil)
737 		return 0;
738 	*q = 0;
739 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
740 		print(mime ? "text/plain\n" : "mail box\n");
741 		return 1;
742 	}
743 	*q = '\n';
744 	return 0;
745 }
746 
747 int
748 iscint(void)
749 {
750 	int type;
751 	char *name;
752 	Biobuf b;
753 
754 	if(Binit(&b, fd, OREAD) == Beof)
755 		return 0;
756 	seek(fd, 0, 0);
757 	type = objtype(&b, &name);
758 	if(type < 0)
759 		return 0;
760 	if(mime)
761 		print(OCTET);
762 	else
763 		print("%s intermediate\n", name);
764 	return 1;
765 }
766 
767 int
768 isc(void)
769 {
770 	int n;
771 
772 	n = wfreq[I1];
773 	/*
774 	 * includes
775 	 */
776 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
777 		goto yes;
778 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
779 		goto yes;
780 	/*
781 	 * declarations
782 	 */
783 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
784 		goto yes;
785 	/*
786 	 * assignments
787 	 */
788 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
789 		goto yes;
790 	return 0;
791 
792 yes:
793 	if(mime){
794 		print(PLAIN);
795 		return 1;
796 	}
797 	if(wfreq[Alword] > 0)
798 		print("alef program\n");
799 	else
800 		print("c program\n");
801 	return 1;
802 }
803 
804 int
805 islimbo(void)
806 {
807 
808 	/*
809 	 * includes
810 	 */
811 	if(wfreq[Lword] < 4)
812 		return 0;
813 	print(mime ? PLAIN : "limbo program\n");
814 	return 1;
815 }
816 
817 int
818 isas(void)
819 {
820 
821 	/*
822 	 * includes
823 	 */
824 	if(wfreq[Aword] < 2)
825 		return 0;
826 	print(mime ? PLAIN : "as program\n");
827 	return 1;
828 }
829 
830 /*
831  * low entropy means encrypted
832  */
833 int
834 ismung(void)
835 {
836 	int i, bucket[8];
837 	float cs;
838 
839 	if(nbuf < 64)
840 		return 0;
841 	memset(bucket, 0, sizeof(bucket));
842 	for(i=0; i<64; i++)
843 		bucket[(buf[i]>>5)&07] += 1;
844 
845 	cs = 0.;
846 	for(i=0; i<8; i++)
847 		cs += (bucket[i]-8)*(bucket[i]-8);
848 	cs /= 8.;
849 	if(cs <= 24.322) {
850 		if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
851 			print(mime ? OCTET : "compressed\n");
852 		else
853 			print(mime ? OCTET : "encrypted\n");
854 		return 1;
855 	}
856 	return 0;
857 }
858 
859 /*
860  * english by punctuation and frequencies
861  */
862 int
863 isenglish(void)
864 {
865 	int vow, comm, rare, badpun, punct;
866 	char *p;
867 
868 	if(guess != Fascii && guess != Feascii)
869 		return 0;
870 	badpun = 0;
871 	punct = 0;
872 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
873 		switch(*p) {
874 		case '.':
875 		case ',':
876 		case ')':
877 		case '%':
878 		case ';':
879 		case ':':
880 		case '?':
881 			punct++;
882 			if(p[1] != ' ' && p[1] != '\n')
883 				badpun++;
884 		}
885 	if(badpun*5 > punct)
886 		return 0;
887 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
888 		return 0;
889 	if(2*cfreq[';'] > cfreq['e'])
890 		return 0;
891 
892 	vow = 0;
893 	for(p="AEIOU"; *p; p++) {
894 		vow += cfreq[*p];
895 		vow += cfreq[tolower(*p)];
896 	}
897 	comm = 0;
898 	for(p="ETAION"; *p; p++) {
899 		comm += cfreq[*p];
900 		comm += cfreq[tolower(*p)];
901 	}
902 	rare = 0;
903 	for(p="VJKQXZ"; *p; p++) {
904 		rare += cfreq[*p];
905 		rare += cfreq[tolower(*p)];
906 	}
907 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
908 		print(mime ? PLAIN : "English text\n");
909 		return 1;
910 	}
911 	return 0;
912 }
913 
914 /*
915  * pick up a number with
916  * syntax _*[0-9]+_
917  */
918 #define	P9BITLEN	12
919 int
920 p9bitnum(uchar *bp)
921 {
922 	int n, c, len;
923 
924 	len = P9BITLEN;
925 	while(*bp == ' ') {
926 		bp++;
927 		len--;
928 		if(len <= 0)
929 			return -1;
930 	}
931 	n = 0;
932 	while(len > 1) {
933 		c = *bp++;
934 		if(!isdigit(c))
935 			return -1;
936 		n = n*10 + c-'0';
937 		len--;
938 	}
939 	if(*bp != ' ')
940 		return -1;
941 	return n;
942 }
943 
944 int
945 depthof(char *s, int *newp)
946 {
947 	char *es;
948 	int d;
949 
950 	*newp = 0;
951 	es = s+12;
952 	while(s<es && *s==' ')
953 		s++;
954 	if(s == es)
955 		return -1;
956 	if('0'<=*s && *s<='9')
957 		return 1<<atoi(s);
958 
959 	*newp = 1;
960 	d = 0;
961 	while(s<es && *s!=' '){
962 		s++;	/* skip letter */
963 		d += strtoul(s, &s, 10);
964 	}
965 
966 	switch(d){
967 	case 32:
968 	case 24:
969 	case 16:
970 	case 8:
971 		return d;
972 	}
973 	return -1;
974 }
975 
976 int
977 isp9bit(void)
978 {
979 	int dep, lox, loy, hix, hiy, px, new;
980 	ulong t;
981 	long len;
982 	char *newlabel;
983 
984 	newlabel = "old ";
985 
986 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
987 	if(new)
988 		newlabel = "";
989 	lox = p9bitnum(buf + 1*P9BITLEN);
990 	loy = p9bitnum(buf + 2*P9BITLEN);
991 	hix = p9bitnum(buf + 3*P9BITLEN);
992 	hiy = p9bitnum(buf + 4*P9BITLEN);
993 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
994 		return 0;
995 
996 	if(dep < 8){
997 		px = 8/dep;	/* pixels per byte */
998 		/* set l to number of bytes of data per scan line */
999 		if(lox >= 0)
1000 			len = (hix+px-1)/px - lox/px;
1001 		else{	/* make positive before divide */
1002 			t = (-lox)+px-1;
1003 			t = (t/px)*px;
1004 			len = (t+hix+px-1)/px;
1005 		}
1006 	}else
1007 		len = (hix-lox)*dep/8;
1008 	len *= (hiy-loy);		/* col length */
1009 	len += 5*P9BITLEN;		/* size of initial ascii */
1010 
1011 	/*
1012 	 * for image file, length is non-zero and must match calculation above
1013 	 * for /dev/window and /dev/screen the length is always zero
1014 	 * for subfont, the subfont header should follow immediately.
1015 	 */
1016 	if (len != 0 && mbuf->length == 0) {
1017 		print("%splan 9 image\n", newlabel);
1018 		return 1;
1019 	}
1020 	if (mbuf->length == len) {
1021 		print("%splan 9 image\n", newlabel);
1022 		return 1;
1023 	}
1024 	/* Ghostscript sometimes produces a little extra on the end */
1025 	if (mbuf->length < len+P9BITLEN) {
1026 		print("%splan 9 image\n", newlabel);
1027 		return 1;
1028 	}
1029 	if (p9subfont(buf+len)) {
1030 		print("%ssubfont file\n", newlabel);
1031 		return 1;
1032 	}
1033 	return 0;
1034 }
1035 
1036 int
1037 p9subfont(uchar *p)
1038 {
1039 	int n, h, a;
1040 
1041 		/* if image too big, assume it's a subfont */
1042 	if (p+3*P9BITLEN > buf+sizeof(buf))
1043 		return 1;
1044 
1045 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1046 	if (n < 0)
1047 		return 0;
1048 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1049 	if (h < 0)
1050 		return 0;
1051 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1052 	if (a < 0)
1053 		return 0;
1054 	return 1;
1055 }
1056 
1057 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1058 
1059 int
1060 isp9font(void)
1061 {
1062 	uchar *cp, *p;
1063 	int i, n;
1064 	char pathname[1024];
1065 
1066 	cp = buf;
1067 	if (!getfontnum(cp, &cp))	/* height */
1068 		return 0;
1069 	if (!getfontnum(cp, &cp))	/* ascent */
1070 		return 0;
1071 	for (i = 0; 1; i++) {
1072 		if (!getfontnum(cp, &cp))	/* min */
1073 			break;
1074 		if (!getfontnum(cp, &cp))	/* max */
1075 			return 0;
1076 		while (WHITESPACE(*cp))
1077 			cp++;
1078 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1079 				;
1080 			/* construct a path name, if needed */
1081 		n = 0;
1082 		if (*p != '/' && slash) {
1083 			n = slash-fname+1;
1084 			if (n < sizeof(pathname))
1085 				memcpy(pathname, fname, n);
1086 			else n = 0;
1087 		}
1088 		if (n+cp-p < sizeof(pathname)) {
1089 			memcpy(pathname+n, p, cp-p);
1090 			n += cp-p;
1091 			pathname[n] = 0;
1092 			if (access(pathname, AEXIST) < 0)
1093 				return 0;
1094 		}
1095 	}
1096 	if (i) {
1097 		print("font file\n");
1098 		return 1;
1099 	}
1100 	return 0;
1101 }
1102 
1103 int
1104 getfontnum(uchar *cp, uchar **rp)
1105 {
1106 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1107 		cp++;
1108 	if (*cp < '0' || *cp > '9')
1109 		return 0;
1110 	strtoul((char *)cp, (char **)rp, 0);
1111 	if (!WHITESPACE(**rp))
1112 		return 0;
1113 	return 1;
1114 }
1115 
1116 int
1117 isrtf(void)
1118 {
1119 	if(strstr((char *)buf, "\\rtf1")){
1120 		print(mime ? "application/rtf\n" : "rich text format\n");
1121 		return 1;
1122 	}
1123 	return 0;
1124 }
1125 
1126 int
1127 ismsdos(void)
1128 {
1129 	if (buf[0] == 0x4d && buf[1] == 0x5a){
1130 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1131 		return 1;
1132 	}
1133 	return 0;
1134 }
1135