xref: /plan9/sys/src/cmd/file.c (revision 6a9fc400c33447ef5e1cda7185cb4de2c8e8010e)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"chan",		Alword,
51 	"char",		Cword,
52 	"common",	Fword,
53 	"con",		Lword,
54 	"data",		Fword,
55 	"dimension",	Fword,
56 	"double",	Cword,
57 	"extern",	Cword,
58 	"bio",		I2,
59 	"float",	Cword,
60 	"fn",		Lword,
61 	"function",	Fword,
62 	"h",		I3,
63 	"implement",	Lword,
64 	"import",	Lword,
65 	"include",	I1,
66 	"int",		Cword,
67 	"integer",	Fword,
68 	"iota",		Lword,
69 	"libc",		I2,
70 	"long",		Cword,
71 	"module",	Lword,
72 	"real",		Fword,
73 	"ref",		Lword,
74 	"register",	Cword,
75 	"self",		Lword,
76 	"short",	Cword,
77 	"static",	Cword,
78 	"stdio",	I2,
79 	"struct",	Cword,
80 	"subroutine",	Fword,
81 	"u",		I2,
82 	"void",		Cword,
83 };
84 
85 /* codes for 'mode' field in language structure */
86 enum	{
87 		Normal	= 0,
88 		First,		/* first entry for language spanning several ranges */
89 		Multi,		/* later entries "   "       "  ... */
90 		Shared,		/* codes used in several languages */
91 	};
92 
93 struct
94 {
95 	int	mode;		/* see enum above */
96 	int 	count;
97 	int	low;
98 	int	high;
99 	char	*name;
100 
101 } language[] =
102 {
103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
125 	First,	0,	0x3130,	0x318F,	"Korean",
126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
128 	Normal,	0,	0,	0,	0,		/* terminal entry */
129 };
130 
131 
132 enum
133 {
134 	Fascii,		/* printable ascii */
135 	Flatin,		/* latin 1*/
136 	Futf,		/* UTf character set */
137 	Fbinary,	/* binary */
138 	Feascii,	/* ASCII with control chars */
139 	Fnull,		/* NULL in file */
140 } guess;
141 
142 void	bump_utf_count(Rune);
143 int	cistrncmp(char*, char*, int);
144 void	filetype(int);
145 int	getfontnum(uchar*, uchar**);
146 int	isas(void);
147 int	isc(void);
148 int	iscint(void);
149 int	isenglish(void);
150 int	ishp(void);
151 int	ishtml(void);
152 int	isrfc822(void);
153 int	ismbox(void);
154 int	islimbo(void);
155 int	ismung(void);
156 int	isp9bit(void);
157 int	isp9font(void);
158 int	istring(void);
159 int	long0(void);
160 int	p9bitnum(uchar*);
161 int	p9subfont(uchar*);
162 void	print_utf(void);
163 void	type(char*, int);
164 int	utf_count(void);
165 void	wordfreq(void);
166 
167 int	(*call[])(void) =
168 {
169 	long0,		/* recognizable by first 4 bytes */
170 	istring,	/* recognizable by first string */
171 	ishtml,		/* html keywords */
172 	isrfc822,	/* email file */
173 	ismbox,		/* mail box */
174 	iscint,		/* compiler/assembler intermediate */
175 	islimbo,	/* limbo source */
176 	isc,		/* c & alef compiler key words */
177 	isas,		/* assembler key words */
178 	ismung,		/* entropy compressed/encrypted */
179 	isp9font,	/* plan 9 font */
180 	isp9bit,	/* plan 9 image (as from /dev/window) */
181 	isenglish,	/* char frequency English */
182 	ishp,		/* HP Job Control Language - Postscript */
183 	0
184 };
185 
186 int mime;
187 
188 #define OCTET	"application/octet-stream\n"
189 #define PLAIN	"text/plain\n"
190 
191 void
192 main(int argc, char *argv[])
193 {
194 	int i, j, maxlen;
195 	char *cp;
196 	Rune r;
197 
198 	ARGBEGIN{
199 	case 'm':
200 		mime = 1;
201 		break;
202 	default:
203 		fprint(2, "usage: file [-m] [file...]\n");
204 		exits("usage");
205 	}ARGEND;
206 
207 	maxlen = 0;
208 	if(mime == 0 || argc > 1){
209 		for(i = 0; i < argc; i++) {
210 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
211 					;
212 			if(j > maxlen)
213 				maxlen = j;
214 		}
215 	}
216 	if (argc <= 0) {
217 		if(!mime)
218 			print ("stdin: ");
219 		filetype(0);
220 	}
221 	else {
222 		for(i = 0; i < argc; i++)
223 			type(argv[i], maxlen);
224 	}
225 	exits(0);
226 }
227 
228 void
229 type(char *file, int nlen)
230 {
231 	Rune r;
232 	int i;
233 	char *p;
234 
235 	if(nlen > 0){
236 		slash = 0;
237 		for (i = 0, p = file; *p; i++) {
238 			if (*p == '/')			/* find rightmost slash */
239 				slash = p;
240 			p += chartorune(&r, p);		/* count runes */
241 		}
242 		print("%s:%*s",file, nlen-i+1, "");
243 	}
244 	fname = file;
245 	if ((fd = open(file, OREAD)) < 0) {
246 		print("cannot open\n");
247 		return;
248 	}
249 	filetype(fd);
250 	close(fd);
251 }
252 
253 void
254 filetype(int fd)
255 {
256 	Rune r;
257 	int i, f, n;
258 	char *p, *eob;
259 
260 	free(mbuf);
261 	mbuf = dirfstat(fd);
262 	if(mbuf == nil){
263 		print("cannot stat: %r\n");
264 		return;
265 	}
266 	if(mbuf->mode & DMDIR) {
267 		print(mime ? "text/directory\n" : "directory\n");
268 		return;
269 	}
270 	if(mbuf->type != 'M' && mbuf->type != '|') {
271 		print(mime ? OCTET : "special file #%c/%s\n",
272 			mbuf->type, mbuf->name);
273 		return;
274 	}
275 	nbuf = read(fd, buf, sizeof(buf)-1);
276 
277 	if(nbuf < 0) {
278 		print("cannot read\n");
279 		return;
280 	}
281 	if(nbuf == 0) {
282 		print(mime ? PLAIN : "empty file\n");
283 		return;
284 	}
285 	buf[nbuf] = 0;
286 
287 	/*
288 	 * build histogram table
289 	 */
290 	memset(cfreq, 0, sizeof(cfreq));
291 	for (i = 0; language[i].name; i++)
292 		language[i].count = 0;
293 	eob = (char *)buf+nbuf;
294 	for(n = 0, p = (char *)buf; p < eob; n++) {
295 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
296 			break;
297 		p += chartorune(&r, p);
298 		if (r == 0)
299 			f = Cnull;
300 		else if (r <= 0x7f) {
301 			if (!isprint(r) && !isspace(r))
302 				f = Ceascii;	/* ASCII control char */
303 			else f = r;
304 		} else if (r == 0x080) {
305 			bump_utf_count(r);
306 			f = Cutf;
307 		} else if (r < 0xA0)
308 				f = Cbinary;	/* Invalid Runes */
309 		else if (r <= 0xff)
310 				f = Clatin;	/* Latin 1 */
311 		else {
312 			bump_utf_count(r);
313 			f = Cutf;		/* UTF extension */
314 		}
315 		cfreq[f]++;			/* ASCII chars peg directly */
316 	}
317 	/*
318 	 * gross classify
319 	 */
320 	if (cfreq[Cbinary])
321 		guess = Fbinary;
322 	else if (cfreq[Cutf])
323 		guess = Futf;
324 	else if (cfreq[Clatin])
325 		guess = Flatin;
326 	else if (cfreq[Ceascii])
327 		guess = Feascii;
328 	else if (cfreq[Cnull] == n) {
329 		print(mime ? OCTET : "first block all null bytes\n");
330 		return;
331 	}
332 	else guess = Fascii;
333 	/*
334 	 * lookup dictionary words
335 	 */
336 	memset(wfreq, 0, sizeof(wfreq));
337 	if(guess == Fascii || guess == Flatin || guess == Futf)
338 		wordfreq();
339 	/*
340 	 * call individual classify routines
341 	 */
342 	for(i=0; call[i]; i++)
343 		if((*call[i])())
344 			return;
345 
346 	/*
347 	 * if all else fails,
348 	 * print out gross classification
349 	 */
350 	if (nbuf < 100 && !mime)
351 		print(mime ? PLAIN : "short ");
352 	if (guess == Fascii)
353 		print(mime ? PLAIN : "Ascii\n");
354 	else if (guess == Feascii)
355 		print(mime ? PLAIN : "extended ascii\n");
356 	else if (guess == Flatin)
357 		print(mime ? PLAIN : "latin ascii\n");
358 	else if (guess == Futf && utf_count() < 4)
359 		print_utf();
360 	else print(mime ? OCTET : "binary\n");
361 }
362 
363 void
364 bump_utf_count(Rune r)
365 {
366 	int low, high, mid;
367 
368 	high = sizeof(language)/sizeof(language[0])-1;
369 	for (low = 0; low < high;) {
370 		mid = (low+high)/2;
371 		if (r >=language[mid].low) {
372 			if (r <= language[mid].high) {
373 				language[mid].count++;
374 				break;
375 			} else low = mid+1;
376 		} else high = mid;
377 	}
378 }
379 
380 int
381 utf_count(void)
382 {
383 	int i, count;
384 
385 	count = 0;
386 	for (i = 0; language[i].name; i++)
387 		if (language[i].count > 0)
388 			switch (language[i].mode) {
389 			case Normal:
390 			case First:
391 				count++;
392 				break;
393 			default:
394 				break;
395 			}
396 	return count;
397 }
398 
399 int
400 chkascii(void)
401 {
402 	int i;
403 
404 	for (i = 'a'; i < 'z'; i++)
405 		if (cfreq[i])
406 			return 1;
407 	for (i = 'A'; i < 'Z'; i++)
408 		if (cfreq[i])
409 			return 1;
410 	return 0;
411 }
412 
413 int
414 find_first(char *name)
415 {
416 	int i;
417 
418 	for (i = 0; language[i].name != 0; i++)
419 		if (language[i].mode == First
420 			&& strcmp(language[i].name, name) == 0)
421 			return i;
422 	return -1;
423 }
424 
425 void
426 print_utf(void)
427 {
428 	int i, printed, j;
429 
430 	if(mime){
431 		print(PLAIN);
432 		return;
433 	}
434 	if (chkascii()) {
435 		printed = 1;
436 		print("Ascii");
437 	} else
438 		printed = 0;
439 	for (i = 0; language[i].name; i++)
440 		if (language[i].count) {
441 			switch(language[i].mode) {
442 			case Multi:
443 				j = find_first(language[i].name);
444 				if (j < 0)
445 					break;
446 				if (language[j].count > 0)
447 					break;
448 				/* Fall through */
449 			case Normal:
450 			case First:
451 				if (printed)
452 					print(" & ");
453 				else printed = 1;
454 				print("%s", language[i].name);
455 				break;
456 			case Shared:
457 			default:
458 				break;
459 			}
460 		}
461 	if(!printed)
462 		print("UTF");
463 	print(" text\n");
464 }
465 
466 void
467 wordfreq(void)
468 {
469 	int low, high, mid, r;
470 	uchar *p, *p2, c;
471 
472 	p = buf;
473 	for(;;) {
474 		while (p < buf+nbuf && !isalpha(*p))
475 			p++;
476 		if (p >= buf+nbuf)
477 			return;
478 		p2 = p;
479 		while(p < buf+nbuf && isalpha(*p))
480 			p++;
481 		c = *p;
482 		*p = 0;
483 		high = sizeof(dict)/sizeof(dict[0]);
484 		for(low = 0;low < high;) {
485 			mid = (low+high)/2;
486 			r = strcmp(dict[mid].word, (char*)p2);
487 			if(r == 0) {
488 				wfreq[dict[mid].class]++;
489 				break;
490 			}
491 			if(r < 0)
492 				low = mid+1;
493 			else
494 				high = mid;
495 		}
496 		*p++ = c;
497 	}
498 }
499 
500 typedef struct Filemagic Filemagic;
501 struct Filemagic {
502 	ulong x;
503 	ulong mask;
504 	char *desc;
505 	char *mime;
506 };
507 
508 Filemagic long0tab[] = {
509 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
510 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
511 	0x32636170,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
512 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
513 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
514 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", OCTET,
515 	070707,		0xFFFF,		"cpio archive\n", OCTET,
516 	0x2F7,		0xFFFF,		"tex dvi\n", OCTET,
517 };
518 
519 int
520 filemagic(Filemagic *tab, int ntab, ulong x)
521 {
522 	int i;
523 
524 	for(i=0; i<ntab; i++)
525 		if((x&tab[i].mask) == tab[i].x){
526 			print(mime ? tab[i].mime : tab[i].desc);
527 			return 1;
528 		}
529 	return 0;
530 }
531 
532 int
533 long0(void)
534 {
535 	Fhdr f;
536 	long x;
537 
538 	seek(fd, 0, 0);		/* reposition to start of file */
539 	if(crackhdr(fd, &f)) {
540 		print(mime ? OCTET : "%s\n", f.name);
541 		return 1;
542 	}
543 	x = LENDIAN(buf);
544 	if(filemagic(long0tab, nelem(long0tab), x))
545 		return 1;
546 	return 0;
547 }
548 
549 /*
550  * initial words to classify file
551  */
552 struct	FILE_STRING
553 {
554 	char 	*key;
555 	char	*filetype;
556 	int	length;
557 	char	*mime;
558 } file_string[] =
559 {
560 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
561 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
562 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
563 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
564 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
565 	"%!",			"postscript",			2,	"application/postscript",
566 	"\004%!",		"postscript",			3,	"application/postscript",
567 	"x T post",		"troff output for post",	8,	"application/troff",
568 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
569 	"x T utf",		"troff output for UTF",		7,	"application/troff",
570 	"x T 202",		"troff output for 202",		7,	"application/troff",
571 	"x T aps",		"troff output for aps",		7,	"application/troff",
572 	"GIF",			"GIF image", 			3,	"image/gif",
573 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
574 	"%PDF",			"PDF",				4,	"application/pdf",
575 	"<html>\n",		"HTML file",			7,	"text/html",
576 	"<HTML>\n",		"HTML file",			7,	"text/html",
577 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
578 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
579 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
580 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
581 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
582 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
583 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
584 	0,0,0,0
585 };
586 
587 int
588 istring(void)
589 {
590 	int i;
591 	struct FILE_STRING *p;
592 
593 	for(p = file_string; p->key; p++) {
594 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
595 			if(mime)
596 				print("%s\n", p->mime);
597 			else
598 				print("%s\n", p->filetype);
599 			return 1;
600 		}
601 	}
602 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
603 		for(i = 5; i < nbuf; i++)
604 			if(buf[i] == '\n')
605 				break;
606 		if(mime)
607 			print(OCTET);
608 		else
609 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
610 		return 1;
611 	}
612 	return 0;
613 }
614 
615 char*	html_string[] =
616 {
617 	"title",
618 	"body",
619 	"head",
620 	"strong",
621 	"h1",
622 	"h2",
623 	"h3",
624 	"h4",
625 	"h5",
626 	"h6",
627 	"ul",
628 	"li",
629 	"dl",
630 	"br",
631 	"em",
632 	0,
633 };
634 
635 int
636 ishtml(void)
637 {
638 	uchar *p, *q;
639 	int i, count;
640 
641 		/* compare strings between '<' and '>' to html table */
642 	count = 0;
643 	p = buf;
644 	for(;;) {
645 		while (p < buf+nbuf && *p != '<')
646 			p++;
647 		p++;
648 		if (p >= buf+nbuf)
649 			break;
650 		if(*p == '/')
651 			p++;
652 		q = p;
653 		while(p < buf+nbuf && *p != '>')
654 			p++;
655 		if (p >= buf+nbuf)
656 			break;
657 		for(i = 0; html_string[i]; i++) {
658 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
659 				if(count++ > 4) {
660 					print(mime ? "text/html\n" : "HTML file\n");
661 					return 1;
662 				}
663 				break;
664 			}
665 		}
666 		p++;
667 	}
668 	return 0;
669 }
670 
671 char*	rfc822_string[] =
672 {
673 	"from:",
674 	"date:",
675 	"to:",
676 	"subject:",
677 	"received:",
678 	"reply to:",
679 	"sender:",
680 	0,
681 };
682 
683 int
684 isrfc822(void)
685 {
686 
687 	char *p, *q, *r;
688 	int i, count;
689 
690 	count = 0;
691 	p = (char*)buf;
692 	for(;;) {
693 		q = strchr(p, '\n');
694 		if(q == nil)
695 			break;
696 		*q = 0;
697 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
698 			count++;
699 			*q = '\n';
700 			p = q+1;
701 			continue;
702 		}
703 		*q = '\n';
704 		if(*p != '\t' && *p != ' '){
705 			r = strchr(p, ':');
706 			if(r == 0 || r > q)
707 				break;
708 			for(i = 0; rfc822_string[i]; i++) {
709 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
710 					count++;
711 					break;
712 				}
713 			}
714 		}
715 		p = q+1;
716 	}
717 	if(count >= 3){
718 		print(mime ? "message/rfc822\n" : "email file\n");
719 		return 1;
720 	}
721 	return 0;
722 }
723 
724 int
725 ismbox(void)
726 {
727 	char *p, *q;
728 
729 	p = (char*)buf;
730 	q = strchr(p, '\n');
731 	if(q == nil)
732 		return 0;
733 	*q = 0;
734 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
735 		print(mime ? "text/plain\n" : "mail box\n");
736 		return 1;
737 	}
738 	*q = '\n';
739 	return 0;
740 }
741 
742 int
743 iscint(void)
744 {
745 	int type;
746 	char *name;
747 	Biobuf b;
748 
749 	if(Binit(&b, fd, OREAD) == Beof)
750 		return 0;
751 	seek(fd, 0, 0);
752 	type = objtype(&b, &name);
753 	if(type < 0)
754 		return 0;
755 	if(mime)
756 		print(OCTET);
757 	else
758 		print("%s intermediate\n", name);
759 	return 1;
760 }
761 
762 int
763 isc(void)
764 {
765 	int n;
766 
767 	n = wfreq[I1];
768 	/*
769 	 * includes
770 	 */
771 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
772 		goto yes;
773 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
774 		goto yes;
775 	/*
776 	 * declarations
777 	 */
778 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
779 		goto yes;
780 	/*
781 	 * assignments
782 	 */
783 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
784 		goto yes;
785 	return 0;
786 
787 yes:
788 	if(mime){
789 		print(PLAIN);
790 		return 1;
791 	}
792 	if(wfreq[Alword] > 0)
793 		print("alef program\n");
794 	else
795 		print("c program\n");
796 	return 1;
797 }
798 
799 int
800 islimbo(void)
801 {
802 
803 	/*
804 	 * includes
805 	 */
806 	if(wfreq[Lword] < 4)
807 		return 0;
808 	print(mime ? PLAIN : "limbo program\n");
809 	return 1;
810 }
811 
812 int
813 isas(void)
814 {
815 
816 	/*
817 	 * includes
818 	 */
819 	if(wfreq[Aword] < 2)
820 		return 0;
821 	print(mime ? PLAIN : "as program\n");
822 	return 1;
823 }
824 
825 /*
826  * low entropy means encrypted
827  */
828 int
829 ismung(void)
830 {
831 	int i, bucket[8];
832 	float cs;
833 
834 	if(nbuf < 64)
835 		return 0;
836 	memset(bucket, 0, sizeof(bucket));
837 	for(i=0; i<64; i++)
838 		bucket[(buf[i]>>5)&07] += 1;
839 
840 	cs = 0.;
841 	for(i=0; i<8; i++)
842 		cs += (bucket[i]-8)*(bucket[i]-8);
843 	cs /= 8.;
844 	if(cs <= 24.322) {
845 		if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
846 			print(mime ? OCTET : "compressed\n");
847 		else
848 			print(mime ? OCTET : "encrypted\n");
849 		return 1;
850 	}
851 	return 0;
852 }
853 
854 /*
855  * english by punctuation and frequencies
856  */
857 int
858 isenglish(void)
859 {
860 	int vow, comm, rare, badpun, punct;
861 	char *p;
862 
863 	if(guess != Fascii && guess != Feascii)
864 		return 0;
865 	badpun = 0;
866 	punct = 0;
867 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
868 		switch(*p) {
869 		case '.':
870 		case ',':
871 		case ')':
872 		case '%':
873 		case ';':
874 		case ':':
875 		case '?':
876 			punct++;
877 			if(p[1] != ' ' && p[1] != '\n')
878 				badpun++;
879 		}
880 	if(badpun*5 > punct)
881 		return 0;
882 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
883 		return 0;
884 	if(2*cfreq[';'] > cfreq['e'])
885 		return 0;
886 
887 	vow = 0;
888 	for(p="AEIOU"; *p; p++) {
889 		vow += cfreq[*p];
890 		vow += cfreq[tolower(*p)];
891 	}
892 	comm = 0;
893 	for(p="ETAION"; *p; p++) {
894 		comm += cfreq[*p];
895 		comm += cfreq[tolower(*p)];
896 	}
897 	rare = 0;
898 	for(p="VJKQXZ"; *p; p++) {
899 		rare += cfreq[*p];
900 		rare += cfreq[tolower(*p)];
901 	}
902 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
903 		print(mime ? PLAIN : "English text\n");
904 		return 1;
905 	}
906 	return 0;
907 }
908 
909 /*
910  * pick up a number with
911  * syntax _*[0-9]+_
912  */
913 #define	P9BITLEN	12
914 int
915 p9bitnum(uchar *bp)
916 {
917 	int n, c, len;
918 
919 	len = P9BITLEN;
920 	while(*bp == ' ') {
921 		bp++;
922 		len--;
923 		if(len <= 0)
924 			return -1;
925 	}
926 	n = 0;
927 	while(len > 1) {
928 		c = *bp++;
929 		if(!isdigit(c))
930 			return -1;
931 		n = n*10 + c-'0';
932 		len--;
933 	}
934 	if(*bp != ' ')
935 		return -1;
936 	return n;
937 }
938 
939 int
940 depthof(char *s, int *newp)
941 {
942 	char *es;
943 	int d;
944 
945 	*newp = 0;
946 	es = s+12;
947 	while(s<es && *s==' ')
948 		s++;
949 	if(s == es)
950 		return -1;
951 	if('0'<=*s && *s<='9')
952 		return 1<<atoi(s);
953 
954 	*newp = 1;
955 	d = 0;
956 	while(s<es && *s!=' '){
957 		s++;	/* skip letter */
958 		d += strtoul(s, &s, 10);
959 	}
960 
961 	switch(d){
962 	case 32:
963 	case 24:
964 	case 16:
965 	case 8:
966 		return d;
967 	}
968 	return -1;
969 }
970 
971 int
972 isp9bit(void)
973 {
974 	int dep, lox, loy, hix, hiy, px, new;
975 	ulong t;
976 	long len;
977 	char *newlabel;
978 
979 	newlabel = "old ";
980 
981 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
982 	if(new)
983 		newlabel = "";
984 	lox = p9bitnum(buf + 1*P9BITLEN);
985 	loy = p9bitnum(buf + 2*P9BITLEN);
986 	hix = p9bitnum(buf + 3*P9BITLEN);
987 	hiy = p9bitnum(buf + 4*P9BITLEN);
988 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
989 		return 0;
990 
991 	if(dep < 8){
992 		px = 8/dep;	/* pixels per byte */
993 		/* set l to number of bytes of data per scan line */
994 		if(lox >= 0)
995 			len = (hix+px-1)/px - lox/px;
996 		else{	/* make positive before divide */
997 			t = (-lox)+px-1;
998 			t = (t/px)*px;
999 			len = (t+hix+px-1)/px;
1000 		}
1001 	}else
1002 		len = (hix-lox)*dep/8;
1003 	len *= (hiy-loy);		/* col length */
1004 	len += 5*P9BITLEN;		/* size of initial ascii */
1005 
1006 	/*
1007 	 * for image file, length is non-zero and must match calculation above
1008 	 * for /dev/window and /dev/screen the length is always zero
1009 	 * for subfont, the subfont header should follow immediately.
1010 	 */
1011 	if (len != 0 && mbuf->length == 0) {
1012 		print("%splan 9 image\n", newlabel);
1013 		return 1;
1014 	}
1015 	if (mbuf->length == len) {
1016 		print("%splan 9 image\n", newlabel);
1017 		return 1;
1018 	}
1019 	/* Ghostscript sometimes produces a little extra on the end */
1020 	if (mbuf->length < len+P9BITLEN) {
1021 		print("%splan 9 image\n", newlabel);
1022 		return 1;
1023 	}
1024 	if (p9subfont(buf+len)) {
1025 		print("%ssubfont file\n", newlabel);
1026 		return 1;
1027 	}
1028 	return 0;
1029 }
1030 
1031 int
1032 p9subfont(uchar *p)
1033 {
1034 	int n, h, a;
1035 
1036 		/* if image too big, assume it's a subfont */
1037 	if (p+3*P9BITLEN > buf+sizeof(buf))
1038 		return 1;
1039 
1040 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1041 	if (n < 0)
1042 		return 0;
1043 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1044 	if (h < 0)
1045 		return 0;
1046 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1047 	if (a < 0)
1048 		return 0;
1049 	return 1;
1050 }
1051 
1052 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1053 
1054 int
1055 isp9font(void)
1056 {
1057 	uchar *cp, *p;
1058 	int i, n;
1059 	char pathname[1024];
1060 
1061 	cp = buf;
1062 	if (!getfontnum(cp, &cp))	/* height */
1063 		return 0;
1064 	if (!getfontnum(cp, &cp))	/* ascent */
1065 		return 0;
1066 	for (i = 0; 1; i++) {
1067 		if (!getfontnum(cp, &cp))	/* min */
1068 			break;
1069 		if (!getfontnum(cp, &cp))	/* max */
1070 			return 0;
1071 		while (WHITESPACE(*cp))
1072 			cp++;
1073 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1074 				;
1075 			/* construct a path name, if needed */
1076 		n = 0;
1077 		if (*p != '/' && slash) {
1078 			n = slash-fname+1;
1079 			if (n < sizeof(pathname))
1080 				memcpy(pathname, fname, n);
1081 			else n = 0;
1082 		}
1083 		if (n+cp-p < sizeof(pathname)) {
1084 			memcpy(pathname+n, p, cp-p);
1085 			n += cp-p;
1086 			pathname[n] = 0;
1087 			if (access(pathname, AEXIST) < 0)
1088 				return 0;
1089 		}
1090 	}
1091 	if (i) {
1092 		print("font file\n");
1093 		return 1;
1094 	}
1095 	return 0;
1096 }
1097 
1098 int
1099 getfontnum(uchar *cp, uchar **rp)
1100 {
1101 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1102 		cp++;
1103 	if (*cp < '0' || *cp > '9')
1104 		return 0;
1105 	strtoul((char *)cp, (char **)rp, 0);
1106 	if (!WHITESPACE(**rp))
1107 		return 0;
1108 	return 1;
1109 }
1110 
1111 int
1112 ishp(void)
1113 {
1114 	if (strncmp("\033%-12345X", (char *)buf, 9)==0) {
1115 		print("HPJCL file\n");
1116 		return 1;
1117 	}
1118 	return 0;
1119 }
1120