xref: /plan9/sys/src/cmd/file.c (revision 9a747e4fd48b9f4522c70c07e8f882a15030f964)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6001];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir*	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"chan",		Alword,
51 	"char",		Cword,
52 	"common",	Fword,
53 	"con",		Lword,
54 	"data",		Fword,
55 	"dimension",	Fword,
56 	"double",	Cword,
57 	"extern",	Cword,
58 	"bio",		I2,
59 	"float",	Cword,
60 	"fn",		Lword,
61 	"function",	Fword,
62 	"h",		I3,
63 	"implement",	Lword,
64 	"import",	Lword,
65 	"include",	I1,
66 	"int",		Cword,
67 	"integer",	Fword,
68 	"iota",		Lword,
69 	"libc",		I2,
70 	"long",		Cword,
71 	"module",	Lword,
72 	"real",		Fword,
73 	"ref",		Lword,
74 	"register",	Cword,
75 	"self",		Lword,
76 	"short",	Cword,
77 	"static",	Cword,
78 	"stdio",	I2,
79 	"struct",	Cword,
80 	"subroutine",	Fword,
81 	"u",		I2,
82 	"void",		Cword,
83 };
84 
85 /* codes for 'mode' field in language structure */
86 enum	{
87 		Normal	= 0,
88 		First,		/* first entry for language spanning several ranges */
89 		Multi,		/* later entries "   "       "  ... */
90 		Shared,		/* codes used in several languages */
91 	};
92 
93 struct
94 {
95 	int	mode;		/* see enum above */
96 	int 	count;
97 	int	low;
98 	int	high;
99 	char	*name;
100 
101 } language[] =
102 {
103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
125 	First,	0,	0x3130,	0x318F,	"Korean",
126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
128 	Normal,	0,	0,	0,	0,		/* terminal entry */
129 };
130 
131 
132 enum
133 {
134 	Fascii,		/* printable ascii */
135 	Flatin,		/* latin 1*/
136 	Futf,		/* UTf character set */
137 	Fbinary,	/* binary */
138 	Feascii,	/* ASCII with control chars */
139 	Fnull,		/* NULL in file */
140 } guess;
141 
142 void	bump_utf_count(Rune);
143 int	cistrncmp(char*, char*, int);
144 void	filetype(int);
145 int	getfontnum(uchar*, uchar**);
146 int	isas(void);
147 int	isc(void);
148 int	iscint(void);
149 int	isenglish(void);
150 int	ishp(void);
151 int	ishtml(void);
152 int	isrfc822(void);
153 int	islimbo(void);
154 int	ismung(void);
155 int	isp9bit(void);
156 int	isp9font(void);
157 int	istring(void);
158 int	long0(void);
159 int	p9bitnum(uchar*);
160 int	p9subfont(uchar*);
161 void	print_utf(void);
162 void	type(char*, int);
163 int	utf_count(void);
164 void	wordfreq(void);
165 
166 int	(*call[])(void) =
167 {
168 	long0,		/* recognizable by first 4 bytes */
169 	istring,	/* recognizable by first string */
170 	ishtml,		/* html keywords */
171 	isrfc822,	/* email file */
172 	iscint,		/* compiler/assembler intermediate */
173 	islimbo,	/* limbo source */
174 	isc,		/* c & alef compiler key words */
175 	isas,		/* assembler key words */
176 	ismung,		/* entropy compressed/encrypted */
177 	isp9font,	/* plan 9 font */
178 	isp9bit,	/* plan 9 image (as from /dev/window) */
179 	isenglish,	/* char frequency English */
180 	ishp,		/* HP Job Control Language - Postscript */
181 	0
182 };
183 
184 int mime;
185 
186 #define OCTET	"application/octet-stream\n"
187 #define PLAIN	"text/plain\n"
188 
189 void
190 main(int argc, char *argv[])
191 {
192 	int i, j, maxlen;
193 	char *cp;
194 	Rune r;
195 
196 	ARGBEGIN{
197 	case 'm':
198 		mime = 1;
199 		break;
200 	default:
201 		fprint(2, "usage: file [-m] [file...]\n");
202 		exits("usage");
203 	}ARGEND;
204 
205 	maxlen = 0;
206 	if(mime == 0 || argc > 1){
207 		for(i = 0; i < argc; i++) {
208 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
209 					;
210 			if(j > maxlen)
211 				maxlen = j;
212 		}
213 	}
214 	if (argc <= 0) {
215 		if(!mime)
216 			print ("stdin: ");
217 		filetype(0);
218 	}
219 	else {
220 		for(i = 0; i < argc; i++)
221 			type(argv[i], maxlen);
222 	}
223 	exits(0);
224 }
225 
226 void
227 type(char *file, int nlen)
228 {
229 	Rune r;
230 	int i;
231 	char *p;
232 
233 	if(nlen > 0){
234 		slash = 0;
235 		for (i = 0, p = file; *p; i++) {
236 			if (*p == '/')			/* find rightmost slash */
237 				slash = p;
238 			p += chartorune(&r, p);		/* count runes */
239 		}
240 		print("%s:%*s",file, nlen-i+1, "");
241 	}
242 	fname = file;
243 	if ((fd = open(file, OREAD)) < 0) {
244 		print("cannot open\n");
245 		return;
246 	}
247 	filetype(fd);
248 	close(fd);
249 }
250 
251 void
252 filetype(int fd)
253 {
254 	Rune r;
255 	int i, f, n;
256 	char *p, *eob;
257 
258 	free(mbuf);
259 	mbuf = dirfstat(fd);
260 	if(mbuf == nil){
261 		print("cannot stat: %r\n");
262 		return;
263 	}
264 	if(mbuf->mode & DMDIR) {
265 		print(mime ? "text/directory\n" : "directory\n");
266 		return;
267 	}
268 	if(mbuf->type != 'M' && mbuf->type != '|') {
269 		print(mime ? OCTET : "special file #%c/%s\n",
270 			mbuf->type, mbuf->name);
271 		return;
272 	}
273 	nbuf = read(fd, buf, sizeof(buf)-1);
274 
275 	if(nbuf < 0) {
276 		print("cannot read\n");
277 		return;
278 	}
279 	if(nbuf == 0) {
280 		print(mime ? PLAIN : "empty file\n");
281 		return;
282 	}
283 	buf[nbuf] = 0;
284 
285 	/*
286 	 * build histogram table
287 	 */
288 	memset(cfreq, 0, sizeof(cfreq));
289 	for (i = 0; language[i].name; i++)
290 		language[i].count = 0;
291 	eob = (char *)buf+nbuf;
292 	for(n = 0, p = (char *)buf; p < eob; n++) {
293 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
294 			break;
295 		p += chartorune(&r, p);
296 		if (r == 0)
297 			f = Cnull;
298 		else if (r <= 0x7f) {
299 			if (!isprint(r) && !isspace(r))
300 				f = Ceascii;	/* ASCII control char */
301 			else f = r;
302 		} else if (r == 0x080) {
303 			bump_utf_count(r);
304 			f = Cutf;
305 		} else if (r < 0xA0)
306 				f = Cbinary;	/* Invalid Runes */
307 		else if (r <= 0xff)
308 				f = Clatin;	/* Latin 1 */
309 		else {
310 			bump_utf_count(r);
311 			f = Cutf;		/* UTF extension */
312 		}
313 		cfreq[f]++;			/* ASCII chars peg directly */
314 	}
315 	/*
316 	 * gross classify
317 	 */
318 	if (cfreq[Cbinary])
319 		guess = Fbinary;
320 	else if (cfreq[Cutf])
321 		guess = Futf;
322 	else if (cfreq[Clatin])
323 		guess = Flatin;
324 	else if (cfreq[Ceascii])
325 		guess = Feascii;
326 	else if (cfreq[Cnull] == n) {
327 		print(mime ? OCTET : "first block all null bytes\n");
328 		return;
329 	}
330 	else guess = Fascii;
331 	/*
332 	 * lookup dictionary words
333 	 */
334 	memset(wfreq, 0, sizeof(wfreq));
335 	if(guess == Fascii || guess == Flatin || guess == Futf)
336 		wordfreq();
337 	/*
338 	 * call individual classify routines
339 	 */
340 	for(i=0; call[i]; i++)
341 		if((*call[i])())
342 			return;
343 
344 	/*
345 	 * if all else fails,
346 	 * print out gross classification
347 	 */
348 	if (nbuf < 100 && !mime)
349 		print(mime ? PLAIN : "short ");
350 	if (guess == Fascii)
351 		print(mime ? PLAIN : "Ascii\n");
352 	else if (guess == Feascii)
353 		print(mime ? PLAIN : "extended ascii\n");
354 	else if (guess == Flatin)
355 		print(mime ? PLAIN : "latin ascii\n");
356 	else if (guess == Futf && utf_count() < 4)
357 		print_utf();
358 	else print(mime ? OCTET : "binary\n");
359 }
360 
361 void
362 bump_utf_count(Rune r)
363 {
364 	int low, high, mid;
365 
366 	high = sizeof(language)/sizeof(language[0])-1;
367 	for (low = 0; low < high;) {
368 		mid = (low+high)/2;
369 		if (r >=language[mid].low) {
370 			if (r <= language[mid].high) {
371 				language[mid].count++;
372 				break;
373 			} else low = mid+1;
374 		} else high = mid;
375 	}
376 }
377 
378 int
379 utf_count(void)
380 {
381 	int i, count;
382 
383 	count = 0;
384 	for (i = 0; language[i].name; i++)
385 		if (language[i].count > 0)
386 			switch (language[i].mode) {
387 			case Normal:
388 			case First:
389 				count++;
390 				break;
391 			default:
392 				break;
393 			}
394 	return count;
395 }
396 
397 int
398 chkascii(void)
399 {
400 	int i;
401 
402 	for (i = 'a'; i < 'z'; i++)
403 		if (cfreq[i])
404 			return 1;
405 	for (i = 'A'; i < 'Z'; i++)
406 		if (cfreq[i])
407 			return 1;
408 	return 0;
409 }
410 
411 int
412 find_first(char *name)
413 {
414 	int i;
415 
416 	for (i = 0; language[i].name != 0; i++)
417 		if (language[i].mode == First
418 			&& strcmp(language[i].name, name) == 0)
419 			return i;
420 	return -1;
421 }
422 
423 void
424 print_utf(void)
425 {
426 	int i, printed, j;
427 
428 	if(mime){
429 		print(PLAIN);
430 		return;
431 	}
432 	if (chkascii()) {
433 		printed = 1;
434 		print("Ascii");
435 	} else
436 		printed = 0;
437 	for (i = 0; language[i].name; i++)
438 		if (language[i].count) {
439 			switch(language[i].mode) {
440 			case Multi:
441 				j = find_first(language[i].name);
442 				if (j < 0)
443 					break;
444 				if (language[j].count > 0)
445 					break;
446 				/* Fall through */
447 			case Normal:
448 			case First:
449 				if (printed)
450 					print(" & ");
451 				else printed = 1;
452 				print("%s", language[i].name);
453 				break;
454 			case Shared:
455 			default:
456 				break;
457 			}
458 		}
459 	if(!printed)
460 		print("UTF");
461 	print(" text\n");
462 }
463 
464 void
465 wordfreq(void)
466 {
467 	int low, high, mid, r;
468 	uchar *p, *p2, c;
469 
470 	p = buf;
471 	for(;;) {
472 		while (p < buf+nbuf && !isalpha(*p))
473 			p++;
474 		if (p >= buf+nbuf)
475 			return;
476 		p2 = p;
477 		while(p < buf+nbuf && isalpha(*p))
478 			p++;
479 		c = *p;
480 		*p = 0;
481 		high = sizeof(dict)/sizeof(dict[0]);
482 		for(low = 0;low < high;) {
483 			mid = (low+high)/2;
484 			r = strcmp(dict[mid].word, (char*)p2);
485 			if(r == 0) {
486 				wfreq[dict[mid].class]++;
487 				break;
488 			}
489 			if(r < 0)
490 				low = mid+1;
491 			else
492 				high = mid;
493 		}
494 		*p++ = c;
495 	}
496 }
497 
498 typedef struct Filemagic Filemagic;
499 struct Filemagic {
500 	ulong x;
501 	ulong mask;
502 	char *desc;
503 	char *mime;
504 };
505 
506 Filemagic long0tab[] = {
507 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
508 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
509 	0x32636170,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
510 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
511 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
512 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", OCTET,
513 	070707,		0xFFFF,		"cpio archive\n", OCTET,
514 	0x2F7,		0xFFFF,		"tex dvi\n", OCTET,
515 };
516 
517 int
518 filemagic(Filemagic *tab, int ntab, ulong x)
519 {
520 	int i;
521 
522 	for(i=0; i<ntab; i++)
523 		if((x&tab[i].mask) == tab[i].x){
524 			print(mime ? tab[i].mime : tab[i].desc);
525 			return 1;
526 		}
527 	return 0;
528 }
529 
530 int
531 long0(void)
532 {
533 	Fhdr f;
534 	long x;
535 
536 	seek(fd, 0, 0);		/* reposition to start of file */
537 	if(crackhdr(fd, &f)) {
538 		print(mime ? OCTET : "%s\n", f.name);
539 		return 1;
540 	}
541 	x = LENDIAN(buf);
542 	if(filemagic(long0tab, nelem(long0tab), x))
543 		return 1;
544 	return 0;
545 }
546 
547 /*
548  * initial words to classify file
549  */
550 struct	FILE_STRING
551 {
552 	char 	*key;
553 	char	*filetype;
554 	int	length;
555 	char	*mime;
556 } file_string[] =
557 {
558 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
559 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
560 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
561 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
562 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
563 	"%!",			"postscript",			2,	"application/postscript",
564 	"\004%!",		"postscript",			3,	"application/postscript",
565 	"x T post",		"troff output for post",	8,	"application/troff",
566 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
567 	"x T utf",		"troff output for UTF",		7,	"application/troff",
568 	"x T 202",		"troff output for 202",		7,	"application/troff",
569 	"x T aps",		"troff output for aps",		7,	"application/troff",
570 	"GIF",			"GIF image", 			3,	"image/gif",
571 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
572 	"%PDF",			"PDF",				4,	"application/pdf",
573 	"From ",		"mail box",			5, "text/plain",
574 	"<html>\n",		"HTML file",			7,	"text/html",
575 	"<HTML>\n",		"HTML file",			7,	"text/html",
576 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
577 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
578 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
579 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
580 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
581 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
582 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
583 	0,0,0,0
584 };
585 
586 int
587 istring(void)
588 {
589 	int i;
590 	struct FILE_STRING *p;
591 
592 	for(p = file_string; p->key; p++) {
593 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
594 			if(mime)
595 				print("%s\n", p->mime);
596 			else
597 				print("%s\n", p->filetype);
598 			return 1;
599 		}
600 	}
601 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
602 		for(i = 5; i < nbuf; i++)
603 			if(buf[i] == '\n')
604 				break;
605 		if(mime)
606 			print(OCTET);
607 		else
608 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
609 		return 1;
610 	}
611 	return 0;
612 }
613 
614 char*	html_string[] =
615 {
616 	"title",
617 	"body",
618 	"head",
619 	"strong",
620 	"h1",
621 	"h2",
622 	"h3",
623 	"h4",
624 	"h5",
625 	"h6",
626 	"ul",
627 	"li",
628 	"dl",
629 	"br",
630 	"em",
631 	0,
632 };
633 
634 int
635 ishtml(void)
636 {
637 	uchar *p, *q;
638 	int i, count;
639 
640 		/* compare strings between '<' and '>' to html table */
641 	count = 0;
642 	p = buf;
643 	for(;;) {
644 		while (p < buf+nbuf && *p != '<')
645 			p++;
646 		p++;
647 		if (p >= buf+nbuf)
648 			break;
649 		if(*p == '/')
650 			p++;
651 		q = p;
652 		while(p < buf+nbuf && *p != '>')
653 			p++;
654 		if (p >= buf+nbuf)
655 			break;
656 		for(i = 0; html_string[i]; i++) {
657 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
658 				if(count++ > 4) {
659 					print(mime ? "text/html\n" : "HTML file\n");
660 					return 1;
661 				}
662 				break;
663 			}
664 		}
665 		p++;
666 	}
667 	return 0;
668 }
669 
670 char*	rfc822_string[] =
671 {
672 	"from:",
673 	"date:",
674 	"to:",
675 	"subject:",
676 	"received:",
677 	0,
678 };
679 
680 int
681 isrfc822(void)
682 {
683 
684 	char *p, *q, *r;
685 	int i, count;
686 
687 	count = 0;
688 	p = (char*)buf;
689 	for(;;) {
690 		q = strchr(p, '\n');
691 		if(q == nil)
692 			break;
693 		if(*p != '\t' && *p != ' '){
694 			r = strchr(p, ':');
695 			if(r == 0 || r > q)
696 				break;
697 			for(i = 0; rfc822_string[i]; i++) {
698 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
699 					count++;
700 					break;
701 				}
702 			}
703 		}
704 		p = q+1;
705 	}
706 	if(count >= 3){
707 		print(mime ? "message/rfc822\n" : "email file\n");
708 		return 1;
709 	}
710 	return 0;
711 }
712 
713 int
714 iscint(void)
715 {
716 	int type;
717 	char *name;
718 	Biobuf b;
719 
720 	if(Binit(&b, fd, OREAD) == Beof)
721 		return 0;
722 	seek(fd, 0, 0);
723 	type = objtype(&b, &name);
724 	if(type < 0)
725 		return 0;
726 	if(mime)
727 		print(OCTET);
728 	else
729 		print("%s intermediate\n", name);
730 	return 1;
731 }
732 
733 int
734 isc(void)
735 {
736 	int n;
737 
738 	n = wfreq[I1];
739 	/*
740 	 * includes
741 	 */
742 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
743 		goto yes;
744 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
745 		goto yes;
746 	/*
747 	 * declarations
748 	 */
749 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
750 		goto yes;
751 	/*
752 	 * assignments
753 	 */
754 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
755 		goto yes;
756 	return 0;
757 
758 yes:
759 	if(mime){
760 		print(PLAIN);
761 		return 1;
762 	}
763 	if(wfreq[Alword] > 0)
764 		print("alef program\n");
765 	else
766 		print("c program\n");
767 	return 1;
768 }
769 
770 int
771 islimbo(void)
772 {
773 
774 	/*
775 	 * includes
776 	 */
777 	if(wfreq[Lword] < 4)
778 		return 0;
779 	print(mime ? PLAIN : "limbo program\n");
780 	return 1;
781 }
782 
783 int
784 isas(void)
785 {
786 
787 	/*
788 	 * includes
789 	 */
790 	if(wfreq[Aword] < 2)
791 		return 0;
792 	print(mime ? PLAIN : "as program\n");
793 	return 1;
794 }
795 
796 /*
797  * low entropy means encrypted
798  */
799 int
800 ismung(void)
801 {
802 	int i, bucket[8];
803 	float cs;
804 
805 	if(nbuf < 64)
806 		return 0;
807 	memset(bucket, 0, sizeof(bucket));
808 	for(i=0; i<64; i++)
809 		bucket[(buf[i]>>5)&07] += 1;
810 
811 	cs = 0.;
812 	for(i=0; i<8; i++)
813 		cs += (bucket[i]-8)*(bucket[i]-8);
814 	cs /= 8.;
815 	if(cs <= 24.322) {
816 		if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
817 			print(mime ? OCTET : "compressed\n");
818 		else
819 			print(mime ? OCTET : "encrypted\n");
820 		return 1;
821 	}
822 	return 0;
823 }
824 
825 /*
826  * english by punctuation and frequencies
827  */
828 int
829 isenglish(void)
830 {
831 	int vow, comm, rare, badpun, punct;
832 	char *p;
833 
834 	if(guess != Fascii && guess != Feascii)
835 		return 0;
836 	badpun = 0;
837 	punct = 0;
838 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
839 		switch(*p) {
840 		case '.':
841 		case ',':
842 		case ')':
843 		case '%':
844 		case ';':
845 		case ':':
846 		case '?':
847 			punct++;
848 			if(p[1] != ' ' && p[1] != '\n')
849 				badpun++;
850 		}
851 	if(badpun*5 > punct)
852 		return 0;
853 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
854 		return 0;
855 	if(2*cfreq[';'] > cfreq['e'])
856 		return 0;
857 
858 	vow = 0;
859 	for(p="AEIOU"; *p; p++) {
860 		vow += cfreq[*p];
861 		vow += cfreq[tolower(*p)];
862 	}
863 	comm = 0;
864 	for(p="ETAION"; *p; p++) {
865 		comm += cfreq[*p];
866 		comm += cfreq[tolower(*p)];
867 	}
868 	rare = 0;
869 	for(p="VJKQXZ"; *p; p++) {
870 		rare += cfreq[*p];
871 		rare += cfreq[tolower(*p)];
872 	}
873 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
874 		print(mime ? PLAIN : "English text\n");
875 		return 1;
876 	}
877 	return 0;
878 }
879 
880 /*
881  * pick up a number with
882  * syntax _*[0-9]+_
883  */
884 #define	P9BITLEN	12
885 int
886 p9bitnum(uchar *bp)
887 {
888 	int n, c, len;
889 
890 	len = P9BITLEN;
891 	while(*bp == ' ') {
892 		bp++;
893 		len--;
894 		if(len <= 0)
895 			return -1;
896 	}
897 	n = 0;
898 	while(len > 1) {
899 		c = *bp++;
900 		if(!isdigit(c))
901 			return -1;
902 		n = n*10 + c-'0';
903 		len--;
904 	}
905 	if(*bp != ' ')
906 		return -1;
907 	return n;
908 }
909 
910 int
911 depthof(char *s, int *newp)
912 {
913 	char *es;
914 	int d;
915 
916 	*newp = 0;
917 	es = s+12;
918 	while(s<es && *s==' ')
919 		s++;
920 	if(s == es)
921 		return -1;
922 	if('0'<=*s && *s<='9')
923 		return 1<<atoi(s);
924 
925 	*newp = 1;
926 	d = 0;
927 	while(s<es && *s!=' '){
928 		s++;	/* skip letter */
929 		d += strtoul(s, &s, 10);
930 	}
931 
932 	switch(d){
933 	case 32:
934 	case 24:
935 	case 16:
936 	case 8:
937 		return d;
938 	}
939 	return -1;
940 }
941 
942 int
943 isp9bit(void)
944 {
945 	int dep, lox, loy, hix, hiy, px, new;
946 	ulong t;
947 	long len;
948 	char *newlabel;
949 
950 	newlabel = "old ";
951 
952 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
953 	if(new)
954 		newlabel = "";
955 	lox = p9bitnum(buf + 1*P9BITLEN);
956 	loy = p9bitnum(buf + 2*P9BITLEN);
957 	hix = p9bitnum(buf + 3*P9BITLEN);
958 	hiy = p9bitnum(buf + 4*P9BITLEN);
959 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
960 		return 0;
961 
962 	if(dep < 8){
963 		px = 8/dep;	/* pixels per byte */
964 		/* set l to number of bytes of data per scan line */
965 		if(lox >= 0)
966 			len = (hix+px-1)/px - lox/px;
967 		else{	/* make positive before divide */
968 			t = (-lox)+px-1;
969 			t = (t/px)*px;
970 			len = (t+hix+px-1)/px;
971 		}
972 	}else
973 		len = (hix-lox)*dep/8;
974 	len *= (hiy-loy);		/* col length */
975 	len += 5*P9BITLEN;		/* size of initial ascii */
976 
977 	/*
978 	 * for image file, length is non-zero and must match calculation above
979 	 * for /dev/window and /dev/screen the length is always zero
980 	 * for subfont, the subfont header should follow immediately.
981 	 */
982 	if (len != 0 && mbuf->length == 0) {
983 		print("%splan 9 image\n", newlabel);
984 		return 1;
985 	}
986 	if (mbuf->length == len) {
987 		print("%splan 9 image\n", newlabel);
988 		return 1;
989 	}
990 	/* Ghostscript sometimes produces a little extra on the end */
991 	if (mbuf->length < len+P9BITLEN) {
992 		print("%splan 9 image\n", newlabel);
993 		return 1;
994 	}
995 	if (p9subfont(buf+len)) {
996 		print("%ssubfont file\n", newlabel);
997 		return 1;
998 	}
999 	return 0;
1000 }
1001 
1002 int
1003 p9subfont(uchar *p)
1004 {
1005 	int n, h, a;
1006 
1007 		/* if image too big, assume it's a subfont */
1008 	if (p+3*P9BITLEN > buf+sizeof(buf))
1009 		return 1;
1010 
1011 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
1012 	if (n < 0)
1013 		return 0;
1014 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
1015 	if (h < 0)
1016 		return 0;
1017 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
1018 	if (a < 0)
1019 		return 0;
1020 	return 1;
1021 }
1022 
1023 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1024 
1025 int
1026 isp9font(void)
1027 {
1028 	uchar *cp, *p;
1029 	int i, n;
1030 	char pathname[1024];
1031 
1032 	cp = buf;
1033 	if (!getfontnum(cp, &cp))	/* height */
1034 		return 0;
1035 	if (!getfontnum(cp, &cp))	/* ascent */
1036 		return 0;
1037 	for (i = 0; 1; i++) {
1038 		if (!getfontnum(cp, &cp))	/* min */
1039 			break;
1040 		if (!getfontnum(cp, &cp))	/* max */
1041 			return 0;
1042 		while (WHITESPACE(*cp))
1043 			cp++;
1044 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1045 				;
1046 			/* construct a path name, if needed */
1047 		n = 0;
1048 		if (*p != '/' && slash) {
1049 			n = slash-fname+1;
1050 			if (n < sizeof(pathname))
1051 				memcpy(pathname, fname, n);
1052 			else n = 0;
1053 		}
1054 		if (n+cp-p < sizeof(pathname)) {
1055 			memcpy(pathname+n, p, cp-p);
1056 			n += cp-p;
1057 			pathname[n] = 0;
1058 			if (access(pathname, AEXIST) < 0)
1059 				return 0;
1060 		}
1061 	}
1062 	if (i) {
1063 		print("font file\n");
1064 		return 1;
1065 	}
1066 	return 0;
1067 }
1068 
1069 int
1070 getfontnum(uchar *cp, uchar **rp)
1071 {
1072 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1073 		cp++;
1074 	if (*cp < '0' || *cp > '9')
1075 		return 0;
1076 	strtoul((char *)cp, (char **)rp, 0);
1077 	if (!WHITESPACE(**rp))
1078 		return 0;
1079 	return 1;
1080 }
1081 
1082 int
1083 ishp(void)
1084 {
1085 	if (strncmp("\033%-12345X", (char *)buf, 9)==0) {
1086 		print("HPJCL file\n");
1087 		return 1;
1088 	}
1089 	return 0;
1090 }
1091