xref: /plan9-contrib/sys/src/cmd/file.c (revision 7dd7cddf99dd7472612f1413b4da293630e6b1bc)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
6 
7 /*
8  * file - determine type of file
9  */
10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
11 
12 uchar	buf[6000];
13 short	cfreq[140];
14 short	wfreq[50];
15 int	nbuf;
16 Dir	mbuf;
17 int	fd;
18 char 	*fname;
19 char	*slash;
20 
21 enum
22 {
23 	Cword,
24 	Fword,
25 	Aword,
26 	Alword,
27 	Lword,
28 	I1,
29 	I2,
30 	I3,
31 	Clatin	= 128,
32 	Cbinary,
33 	Cnull,
34 	Ceascii,
35 	Cutf,
36 };
37 struct
38 {
39 	char*	word;
40 	int	class;
41 } dict[] =
42 {
43 	"PATH",		Lword,
44 	"TEXT",		Aword,
45 	"adt",		Alword,
46 	"aggr",		Alword,
47 	"alef",		Alword,
48 	"array",	Lword,
49 	"block",	Fword,
50 	"chan",		Alword,
51 	"char",		Cword,
52 	"common",	Fword,
53 	"con",		Lword,
54 	"data",		Fword,
55 	"dimension",	Fword,
56 	"double",	Cword,
57 	"extern",	Cword,
58 	"bio",		I2,
59 	"float",	Cword,
60 	"fn",		Lword,
61 	"function",	Fword,
62 	"h",		I3,
63 	"implement",	Lword,
64 	"import",	Lword,
65 	"include",	I1,
66 	"int",		Cword,
67 	"integer",	Fword,
68 	"iota",		Lword,
69 	"libc",		I2,
70 	"long",		Cword,
71 	"module",	Lword,
72 	"real",		Fword,
73 	"ref",		Lword,
74 	"register",	Cword,
75 	"self",		Lword,
76 	"short",	Cword,
77 	"static",	Cword,
78 	"stdio",	I2,
79 	"struct",	Cword,
80 	"subroutine",	Fword,
81 	"u",		I2,
82 	"void",		Cword,
83 };
84 
85 /* codes for 'mode' field in language structure */
86 enum	{
87 		Normal	= 0,
88 		First,		/* first entry for language spanning several ranges */
89 		Multi,		/* later entries "   "       "  ... */
90 		Shared,		/* codes used in several languages */
91 	};
92 
93 struct
94 {
95 	int	mode;		/* see enum above */
96 	int 	count;
97 	int	low;
98 	int	high;
99 	char	*name;
100 
101 } language[] =
102 {
103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
125 	First,	0,	0x3130,	0x318F,	"Korean",
126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
128 	Normal,	0,	0,	0,	0,		/* terminal entry */
129 };
130 
131 
132 enum
133 {
134 	Fascii,		/* printable ascii */
135 	Flatin,		/* latin 1*/
136 	Futf,		/* UTf character set */
137 	Fbinary,	/* binary */
138 	Feascii,	/* ASCII with control chars */
139 	Fnull,		/* NULL in file */
140 } guess;
141 
142 void	bump_utf_count(Rune);
143 int	cistrncmp(char*, char*, int);
144 void	filetype(int);
145 int	getfontnum(uchar*, uchar**);
146 int	isas(void);
147 int	isc(void);
148 int	iscint(void);
149 int	isenglish(void);
150 int	ishp(void);
151 int	ishtml(void);
152 int	islimbo(void);
153 int	ismung(void);
154 int	isp9bit(void);
155 int	isp9font(void);
156 int	istring(void);
157 int	long0(void);
158 int	p9bitnum(uchar*);
159 int	p9subfont(uchar*);
160 void	print_utf(void);
161 int	short0(void);
162 void	type(char*, int);
163 int	utf_count(void);
164 void	wordfreq(void);
165 
166 int	(*call[])(void) =
167 {
168 	long0,		/* recognizable by first 4 bytes */
169 	short0,		/* recognizable by first 2 bytes */
170 	istring,	/* recognizable by first string */
171 	ishtml,		/* html keywords */
172 	iscint,		/* compiler/assembler intermediate */
173 	islimbo,	/* limbo source */
174 	isc,		/* c & alef compiler key words */
175 	isas,		/* assembler key words */
176 	ismung,		/* entropy compressed/encrypted */
177 	isp9font,	/* plan 9 font */
178 	isp9bit,	/* plan 9 image (as from /dev/window) */
179 	isenglish,	/* char frequency English */
180 	ishp,		/* HP Job Control Language - Postscript */
181 	0
182 };
183 
184 int mime;
185 
186 #define OCTET	"application/octet-stream\n"
187 #define PLAIN	"text/plain\n"
188 
189 void
190 main(int argc, char *argv[])
191 {
192 	int i, j, maxlen;
193 	char *cp;
194 	Rune r;
195 
196 	ARGBEGIN{
197 	case 'm':
198 		mime = 1;
199 		break;
200 	default:
201 		fprint(2, "usage: file [-m] [file...]\n");
202 		exits("usage");
203 	}ARGEND;
204 
205 	maxlen = 0;
206 	if(mime == 0 || argc > 1){
207 		for(i = 0; i < argc; i++) {
208 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
209 					;
210 			if(j > maxlen)
211 				maxlen = j;
212 		}
213 	}
214 	if (argc <= 0) {
215 		if(!mime)
216 			print ("stdin: ");
217 		filetype(0);
218 	}
219 	else {
220 		for(i = 0; i < argc; i++)
221 			type(argv[i], maxlen);
222 	}
223 	exits(0);
224 }
225 
226 void
227 type(char *file, int nlen)
228 {
229 	Rune r;
230 	int i;
231 	char *p;
232 
233 	if(nlen > 0){
234 		slash = 0;
235 		for (i = 0, p = file; *p; i++) {
236 			if (*p == '/')			/* find rightmost slash */
237 				slash = p;
238 			p += chartorune(&r, p);		/* count runes */
239 		}
240 		print("%s:%*s",file, nlen-i+1, "");
241 	}
242 	fname = file;
243 	if ((fd = open(file, OREAD)) < 0) {
244 		print("cannot open\n");
245 		return;
246 	}
247 	filetype(fd);
248 	close(fd);
249 }
250 
251 void
252 filetype(int fd)
253 {
254 	Rune r;
255 	int i, f, n;
256 	char *p, *eob;
257 
258 	if(dirfstat(fd, &mbuf) < 0) {
259 		print("cannot stat\n");
260 		return;
261 	}
262 	if(mbuf.mode & CHDIR) {
263 		print(mime ? "text/directory\n" : "directory\n");
264 		return;
265 	}
266 	if(mbuf.type != 'M' && mbuf.type != '|') {
267 		print(mime ? OCTET : "special file #%c/%s\n",
268 			mbuf.type, mbuf.name);
269 		return;
270 	}
271 	nbuf = read(fd, buf, sizeof(buf));
272 
273 	if(nbuf < 0) {
274 		print("cannot read\n");
275 		return;
276 	}
277 	if(nbuf == 0) {
278 		print(mime ? PLAIN : "empty file\n");
279 		return;
280 	}
281 
282 	/*
283 	 * build histogram table
284 	 */
285 	memset(cfreq, 0, sizeof(cfreq));
286 	for (i = 0; language[i].name; i++)
287 		language[i].count = 0;
288 	eob = (char *)buf+nbuf;
289 	for(n = 0, p = (char *)buf; p < eob; n++) {
290 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
291 			break;
292 		p += chartorune(&r, p);
293 		if (r == 0)
294 			f = Cnull;
295 		else if (r <= 0x7f) {
296 			if (!isprint(r) && !isspace(r))
297 				f = Ceascii;	/* ASCII control char */
298 			else f = r;
299 		} else if (r == 0x080) {
300 			bump_utf_count(r);
301 			f = Cutf;
302 		} else if (r < 0xA0)
303 				f = Cbinary;	/* Invalid Runes */
304 		else if (r <= 0xff)
305 				f = Clatin;	/* Latin 1 */
306 		else {
307 			bump_utf_count(r);
308 			f = Cutf;		/* UTF extension */
309 		}
310 		cfreq[f]++;			/* ASCII chars peg directly */
311 	}
312 	/*
313 	 * gross classify
314 	 */
315 	if (cfreq[Cbinary])
316 		guess = Fbinary;
317 	else if (cfreq[Cutf])
318 		guess = Futf;
319 	else if (cfreq[Clatin])
320 		guess = Flatin;
321 	else if (cfreq[Ceascii])
322 		guess = Feascii;
323 	else if (cfreq[Cnull] == n) {
324 		print(mime ? OCTET : "all null bytes\n");
325 		return;
326 	}
327 	else guess = Fascii;
328 	/*
329 	 * lookup dictionary words
330 	 */
331 	memset(wfreq, 0, sizeof(wfreq));
332 	if(guess == Fascii || guess == Flatin || guess == Futf)
333 		wordfreq();
334 	/*
335 	 * call individual classify routines
336 	 */
337 	for(i=0; call[i]; i++)
338 		if((*call[i])())
339 			return;
340 
341 	/*
342 	 * if all else fails,
343 	 * print out gross classification
344 	 */
345 	if (nbuf < 100)
346 		print(mime ? PLAIN : "short ");
347 	if (guess == Fascii)
348 		print(mime ? PLAIN : "Ascii\n");
349 	else if (guess == Feascii)
350 		print(mime ? PLAIN : "extended ascii\n");
351 	else if (guess == Flatin)
352 		print(mime ? PLAIN : "latin ascii\n");
353 	else if (guess == Futf && utf_count() < 4)
354 		print_utf();
355 	else print(mime ? OCTET : "binary\n");
356 }
357 
358 void
359 bump_utf_count(Rune r)
360 {
361 	int low, high, mid;
362 
363 	high = sizeof(language)/sizeof(language[0])-1;
364 	for (low = 0; low < high;) {
365 		mid = (low+high)/2;
366 		if (r >=language[mid].low) {
367 			if (r <= language[mid].high) {
368 				language[mid].count++;
369 				break;
370 			} else low = mid+1;
371 		} else high = mid;
372 	}
373 }
374 
375 int
376 utf_count(void)
377 {
378 	int i, count;
379 
380 	count = 0;
381 	for (i = 0; language[i].name; i++)
382 		if (language[i].count > 0)
383 			switch (language[i].mode) {
384 			case Normal:
385 			case First:
386 				count++;
387 				break;
388 			default:
389 				break;
390 			}
391 	return count;
392 }
393 
394 int
395 chkascii(void)
396 {
397 	int i;
398 
399 	for (i = 'a'; i < 'z'; i++)
400 		if (cfreq[i])
401 			return 1;
402 	for (i = 'A'; i < 'Z'; i++)
403 		if (cfreq[i])
404 			return 1;
405 	return 0;
406 }
407 
408 int
409 find_first(char *name)
410 {
411 	int i;
412 
413 	for (i = 0; language[i].name != 0; i++)
414 		if (language[i].mode == First
415 			&& strcmp(language[i].name, name) == 0)
416 			return i;
417 	return -1;
418 }
419 
420 void
421 print_utf(void)
422 {
423 	int i, printed, j;
424 
425 	if(mime){
426 		print(PLAIN);
427 		return;
428 	}
429 	if (chkascii()) {
430 		printed = 1;
431 		print("Ascii");
432 	} else
433 		printed = 0;
434 	for (i = 0; language[i].name; i++)
435 		if (language[i].count) {
436 			switch(language[i].mode) {
437 			case Multi:
438 				j = find_first(language[i].name);
439 				if (j < 0)
440 					break;
441 				if (language[j].count > 0)
442 					break;
443 				/* Fall through */
444 			case Normal:
445 			case First:
446 				if (printed)
447 					print(" & ");
448 				else printed = 1;
449 				print("%s", language[i].name);
450 				break;
451 			case Shared:
452 			default:
453 				break;
454 			}
455 		}
456 	if(!printed)
457 		print("UTF");
458 	print(" text\n");
459 }
460 
461 void
462 wordfreq(void)
463 {
464 	int low, high, mid, r;
465 	uchar *p, *p2, c;
466 
467 	p = buf;
468 	for(;;) {
469 		while (p < buf+nbuf && !isalpha(*p))
470 			p++;
471 		if (p >= buf+nbuf)
472 			return;
473 		p2 = p;
474 		while(p < buf+nbuf && isalpha(*p))
475 			p++;
476 		c = *p;
477 		*p = 0;
478 		high = sizeof(dict)/sizeof(dict[0]);
479 		for(low = 0;low < high;) {
480 			mid = (low+high)/2;
481 			r = strcmp(dict[mid].word, (char*)p2);
482 			if(r == 0) {
483 				wfreq[dict[mid].class]++;
484 				break;
485 			}
486 			if(r < 0)
487 				low = mid+1;
488 			else
489 				high = mid;
490 		}
491 		*p++ = c;
492 	}
493 }
494 
495 int
496 long0(void)
497 {
498 	Fhdr f;
499 	long x;
500 
501 	seek(fd, 0, 0);		/* reposition to start of file */
502 	if(crackhdr(fd, &f)) {
503 		print(mime ? OCTET : "%s\n", f.name);
504 		return 1;
505 	}
506 	x = LENDIAN(buf);
507 	switch(x) {
508 	case 0xf16df16d:
509 		print(mime ? OCTET : "pac1 audio file\n");
510 		return 1;
511 	case 0x31636170:
512 		print(mime ? OCTET : "pac3 audio file\n");
513 		return 1;
514 	case 0xba010000:
515 		print(mime ? OCTET : "mpeg system stream\n");
516 		return 1;
517 	case 0x30800cc0:
518 		print(mime ? OCTET : "inferno .dis executable\n");
519 		return 1;
520 	}
521 	if(((x ^ 0x32636170) & 0xffff00ff) == 0) {
522 		print(mime ? OCTET : "pac4 audio file\n");
523 		return 1;
524 	}
525 	return 0;
526 }
527 
528 int
529 short0(void)
530 {
531 
532 	switch(LENDIAN(buf) & 0xffff) {
533 	case 070707:
534 		print(mime ? OCTET : "cpio archive\n");
535 		break;
536 
537 	case 0x02f7:
538 		print(mime ? OCTET : "tex dvi\n");
539 		break;
540 	default:
541 		return 0;
542 	}
543 	return 1;
544 }
545 
546 /*
547  * initial words to classify file
548  */
549 struct	FILE_STRING
550 {
551 	char 	*key;
552 	char	*filetype;
553 	int	length;
554 	char	*mime;
555 } file_string[] =
556 {
557 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
558 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
559 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
560 	"#!/bin/rc",		"rc executable file",		9,	"text/plain",
561 	"#!/bin/sh",		"sh executable file",		9,	"text/plain",
562 	"%!",			"postscript",			2,	"application/postscript",
563 	"\004%!",		"postscript",			3,	"application/postscript",
564 	"x T post",		"troff output for post",	8,	"application/troff",
565 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
566 	"x T utf",		"troff output for UTF",		7,	"application/troff",
567 	"x T 202",		"troff output for 202",		7,	"application/troff",
568 	"x T aps",		"troff output for aps",		7,	"application/troff",
569 	"GIF",			"GIF image", 			3,	"image/gif",
570 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
571 	"%PDF",			"PDF",				4,	"image/pdf",
572 	"<html>\n",		"HTML file",			7,	"text/html",
573 	"<HTML>\n",		"HTML file",			7,	"text/html",
574 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
575 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
576 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
577 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
578 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
579 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
580 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
581 
582 	0,0,0,0
583 };
584 
585 int
586 istring(void)
587 {
588 	int i;
589 	struct FILE_STRING *p;
590 
591 	for(p = file_string; p->key; p++) {
592 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
593 			if(mime)
594 				print("%s\n", p->mime);
595 			else
596 				print("%s\n", p->filetype);
597 			return 1;
598 		}
599 	}
600 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
601 		for(i = 5; i < nbuf; i++)
602 			if(buf[i] == '\n')
603 				break;
604 		if(mime)
605 			print(OCTET);
606 		else
607 			print("%.*s picture\n", i-5, (char*)buf+5);
608 		return 1;
609 	}
610 	return 0;
611 }
612 
613 char*	html_string[] =
614 {
615 	"title",
616 	"body",
617 	"head",
618 	"strong",
619 	"h1",
620 	"h2",
621 	"h3",
622 	"h4",
623 	"h5",
624 	"h6",
625 	"ul",
626 	"li",
627 	"dl",
628 	"br",
629 	"em",
630 	0,
631 };
632 
633 int
634 ishtml(void)
635 {
636 	uchar *p, *q;
637 	int i, count;
638 
639 		/* compare strings between '<' and '>' to html table */
640 	count = 0;
641 	p = buf;
642 	for(;;) {
643 		while (p < buf+nbuf && *p != '<')
644 			p++;
645 		p++;
646 		if (p >= buf+nbuf)
647 			break;
648 		if(*p == '/')
649 			p++;
650 		q = p;
651 		while(p < buf+nbuf && *p != '>')
652 			p++;
653 		if (p >= buf+nbuf)
654 			break;
655 		for(i = 0; html_string[i]; i++) {
656 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
657 				if(count++ > 4) {
658 					print(mime ? "text/html\n" : "HTML file\n");
659 					return 1;
660 				}
661 				break;
662 			}
663 		}
664 		p++;
665 	}
666 	return 0;
667 }
668 
669 /*
670  *  case independent string compare
671  */
672 int
673 cistrncmp(char *s1, char *s2, int n)
674 {
675 	int c1, c2;
676 
677 	for(; n > 0; n--){
678 		c1 = *s1++;
679 		c2 = *s2++;
680 		if(isupper(c1))
681 			c1 = tolower(c1);
682 		if(isupper(c2))
683 			c2 = tolower(c2);
684 		if(c2 != c1)
685 			break;
686 		if(c1 == 0)
687 			return 0;
688 	}
689 	return 1;
690 }
691 
692 int
693 iscint(void)
694 {
695 	int type;
696 	char *name;
697 	Biobuf b;
698 
699 	if(Binit(&b, fd, OREAD) == Beof)
700 		return 0;
701 	seek(fd, 0, 0);
702 	type = objtype(&b, &name);
703 	if(type < 0)
704 		return 0;
705 	if(mime)
706 		print(OCTET);
707 	else
708 		print("%s intermediate\n", name);
709 	return 1;
710 }
711 
712 int
713 isc(void)
714 {
715 	int n;
716 
717 	n = wfreq[I1];
718 	/*
719 	 * includes
720 	 */
721 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
722 		goto yes;
723 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
724 		goto yes;
725 	/*
726 	 * declarations
727 	 */
728 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
729 		goto yes;
730 	/*
731 	 * assignments
732 	 */
733 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
734 		goto yes;
735 	return 0;
736 
737 yes:
738 	if(mime){
739 		print(PLAIN);
740 		return 1;
741 	}
742 	if(wfreq[Alword] > 0)
743 		print("alef program\n");
744 	else
745 		print("c program\n");
746 	return 1;
747 }
748 
749 int
750 islimbo(void)
751 {
752 
753 	/*
754 	 * includes
755 	 */
756 	if(wfreq[Lword] < 4)
757 		return 0;
758 	print(mime ? PLAIN : "limbo program\n");
759 	return 1;
760 }
761 
762 int
763 isas(void)
764 {
765 
766 	/*
767 	 * includes
768 	 */
769 	if(wfreq[Aword] < 2)
770 		return 0;
771 	print(mime ? PLAIN : "as program\n");
772 	return 1;
773 }
774 
775 /*
776  * low entropy means encrypted
777  */
778 int
779 ismung(void)
780 {
781 	int i, bucket[8];
782 	float cs;
783 
784 	if(nbuf < 64)
785 		return 0;
786 	memset(bucket, 0, sizeof(bucket));
787 	for(i=0; i<64; i++)
788 		bucket[(buf[i]>>5)&07] += 1;
789 
790 	cs = 0.;
791 	for(i=0; i<8; i++)
792 		cs += (bucket[i]-8)*(bucket[i]-8);
793 	cs /= 8.;
794 	if(cs <= 24.322) {
795 		if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
796 			print(mime ? OCTET : "compressed\n");
797 		else
798 			print(mime ? OCTET : "encrypted\n");
799 		return 1;
800 	}
801 	return 0;
802 }
803 
804 /*
805  * english by punctuation and frequencies
806  */
807 int
808 isenglish(void)
809 {
810 	int vow, comm, rare, badpun, punct;
811 	char *p;
812 
813 	if(guess != Fascii && guess != Feascii)
814 		return 0;
815 	badpun = 0;
816 	punct = 0;
817 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
818 		switch(*p) {
819 		case '.':
820 		case ',':
821 		case ')':
822 		case '%':
823 		case ';':
824 		case ':':
825 		case '?':
826 			punct++;
827 			if(p[1] != ' ' && p[1] != '\n')
828 				badpun++;
829 		}
830 	if(badpun*5 > punct)
831 		return 0;
832 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
833 		return 0;
834 	if(2*cfreq[';'] > cfreq['e'])
835 		return 0;
836 
837 	vow = 0;
838 	for(p="AEIOU"; *p; p++) {
839 		vow += cfreq[*p];
840 		vow += cfreq[tolower(*p)];
841 	}
842 	comm = 0;
843 	for(p="ETAION"; *p; p++) {
844 		comm += cfreq[*p];
845 		comm += cfreq[tolower(*p)];
846 	}
847 	rare = 0;
848 	for(p="VJKQXZ"; *p; p++) {
849 		rare += cfreq[*p];
850 		rare += cfreq[tolower(*p)];
851 	}
852 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
853 		print(mime ? PLAIN : "English text\n");
854 		return 1;
855 	}
856 	return 0;
857 }
858 
859 /*
860  * pick up a number with
861  * syntax _*[0-9]+_
862  */
863 #define	P9BITLEN	12
864 int
865 p9bitnum(uchar *bp)
866 {
867 	int n, c, len;
868 
869 	len = P9BITLEN;
870 	while(*bp == ' ') {
871 		bp++;
872 		len--;
873 		if(len <= 0)
874 			return -1;
875 	}
876 	n = 0;
877 	while(len > 1) {
878 		c = *bp++;
879 		if(!isdigit(c))
880 			return -1;
881 		n = n*10 + c-'0';
882 		len--;
883 	}
884 	if(*bp != ' ')
885 		return -1;
886 	return n;
887 }
888 
889 int
890 depthof(char *s, int *newp)
891 {
892 	char *es;
893 	int d;
894 
895 	*newp = 0;
896 	es = s+12;
897 	while(s<es && *s==' ')
898 		s++;
899 	if(s == es)
900 		return -1;
901 	if('0'<=*s && *s<='9')
902 		return 1<<atoi(s);
903 
904 	*newp = 1;
905 	d = 0;
906 	while(s<es && *s!=' '){
907 		s++;	/* skip letter */
908 		d += strtoul(s, &s, 10);
909 	}
910 
911 	switch(d){
912 	case 32:
913 	case 24:
914 	case 16:
915 	case 8:
916 		return d;
917 	}
918 	return -1;
919 }
920 
921 int
922 isp9bit(void)
923 {
924 	int dep, lox, loy, hix, hiy, px, new;
925 	ulong t;
926 	long len;
927 	char *newlabel;
928 
929 	newlabel = "old ";
930 
931 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
932 	if(new)
933 		newlabel = "";
934 	lox = p9bitnum(buf + 1*P9BITLEN);
935 	loy = p9bitnum(buf + 2*P9BITLEN);
936 	hix = p9bitnum(buf + 3*P9BITLEN);
937 	hiy = p9bitnum(buf + 4*P9BITLEN);
938 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
939 		return 0;
940 
941 	if(dep < 8){
942 		px = 8/dep;	/* pixels per byte */
943 		/* set l to number of bytes of data per scan line */
944 		if(lox >= 0)
945 			len = (hix+px-1)/px - lox/px;
946 		else{	/* make positive before divide */
947 			t = (-lox)+px-1;
948 			t = (t/px)*px;
949 			len = (t+hix+px-1)/px;
950 		}
951 	}else
952 		len = (hix-lox)*dep/8;
953 	len *= (hiy-loy);		/* col length */
954 	len += 5*P9BITLEN;		/* size of initial ascii */
955 
956 	/*
957 	 * for image file, length is non-zero and must match calculation above
958 	 * for /dev/window and /dev/screen the length is always zero
959 	 * for subfont, the subfont header should follow immediately.
960 	 */
961 	if (len != 0 && mbuf.length == 0) {
962 		print("%splan 9 image\n", newlabel);
963 		return 1;
964 	}
965 	if (mbuf.length == len) {
966 		print("%splan 9 image\n", newlabel);
967 		return 1;
968 	}
969 	/* Ghostscript sometimes produces a little extra on the end */
970 	if (mbuf.length < len+P9BITLEN) {
971 		print("%splan 9 image\n", newlabel);
972 		return 1;
973 	}
974 	if (p9subfont(buf+len)) {
975 		print("%ssubfont file\n", newlabel);
976 		return 1;
977 	}
978 	return 0;
979 }
980 
981 int
982 p9subfont(uchar *p)
983 {
984 	int n, h, a;
985 
986 		/* if image too big, assume it's a subfont */
987 	if (p+3*P9BITLEN > buf+sizeof(buf))
988 		return 1;
989 
990 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
991 	if (n < 0)
992 		return 0;
993 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
994 	if (h < 0)
995 		return 0;
996 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
997 	if (a < 0)
998 		return 0;
999 	return 1;
1000 }
1001 
1002 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
1003 
1004 int
1005 isp9font(void)
1006 {
1007 	uchar *cp, *p;
1008 	int i, n;
1009 	char dbuf[DIRLEN];
1010 	char pathname[1024];
1011 
1012 	cp = buf;
1013 	if (!getfontnum(cp, &cp))	/* height */
1014 		return 0;
1015 	if (!getfontnum(cp, &cp))	/* ascent */
1016 		return 0;
1017 	for (i = 0; 1; i++) {
1018 		if (!getfontnum(cp, &cp))	/* min */
1019 			break;
1020 		if (!getfontnum(cp, &cp))	/* max */
1021 			return 0;
1022 		while (WHITESPACE(*cp))
1023 			cp++;
1024 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1025 				;
1026 			/* construct a path name, if needed */
1027 		n = 0;
1028 		if (*p != '/' && slash) {
1029 			n = slash-fname+1;
1030 			if (n < sizeof(pathname))
1031 				memcpy(pathname, fname, n);
1032 			else n = 0;
1033 		}
1034 		if (n+cp-p < sizeof(pathname)) {
1035 			memcpy(pathname+n, p, cp-p);
1036 			n += cp-p;
1037 			pathname[n] = 0;
1038 			if (stat(pathname, dbuf) < 0)
1039 				return 0;
1040 		}
1041 	}
1042 	if (i) {
1043 		print("font file\n");
1044 		return 1;
1045 	}
1046 	return 0;
1047 }
1048 
1049 int
1050 getfontnum(uchar *cp, uchar **rp)
1051 {
1052 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
1053 		cp++;
1054 	if (*cp < '0' || *cp > '9')
1055 		return 0;
1056 	strtoul((char *)cp, (char **)rp, 0);
1057 	if (!WHITESPACE(**rp))
1058 		return 0;
1059 	return 1;
1060 }
1061 
1062 int
1063 ishp(void)
1064 {
1065 	if (strncmp("\033%-12345X", (char *)buf, 9)==0) {
1066 		print("HPJCL file\n");
1067 		return 1;
1068 	}
1069 	return 0;
1070 }
1071