1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6000]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 I1, 28 I2, 29 I3, 30 Clatin = 128, 31 Cbinary, 32 Cnull, 33 Ceascii, 34 Cutf, 35 }; 36 struct 37 { 38 char* word; 39 int class; 40 } dict[] = 41 { 42 "TEXT", Aword, 43 "adt", Alword, 44 "aggr", Alword, 45 "alef", Alword, 46 "block", Fword, 47 "chan", Alword, 48 "char", Cword, 49 "common", Fword, 50 "data", Fword, 51 "dimension", Fword, 52 "double", Cword, 53 "extern", Cword, 54 "bio", I2, 55 "float", Cword, 56 "function", Fword, 57 "h", I3, 58 "include", I1, 59 "int", Cword, 60 "integer", Fword, 61 "libc", I2, 62 "long", Cword, 63 "real", Fword, 64 "register", Cword, 65 "short", Cword, 66 "static", Cword, 67 "stdio", I2, 68 "struct", Cword, 69 "subroutine", Fword, 70 "u", I2, 71 "void", Cword, 72 }; 73 74 /* codes for 'mode' field in language structure */ 75 enum { 76 Normal = 0, 77 First, /* first entry for language spanning several ranges */ 78 Multi, /* later entries " " " ... */ 79 Shared, /* codes used in several languages */ 80 }; 81 82 struct 83 { 84 int mode; /* see enum above */ 85 int count; 86 int low; 87 int high; 88 char *name; 89 90 } language[] = 91 { 92 Normal, 0, 0x0080, 0x0080, "Extended Latin", 93 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 94 Normal, 0, 0x0370, 0x03FF, "Greek", 95 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 96 Normal, 0, 0x0530, 0x058F, "Armenian", 97 Normal, 0, 0x0590, 0x05FF, "Hebrew", 98 Normal, 0, 0x0600, 0x06FF, "Arabic", 99 Normal, 0, 0x0900, 0x097F, "Devanagari", 100 Normal, 0, 0x0980, 0x09FF, "Bengali", 101 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 102 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 103 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 104 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 105 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 106 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 107 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 108 Normal, 0, 0x0E00, 0x0E7F, "Thai", 109 Normal, 0, 0x0E80, 0x0EFF, "Lao", 110 Normal, 0, 0x1000, 0x105F, "Tibetan", 111 Normal, 0, 0x10A0, 0x10FF, "Georgian", 112 Normal, 0, 0x3040, 0x30FF, "Japanese", 113 Normal, 0, 0x3100, 0x312F, "Chinese", 114 First, 0, 0x3130, 0x318F, "Korean", 115 Multi, 0, 0x3400, 0x3D2F, "Korean", 116 Shared, 0, 0x4e00, 0x9fff, "CJK", 117 Normal, 0, 0, 0, 0, /* terminal entry */ 118 }; 119 120 121 enum 122 { 123 Fascii, /* printable ascii */ 124 Flatin, /* latin 1*/ 125 Futf, /* UTf character set */ 126 Fbinary, /* binary */ 127 Feascii, /* ASCII with control chars */ 128 Fnull, /* NULL in file */ 129 } guess; 130 131 void bump_utf_count(Rune); 132 void filetype(int); 133 int getfontnum(uchar *, uchar **); 134 int isas(void); 135 int isc(void); 136 int iscint(void); 137 int isenglish(void); 138 int ismung(void); 139 int isp9bit(void); 140 int isp9font(void); 141 int istring(void); 142 int long0(void); 143 int p9bitnum(uchar *); 144 int p9subfont(uchar *); 145 void print_utf(void); 146 int short0(void); 147 void type(char*, int); 148 int utf_count(void); 149 void wordfreq(void); 150 151 int (*call[])(void) = 152 { 153 long0, /* recognizable by first 4 bytes */ 154 short0, /* recognizable by first 2 bytes */ 155 istring, /* recognizable by first string */ 156 iscint, /* compiler/assembler intermediate */ 157 isc, /* c & alef compiler key words */ 158 isas, /* assembler key words */ 159 ismung, /* entropy compressed/encrypted */ 160 isenglish, /* char frequency English */ 161 isp9font, /* plan 9 font */ 162 isp9bit, /* plan 9 bitmap (as from /dev/window) */ 163 0 164 }; 165 166 void 167 main(int argc, char *argv[]) 168 { 169 int i, j, maxlen; 170 char *cp; 171 Rune r; 172 173 maxlen = 0; 174 for(i = 1; i < argc; i++) { 175 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 176 ; 177 if(j > maxlen) 178 maxlen = j; 179 } 180 if (argc <= 1) { 181 print ("stdin: "); 182 filetype(0); 183 } 184 else { 185 for(i = 1; i < argc; i++) 186 type(argv[i], maxlen); 187 } 188 exits(0); 189 } 190 191 void 192 type(char *file, int nlen) 193 { 194 Rune r; 195 int i; 196 char *p; 197 198 slash = 0; 199 for (i = 0, p = file; *p; i++) { 200 if (*p == '/') /* find rightmost slash */ 201 slash = p; 202 p += chartorune(&r, p); /* count runes */ 203 } 204 print("%s:%*s",file, nlen-i+1, ""); 205 fname = file; 206 if ((fd = open(file, OREAD)) < 0) { 207 print("cannot open\n"); 208 return; 209 } 210 filetype(fd); 211 close(fd); 212 } 213 214 void 215 filetype(int fd) 216 { 217 Rune r; 218 int i, f, n; 219 char *p, *eob; 220 221 if(dirfstat(fd, &mbuf) < 0) { 222 print("cannot stat\n"); 223 return; 224 } 225 if(mbuf.mode & CHDIR) { 226 print("directory\n"); 227 return; 228 } 229 if(mbuf.type != 'M' && mbuf.type != '|') { 230 print("special file #%c\n", mbuf.type); 231 return; 232 } 233 nbuf = read(fd, buf, sizeof(buf)); 234 235 if(nbuf < 0) { 236 print("cannot read\n"); 237 return; 238 } 239 if(nbuf == 0) { 240 print("empty\n"); 241 return; 242 } 243 244 /* 245 * build histogram table 246 */ 247 memset(cfreq, 0, sizeof(cfreq)); 248 for (i = 0; language[i].name; i++) 249 language[i].count = 0; 250 eob = (char *)buf+nbuf; 251 for(n = 0, p = (char *)buf; p < eob; n++) { 252 if (!fullrune(p, eob-p) && eob-p < UTFmax) 253 break; 254 p += chartorune(&r, p); 255 if (r == 0) 256 f = Cnull; 257 else if (r <= 0x7f) { 258 if (!isprint(r) && !isspace(r)) 259 f = Ceascii; /* ASCII control char */ 260 else f = r; 261 } else if (r == 0x080) { 262 bump_utf_count(r); 263 f = Cutf; 264 } else if (r < 0xA0) 265 f = Cbinary; /* Invalid Runes */ 266 else if (r <= 0xff) 267 f = Clatin; /* Latin 1 */ 268 else { 269 bump_utf_count(r); 270 f = Cutf; /* UTF extension */ 271 } 272 cfreq[f]++; /* ASCII chars peg directly */ 273 } 274 /* 275 * gross classify 276 */ 277 if (cfreq[Cbinary]) 278 guess = Fbinary; 279 else if (cfreq[Cutf]) 280 guess = Futf; 281 else if (cfreq[Clatin]) 282 guess = Flatin; 283 else if (cfreq[Ceascii]) 284 guess = Feascii; 285 else if (cfreq[Cnull] == n) { 286 print("all null bytes\n"); 287 return; 288 } 289 else guess = Fascii; 290 /* 291 * lookup dictionary words 292 */ 293 memset(wfreq, 0, sizeof(wfreq)); 294 if(guess == Fascii || guess == Flatin) 295 wordfreq(); 296 /* 297 * call individual classify routines 298 */ 299 for(i=0; call[i]; i++) 300 if((*call[i])()) 301 return; 302 303 /* 304 * if all else fails, 305 * print out gross classification 306 */ 307 if (nbuf < 100) 308 print("short "); 309 if (guess == Fascii) 310 print("Ascii\n"); 311 else if (guess == Feascii) 312 print("extended ascii\n"); 313 else if (guess == Flatin) 314 print("latin ascii\n"); 315 else if (guess == Futf && utf_count() < 4) 316 print_utf(); 317 else print("binary\n"); 318 } 319 320 void 321 bump_utf_count(Rune r) 322 { 323 int low, high, mid; 324 325 high = sizeof(language)/sizeof(language[0])-1; 326 for (low = 0; low < high;) { 327 mid = (low+high)/2; 328 if (r >=language[mid].low) { 329 if (r <= language[mid].high) { 330 language[mid].count++; 331 break; 332 } else low = mid+1; 333 } else high = mid; 334 } 335 } 336 337 int 338 utf_count(void) 339 { 340 int i, count; 341 342 count = 0; 343 for (i = 0; language[i].name; i++) 344 if (language[i].count > 0) 345 switch (language[i].mode) { 346 case Normal: 347 case First: 348 count++; 349 break; 350 default: 351 break; 352 } 353 return count; 354 } 355 356 int 357 chkascii(void) 358 { 359 int i; 360 361 for (i = 'a'; i < 'z'; i++) 362 if (cfreq[i]) 363 return 1; 364 for (i = 'A'; i < 'Z'; i++) 365 if (cfreq[i]) 366 return 1; 367 return 0; 368 } 369 370 int 371 find_first(char *name) 372 { 373 int i; 374 375 for (i = 0; language[i].name != 0; i++) 376 if (language[i].mode == First 377 && strcmp(language[i].name, name) == 0) 378 return i; 379 return -1; 380 } 381 382 void 383 print_utf(void) 384 { 385 int i, printed, j; 386 387 if (chkascii()) { 388 printed = 1; 389 print("Ascii"); 390 } else 391 printed = 0; 392 for (i = 0; language[i].name; i++) 393 if (language[i].count) { 394 switch(language[i].mode) { 395 case Multi: 396 j = find_first(language[i].name); 397 if (j < 0) 398 break; 399 if (language[j].count > 0) 400 break; 401 /* Fall through */ 402 case Normal: 403 case First: 404 if (printed) 405 print(" & "); 406 else printed = 1; 407 print("%s", language[i].name); 408 break; 409 case Shared: 410 default: 411 break; 412 } 413 } 414 if(!printed) 415 print("UTF"); 416 print(" text\n"); 417 } 418 419 void 420 wordfreq(void) 421 { 422 int low, high, mid, r; 423 uchar *p, *p2, c; 424 425 p = buf; 426 for(;;) { 427 while (p < buf+nbuf && !isalpha(*p)) 428 p++; 429 if (p >= buf+nbuf) 430 return; 431 p2 = p; 432 while(p < buf+nbuf && isalpha(*p)) 433 p++; 434 c = *p; 435 *p = 0; 436 high = sizeof(dict)/sizeof(dict[0]); 437 for(low = 0;low < high;) { 438 mid = (low+high)/2; 439 r = strcmp(dict[mid].word, (char*)p2); 440 if(r == 0) { 441 wfreq[dict[mid].class]++; 442 break; 443 } 444 if(r < 0) 445 low = mid+1; 446 else 447 high = mid; 448 } 449 *p++ = c; 450 } 451 } 452 453 int 454 long0(void) 455 { 456 Fhdr f; 457 458 seek(fd, 0, 0); /* reposition to start of file */ 459 if (crackhdr(fd, &f)) { 460 print("%s\n", f.name); 461 return 1; 462 } 463 switch(LENDIAN(buf)) { 464 case 0xf16df16d: 465 print("pac1 audio file\n"); 466 return 1; 467 case 0x31636170: 468 print("pac3 audio file\n"); 469 return 1; 470 case 0x32636170: 471 print("pac4 audio file\n"); 472 return 1; 473 default: 474 return 0; 475 } 476 return 1; 477 } 478 479 int 480 short0(void) 481 { 482 483 switch(LENDIAN(buf) & 0xffff) { 484 case 070707: 485 print("cpio archive\n"); 486 break; 487 488 case 0x02f7: 489 print("tex dvi\n"); 490 break; 491 default: 492 return 0; 493 } 494 return 1; 495 } 496 497 /* 498 * initial words to classify file 499 */ 500 struct FILE_STRING 501 { 502 char *key; 503 char *filetype; 504 int length; 505 } file_string[] = 506 { 507 "!<arch>\n__.SYMDEF", "archive random library", 16, 508 "!<arch>\n", "archive", 8, 509 "070707", "cpio archive - ascii header", 6, 510 "#!/bin/rc", "rc executable file", 9, 511 "#!/bin/sh", "sh executable file", 9, 512 "%!", "postscript", 2, 513 "x T post", "troff output for post", 8, 514 "x T Latin1", "troff output for Latin1", 10, 515 "x T utf", "troff output for UTF", 7, 516 "x T 202", "troff output for 202", 7, 517 "x T aps", "troff output for aps", 7, 518 "GIF", "GIF image", 3, 519 "\0PC Research, Inc", "ghostscript fax file", 23, 520 0,0,0 521 }; 522 523 int 524 istring(void) 525 { 526 int i; 527 struct FILE_STRING *p; 528 529 for(p = file_string; p->key; p++) { 530 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) { 531 print("%s\n", p->filetype); 532 return 1; 533 } 534 } 535 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 536 for(i = 5; i < nbuf; i++) 537 if(buf[i] == '\n') 538 break; 539 print("%.*s picture\n", i-5, buf+5); 540 return 1; 541 } 542 return 0; 543 } 544 545 int 546 iscint(void) 547 { 548 int type; 549 char *name; 550 Biobuf b; 551 552 if(Binit(&b, fd, OREAD) == Beof) 553 return 0; 554 seek(fd, 0, 0); 555 type = objtype(&b, &name); 556 if(type < 0) 557 return 0; 558 print("%s intermediate\n", name); 559 return 1; 560 } 561 562 int 563 isc(void) 564 { 565 int n; 566 567 n = wfreq[I1]; 568 /* 569 * includes 570 */ 571 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 572 goto yes; 573 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 574 goto yes; 575 /* 576 * declarations 577 */ 578 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 579 goto yes; 580 /* 581 * assignments 582 */ 583 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 584 goto yes; 585 return 0; 586 587 yes: 588 if(wfreq[Alword] > 0) 589 print("alef program\n"); 590 else 591 print("c program\n"); 592 return 1; 593 } 594 595 int 596 isas(void) 597 { 598 599 /* 600 * includes 601 */ 602 if(wfreq[Aword] < 2) 603 return 0; 604 print("as program\n"); 605 return 1; 606 } 607 608 /* 609 * low entropy means encrypted 610 */ 611 int 612 ismung(void) 613 { 614 int i, bucket[8]; 615 float cs; 616 617 if(nbuf < 64) 618 return 0; 619 memset(bucket, 0, sizeof(bucket)); 620 for(i=0; i<64; i++) 621 bucket[(buf[i]>>5)&07] += 1; 622 623 cs = 0.; 624 for(i=0; i<8; i++) 625 cs += (bucket[i]-8)*(bucket[i]-8); 626 cs /= 8.; 627 if(cs <= 24.322) { 628 if(buf[0]==037 && buf[1]==0235) 629 print("compressed\n"); 630 else 631 print("encrypted\n"); 632 return 1; 633 } 634 return 0; 635 } 636 637 /* 638 * english by punctuation and frequencies 639 */ 640 int 641 isenglish(void) 642 { 643 int vow, comm, rare, badpun, punct; 644 char *p; 645 646 if(guess != Fascii && guess != Feascii) 647 return 0; 648 badpun = 0; 649 punct = 0; 650 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 651 switch(*p) { 652 case '.': 653 case ',': 654 case ')': 655 case '%': 656 case ';': 657 case ':': 658 case '?': 659 punct++; 660 if(p[1] != ' ' && p[1] != '\n') 661 badpun++; 662 } 663 if(badpun*5 > punct) 664 return 0; 665 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 666 return 0; 667 if(2*cfreq[';'] > cfreq['e']) 668 return 0; 669 670 vow = 0; 671 for(p="AEIOU"; *p; p++) { 672 vow += cfreq[*p]; 673 vow += cfreq[tolower(*p)]; 674 } 675 comm = 0; 676 for(p="ETAION"; *p; p++) { 677 comm += cfreq[*p]; 678 comm += cfreq[tolower(*p)]; 679 } 680 rare = 0; 681 for(p="VJKQXZ"; *p; p++) { 682 rare += cfreq[*p]; 683 rare += cfreq[tolower(*p)]; 684 } 685 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 686 print("English text\n"); 687 return 1; 688 } 689 return 0; 690 } 691 692 /* 693 * pick up a number with 694 * syntax _*[0-9]+_ 695 */ 696 #define P9BITLEN 12 697 int 698 p9bitnum(uchar *bp) 699 { 700 int n, c, len; 701 702 len = P9BITLEN; 703 while(*bp == ' ') { 704 bp++; 705 len--; 706 if(len <= 0) 707 return -1; 708 } 709 n = 0; 710 while(len > 1) { 711 c = *bp++; 712 if(!isdigit(c)) 713 return -1; 714 n = n*10 + c-'0'; 715 len--; 716 } 717 if(*bp != ' ') 718 return -1; 719 return n; 720 } 721 722 int 723 isp9bit(void) 724 { 725 int ldep, lox, loy, hix, hiy, px; 726 ulong t; 727 long len; 728 729 ldep = p9bitnum(buf + 0*P9BITLEN); 730 lox = p9bitnum(buf + 1*P9BITLEN); 731 loy = p9bitnum(buf + 2*P9BITLEN); 732 hix = p9bitnum(buf + 3*P9BITLEN); 733 hiy = p9bitnum(buf + 4*P9BITLEN); 734 735 if(ldep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 736 return 0; 737 738 px = 1<<(3-ldep); /* pixels per byte */ 739 /* set l to number of bytes of data per scan line */ 740 if(lox >= 0) 741 len = (hix+px-1)/px - lox/px; 742 else{ /* make positive before divide */ 743 t = (-lox)+px-1; 744 t = (t/px)*px; 745 len = (t+hix+px-1)/px; 746 } 747 len *= (hiy-loy); /* col length */ 748 len += 5*P9BITLEN; /* size of initial ascii */ 749 750 /* 751 * for bitmap file, length is non-zero and must match calculation above 752 * for /dev/window and /dev/screen the length is always zero 753 * for subfont, the subfont header should follow immediately. 754 */ 755 if (mbuf.length == 0) 756 return 0; 757 if (mbuf.length == len) { 758 print("plan 9 bitmap\n"); 759 return 1; 760 } 761 if (p9subfont(buf+len)) { 762 print("subfont file\n"); 763 return 1; 764 } 765 return 0; 766 } 767 768 int 769 p9subfont(uchar *p) 770 { 771 int n, h, a; 772 773 /* if bitmap too big, assume it's a subfont */ 774 if (p+3*P9BITLEN > buf+sizeof(buf)) 775 return 1; 776 777 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 778 if (n < 0) 779 return 0; 780 h = p9bitnum(p + 1*P9BITLEN); /* height */ 781 if (h < 0) 782 return 0; 783 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 784 if (a < 0) 785 return 0; 786 return 1; 787 } 788 789 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 790 791 int 792 isp9font(void) 793 { 794 uchar *cp, *p; 795 int i, n; 796 char dbuf[DIRLEN]; 797 char pathname[1024]; 798 799 cp = buf; 800 if (!getfontnum(cp, &cp)) /* height */ 801 return 0; 802 if (!getfontnum(cp, &cp)) /* ascent */ 803 return 0; 804 for (i = 0; 1; i++) { 805 if (!getfontnum(cp, &cp)) /* min */ 806 break; 807 if (!getfontnum(cp, &cp)) /* max */ 808 return 0; 809 while (WHITESPACE(*cp)) 810 cp++; 811 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 812 ; 813 /* construct a path name, if needed */ 814 n = 0; 815 if (*p != '/' && slash) { 816 n = slash-fname+1; 817 if (n < sizeof(pathname)) 818 memcpy(pathname, fname, n); 819 else n = 0; 820 } 821 if (n+cp-p < sizeof(pathname)) { 822 memcpy(pathname+n, p, cp-p); 823 n += cp-p; 824 pathname[n] = 0; 825 if (stat(pathname, dbuf) < 0) 826 return 0; 827 } 828 } 829 if (i) { 830 print("font file\n"); 831 return 1; 832 } 833 return 0; 834 } 835 836 int 837 getfontnum(uchar *cp, uchar **rp) 838 { 839 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 840 cp++; 841 if (*cp < '0' || *cp > '9') 842 return 0; 843 strtoul((char *)cp, (char **)rp, 0); 844 if (!WHITESPACE(**rp)) 845 return 0; 846 return 1; 847 } 848