1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "chan", Alword, 51 "char", Cword, 52 "common", Fword, 53 "con", Lword, 54 "data", Fword, 55 "dimension", Fword, 56 "double", Cword, 57 "extern", Cword, 58 "bio", I2, 59 "float", Cword, 60 "fn", Lword, 61 "function", Fword, 62 "h", I3, 63 "implement", Lword, 64 "import", Lword, 65 "include", I1, 66 "int", Cword, 67 "integer", Fword, 68 "iota", Lword, 69 "libc", I2, 70 "long", Cword, 71 "module", Lword, 72 "real", Fword, 73 "ref", Lword, 74 "register", Cword, 75 "self", Lword, 76 "short", Cword, 77 "static", Cword, 78 "stdio", I2, 79 "struct", Cword, 80 "subroutine", Fword, 81 "u", I2, 82 "void", Cword, 83 }; 84 85 /* codes for 'mode' field in language structure */ 86 enum { 87 Normal = 0, 88 First, /* first entry for language spanning several ranges */ 89 Multi, /* later entries " " " ... */ 90 Shared, /* codes used in several languages */ 91 }; 92 93 struct 94 { 95 int mode; /* see enum above */ 96 int count; 97 int low; 98 int high; 99 char *name; 100 101 } language[] = 102 { 103 Normal, 0, 0x0080, 0x0080, "Extended Latin", 104 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 105 Normal, 0, 0x0370, 0x03FF, "Greek", 106 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 107 Normal, 0, 0x0530, 0x058F, "Armenian", 108 Normal, 0, 0x0590, 0x05FF, "Hebrew", 109 Normal, 0, 0x0600, 0x06FF, "Arabic", 110 Normal, 0, 0x0900, 0x097F, "Devanagari", 111 Normal, 0, 0x0980, 0x09FF, "Bengali", 112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 114 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 115 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 116 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 117 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 119 Normal, 0, 0x0E00, 0x0E7F, "Thai", 120 Normal, 0, 0x0E80, 0x0EFF, "Lao", 121 Normal, 0, 0x1000, 0x105F, "Tibetan", 122 Normal, 0, 0x10A0, 0x10FF, "Georgian", 123 Normal, 0, 0x3040, 0x30FF, "Japanese", 124 Normal, 0, 0x3100, 0x312F, "Chinese", 125 First, 0, 0x3130, 0x318F, "Korean", 126 Multi, 0, 0x3400, 0x3D2F, "Korean", 127 Shared, 0, 0x4e00, 0x9fff, "CJK", 128 Normal, 0, 0, 0, 0, /* terminal entry */ 129 }; 130 131 132 enum 133 { 134 Fascii, /* printable ascii */ 135 Flatin, /* latin 1*/ 136 Futf, /* UTf character set */ 137 Fbinary, /* binary */ 138 Feascii, /* ASCII with control chars */ 139 Fnull, /* NULL in file */ 140 } guess; 141 142 void bump_utf_count(Rune); 143 int cistrncmp(char*, char*, int); 144 void filetype(int); 145 int getfontnum(uchar*, uchar**); 146 int isas(void); 147 int isc(void); 148 int iscint(void); 149 int isenglish(void); 150 int ishp(void); 151 int ishtml(void); 152 int isrfc822(void); 153 int ismbox(void); 154 int islimbo(void); 155 int ismung(void); 156 int isp9bit(void); 157 int isp9font(void); 158 int isrtf(void); 159 int ismsdos(void); 160 int iself(void); 161 int istring(void); 162 int iff(void); 163 int long0(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 isrfc822, /* email file */ 182 ismbox, /* mail box */ 183 istar, /* recognizable by tar checksum */ 184 ishtml, /* html keywords */ 185 iscint, /* compiler/assembler intermediate */ 186 islimbo, /* limbo source */ 187 isc, /* c & alef compiler key words */ 188 isas, /* assembler key words */ 189 ismung, /* entropy compressed/encrypted */ 190 isp9font, /* plan 9 font */ 191 isp9bit, /* plan 9 image (as from /dev/window) */ 192 isenglish, /* char frequency English */ 193 isrtf, /* rich text format */ 194 ismsdos, /* msdos exe (virus file attachement) */ 195 isface, /* ascii face file */ 196 0 197 }; 198 199 int mime; 200 201 #define OCTET "application/octet-stream\n" 202 #define PLAIN "text/plain\n" 203 204 void 205 main(int argc, char *argv[]) 206 { 207 int i, j, maxlen; 208 char *cp; 209 Rune r; 210 211 ARGBEGIN{ 212 case 'm': 213 mime = 1; 214 break; 215 default: 216 fprint(2, "usage: file [-m] [file...]\n"); 217 exits("usage"); 218 }ARGEND; 219 220 maxlen = 0; 221 if(mime == 0 || argc > 1){ 222 for(i = 0; i < argc; i++) { 223 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 224 ; 225 if(j > maxlen) 226 maxlen = j; 227 } 228 } 229 if (argc <= 0) { 230 if(!mime) 231 print ("stdin: "); 232 filetype(0); 233 } 234 else { 235 for(i = 0; i < argc; i++) 236 type(argv[i], maxlen); 237 } 238 exits(0); 239 } 240 241 void 242 type(char *file, int nlen) 243 { 244 Rune r; 245 int i; 246 char *p; 247 248 if(nlen > 0){ 249 slash = 0; 250 for (i = 0, p = file; *p; i++) { 251 if (*p == '/') /* find rightmost slash */ 252 slash = p; 253 p += chartorune(&r, p); /* count runes */ 254 } 255 print("%s:%*s",file, nlen-i+1, ""); 256 } 257 fname = file; 258 if ((fd = open(file, OREAD)) < 0) { 259 print("cannot open\n"); 260 return; 261 } 262 filetype(fd); 263 close(fd); 264 } 265 266 void 267 filetype(int fd) 268 { 269 Rune r; 270 int i, f, n; 271 char *p, *eob; 272 273 free(mbuf); 274 mbuf = dirfstat(fd); 275 if(mbuf == nil){ 276 print("cannot stat: %r\n"); 277 return; 278 } 279 if(mbuf->mode & DMDIR) { 280 print(mime ? "text/directory\n" : "directory\n"); 281 return; 282 } 283 if(mbuf->type != 'M' && mbuf->type != '|') { 284 print(mime ? OCTET : "special file #%c/%s\n", 285 mbuf->type, mbuf->name); 286 return; 287 } 288 nbuf = read(fd, buf, sizeof(buf)-1); 289 290 if(nbuf < 0) { 291 print("cannot read\n"); 292 return; 293 } 294 if(nbuf == 0) { 295 print(mime ? PLAIN : "empty file\n"); 296 return; 297 } 298 buf[nbuf] = 0; 299 300 /* 301 * build histogram table 302 */ 303 memset(cfreq, 0, sizeof(cfreq)); 304 for (i = 0; language[i].name; i++) 305 language[i].count = 0; 306 eob = (char *)buf+nbuf; 307 for(n = 0, p = (char *)buf; p < eob; n++) { 308 if (!fullrune(p, eob-p) && eob-p < UTFmax) 309 break; 310 p += chartorune(&r, p); 311 if (r == 0) 312 f = Cnull; 313 else if (r <= 0x7f) { 314 if (!isprint(r) && !isspace(r)) 315 f = Ceascii; /* ASCII control char */ 316 else f = r; 317 } else if (r == 0x080) { 318 bump_utf_count(r); 319 f = Cutf; 320 } else if (r < 0xA0) 321 f = Cbinary; /* Invalid Runes */ 322 else if (r <= 0xff) 323 f = Clatin; /* Latin 1 */ 324 else { 325 bump_utf_count(r); 326 f = Cutf; /* UTF extension */ 327 } 328 cfreq[f]++; /* ASCII chars peg directly */ 329 } 330 /* 331 * gross classify 332 */ 333 if (cfreq[Cbinary]) 334 guess = Fbinary; 335 else if (cfreq[Cutf]) 336 guess = Futf; 337 else if (cfreq[Clatin]) 338 guess = Flatin; 339 else if (cfreq[Ceascii]) 340 guess = Feascii; 341 else if (cfreq[Cnull] == n) { 342 print(mime ? OCTET : "first block all null bytes\n"); 343 return; 344 } 345 else guess = Fascii; 346 /* 347 * lookup dictionary words 348 */ 349 memset(wfreq, 0, sizeof(wfreq)); 350 if(guess == Fascii || guess == Flatin || guess == Futf) 351 wordfreq(); 352 /* 353 * call individual classify routines 354 */ 355 for(i=0; call[i]; i++) 356 if((*call[i])()) 357 return; 358 359 /* 360 * if all else fails, 361 * print out gross classification 362 */ 363 if (nbuf < 100 && !mime) 364 print(mime ? PLAIN : "short "); 365 if (guess == Fascii) 366 print(mime ? PLAIN : "Ascii\n"); 367 else if (guess == Feascii) 368 print(mime ? PLAIN : "extended ascii\n"); 369 else if (guess == Flatin) 370 print(mime ? PLAIN : "latin ascii\n"); 371 else if (guess == Futf && utf_count() < 4) 372 print_utf(); 373 else print(mime ? OCTET : "binary\n"); 374 } 375 376 void 377 bump_utf_count(Rune r) 378 { 379 int low, high, mid; 380 381 high = sizeof(language)/sizeof(language[0])-1; 382 for (low = 0; low < high;) { 383 mid = (low+high)/2; 384 if (r >=language[mid].low) { 385 if (r <= language[mid].high) { 386 language[mid].count++; 387 break; 388 } else low = mid+1; 389 } else high = mid; 390 } 391 } 392 393 int 394 utf_count(void) 395 { 396 int i, count; 397 398 count = 0; 399 for (i = 0; language[i].name; i++) 400 if (language[i].count > 0) 401 switch (language[i].mode) { 402 case Normal: 403 case First: 404 count++; 405 break; 406 default: 407 break; 408 } 409 return count; 410 } 411 412 int 413 chkascii(void) 414 { 415 int i; 416 417 for (i = 'a'; i < 'z'; i++) 418 if (cfreq[i]) 419 return 1; 420 for (i = 'A'; i < 'Z'; i++) 421 if (cfreq[i]) 422 return 1; 423 return 0; 424 } 425 426 int 427 find_first(char *name) 428 { 429 int i; 430 431 for (i = 0; language[i].name != 0; i++) 432 if (language[i].mode == First 433 && strcmp(language[i].name, name) == 0) 434 return i; 435 return -1; 436 } 437 438 void 439 print_utf(void) 440 { 441 int i, printed, j; 442 443 if(mime){ 444 print(PLAIN); 445 return; 446 } 447 if (chkascii()) { 448 printed = 1; 449 print("Ascii"); 450 } else 451 printed = 0; 452 for (i = 0; language[i].name; i++) 453 if (language[i].count) { 454 switch(language[i].mode) { 455 case Multi: 456 j = find_first(language[i].name); 457 if (j < 0) 458 break; 459 if (language[j].count > 0) 460 break; 461 /* Fall through */ 462 case Normal: 463 case First: 464 if (printed) 465 print(" & "); 466 else printed = 1; 467 print("%s", language[i].name); 468 break; 469 case Shared: 470 default: 471 break; 472 } 473 } 474 if(!printed) 475 print("UTF"); 476 print(" text\n"); 477 } 478 479 void 480 wordfreq(void) 481 { 482 int low, high, mid, r; 483 uchar *p, *p2, c; 484 485 p = buf; 486 for(;;) { 487 while (p < buf+nbuf && !isalpha(*p)) 488 p++; 489 if (p >= buf+nbuf) 490 return; 491 p2 = p; 492 while(p < buf+nbuf && isalpha(*p)) 493 p++; 494 c = *p; 495 *p = 0; 496 high = sizeof(dict)/sizeof(dict[0]); 497 for(low = 0;low < high;) { 498 mid = (low+high)/2; 499 r = strcmp(dict[mid].word, (char*)p2); 500 if(r == 0) { 501 wfreq[dict[mid].class]++; 502 break; 503 } 504 if(r < 0) 505 low = mid+1; 506 else 507 high = mid; 508 } 509 *p++ = c; 510 } 511 } 512 513 typedef struct Filemagic Filemagic; 514 struct Filemagic { 515 ulong x; 516 ulong mask; 517 char *desc; 518 char *mime; 519 }; 520 521 Filemagic long0tab[] = { 522 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 523 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 524 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET, 525 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 526 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 527 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 528 070707, 0xFFFF, "cpio archive\n", OCTET, 529 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 530 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 531 }; 532 533 int 534 filemagic(Filemagic *tab, int ntab, ulong x) 535 { 536 int i; 537 538 for(i=0; i<ntab; i++) 539 if((x&tab[i].mask) == tab[i].x){ 540 print(mime ? tab[i].mime : tab[i].desc); 541 return 1; 542 } 543 return 0; 544 } 545 546 int 547 long0(void) 548 { 549 long x; 550 551 x = LENDIAN(buf); 552 if(filemagic(long0tab, nelem(long0tab), x)) 553 return 1; 554 return 0; 555 } 556 557 int 558 isexec(void) 559 { 560 Fhdr f; 561 562 seek(fd, 0, 0); /* reposition to start of file */ 563 if(crackhdr(fd, &f)) { 564 print(mime ? OCTET : "%s\n", f.name); 565 return 1; 566 } 567 return 0; 568 } 569 570 571 /* from tar.c */ 572 enum { NAMSIZ = 100, TBLOCK = 512 }; 573 574 union hblock 575 { 576 char dummy[TBLOCK]; 577 struct header 578 { 579 char name[NAMSIZ]; 580 char mode[8]; 581 char uid[8]; 582 char gid[8]; 583 char size[12]; 584 char mtime[12]; 585 char chksum[8]; 586 char linkflag; 587 char linkname[NAMSIZ]; 588 /* rest are defined by POSIX's ustar format; see p1003.2b */ 589 char magic[6]; /* "ustar" */ 590 char version[2]; 591 char uname[32]; 592 char gname[32]; 593 char devmajor[8]; 594 char devminor[8]; 595 char prefix[155]; /* if non-null, path = prefix "/" name */ 596 } dbuf; 597 }; 598 599 int 600 checksum(union hblock *hp) 601 { 602 int i; 603 char *cp; 604 struct header *hdr = &hp->dbuf; 605 606 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 607 *cp = ' '; 608 i = 0; 609 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 610 i += *cp & 0xff; 611 return i; 612 } 613 614 int 615 istar(void) 616 { 617 int chksum; 618 char tblock[TBLOCK]; 619 union hblock *hp = (union hblock *)tblock; 620 struct header *hdr = &hp->dbuf; 621 622 seek(fd, 0, 0); /* reposition to start of file */ 623 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 624 return 0; 625 chksum = strtol(hdr->chksum, 0, 8); 626 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 627 if (strcmp(hdr->magic, "ustar") == 0) 628 print(mime? "application/x-ustar\n": 629 "posix tar archive\n"); 630 else 631 print(mime? "application/x-tar\n": "tar archive\n"); 632 return 1; 633 } 634 return 0; 635 } 636 637 /* 638 * initial words to classify file 639 */ 640 struct FILE_STRING 641 { 642 char *key; 643 char *filetype; 644 int length; 645 char *mime; 646 } file_string[] = 647 { 648 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 649 "!<arch>\n", "archive", 8, "application/octet-stream", 650 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 651 "#!/bin/rc", "rc executable file", 9, "text/plain", 652 "#!/bin/sh", "sh executable file", 9, "text/plain", 653 "%!", "postscript", 2, "application/postscript", 654 "\004%!", "postscript", 3, "application/postscript", 655 "x T post", "troff output for post", 8, "application/troff", 656 "x T Latin1", "troff output for Latin1", 10, "application/troff", 657 "x T utf", "troff output for UTF", 7, "application/troff", 658 "x T 202", "troff output for 202", 7, "application/troff", 659 "x T aps", "troff output for aps", 7, "application/troff", 660 "GIF", "GIF image", 3, "image/gif", 661 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 662 "%PDF", "PDF", 4, "application/pdf", 663 "<html>\n", "HTML file", 7, "text/html", 664 "<HTML>\n", "HTML file", 7, "text/html", 665 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream", 666 "\111\111\052\000", "tiff", 4, "image/tiff", 667 "\115\115\000\052", "tiff", 4, "image/tiff", 668 "\377\330\377\340", "jpeg", 4, "image/jpeg", 669 "\377\330\377\341", "jpeg", 4, "image/jpeg", 670 "\377\330\377\333", "jpeg", 4, "image/jpeg", 671 "BM", "bmp", 2, "image/bmp", 672 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 673 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 674 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 675 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 676 "\211PNG", "PNG image", 4, "image/png", 677 "P3\n", "ppm", 3, "image/ppm", 678 "P6\n", "ppm", 3, "image/ppm", 679 "/* XPM */\n", "xbm", 10, "image/xbm", 680 0,0,0,0 681 }; 682 683 int 684 istring(void) 685 { 686 int i; 687 struct FILE_STRING *p; 688 689 for(p = file_string; p->key; p++) { 690 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) { 691 if(mime) 692 print("%s\n", p->mime); 693 else 694 print("%s\n", p->filetype); 695 return 1; 696 } 697 } 698 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 699 for(i = 5; i < nbuf; i++) 700 if(buf[i] == '\n') 701 break; 702 if(mime) 703 print(OCTET); 704 else 705 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 706 return 1; 707 } 708 return 0; 709 } 710 711 int 712 iff(void) 713 { 714 if (strncmp((char*)buf, "FORM", 4) == 0 && 715 strncmp((char*)buf+8, "AIFF", 4) == 0) { 716 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 717 return 1; 718 } 719 return 0; 720 } 721 722 char* html_string[] = 723 { 724 "title", 725 "body", 726 "head", 727 "strong", 728 "h1", 729 "h2", 730 "h3", 731 "h4", 732 "h5", 733 "h6", 734 "ul", 735 "li", 736 "dl", 737 "br", 738 "em", 739 0, 740 }; 741 742 int 743 ishtml(void) 744 { 745 uchar *p, *q; 746 int i, count; 747 748 /* compare strings between '<' and '>' to html table */ 749 count = 0; 750 p = buf; 751 for(;;) { 752 while (p < buf+nbuf && *p != '<') 753 p++; 754 p++; 755 if (p >= buf+nbuf) 756 break; 757 if(*p == '/') 758 p++; 759 q = p; 760 while(p < buf+nbuf && *p != '>') 761 p++; 762 if (p >= buf+nbuf) 763 break; 764 for(i = 0; html_string[i]; i++) { 765 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 766 if(count++ > 4) { 767 print(mime ? "text/html\n" : "HTML file\n"); 768 return 1; 769 } 770 break; 771 } 772 } 773 p++; 774 } 775 return 0; 776 } 777 778 char* rfc822_string[] = 779 { 780 "from:", 781 "date:", 782 "to:", 783 "subject:", 784 "received:", 785 "reply to:", 786 "sender:", 787 0, 788 }; 789 790 int 791 isrfc822(void) 792 { 793 794 char *p, *q, *r; 795 int i, count; 796 797 count = 0; 798 p = (char*)buf; 799 for(;;) { 800 q = strchr(p, '\n'); 801 if(q == nil) 802 break; 803 *q = 0; 804 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 805 count++; 806 *q = '\n'; 807 p = q+1; 808 continue; 809 } 810 *q = '\n'; 811 if(*p != '\t' && *p != ' '){ 812 r = strchr(p, ':'); 813 if(r == 0 || r > q) 814 break; 815 for(i = 0; rfc822_string[i]; i++) { 816 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 817 count++; 818 break; 819 } 820 } 821 } 822 p = q+1; 823 } 824 if(count >= 3){ 825 print(mime ? "message/rfc822\n" : "email file\n"); 826 return 1; 827 } 828 return 0; 829 } 830 831 int 832 ismbox(void) 833 { 834 char *p, *q; 835 836 p = (char*)buf; 837 q = strchr(p, '\n'); 838 if(q == nil) 839 return 0; 840 *q = 0; 841 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 842 print(mime ? "text/plain\n" : "mail box\n"); 843 return 1; 844 } 845 *q = '\n'; 846 return 0; 847 } 848 849 int 850 iscint(void) 851 { 852 int type; 853 char *name; 854 Biobuf b; 855 856 if(Binit(&b, fd, OREAD) == Beof) 857 return 0; 858 seek(fd, 0, 0); 859 type = objtype(&b, &name); 860 if(type < 0) 861 return 0; 862 if(mime) 863 print(OCTET); 864 else 865 print("%s intermediate\n", name); 866 return 1; 867 } 868 869 int 870 isc(void) 871 { 872 int n; 873 874 n = wfreq[I1]; 875 /* 876 * includes 877 */ 878 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 879 goto yes; 880 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 881 goto yes; 882 /* 883 * declarations 884 */ 885 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 886 goto yes; 887 /* 888 * assignments 889 */ 890 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 891 goto yes; 892 return 0; 893 894 yes: 895 if(mime){ 896 print(PLAIN); 897 return 1; 898 } 899 if(wfreq[Alword] > 0) 900 print("alef program\n"); 901 else 902 print("c program\n"); 903 return 1; 904 } 905 906 int 907 islimbo(void) 908 { 909 910 /* 911 * includes 912 */ 913 if(wfreq[Lword] < 4) 914 return 0; 915 print(mime ? PLAIN : "limbo program\n"); 916 return 1; 917 } 918 919 int 920 isas(void) 921 { 922 923 /* 924 * includes 925 */ 926 if(wfreq[Aword] < 2) 927 return 0; 928 print(mime ? PLAIN : "as program\n"); 929 return 1; 930 } 931 932 /* 933 * low entropy means encrypted 934 */ 935 int 936 ismung(void) 937 { 938 int i, bucket[8]; 939 float cs; 940 941 if(nbuf < 64) 942 return 0; 943 memset(bucket, 0, sizeof(bucket)); 944 for(i=nbuf-64; i<nbuf; i++) 945 bucket[(buf[i]>>5)&07] += 1; 946 947 cs = 0.; 948 for(i=0; i<8; i++) 949 cs += (bucket[i]-8)*(bucket[i]-8); 950 cs /= 8.; 951 if(cs <= 24.322) { 952 if(buf[0]==0x1f && buf[1]==0x9d) 953 print(mime ? OCTET : "compressed\n"); 954 else 955 if(buf[0]==0x1f && buf[1]==0x8b) 956 print(mime ? OCTET : "gzip compressed\n"); 957 else 958 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 959 print(mime ? OCTET : "bzip2 compressed\n"); 960 else 961 print(mime ? OCTET : "encrypted\n"); 962 return 1; 963 } 964 return 0; 965 } 966 967 /* 968 * english by punctuation and frequencies 969 */ 970 int 971 isenglish(void) 972 { 973 int vow, comm, rare, badpun, punct; 974 char *p; 975 976 if(guess != Fascii && guess != Feascii) 977 return 0; 978 badpun = 0; 979 punct = 0; 980 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 981 switch(*p) { 982 case '.': 983 case ',': 984 case ')': 985 case '%': 986 case ';': 987 case ':': 988 case '?': 989 punct++; 990 if(p[1] != ' ' && p[1] != '\n') 991 badpun++; 992 } 993 if(badpun*5 > punct) 994 return 0; 995 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 996 return 0; 997 if(2*cfreq[';'] > cfreq['e']) 998 return 0; 999 1000 vow = 0; 1001 for(p="AEIOU"; *p; p++) { 1002 vow += cfreq[*p]; 1003 vow += cfreq[tolower(*p)]; 1004 } 1005 comm = 0; 1006 for(p="ETAION"; *p; p++) { 1007 comm += cfreq[*p]; 1008 comm += cfreq[tolower(*p)]; 1009 } 1010 rare = 0; 1011 for(p="VJKQXZ"; *p; p++) { 1012 rare += cfreq[*p]; 1013 rare += cfreq[tolower(*p)]; 1014 } 1015 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1016 print(mime ? PLAIN : "English text\n"); 1017 return 1; 1018 } 1019 return 0; 1020 } 1021 1022 /* 1023 * pick up a number with 1024 * syntax _*[0-9]+_ 1025 */ 1026 #define P9BITLEN 12 1027 int 1028 p9bitnum(uchar *bp) 1029 { 1030 int n, c, len; 1031 1032 len = P9BITLEN; 1033 while(*bp == ' ') { 1034 bp++; 1035 len--; 1036 if(len <= 0) 1037 return -1; 1038 } 1039 n = 0; 1040 while(len > 1) { 1041 c = *bp++; 1042 if(!isdigit(c)) 1043 return -1; 1044 n = n*10 + c-'0'; 1045 len--; 1046 } 1047 if(*bp != ' ') 1048 return -1; 1049 return n; 1050 } 1051 1052 int 1053 depthof(char *s, int *newp) 1054 { 1055 char *es; 1056 int d; 1057 1058 *newp = 0; 1059 es = s+12; 1060 while(s<es && *s==' ') 1061 s++; 1062 if(s == es) 1063 return -1; 1064 if('0'<=*s && *s<='9') 1065 return 1<<strtol(s, 0, 0); 1066 1067 *newp = 1; 1068 d = 0; 1069 while(s<es && *s!=' '){ 1070 s++; /* skip letter */ 1071 d += strtoul(s, &s, 10); 1072 } 1073 1074 switch(d){ 1075 case 32: 1076 case 24: 1077 case 16: 1078 case 8: 1079 return d; 1080 } 1081 return -1; 1082 } 1083 1084 int 1085 isp9bit(void) 1086 { 1087 int dep, lox, loy, hix, hiy, px, new; 1088 ulong t; 1089 long len; 1090 char *newlabel; 1091 1092 newlabel = "old "; 1093 1094 dep = depthof((char*)buf + 0*P9BITLEN, &new); 1095 if(new) 1096 newlabel = ""; 1097 lox = p9bitnum(buf + 1*P9BITLEN); 1098 loy = p9bitnum(buf + 2*P9BITLEN); 1099 hix = p9bitnum(buf + 3*P9BITLEN); 1100 hiy = p9bitnum(buf + 4*P9BITLEN); 1101 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1102 return 0; 1103 1104 if(dep < 8){ 1105 px = 8/dep; /* pixels per byte */ 1106 /* set l to number of bytes of data per scan line */ 1107 if(lox >= 0) 1108 len = (hix+px-1)/px - lox/px; 1109 else{ /* make positive before divide */ 1110 t = (-lox)+px-1; 1111 t = (t/px)*px; 1112 len = (t+hix+px-1)/px; 1113 } 1114 }else 1115 len = (hix-lox)*dep/8; 1116 len *= (hiy-loy); /* col length */ 1117 len += 5*P9BITLEN; /* size of initial ascii */ 1118 1119 /* 1120 * for image file, length is non-zero and must match calculation above 1121 * for /dev/window and /dev/screen the length is always zero 1122 * for subfont, the subfont header should follow immediately. 1123 */ 1124 if (len != 0 && mbuf->length == 0) { 1125 print("%splan 9 image\n", newlabel); 1126 return 1; 1127 } 1128 if (mbuf->length == len) { 1129 print("%splan 9 image\n", newlabel); 1130 return 1; 1131 } 1132 /* Ghostscript sometimes produces a little extra on the end */ 1133 if (mbuf->length < len+P9BITLEN) { 1134 print("%splan 9 image\n", newlabel); 1135 return 1; 1136 } 1137 if (p9subfont(buf+len)) { 1138 print("%ssubfont file\n", newlabel); 1139 return 1; 1140 } 1141 return 0; 1142 } 1143 1144 int 1145 p9subfont(uchar *p) 1146 { 1147 int n, h, a; 1148 1149 /* if image too big, assume it's a subfont */ 1150 if (p+3*P9BITLEN > buf+sizeof(buf)) 1151 return 1; 1152 1153 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1154 if (n < 0) 1155 return 0; 1156 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1157 if (h < 0) 1158 return 0; 1159 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1160 if (a < 0) 1161 return 0; 1162 return 1; 1163 } 1164 1165 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1166 1167 int 1168 isp9font(void) 1169 { 1170 uchar *cp, *p; 1171 int i, n; 1172 char pathname[1024]; 1173 1174 cp = buf; 1175 if (!getfontnum(cp, &cp)) /* height */ 1176 return 0; 1177 if (!getfontnum(cp, &cp)) /* ascent */ 1178 return 0; 1179 for (i = 0;; i++) { 1180 if (!getfontnum(cp, &cp)) /* min */ 1181 break; 1182 if (!getfontnum(cp, &cp)) /* max */ 1183 return 0; 1184 while (WHITESPACE(*cp)) 1185 cp++; 1186 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1187 ; 1188 /* construct a path name, if needed */ 1189 n = 0; 1190 if (*p != '/' && slash) { 1191 n = slash-fname+1; 1192 if (n < sizeof(pathname)) 1193 memcpy(pathname, fname, n); 1194 else n = 0; 1195 } 1196 if (n+cp-p < sizeof(pathname)) { 1197 memcpy(pathname+n, p, cp-p); 1198 n += cp-p; 1199 pathname[n] = 0; 1200 if (access(pathname, AEXIST) < 0) 1201 return 0; 1202 } 1203 } 1204 if (i) { 1205 print(mime ? "text/plain\n" : "font file\n"); 1206 return 1; 1207 } 1208 return 0; 1209 } 1210 1211 int 1212 getfontnum(uchar *cp, uchar **rp) 1213 { 1214 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1215 cp++; 1216 if (*cp < '0' || *cp > '9') 1217 return 0; 1218 strtoul((char *)cp, (char **)rp, 0); 1219 if (!WHITESPACE(**rp)) 1220 return 0; 1221 return 1; 1222 } 1223 1224 int 1225 isrtf(void) 1226 { 1227 if(strstr((char *)buf, "\\rtf1")){ 1228 print(mime ? "application/rtf\n" : "rich text format\n"); 1229 return 1; 1230 } 1231 return 0; 1232 } 1233 1234 int 1235 ismsdos(void) 1236 { 1237 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1238 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1239 return 1; 1240 } 1241 return 0; 1242 } 1243 1244 int 1245 iself(void) 1246 { 1247 char *cpu[] = { /* NB: incomplete and arbitary list */ 1248 [1] "WE32100", 1249 [2] "SPARC", 1250 [3] "i386", 1251 [4] "M68000", 1252 [5] "M88000", 1253 [6] "i486", 1254 [7] "i860", 1255 [8] "R3000", 1256 [9] "S370", 1257 [10] "R4000", 1258 [15] "HP-PA", 1259 [18] "sparc v8+", 1260 [19] "i960", 1261 [20] "PPC-32", 1262 [21] "PPC-64", 1263 [40] "ARM", 1264 [41] "Alpha", 1265 [43] "sparc v9", 1266 [50] "IA-46", 1267 [62] "AMD64", 1268 [75] "VAX", 1269 }; 1270 1271 1272 if (memcmp(buf, "\x7fELF", 4) == 0){ 1273 if (!mime){ 1274 int n = (buf[19] << 8) | buf[18]; 1275 char *p = "unknown"; 1276 1277 if (n > 0 && n < nelem(cpu) && cpu[n]) 1278 p = cpu[n]; 1279 else { 1280 /* try the other byte order */ 1281 n = (buf[18] << 8) | buf[19]; 1282 if (n > 0 && n < nelem(cpu) && cpu[n]) 1283 p = cpu[n]; 1284 } 1285 print("%s ELF executable\n", p); 1286 } 1287 else 1288 print("application/x-elf-executable"); 1289 return 1; 1290 } 1291 1292 return 0; 1293 } 1294 1295 int 1296 isface(void) 1297 { 1298 int i, j, ldepth, l; 1299 char *p; 1300 1301 ldepth = -1; 1302 for(j = 0; j < 3; j++){ 1303 for(p = (char*)buf, i=0; i<3; i++){ 1304 if(p[0] != '0' || p[1] != 'x') 1305 return 0; 1306 if(buf[2+8] == ',') 1307 l = 2; 1308 else if(buf[2+4] == ',') 1309 l = 1; 1310 else 1311 return 0; 1312 if(ldepth == -1) 1313 ldepth = l; 1314 if(l != ldepth) 1315 return 0; 1316 strtoul(p, &p, 16); 1317 if(*p++ != ',') 1318 return 0; 1319 while(*p == ' ' || *p == '\t') 1320 p++; 1321 } 1322 if (*p++ != '\n') 1323 return 0; 1324 } 1325 1326 if(mime) 1327 print("application/x-face\n"); 1328 else 1329 print("face image depth %d\n", ldepth); 1330 return 1; 1331 } 1332 1333