1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "chan", Alword, 51 "char", Cword, 52 "common", Fword, 53 "con", Lword, 54 "data", Fword, 55 "dimension", Fword, 56 "double", Cword, 57 "extern", Cword, 58 "bio", I2, 59 "float", Cword, 60 "fn", Lword, 61 "function", Fword, 62 "h", I3, 63 "implement", Lword, 64 "import", Lword, 65 "include", I1, 66 "int", Cword, 67 "integer", Fword, 68 "iota", Lword, 69 "libc", I2, 70 "long", Cword, 71 "module", Lword, 72 "real", Fword, 73 "ref", Lword, 74 "register", Cword, 75 "self", Lword, 76 "short", Cword, 77 "static", Cword, 78 "stdio", I2, 79 "struct", Cword, 80 "subroutine", Fword, 81 "u", I2, 82 "void", Cword, 83 }; 84 85 /* codes for 'mode' field in language structure */ 86 enum { 87 Normal = 0, 88 First, /* first entry for language spanning several ranges */ 89 Multi, /* later entries " " " ... */ 90 Shared, /* codes used in several languages */ 91 }; 92 93 struct 94 { 95 int mode; /* see enum above */ 96 int count; 97 int low; 98 int high; 99 char *name; 100 101 } language[] = 102 { 103 Normal, 0, 0x0080, 0x0080, "Extended Latin", 104 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 105 Normal, 0, 0x0370, 0x03FF, "Greek", 106 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 107 Normal, 0, 0x0530, 0x058F, "Armenian", 108 Normal, 0, 0x0590, 0x05FF, "Hebrew", 109 Normal, 0, 0x0600, 0x06FF, "Arabic", 110 Normal, 0, 0x0900, 0x097F, "Devanagari", 111 Normal, 0, 0x0980, 0x09FF, "Bengali", 112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 114 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 115 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 116 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 117 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 119 Normal, 0, 0x0E00, 0x0E7F, "Thai", 120 Normal, 0, 0x0E80, 0x0EFF, "Lao", 121 Normal, 0, 0x1000, 0x105F, "Tibetan", 122 Normal, 0, 0x10A0, 0x10FF, "Georgian", 123 Normal, 0, 0x3040, 0x30FF, "Japanese", 124 Normal, 0, 0x3100, 0x312F, "Chinese", 125 First, 0, 0x3130, 0x318F, "Korean", 126 Multi, 0, 0x3400, 0x3D2F, "Korean", 127 Shared, 0, 0x4e00, 0x9fff, "CJK", 128 Normal, 0, 0, 0, 0, /* terminal entry */ 129 }; 130 131 132 enum 133 { 134 Fascii, /* printable ascii */ 135 Flatin, /* latin 1*/ 136 Futf, /* UTf character set */ 137 Fbinary, /* binary */ 138 Feascii, /* ASCII with control chars */ 139 Fnull, /* NULL in file */ 140 } guess; 141 142 void bump_utf_count(Rune); 143 int cistrncmp(char*, char*, int); 144 void filetype(int); 145 int getfontnum(uchar*, uchar**); 146 int isas(void); 147 int isc(void); 148 int iscint(void); 149 int isenglish(void); 150 int ishp(void); 151 int ishtml(void); 152 int isrfc822(void); 153 int ismbox(void); 154 int islimbo(void); 155 int ismung(void); 156 int isp9bit(void); 157 int isp9font(void); 158 int isrtf(void); 159 int ismsdos(void); 160 int iself(void); 161 int istring(void); 162 int long0(void); 163 int istar(void); 164 int p9bitnum(uchar*); 165 int p9subfont(uchar*); 166 void print_utf(void); 167 void type(char*, int); 168 int utf_count(void); 169 void wordfreq(void); 170 171 int (*call[])(void) = 172 { 173 long0, /* recognizable by first 4 bytes */ 174 istring, /* recognizable by first string */ 175 isrfc822, /* email file */ 176 ismbox, /* mail box */ 177 istar, /* recognizable by tar checksum */ 178 ishtml, /* html keywords */ 179 iscint, /* compiler/assembler intermediate */ 180 islimbo, /* limbo source */ 181 isc, /* c & alef compiler key words */ 182 isas, /* assembler key words */ 183 ismung, /* entropy compressed/encrypted */ 184 isp9font, /* plan 9 font */ 185 isp9bit, /* plan 9 image (as from /dev/window) */ 186 isenglish, /* char frequency English */ 187 isrtf, /* rich text format */ 188 ismsdos, /* msdos exe (virus file attachement) */ 189 iself, /* ELF (foreign) executable */ 190 0 191 }; 192 193 int mime; 194 195 #define OCTET "application/octet-stream\n" 196 #define PLAIN "text/plain\n" 197 198 void 199 main(int argc, char *argv[]) 200 { 201 int i, j, maxlen; 202 char *cp; 203 Rune r; 204 205 ARGBEGIN{ 206 case 'm': 207 mime = 1; 208 break; 209 default: 210 fprint(2, "usage: file [-m] [file...]\n"); 211 exits("usage"); 212 }ARGEND; 213 214 maxlen = 0; 215 if(mime == 0 || argc > 1){ 216 for(i = 0; i < argc; i++) { 217 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 218 ; 219 if(j > maxlen) 220 maxlen = j; 221 } 222 } 223 if (argc <= 0) { 224 if(!mime) 225 print ("stdin: "); 226 filetype(0); 227 } 228 else { 229 for(i = 0; i < argc; i++) 230 type(argv[i], maxlen); 231 } 232 exits(0); 233 } 234 235 void 236 type(char *file, int nlen) 237 { 238 Rune r; 239 int i; 240 char *p; 241 242 if(nlen > 0){ 243 slash = 0; 244 for (i = 0, p = file; *p; i++) { 245 if (*p == '/') /* find rightmost slash */ 246 slash = p; 247 p += chartorune(&r, p); /* count runes */ 248 } 249 print("%s:%*s",file, nlen-i+1, ""); 250 } 251 fname = file; 252 if ((fd = open(file, OREAD)) < 0) { 253 print("cannot open\n"); 254 return; 255 } 256 filetype(fd); 257 close(fd); 258 } 259 260 void 261 filetype(int fd) 262 { 263 Rune r; 264 int i, f, n; 265 char *p, *eob; 266 267 free(mbuf); 268 mbuf = dirfstat(fd); 269 if(mbuf == nil){ 270 print("cannot stat: %r\n"); 271 return; 272 } 273 if(mbuf->mode & DMDIR) { 274 print(mime ? "text/directory\n" : "directory\n"); 275 return; 276 } 277 if(mbuf->type != 'M' && mbuf->type != '|') { 278 print(mime ? OCTET : "special file #%c/%s\n", 279 mbuf->type, mbuf->name); 280 return; 281 } 282 nbuf = read(fd, buf, sizeof(buf)-1); 283 284 if(nbuf < 0) { 285 print("cannot read\n"); 286 return; 287 } 288 if(nbuf == 0) { 289 print(mime ? PLAIN : "empty file\n"); 290 return; 291 } 292 buf[nbuf] = 0; 293 294 /* 295 * build histogram table 296 */ 297 memset(cfreq, 0, sizeof(cfreq)); 298 for (i = 0; language[i].name; i++) 299 language[i].count = 0; 300 eob = (char *)buf+nbuf; 301 for(n = 0, p = (char *)buf; p < eob; n++) { 302 if (!fullrune(p, eob-p) && eob-p < UTFmax) 303 break; 304 p += chartorune(&r, p); 305 if (r == 0) 306 f = Cnull; 307 else if (r <= 0x7f) { 308 if (!isprint(r) && !isspace(r)) 309 f = Ceascii; /* ASCII control char */ 310 else f = r; 311 } else if (r == 0x080) { 312 bump_utf_count(r); 313 f = Cutf; 314 } else if (r < 0xA0) 315 f = Cbinary; /* Invalid Runes */ 316 else if (r <= 0xff) 317 f = Clatin; /* Latin 1 */ 318 else { 319 bump_utf_count(r); 320 f = Cutf; /* UTF extension */ 321 } 322 cfreq[f]++; /* ASCII chars peg directly */ 323 } 324 /* 325 * gross classify 326 */ 327 if (cfreq[Cbinary]) 328 guess = Fbinary; 329 else if (cfreq[Cutf]) 330 guess = Futf; 331 else if (cfreq[Clatin]) 332 guess = Flatin; 333 else if (cfreq[Ceascii]) 334 guess = Feascii; 335 else if (cfreq[Cnull] == n) { 336 print(mime ? OCTET : "first block all null bytes\n"); 337 return; 338 } 339 else guess = Fascii; 340 /* 341 * lookup dictionary words 342 */ 343 memset(wfreq, 0, sizeof(wfreq)); 344 if(guess == Fascii || guess == Flatin || guess == Futf) 345 wordfreq(); 346 /* 347 * call individual classify routines 348 */ 349 for(i=0; call[i]; i++) 350 if((*call[i])()) 351 return; 352 353 /* 354 * if all else fails, 355 * print out gross classification 356 */ 357 if (nbuf < 100 && !mime) 358 print(mime ? PLAIN : "short "); 359 if (guess == Fascii) 360 print(mime ? PLAIN : "Ascii\n"); 361 else if (guess == Feascii) 362 print(mime ? PLAIN : "extended ascii\n"); 363 else if (guess == Flatin) 364 print(mime ? PLAIN : "latin ascii\n"); 365 else if (guess == Futf && utf_count() < 4) 366 print_utf(); 367 else print(mime ? OCTET : "binary\n"); 368 } 369 370 void 371 bump_utf_count(Rune r) 372 { 373 int low, high, mid; 374 375 high = sizeof(language)/sizeof(language[0])-1; 376 for (low = 0; low < high;) { 377 mid = (low+high)/2; 378 if (r >=language[mid].low) { 379 if (r <= language[mid].high) { 380 language[mid].count++; 381 break; 382 } else low = mid+1; 383 } else high = mid; 384 } 385 } 386 387 int 388 utf_count(void) 389 { 390 int i, count; 391 392 count = 0; 393 for (i = 0; language[i].name; i++) 394 if (language[i].count > 0) 395 switch (language[i].mode) { 396 case Normal: 397 case First: 398 count++; 399 break; 400 default: 401 break; 402 } 403 return count; 404 } 405 406 int 407 chkascii(void) 408 { 409 int i; 410 411 for (i = 'a'; i < 'z'; i++) 412 if (cfreq[i]) 413 return 1; 414 for (i = 'A'; i < 'Z'; i++) 415 if (cfreq[i]) 416 return 1; 417 return 0; 418 } 419 420 int 421 find_first(char *name) 422 { 423 int i; 424 425 for (i = 0; language[i].name != 0; i++) 426 if (language[i].mode == First 427 && strcmp(language[i].name, name) == 0) 428 return i; 429 return -1; 430 } 431 432 void 433 print_utf(void) 434 { 435 int i, printed, j; 436 437 if(mime){ 438 print(PLAIN); 439 return; 440 } 441 if (chkascii()) { 442 printed = 1; 443 print("Ascii"); 444 } else 445 printed = 0; 446 for (i = 0; language[i].name; i++) 447 if (language[i].count) { 448 switch(language[i].mode) { 449 case Multi: 450 j = find_first(language[i].name); 451 if (j < 0) 452 break; 453 if (language[j].count > 0) 454 break; 455 /* Fall through */ 456 case Normal: 457 case First: 458 if (printed) 459 print(" & "); 460 else printed = 1; 461 print("%s", language[i].name); 462 break; 463 case Shared: 464 default: 465 break; 466 } 467 } 468 if(!printed) 469 print("UTF"); 470 print(" text\n"); 471 } 472 473 void 474 wordfreq(void) 475 { 476 int low, high, mid, r; 477 uchar *p, *p2, c; 478 479 p = buf; 480 for(;;) { 481 while (p < buf+nbuf && !isalpha(*p)) 482 p++; 483 if (p >= buf+nbuf) 484 return; 485 p2 = p; 486 while(p < buf+nbuf && isalpha(*p)) 487 p++; 488 c = *p; 489 *p = 0; 490 high = sizeof(dict)/sizeof(dict[0]); 491 for(low = 0;low < high;) { 492 mid = (low+high)/2; 493 r = strcmp(dict[mid].word, (char*)p2); 494 if(r == 0) { 495 wfreq[dict[mid].class]++; 496 break; 497 } 498 if(r < 0) 499 low = mid+1; 500 else 501 high = mid; 502 } 503 *p++ = c; 504 } 505 } 506 507 typedef struct Filemagic Filemagic; 508 struct Filemagic { 509 ulong x; 510 ulong mask; 511 char *desc; 512 char *mime; 513 }; 514 515 Filemagic long0tab[] = { 516 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 517 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 518 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET, 519 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 520 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 521 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 522 070707, 0xFFFF, "cpio archive\n", OCTET, 523 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 524 }; 525 526 int 527 filemagic(Filemagic *tab, int ntab, ulong x) 528 { 529 int i; 530 531 for(i=0; i<ntab; i++) 532 if((x&tab[i].mask) == tab[i].x){ 533 print(mime ? tab[i].mime : tab[i].desc); 534 return 1; 535 } 536 return 0; 537 } 538 539 int 540 long0(void) 541 { 542 Fhdr f; 543 long x; 544 545 seek(fd, 0, 0); /* reposition to start of file */ 546 if(crackhdr(fd, &f)) { 547 print(mime ? OCTET : "%s\n", f.name); 548 return 1; 549 } 550 x = LENDIAN(buf); 551 if(filemagic(long0tab, nelem(long0tab), x)) 552 return 1; 553 return 0; 554 } 555 556 /* from tar.c */ 557 enum { NAMSIZ = 100, TBLOCK = 512 }; 558 559 union hblock 560 { 561 char dummy[TBLOCK]; 562 struct header 563 { 564 char name[NAMSIZ]; 565 char mode[8]; 566 char uid[8]; 567 char gid[8]; 568 char size[12]; 569 char mtime[12]; 570 char chksum[8]; 571 char linkflag; 572 char linkname[NAMSIZ]; 573 /* rest are defined by POSIX's ustar format; see p1003.2b */ 574 char magic[6]; /* "ustar" */ 575 char version[2]; 576 char uname[32]; 577 char gname[32]; 578 char devmajor[8]; 579 char devminor[8]; 580 char prefix[155]; /* if non-null, path = prefix "/" name */ 581 } dbuf; 582 }; 583 584 int 585 checksum(union hblock *hp) 586 { 587 int i; 588 char *cp; 589 struct header *hdr = &hp->dbuf; 590 591 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 592 *cp = ' '; 593 i = 0; 594 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 595 i += *cp & 0xff; 596 return i; 597 } 598 599 int 600 istar(void) 601 { 602 int chksum; 603 char tblock[TBLOCK]; 604 union hblock *hp = (union hblock *)tblock; 605 struct header *hdr = &hp->dbuf; 606 607 seek(fd, 0, 0); /* reposition to start of file */ 608 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 609 return 0; 610 chksum = strtol(hdr->chksum, 0, 8); 611 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 612 if (strcmp(hdr->magic, "ustar") == 0) 613 print(mime? "application/x-ustar\n": 614 "posix tar archive\n"); 615 else 616 print(mime? "application/x-tar\n": "tar archive\n"); 617 return 1; 618 } 619 return 0; 620 } 621 622 /* 623 * initial words to classify file 624 */ 625 struct FILE_STRING 626 { 627 char *key; 628 char *filetype; 629 int length; 630 char *mime; 631 } file_string[] = 632 { 633 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 634 "!<arch>\n", "archive", 8, "application/octet-stream", 635 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 636 "#!/bin/rc", "rc executable file", 9, "text/plain", 637 "#!/bin/sh", "sh executable file", 9, "text/plain", 638 "%!", "postscript", 2, "application/postscript", 639 "\004%!", "postscript", 3, "application/postscript", 640 "x T post", "troff output for post", 8, "application/troff", 641 "x T Latin1", "troff output for Latin1", 10, "application/troff", 642 "x T utf", "troff output for UTF", 7, "application/troff", 643 "x T 202", "troff output for 202", 7, "application/troff", 644 "x T aps", "troff output for aps", 7, "application/troff", 645 "GIF", "GIF image", 3, "image/gif", 646 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 647 "%PDF", "PDF", 4, "application/pdf", 648 "<html>\n", "HTML file", 7, "text/html", 649 "<HTML>\n", "HTML file", 7, "text/html", 650 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream", 651 "\111\111\052\000", "tiff", 4, "image/tiff", 652 "\115\115\000\052", "tiff", 4, "image/tiff", 653 "\377\330\377\340", "jpeg", 4, "image/jpeg", 654 "\377\330\377\341", "jpeg", 4, "image/jpeg", 655 "\377\330\377\333", "jpeg", 4, "image/jpeg", 656 "BM", "bmp", 2, "image/bmp", 657 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 658 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 659 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 660 0,0,0,0 661 }; 662 663 int 664 istring(void) 665 { 666 int i; 667 struct FILE_STRING *p; 668 669 for(p = file_string; p->key; p++) { 670 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) { 671 if(mime) 672 print("%s\n", p->mime); 673 else 674 print("%s\n", p->filetype); 675 return 1; 676 } 677 } 678 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 679 for(i = 5; i < nbuf; i++) 680 if(buf[i] == '\n') 681 break; 682 if(mime) 683 print(OCTET); 684 else 685 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 686 return 1; 687 } 688 return 0; 689 } 690 691 char* html_string[] = 692 { 693 "title", 694 "body", 695 "head", 696 "strong", 697 "h1", 698 "h2", 699 "h3", 700 "h4", 701 "h5", 702 "h6", 703 "ul", 704 "li", 705 "dl", 706 "br", 707 "em", 708 0, 709 }; 710 711 int 712 ishtml(void) 713 { 714 uchar *p, *q; 715 int i, count; 716 717 /* compare strings between '<' and '>' to html table */ 718 count = 0; 719 p = buf; 720 for(;;) { 721 while (p < buf+nbuf && *p != '<') 722 p++; 723 p++; 724 if (p >= buf+nbuf) 725 break; 726 if(*p == '/') 727 p++; 728 q = p; 729 while(p < buf+nbuf && *p != '>') 730 p++; 731 if (p >= buf+nbuf) 732 break; 733 for(i = 0; html_string[i]; i++) { 734 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 735 if(count++ > 4) { 736 print(mime ? "text/html\n" : "HTML file\n"); 737 return 1; 738 } 739 break; 740 } 741 } 742 p++; 743 } 744 return 0; 745 } 746 747 char* rfc822_string[] = 748 { 749 "from:", 750 "date:", 751 "to:", 752 "subject:", 753 "received:", 754 "reply to:", 755 "sender:", 756 0, 757 }; 758 759 int 760 isrfc822(void) 761 { 762 763 char *p, *q, *r; 764 int i, count; 765 766 count = 0; 767 p = (char*)buf; 768 for(;;) { 769 q = strchr(p, '\n'); 770 if(q == nil) 771 break; 772 *q = 0; 773 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 774 count++; 775 *q = '\n'; 776 p = q+1; 777 continue; 778 } 779 *q = '\n'; 780 if(*p != '\t' && *p != ' '){ 781 r = strchr(p, ':'); 782 if(r == 0 || r > q) 783 break; 784 for(i = 0; rfc822_string[i]; i++) { 785 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 786 count++; 787 break; 788 } 789 } 790 } 791 p = q+1; 792 } 793 if(count >= 3){ 794 print(mime ? "message/rfc822\n" : "email file\n"); 795 return 1; 796 } 797 return 0; 798 } 799 800 int 801 ismbox(void) 802 { 803 char *p, *q; 804 805 p = (char*)buf; 806 q = strchr(p, '\n'); 807 if(q == nil) 808 return 0; 809 *q = 0; 810 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 811 print(mime ? "text/plain\n" : "mail box\n"); 812 return 1; 813 } 814 *q = '\n'; 815 return 0; 816 } 817 818 int 819 iscint(void) 820 { 821 int type; 822 char *name; 823 Biobuf b; 824 825 if(Binit(&b, fd, OREAD) == Beof) 826 return 0; 827 seek(fd, 0, 0); 828 type = objtype(&b, &name); 829 if(type < 0) 830 return 0; 831 if(mime) 832 print(OCTET); 833 else 834 print("%s intermediate\n", name); 835 return 1; 836 } 837 838 int 839 isc(void) 840 { 841 int n; 842 843 n = wfreq[I1]; 844 /* 845 * includes 846 */ 847 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 848 goto yes; 849 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 850 goto yes; 851 /* 852 * declarations 853 */ 854 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 855 goto yes; 856 /* 857 * assignments 858 */ 859 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 860 goto yes; 861 return 0; 862 863 yes: 864 if(mime){ 865 print(PLAIN); 866 return 1; 867 } 868 if(wfreq[Alword] > 0) 869 print("alef program\n"); 870 else 871 print("c program\n"); 872 return 1; 873 } 874 875 int 876 islimbo(void) 877 { 878 879 /* 880 * includes 881 */ 882 if(wfreq[Lword] < 4) 883 return 0; 884 print(mime ? PLAIN : "limbo program\n"); 885 return 1; 886 } 887 888 int 889 isas(void) 890 { 891 892 /* 893 * includes 894 */ 895 if(wfreq[Aword] < 2) 896 return 0; 897 print(mime ? PLAIN : "as program\n"); 898 return 1; 899 } 900 901 /* 902 * low entropy means encrypted 903 */ 904 int 905 ismung(void) 906 { 907 int i, bucket[8]; 908 float cs; 909 910 if(nbuf < 64) 911 return 0; 912 memset(bucket, 0, sizeof(bucket)); 913 for(i=0; i<64; i++) 914 bucket[(buf[i]>>5)&07] += 1; 915 916 cs = 0.; 917 for(i=0; i<8; i++) 918 cs += (bucket[i]-8)*(bucket[i]-8); 919 cs /= 8.; 920 if(cs <= 24.322) { 921 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d)) 922 print(mime ? OCTET : "compressed\n"); 923 else 924 print(mime ? OCTET : "encrypted\n"); 925 return 1; 926 } 927 return 0; 928 } 929 930 /* 931 * english by punctuation and frequencies 932 */ 933 int 934 isenglish(void) 935 { 936 int vow, comm, rare, badpun, punct; 937 char *p; 938 939 if(guess != Fascii && guess != Feascii) 940 return 0; 941 badpun = 0; 942 punct = 0; 943 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 944 switch(*p) { 945 case '.': 946 case ',': 947 case ')': 948 case '%': 949 case ';': 950 case ':': 951 case '?': 952 punct++; 953 if(p[1] != ' ' && p[1] != '\n') 954 badpun++; 955 } 956 if(badpun*5 > punct) 957 return 0; 958 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 959 return 0; 960 if(2*cfreq[';'] > cfreq['e']) 961 return 0; 962 963 vow = 0; 964 for(p="AEIOU"; *p; p++) { 965 vow += cfreq[*p]; 966 vow += cfreq[tolower(*p)]; 967 } 968 comm = 0; 969 for(p="ETAION"; *p; p++) { 970 comm += cfreq[*p]; 971 comm += cfreq[tolower(*p)]; 972 } 973 rare = 0; 974 for(p="VJKQXZ"; *p; p++) { 975 rare += cfreq[*p]; 976 rare += cfreq[tolower(*p)]; 977 } 978 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 979 print(mime ? PLAIN : "English text\n"); 980 return 1; 981 } 982 return 0; 983 } 984 985 /* 986 * pick up a number with 987 * syntax _*[0-9]+_ 988 */ 989 #define P9BITLEN 12 990 int 991 p9bitnum(uchar *bp) 992 { 993 int n, c, len; 994 995 len = P9BITLEN; 996 while(*bp == ' ') { 997 bp++; 998 len--; 999 if(len <= 0) 1000 return -1; 1001 } 1002 n = 0; 1003 while(len > 1) { 1004 c = *bp++; 1005 if(!isdigit(c)) 1006 return -1; 1007 n = n*10 + c-'0'; 1008 len--; 1009 } 1010 if(*bp != ' ') 1011 return -1; 1012 return n; 1013 } 1014 1015 int 1016 depthof(char *s, int *newp) 1017 { 1018 char *es; 1019 int d; 1020 1021 *newp = 0; 1022 es = s+12; 1023 while(s<es && *s==' ') 1024 s++; 1025 if(s == es) 1026 return -1; 1027 if('0'<=*s && *s<='9') 1028 return 1<<atoi(s); 1029 1030 *newp = 1; 1031 d = 0; 1032 while(s<es && *s!=' '){ 1033 s++; /* skip letter */ 1034 d += strtoul(s, &s, 10); 1035 } 1036 1037 switch(d){ 1038 case 32: 1039 case 24: 1040 case 16: 1041 case 8: 1042 return d; 1043 } 1044 return -1; 1045 } 1046 1047 int 1048 isp9bit(void) 1049 { 1050 int dep, lox, loy, hix, hiy, px, new; 1051 ulong t; 1052 long len; 1053 char *newlabel; 1054 1055 newlabel = "old "; 1056 1057 dep = depthof((char*)buf + 0*P9BITLEN, &new); 1058 if(new) 1059 newlabel = ""; 1060 lox = p9bitnum(buf + 1*P9BITLEN); 1061 loy = p9bitnum(buf + 2*P9BITLEN); 1062 hix = p9bitnum(buf + 3*P9BITLEN); 1063 hiy = p9bitnum(buf + 4*P9BITLEN); 1064 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1065 return 0; 1066 1067 if(dep < 8){ 1068 px = 8/dep; /* pixels per byte */ 1069 /* set l to number of bytes of data per scan line */ 1070 if(lox >= 0) 1071 len = (hix+px-1)/px - lox/px; 1072 else{ /* make positive before divide */ 1073 t = (-lox)+px-1; 1074 t = (t/px)*px; 1075 len = (t+hix+px-1)/px; 1076 } 1077 }else 1078 len = (hix-lox)*dep/8; 1079 len *= (hiy-loy); /* col length */ 1080 len += 5*P9BITLEN; /* size of initial ascii */ 1081 1082 /* 1083 * for image file, length is non-zero and must match calculation above 1084 * for /dev/window and /dev/screen the length is always zero 1085 * for subfont, the subfont header should follow immediately. 1086 */ 1087 if (len != 0 && mbuf->length == 0) { 1088 print("%splan 9 image\n", newlabel); 1089 return 1; 1090 } 1091 if (mbuf->length == len) { 1092 print("%splan 9 image\n", newlabel); 1093 return 1; 1094 } 1095 /* Ghostscript sometimes produces a little extra on the end */ 1096 if (mbuf->length < len+P9BITLEN) { 1097 print("%splan 9 image\n", newlabel); 1098 return 1; 1099 } 1100 if (p9subfont(buf+len)) { 1101 print("%ssubfont file\n", newlabel); 1102 return 1; 1103 } 1104 return 0; 1105 } 1106 1107 int 1108 p9subfont(uchar *p) 1109 { 1110 int n, h, a; 1111 1112 /* if image too big, assume it's a subfont */ 1113 if (p+3*P9BITLEN > buf+sizeof(buf)) 1114 return 1; 1115 1116 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1117 if (n < 0) 1118 return 0; 1119 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1120 if (h < 0) 1121 return 0; 1122 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1123 if (a < 0) 1124 return 0; 1125 return 1; 1126 } 1127 1128 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1129 1130 int 1131 isp9font(void) 1132 { 1133 uchar *cp, *p; 1134 int i, n; 1135 char pathname[1024]; 1136 1137 cp = buf; 1138 if (!getfontnum(cp, &cp)) /* height */ 1139 return 0; 1140 if (!getfontnum(cp, &cp)) /* ascent */ 1141 return 0; 1142 for (i = 0; 1; i++) { 1143 if (!getfontnum(cp, &cp)) /* min */ 1144 break; 1145 if (!getfontnum(cp, &cp)) /* max */ 1146 return 0; 1147 while (WHITESPACE(*cp)) 1148 cp++; 1149 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1150 ; 1151 /* construct a path name, if needed */ 1152 n = 0; 1153 if (*p != '/' && slash) { 1154 n = slash-fname+1; 1155 if (n < sizeof(pathname)) 1156 memcpy(pathname, fname, n); 1157 else n = 0; 1158 } 1159 if (n+cp-p < sizeof(pathname)) { 1160 memcpy(pathname+n, p, cp-p); 1161 n += cp-p; 1162 pathname[n] = 0; 1163 if (access(pathname, AEXIST) < 0) 1164 return 0; 1165 } 1166 } 1167 if (i) { 1168 print(mime ? "text/plain\n" : "font file\n"); 1169 return 1; 1170 } 1171 return 0; 1172 } 1173 1174 int 1175 getfontnum(uchar *cp, uchar **rp) 1176 { 1177 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1178 cp++; 1179 if (*cp < '0' || *cp > '9') 1180 return 0; 1181 strtoul((char *)cp, (char **)rp, 0); 1182 if (!WHITESPACE(**rp)) 1183 return 0; 1184 return 1; 1185 } 1186 1187 int 1188 isrtf(void) 1189 { 1190 if(strstr((char *)buf, "\\rtf1")){ 1191 print(mime ? "application/rtf\n" : "rich text format\n"); 1192 return 1; 1193 } 1194 return 0; 1195 } 1196 1197 int 1198 ismsdos(void) 1199 { 1200 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1201 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1202 return 1; 1203 } 1204 return 0; 1205 } 1206 1207 int 1208 iself(void) 1209 { 1210 char *cpu[] = { /* NB: incomplete and arbitary list */ 1211 [1] "WE32100", 1212 [2] "SPARC", 1213 [3] "i386", 1214 [4] "M68000", 1215 [5] "M88000", 1216 [6] "i486", 1217 [7] "i860", 1218 [8] "R3000", 1219 [9] "S370", 1220 [10] "R4000", 1221 [15] "HP-PA", 1222 [18] "sparc v8+", 1223 [19] "i960", 1224 [20] "PPC-32", 1225 [21] "PPC-64", 1226 [40] "ARM", 1227 [41] "Alpha", 1228 [43] "sparc v9", 1229 [50] "IA-46", 1230 [62] "AMD x86-64", 1231 [75] "VAX", 1232 }; 1233 1234 1235 if (memcmp(buf, "\x7fELF", 4) == 0){ 1236 if (!mime){ 1237 int n = (buf[19] << 8) | buf[18]; 1238 char *p = "unknown"; 1239 1240 if (n > 0 && n < nelem(cpu) && cpu[n]) 1241 p = cpu[n]; 1242 else { 1243 /* try the other byte order */ 1244 n = (buf[18] << 8) | buf[19]; 1245 if (n > 0 && n < nelem(cpu) && cpu[n]) 1246 p = cpu[n]; 1247 } 1248 print("%s ELF executable\n", p); 1249 } 1250 else 1251 print("application/x-elf-executable"); 1252 return 1; 1253 } 1254 1255 return 0; 1256 } 1257