1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "chan", Alword, 51 "char", Cword, 52 "common", Fword, 53 "con", Lword, 54 "data", Fword, 55 "dimension", Fword, 56 "double", Cword, 57 "extern", Cword, 58 "bio", I2, 59 "float", Cword, 60 "fn", Lword, 61 "function", Fword, 62 "h", I3, 63 "implement", Lword, 64 "import", Lword, 65 "include", I1, 66 "int", Cword, 67 "integer", Fword, 68 "iota", Lword, 69 "libc", I2, 70 "long", Cword, 71 "module", Lword, 72 "real", Fword, 73 "ref", Lword, 74 "register", Cword, 75 "self", Lword, 76 "short", Cword, 77 "static", Cword, 78 "stdio", I2, 79 "struct", Cword, 80 "subroutine", Fword, 81 "u", I2, 82 "void", Cword, 83 }; 84 85 /* codes for 'mode' field in language structure */ 86 enum { 87 Normal = 0, 88 First, /* first entry for language spanning several ranges */ 89 Multi, /* later entries " " " ... */ 90 Shared, /* codes used in several languages */ 91 }; 92 93 struct 94 { 95 int mode; /* see enum above */ 96 int count; 97 int low; 98 int high; 99 char *name; 100 101 } language[] = 102 { 103 Normal, 0, 0x0080, 0x0080, "Extended Latin", 104 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 105 Normal, 0, 0x0370, 0x03FF, "Greek", 106 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 107 Normal, 0, 0x0530, 0x058F, "Armenian", 108 Normal, 0, 0x0590, 0x05FF, "Hebrew", 109 Normal, 0, 0x0600, 0x06FF, "Arabic", 110 Normal, 0, 0x0900, 0x097F, "Devanagari", 111 Normal, 0, 0x0980, 0x09FF, "Bengali", 112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 114 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 115 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 116 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 117 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 119 Normal, 0, 0x0E00, 0x0E7F, "Thai", 120 Normal, 0, 0x0E80, 0x0EFF, "Lao", 121 Normal, 0, 0x1000, 0x105F, "Tibetan", 122 Normal, 0, 0x10A0, 0x10FF, "Georgian", 123 Normal, 0, 0x3040, 0x30FF, "Japanese", 124 Normal, 0, 0x3100, 0x312F, "Chinese", 125 First, 0, 0x3130, 0x318F, "Korean", 126 Multi, 0, 0x3400, 0x3D2F, "Korean", 127 Shared, 0, 0x4e00, 0x9fff, "CJK", 128 Normal, 0, 0, 0, 0, /* terminal entry */ 129 }; 130 131 132 enum 133 { 134 Fascii, /* printable ascii */ 135 Flatin, /* latin 1*/ 136 Futf, /* UTf character set */ 137 Fbinary, /* binary */ 138 Feascii, /* ASCII with control chars */ 139 Fnull, /* NULL in file */ 140 } guess; 141 142 void bump_utf_count(Rune); 143 int cistrncmp(char*, char*, int); 144 void filetype(int); 145 int getfontnum(uchar*, uchar**); 146 int isas(void); 147 int isc(void); 148 int iscint(void); 149 int isenglish(void); 150 int ishp(void); 151 int ishtml(void); 152 int isrfc822(void); 153 int ismbox(void); 154 int islimbo(void); 155 int ismung(void); 156 int isp9bit(void); 157 int isp9font(void); 158 int isrtf(void); 159 int ismsdos(void); 160 int iself(void); 161 int istring(void); 162 int iff(void); 163 int long0(void); 164 int istar(void); 165 int p9bitnum(uchar*); 166 int p9subfont(uchar*); 167 void print_utf(void); 168 void type(char*, int); 169 int utf_count(void); 170 void wordfreq(void); 171 172 int (*call[])(void) = 173 { 174 long0, /* recognizable by first 4 bytes */ 175 istring, /* recognizable by first string */ 176 iff, /* interchange file format (strings) */ 177 isrfc822, /* email file */ 178 ismbox, /* mail box */ 179 istar, /* recognizable by tar checksum */ 180 ishtml, /* html keywords */ 181 iscint, /* compiler/assembler intermediate */ 182 islimbo, /* limbo source */ 183 isc, /* c & alef compiler key words */ 184 isas, /* assembler key words */ 185 ismung, /* entropy compressed/encrypted */ 186 isp9font, /* plan 9 font */ 187 isp9bit, /* plan 9 image (as from /dev/window) */ 188 isenglish, /* char frequency English */ 189 isrtf, /* rich text format */ 190 ismsdos, /* msdos exe (virus file attachement) */ 191 iself, /* ELF (foreign) executable */ 192 0 193 }; 194 195 int mime; 196 197 #define OCTET "application/octet-stream\n" 198 #define PLAIN "text/plain\n" 199 200 void 201 main(int argc, char *argv[]) 202 { 203 int i, j, maxlen; 204 char *cp; 205 Rune r; 206 207 ARGBEGIN{ 208 case 'm': 209 mime = 1; 210 break; 211 default: 212 fprint(2, "usage: file [-m] [file...]\n"); 213 exits("usage"); 214 }ARGEND; 215 216 maxlen = 0; 217 if(mime == 0 || argc > 1){ 218 for(i = 0; i < argc; i++) { 219 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 220 ; 221 if(j > maxlen) 222 maxlen = j; 223 } 224 } 225 if (argc <= 0) { 226 if(!mime) 227 print ("stdin: "); 228 filetype(0); 229 } 230 else { 231 for(i = 0; i < argc; i++) 232 type(argv[i], maxlen); 233 } 234 exits(0); 235 } 236 237 void 238 type(char *file, int nlen) 239 { 240 Rune r; 241 int i; 242 char *p; 243 244 if(nlen > 0){ 245 slash = 0; 246 for (i = 0, p = file; *p; i++) { 247 if (*p == '/') /* find rightmost slash */ 248 slash = p; 249 p += chartorune(&r, p); /* count runes */ 250 } 251 print("%s:%*s",file, nlen-i+1, ""); 252 } 253 fname = file; 254 if ((fd = open(file, OREAD)) < 0) { 255 print("cannot open\n"); 256 return; 257 } 258 filetype(fd); 259 close(fd); 260 } 261 262 void 263 filetype(int fd) 264 { 265 Rune r; 266 int i, f, n; 267 char *p, *eob; 268 269 free(mbuf); 270 mbuf = dirfstat(fd); 271 if(mbuf == nil){ 272 print("cannot stat: %r\n"); 273 return; 274 } 275 if(mbuf->mode & DMDIR) { 276 print(mime ? "text/directory\n" : "directory\n"); 277 return; 278 } 279 if(mbuf->type != 'M' && mbuf->type != '|') { 280 print(mime ? OCTET : "special file #%c/%s\n", 281 mbuf->type, mbuf->name); 282 return; 283 } 284 nbuf = read(fd, buf, sizeof(buf)-1); 285 286 if(nbuf < 0) { 287 print("cannot read\n"); 288 return; 289 } 290 if(nbuf == 0) { 291 print(mime ? PLAIN : "empty file\n"); 292 return; 293 } 294 buf[nbuf] = 0; 295 296 /* 297 * build histogram table 298 */ 299 memset(cfreq, 0, sizeof(cfreq)); 300 for (i = 0; language[i].name; i++) 301 language[i].count = 0; 302 eob = (char *)buf+nbuf; 303 for(n = 0, p = (char *)buf; p < eob; n++) { 304 if (!fullrune(p, eob-p) && eob-p < UTFmax) 305 break; 306 p += chartorune(&r, p); 307 if (r == 0) 308 f = Cnull; 309 else if (r <= 0x7f) { 310 if (!isprint(r) && !isspace(r)) 311 f = Ceascii; /* ASCII control char */ 312 else f = r; 313 } else if (r == 0x080) { 314 bump_utf_count(r); 315 f = Cutf; 316 } else if (r < 0xA0) 317 f = Cbinary; /* Invalid Runes */ 318 else if (r <= 0xff) 319 f = Clatin; /* Latin 1 */ 320 else { 321 bump_utf_count(r); 322 f = Cutf; /* UTF extension */ 323 } 324 cfreq[f]++; /* ASCII chars peg directly */ 325 } 326 /* 327 * gross classify 328 */ 329 if (cfreq[Cbinary]) 330 guess = Fbinary; 331 else if (cfreq[Cutf]) 332 guess = Futf; 333 else if (cfreq[Clatin]) 334 guess = Flatin; 335 else if (cfreq[Ceascii]) 336 guess = Feascii; 337 else if (cfreq[Cnull] == n) { 338 print(mime ? OCTET : "first block all null bytes\n"); 339 return; 340 } 341 else guess = Fascii; 342 /* 343 * lookup dictionary words 344 */ 345 memset(wfreq, 0, sizeof(wfreq)); 346 if(guess == Fascii || guess == Flatin || guess == Futf) 347 wordfreq(); 348 /* 349 * call individual classify routines 350 */ 351 for(i=0; call[i]; i++) 352 if((*call[i])()) 353 return; 354 355 /* 356 * if all else fails, 357 * print out gross classification 358 */ 359 if (nbuf < 100 && !mime) 360 print(mime ? PLAIN : "short "); 361 if (guess == Fascii) 362 print(mime ? PLAIN : "Ascii\n"); 363 else if (guess == Feascii) 364 print(mime ? PLAIN : "extended ascii\n"); 365 else if (guess == Flatin) 366 print(mime ? PLAIN : "latin ascii\n"); 367 else if (guess == Futf && utf_count() < 4) 368 print_utf(); 369 else print(mime ? OCTET : "binary\n"); 370 } 371 372 void 373 bump_utf_count(Rune r) 374 { 375 int low, high, mid; 376 377 high = sizeof(language)/sizeof(language[0])-1; 378 for (low = 0; low < high;) { 379 mid = (low+high)/2; 380 if (r >=language[mid].low) { 381 if (r <= language[mid].high) { 382 language[mid].count++; 383 break; 384 } else low = mid+1; 385 } else high = mid; 386 } 387 } 388 389 int 390 utf_count(void) 391 { 392 int i, count; 393 394 count = 0; 395 for (i = 0; language[i].name; i++) 396 if (language[i].count > 0) 397 switch (language[i].mode) { 398 case Normal: 399 case First: 400 count++; 401 break; 402 default: 403 break; 404 } 405 return count; 406 } 407 408 int 409 chkascii(void) 410 { 411 int i; 412 413 for (i = 'a'; i < 'z'; i++) 414 if (cfreq[i]) 415 return 1; 416 for (i = 'A'; i < 'Z'; i++) 417 if (cfreq[i]) 418 return 1; 419 return 0; 420 } 421 422 int 423 find_first(char *name) 424 { 425 int i; 426 427 for (i = 0; language[i].name != 0; i++) 428 if (language[i].mode == First 429 && strcmp(language[i].name, name) == 0) 430 return i; 431 return -1; 432 } 433 434 void 435 print_utf(void) 436 { 437 int i, printed, j; 438 439 if(mime){ 440 print(PLAIN); 441 return; 442 } 443 if (chkascii()) { 444 printed = 1; 445 print("Ascii"); 446 } else 447 printed = 0; 448 for (i = 0; language[i].name; i++) 449 if (language[i].count) { 450 switch(language[i].mode) { 451 case Multi: 452 j = find_first(language[i].name); 453 if (j < 0) 454 break; 455 if (language[j].count > 0) 456 break; 457 /* Fall through */ 458 case Normal: 459 case First: 460 if (printed) 461 print(" & "); 462 else printed = 1; 463 print("%s", language[i].name); 464 break; 465 case Shared: 466 default: 467 break; 468 } 469 } 470 if(!printed) 471 print("UTF"); 472 print(" text\n"); 473 } 474 475 void 476 wordfreq(void) 477 { 478 int low, high, mid, r; 479 uchar *p, *p2, c; 480 481 p = buf; 482 for(;;) { 483 while (p < buf+nbuf && !isalpha(*p)) 484 p++; 485 if (p >= buf+nbuf) 486 return; 487 p2 = p; 488 while(p < buf+nbuf && isalpha(*p)) 489 p++; 490 c = *p; 491 *p = 0; 492 high = sizeof(dict)/sizeof(dict[0]); 493 for(low = 0;low < high;) { 494 mid = (low+high)/2; 495 r = strcmp(dict[mid].word, (char*)p2); 496 if(r == 0) { 497 wfreq[dict[mid].class]++; 498 break; 499 } 500 if(r < 0) 501 low = mid+1; 502 else 503 high = mid; 504 } 505 *p++ = c; 506 } 507 } 508 509 typedef struct Filemagic Filemagic; 510 struct Filemagic { 511 ulong x; 512 ulong mask; 513 char *desc; 514 char *mime; 515 }; 516 517 Filemagic long0tab[] = { 518 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 519 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 520 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET, 521 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 522 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 523 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 524 070707, 0xFFFF, "cpio archive\n", OCTET, 525 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 526 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 527 }; 528 529 int 530 filemagic(Filemagic *tab, int ntab, ulong x) 531 { 532 int i; 533 534 for(i=0; i<ntab; i++) 535 if((x&tab[i].mask) == tab[i].x){ 536 print(mime ? tab[i].mime : tab[i].desc); 537 return 1; 538 } 539 return 0; 540 } 541 542 int 543 long0(void) 544 { 545 Fhdr f; 546 long x; 547 548 seek(fd, 0, 0); /* reposition to start of file */ 549 if(crackhdr(fd, &f)) { 550 print(mime ? OCTET : "%s\n", f.name); 551 return 1; 552 } 553 x = LENDIAN(buf); 554 if(filemagic(long0tab, nelem(long0tab), x)) 555 return 1; 556 return 0; 557 } 558 559 /* from tar.c */ 560 enum { NAMSIZ = 100, TBLOCK = 512 }; 561 562 union hblock 563 { 564 char dummy[TBLOCK]; 565 struct header 566 { 567 char name[NAMSIZ]; 568 char mode[8]; 569 char uid[8]; 570 char gid[8]; 571 char size[12]; 572 char mtime[12]; 573 char chksum[8]; 574 char linkflag; 575 char linkname[NAMSIZ]; 576 /* rest are defined by POSIX's ustar format; see p1003.2b */ 577 char magic[6]; /* "ustar" */ 578 char version[2]; 579 char uname[32]; 580 char gname[32]; 581 char devmajor[8]; 582 char devminor[8]; 583 char prefix[155]; /* if non-null, path = prefix "/" name */ 584 } dbuf; 585 }; 586 587 int 588 checksum(union hblock *hp) 589 { 590 int i; 591 char *cp; 592 struct header *hdr = &hp->dbuf; 593 594 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 595 *cp = ' '; 596 i = 0; 597 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 598 i += *cp & 0xff; 599 return i; 600 } 601 602 int 603 istar(void) 604 { 605 int chksum; 606 char tblock[TBLOCK]; 607 union hblock *hp = (union hblock *)tblock; 608 struct header *hdr = &hp->dbuf; 609 610 seek(fd, 0, 0); /* reposition to start of file */ 611 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 612 return 0; 613 chksum = strtol(hdr->chksum, 0, 8); 614 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 615 if (strcmp(hdr->magic, "ustar") == 0) 616 print(mime? "application/x-ustar\n": 617 "posix tar archive\n"); 618 else 619 print(mime? "application/x-tar\n": "tar archive\n"); 620 return 1; 621 } 622 return 0; 623 } 624 625 /* 626 * initial words to classify file 627 */ 628 struct FILE_STRING 629 { 630 char *key; 631 char *filetype; 632 int length; 633 char *mime; 634 } file_string[] = 635 { 636 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 637 "!<arch>\n", "archive", 8, "application/octet-stream", 638 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 639 "#!/bin/rc", "rc executable file", 9, "text/plain", 640 "#!/bin/sh", "sh executable file", 9, "text/plain", 641 "%!", "postscript", 2, "application/postscript", 642 "\004%!", "postscript", 3, "application/postscript", 643 "x T post", "troff output for post", 8, "application/troff", 644 "x T Latin1", "troff output for Latin1", 10, "application/troff", 645 "x T utf", "troff output for UTF", 7, "application/troff", 646 "x T 202", "troff output for 202", 7, "application/troff", 647 "x T aps", "troff output for aps", 7, "application/troff", 648 "GIF", "GIF image", 3, "image/gif", 649 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 650 "%PDF", "PDF", 4, "application/pdf", 651 "<html>\n", "HTML file", 7, "text/html", 652 "<HTML>\n", "HTML file", 7, "text/html", 653 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream", 654 "\111\111\052\000", "tiff", 4, "image/tiff", 655 "\115\115\000\052", "tiff", 4, "image/tiff", 656 "\377\330\377\340", "jpeg", 4, "image/jpeg", 657 "\377\330\377\341", "jpeg", 4, "image/jpeg", 658 "\377\330\377\333", "jpeg", 4, "image/jpeg", 659 "BM", "bmp", 2, "image/bmp", 660 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 661 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 662 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 663 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 664 "\211PNG", "PNG image", 4, "image/png", 665 0,0,0,0 666 }; 667 668 int 669 istring(void) 670 { 671 int i; 672 struct FILE_STRING *p; 673 674 for(p = file_string; p->key; p++) { 675 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) { 676 if(mime) 677 print("%s\n", p->mime); 678 else 679 print("%s\n", p->filetype); 680 return 1; 681 } 682 } 683 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 684 for(i = 5; i < nbuf; i++) 685 if(buf[i] == '\n') 686 break; 687 if(mime) 688 print(OCTET); 689 else 690 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 691 return 1; 692 } 693 return 0; 694 } 695 696 int 697 iff(void) 698 { 699 if (strncmp((char*)buf, "FORM", 4) == 0 && 700 strncmp((char*)buf+8, "AIFF", 4) == 0) { 701 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 702 return 1; 703 } 704 return 0; 705 } 706 707 char* html_string[] = 708 { 709 "title", 710 "body", 711 "head", 712 "strong", 713 "h1", 714 "h2", 715 "h3", 716 "h4", 717 "h5", 718 "h6", 719 "ul", 720 "li", 721 "dl", 722 "br", 723 "em", 724 0, 725 }; 726 727 int 728 ishtml(void) 729 { 730 uchar *p, *q; 731 int i, count; 732 733 /* compare strings between '<' and '>' to html table */ 734 count = 0; 735 p = buf; 736 for(;;) { 737 while (p < buf+nbuf && *p != '<') 738 p++; 739 p++; 740 if (p >= buf+nbuf) 741 break; 742 if(*p == '/') 743 p++; 744 q = p; 745 while(p < buf+nbuf && *p != '>') 746 p++; 747 if (p >= buf+nbuf) 748 break; 749 for(i = 0; html_string[i]; i++) { 750 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 751 if(count++ > 4) { 752 print(mime ? "text/html\n" : "HTML file\n"); 753 return 1; 754 } 755 break; 756 } 757 } 758 p++; 759 } 760 return 0; 761 } 762 763 char* rfc822_string[] = 764 { 765 "from:", 766 "date:", 767 "to:", 768 "subject:", 769 "received:", 770 "reply to:", 771 "sender:", 772 0, 773 }; 774 775 int 776 isrfc822(void) 777 { 778 779 char *p, *q, *r; 780 int i, count; 781 782 count = 0; 783 p = (char*)buf; 784 for(;;) { 785 q = strchr(p, '\n'); 786 if(q == nil) 787 break; 788 *q = 0; 789 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 790 count++; 791 *q = '\n'; 792 p = q+1; 793 continue; 794 } 795 *q = '\n'; 796 if(*p != '\t' && *p != ' '){ 797 r = strchr(p, ':'); 798 if(r == 0 || r > q) 799 break; 800 for(i = 0; rfc822_string[i]; i++) { 801 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 802 count++; 803 break; 804 } 805 } 806 } 807 p = q+1; 808 } 809 if(count >= 3){ 810 print(mime ? "message/rfc822\n" : "email file\n"); 811 return 1; 812 } 813 return 0; 814 } 815 816 int 817 ismbox(void) 818 { 819 char *p, *q; 820 821 p = (char*)buf; 822 q = strchr(p, '\n'); 823 if(q == nil) 824 return 0; 825 *q = 0; 826 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 827 print(mime ? "text/plain\n" : "mail box\n"); 828 return 1; 829 } 830 *q = '\n'; 831 return 0; 832 } 833 834 int 835 iscint(void) 836 { 837 int type; 838 char *name; 839 Biobuf b; 840 841 if(Binit(&b, fd, OREAD) == Beof) 842 return 0; 843 seek(fd, 0, 0); 844 type = objtype(&b, &name); 845 if(type < 0) 846 return 0; 847 if(mime) 848 print(OCTET); 849 else 850 print("%s intermediate\n", name); 851 return 1; 852 } 853 854 int 855 isc(void) 856 { 857 int n; 858 859 n = wfreq[I1]; 860 /* 861 * includes 862 */ 863 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 864 goto yes; 865 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 866 goto yes; 867 /* 868 * declarations 869 */ 870 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 871 goto yes; 872 /* 873 * assignments 874 */ 875 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 876 goto yes; 877 return 0; 878 879 yes: 880 if(mime){ 881 print(PLAIN); 882 return 1; 883 } 884 if(wfreq[Alword] > 0) 885 print("alef program\n"); 886 else 887 print("c program\n"); 888 return 1; 889 } 890 891 int 892 islimbo(void) 893 { 894 895 /* 896 * includes 897 */ 898 if(wfreq[Lword] < 4) 899 return 0; 900 print(mime ? PLAIN : "limbo program\n"); 901 return 1; 902 } 903 904 int 905 isas(void) 906 { 907 908 /* 909 * includes 910 */ 911 if(wfreq[Aword] < 2) 912 return 0; 913 print(mime ? PLAIN : "as program\n"); 914 return 1; 915 } 916 917 /* 918 * low entropy means encrypted 919 */ 920 int 921 ismung(void) 922 { 923 int i, bucket[8]; 924 float cs; 925 926 if(nbuf < 64) 927 return 0; 928 memset(bucket, 0, sizeof(bucket)); 929 for(i=0; i<64; i++) 930 bucket[(buf[i]>>5)&07] += 1; 931 932 cs = 0.; 933 for(i=0; i<8; i++) 934 cs += (bucket[i]-8)*(bucket[i]-8); 935 cs /= 8.; 936 if(cs <= 24.322) { 937 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d)) 938 print(mime ? OCTET : "compressed\n"); 939 else 940 print(mime ? OCTET : "encrypted\n"); 941 return 1; 942 } 943 return 0; 944 } 945 946 /* 947 * english by punctuation and frequencies 948 */ 949 int 950 isenglish(void) 951 { 952 int vow, comm, rare, badpun, punct; 953 char *p; 954 955 if(guess != Fascii && guess != Feascii) 956 return 0; 957 badpun = 0; 958 punct = 0; 959 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 960 switch(*p) { 961 case '.': 962 case ',': 963 case ')': 964 case '%': 965 case ';': 966 case ':': 967 case '?': 968 punct++; 969 if(p[1] != ' ' && p[1] != '\n') 970 badpun++; 971 } 972 if(badpun*5 > punct) 973 return 0; 974 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 975 return 0; 976 if(2*cfreq[';'] > cfreq['e']) 977 return 0; 978 979 vow = 0; 980 for(p="AEIOU"; *p; p++) { 981 vow += cfreq[*p]; 982 vow += cfreq[tolower(*p)]; 983 } 984 comm = 0; 985 for(p="ETAION"; *p; p++) { 986 comm += cfreq[*p]; 987 comm += cfreq[tolower(*p)]; 988 } 989 rare = 0; 990 for(p="VJKQXZ"; *p; p++) { 991 rare += cfreq[*p]; 992 rare += cfreq[tolower(*p)]; 993 } 994 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 995 print(mime ? PLAIN : "English text\n"); 996 return 1; 997 } 998 return 0; 999 } 1000 1001 /* 1002 * pick up a number with 1003 * syntax _*[0-9]+_ 1004 */ 1005 #define P9BITLEN 12 1006 int 1007 p9bitnum(uchar *bp) 1008 { 1009 int n, c, len; 1010 1011 len = P9BITLEN; 1012 while(*bp == ' ') { 1013 bp++; 1014 len--; 1015 if(len <= 0) 1016 return -1; 1017 } 1018 n = 0; 1019 while(len > 1) { 1020 c = *bp++; 1021 if(!isdigit(c)) 1022 return -1; 1023 n = n*10 + c-'0'; 1024 len--; 1025 } 1026 if(*bp != ' ') 1027 return -1; 1028 return n; 1029 } 1030 1031 int 1032 depthof(char *s, int *newp) 1033 { 1034 char *es; 1035 int d; 1036 1037 *newp = 0; 1038 es = s+12; 1039 while(s<es && *s==' ') 1040 s++; 1041 if(s == es) 1042 return -1; 1043 if('0'<=*s && *s<='9') 1044 return 1<<atoi(s); 1045 1046 *newp = 1; 1047 d = 0; 1048 while(s<es && *s!=' '){ 1049 s++; /* skip letter */ 1050 d += strtoul(s, &s, 10); 1051 } 1052 1053 switch(d){ 1054 case 32: 1055 case 24: 1056 case 16: 1057 case 8: 1058 return d; 1059 } 1060 return -1; 1061 } 1062 1063 int 1064 isp9bit(void) 1065 { 1066 int dep, lox, loy, hix, hiy, px, new; 1067 ulong t; 1068 long len; 1069 char *newlabel; 1070 1071 newlabel = "old "; 1072 1073 dep = depthof((char*)buf + 0*P9BITLEN, &new); 1074 if(new) 1075 newlabel = ""; 1076 lox = p9bitnum(buf + 1*P9BITLEN); 1077 loy = p9bitnum(buf + 2*P9BITLEN); 1078 hix = p9bitnum(buf + 3*P9BITLEN); 1079 hiy = p9bitnum(buf + 4*P9BITLEN); 1080 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1081 return 0; 1082 1083 if(dep < 8){ 1084 px = 8/dep; /* pixels per byte */ 1085 /* set l to number of bytes of data per scan line */ 1086 if(lox >= 0) 1087 len = (hix+px-1)/px - lox/px; 1088 else{ /* make positive before divide */ 1089 t = (-lox)+px-1; 1090 t = (t/px)*px; 1091 len = (t+hix+px-1)/px; 1092 } 1093 }else 1094 len = (hix-lox)*dep/8; 1095 len *= (hiy-loy); /* col length */ 1096 len += 5*P9BITLEN; /* size of initial ascii */ 1097 1098 /* 1099 * for image file, length is non-zero and must match calculation above 1100 * for /dev/window and /dev/screen the length is always zero 1101 * for subfont, the subfont header should follow immediately. 1102 */ 1103 if (len != 0 && mbuf->length == 0) { 1104 print("%splan 9 image\n", newlabel); 1105 return 1; 1106 } 1107 if (mbuf->length == len) { 1108 print("%splan 9 image\n", newlabel); 1109 return 1; 1110 } 1111 /* Ghostscript sometimes produces a little extra on the end */ 1112 if (mbuf->length < len+P9BITLEN) { 1113 print("%splan 9 image\n", newlabel); 1114 return 1; 1115 } 1116 if (p9subfont(buf+len)) { 1117 print("%ssubfont file\n", newlabel); 1118 return 1; 1119 } 1120 return 0; 1121 } 1122 1123 int 1124 p9subfont(uchar *p) 1125 { 1126 int n, h, a; 1127 1128 /* if image too big, assume it's a subfont */ 1129 if (p+3*P9BITLEN > buf+sizeof(buf)) 1130 return 1; 1131 1132 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1133 if (n < 0) 1134 return 0; 1135 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1136 if (h < 0) 1137 return 0; 1138 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1139 if (a < 0) 1140 return 0; 1141 return 1; 1142 } 1143 1144 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1145 1146 int 1147 isp9font(void) 1148 { 1149 uchar *cp, *p; 1150 int i, n; 1151 char pathname[1024]; 1152 1153 cp = buf; 1154 if (!getfontnum(cp, &cp)) /* height */ 1155 return 0; 1156 if (!getfontnum(cp, &cp)) /* ascent */ 1157 return 0; 1158 for (i = 0; 1; i++) { 1159 if (!getfontnum(cp, &cp)) /* min */ 1160 break; 1161 if (!getfontnum(cp, &cp)) /* max */ 1162 return 0; 1163 while (WHITESPACE(*cp)) 1164 cp++; 1165 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1166 ; 1167 /* construct a path name, if needed */ 1168 n = 0; 1169 if (*p != '/' && slash) { 1170 n = slash-fname+1; 1171 if (n < sizeof(pathname)) 1172 memcpy(pathname, fname, n); 1173 else n = 0; 1174 } 1175 if (n+cp-p < sizeof(pathname)) { 1176 memcpy(pathname+n, p, cp-p); 1177 n += cp-p; 1178 pathname[n] = 0; 1179 if (access(pathname, AEXIST) < 0) 1180 return 0; 1181 } 1182 } 1183 if (i) { 1184 print(mime ? "text/plain\n" : "font file\n"); 1185 return 1; 1186 } 1187 return 0; 1188 } 1189 1190 int 1191 getfontnum(uchar *cp, uchar **rp) 1192 { 1193 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1194 cp++; 1195 if (*cp < '0' || *cp > '9') 1196 return 0; 1197 strtoul((char *)cp, (char **)rp, 0); 1198 if (!WHITESPACE(**rp)) 1199 return 0; 1200 return 1; 1201 } 1202 1203 int 1204 isrtf(void) 1205 { 1206 if(strstr((char *)buf, "\\rtf1")){ 1207 print(mime ? "application/rtf\n" : "rich text format\n"); 1208 return 1; 1209 } 1210 return 0; 1211 } 1212 1213 int 1214 ismsdos(void) 1215 { 1216 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1217 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1218 return 1; 1219 } 1220 return 0; 1221 } 1222 1223 int 1224 iself(void) 1225 { 1226 char *cpu[] = { /* NB: incomplete and arbitary list */ 1227 [1] "WE32100", 1228 [2] "SPARC", 1229 [3] "i386", 1230 [4] "M68000", 1231 [5] "M88000", 1232 [6] "i486", 1233 [7] "i860", 1234 [8] "R3000", 1235 [9] "S370", 1236 [10] "R4000", 1237 [15] "HP-PA", 1238 [18] "sparc v8+", 1239 [19] "i960", 1240 [20] "PPC-32", 1241 [21] "PPC-64", 1242 [40] "ARM", 1243 [41] "Alpha", 1244 [43] "sparc v9", 1245 [50] "IA-46", 1246 [62] "AMD64", 1247 [75] "VAX", 1248 }; 1249 1250 1251 if (memcmp(buf, "\x7fELF", 4) == 0){ 1252 if (!mime){ 1253 int n = (buf[19] << 8) | buf[18]; 1254 char *p = "unknown"; 1255 1256 if (n > 0 && n < nelem(cpu) && cpu[n]) 1257 p = cpu[n]; 1258 else { 1259 /* try the other byte order */ 1260 n = (buf[18] << 8) | buf[19]; 1261 if (n > 0 && n < nelem(cpu) && cpu[n]) 1262 p = cpu[n]; 1263 } 1264 print("%s ELF executable\n", p); 1265 } 1266 else 1267 print("application/x-elf-executable"); 1268 return 1; 1269 } 1270 1271 return 0; 1272 } 1273