1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "chan", Alword, 51 "char", Cword, 52 "common", Fword, 53 "con", Lword, 54 "data", Fword, 55 "dimension", Fword, 56 "double", Cword, 57 "extern", Cword, 58 "bio", I2, 59 "float", Cword, 60 "fn", Lword, 61 "function", Fword, 62 "h", I3, 63 "implement", Lword, 64 "import", Lword, 65 "include", I1, 66 "int", Cword, 67 "integer", Fword, 68 "iota", Lword, 69 "libc", I2, 70 "long", Cword, 71 "module", Lword, 72 "real", Fword, 73 "ref", Lword, 74 "register", Cword, 75 "self", Lword, 76 "short", Cword, 77 "static", Cword, 78 "stdio", I2, 79 "struct", Cword, 80 "subroutine", Fword, 81 "u", I2, 82 "void", Cword, 83 }; 84 85 /* codes for 'mode' field in language structure */ 86 enum { 87 Normal = 0, 88 First, /* first entry for language spanning several ranges */ 89 Multi, /* later entries " " " ... */ 90 Shared, /* codes used in several languages */ 91 }; 92 93 struct 94 { 95 int mode; /* see enum above */ 96 int count; 97 int low; 98 int high; 99 char *name; 100 101 } language[] = 102 { 103 Normal, 0, 0x0080, 0x0080, "Extended Latin", 104 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 105 Normal, 0, 0x0370, 0x03FF, "Greek", 106 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 107 Normal, 0, 0x0530, 0x058F, "Armenian", 108 Normal, 0, 0x0590, 0x05FF, "Hebrew", 109 Normal, 0, 0x0600, 0x06FF, "Arabic", 110 Normal, 0, 0x0900, 0x097F, "Devanagari", 111 Normal, 0, 0x0980, 0x09FF, "Bengali", 112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 114 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 115 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 116 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 117 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 119 Normal, 0, 0x0E00, 0x0E7F, "Thai", 120 Normal, 0, 0x0E80, 0x0EFF, "Lao", 121 Normal, 0, 0x1000, 0x105F, "Tibetan", 122 Normal, 0, 0x10A0, 0x10FF, "Georgian", 123 Normal, 0, 0x3040, 0x30FF, "Japanese", 124 Normal, 0, 0x3100, 0x312F, "Chinese", 125 First, 0, 0x3130, 0x318F, "Korean", 126 Multi, 0, 0x3400, 0x3D2F, "Korean", 127 Shared, 0, 0x4e00, 0x9fff, "CJK", 128 Normal, 0, 0, 0, 0, /* terminal entry */ 129 }; 130 131 132 enum 133 { 134 Fascii, /* printable ascii */ 135 Flatin, /* latin 1*/ 136 Futf, /* UTf character set */ 137 Fbinary, /* binary */ 138 Feascii, /* ASCII with control chars */ 139 Fnull, /* NULL in file */ 140 } guess; 141 142 void bump_utf_count(Rune); 143 int cistrncmp(char*, char*, int); 144 void filetype(int); 145 int getfontnum(uchar*, uchar**); 146 int isas(void); 147 int isc(void); 148 int iscint(void); 149 int isenglish(void); 150 int ishp(void); 151 int ishtml(void); 152 int isrfc822(void); 153 int ismbox(void); 154 int islimbo(void); 155 int ismung(void); 156 int isp9bit(void); 157 int isp9font(void); 158 int isrtf(void); 159 int ismsdos(void); 160 int iself(void); 161 int istring(void); 162 int iff(void); 163 int long0(void); 164 int istar(void); 165 int p9bitnum(uchar*); 166 int p9subfont(uchar*); 167 void print_utf(void); 168 void type(char*, int); 169 int utf_count(void); 170 void wordfreq(void); 171 172 int (*call[])(void) = 173 { 174 long0, /* recognizable by first 4 bytes */ 175 istring, /* recognizable by first string */ 176 iff, /* interchange file format (strings) */ 177 isrfc822, /* email file */ 178 ismbox, /* mail box */ 179 istar, /* recognizable by tar checksum */ 180 ishtml, /* html keywords */ 181 iscint, /* compiler/assembler intermediate */ 182 islimbo, /* limbo source */ 183 isc, /* c & alef compiler key words */ 184 isas, /* assembler key words */ 185 ismung, /* entropy compressed/encrypted */ 186 isp9font, /* plan 9 font */ 187 isp9bit, /* plan 9 image (as from /dev/window) */ 188 isenglish, /* char frequency English */ 189 isrtf, /* rich text format */ 190 ismsdos, /* msdos exe (virus file attachement) */ 191 iself, /* ELF (foreign) executable */ 192 0 193 }; 194 195 int mime; 196 197 #define OCTET "application/octet-stream\n" 198 #define PLAIN "text/plain\n" 199 200 void 201 main(int argc, char *argv[]) 202 { 203 int i, j, maxlen; 204 char *cp; 205 Rune r; 206 207 ARGBEGIN{ 208 case 'm': 209 mime = 1; 210 break; 211 default: 212 fprint(2, "usage: file [-m] [file...]\n"); 213 exits("usage"); 214 }ARGEND; 215 216 maxlen = 0; 217 if(mime == 0 || argc > 1){ 218 for(i = 0; i < argc; i++) { 219 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 220 ; 221 if(j > maxlen) 222 maxlen = j; 223 } 224 } 225 if (argc <= 0) { 226 if(!mime) 227 print ("stdin: "); 228 filetype(0); 229 } 230 else { 231 for(i = 0; i < argc; i++) 232 type(argv[i], maxlen); 233 } 234 exits(0); 235 } 236 237 void 238 type(char *file, int nlen) 239 { 240 Rune r; 241 int i; 242 char *p; 243 244 if(nlen > 0){ 245 slash = 0; 246 for (i = 0, p = file; *p; i++) { 247 if (*p == '/') /* find rightmost slash */ 248 slash = p; 249 p += chartorune(&r, p); /* count runes */ 250 } 251 print("%s:%*s",file, nlen-i+1, ""); 252 } 253 fname = file; 254 if ((fd = open(file, OREAD)) < 0) { 255 print("cannot open\n"); 256 return; 257 } 258 filetype(fd); 259 close(fd); 260 } 261 262 void 263 filetype(int fd) 264 { 265 Rune r; 266 int i, f, n; 267 char *p, *eob; 268 269 free(mbuf); 270 mbuf = dirfstat(fd); 271 if(mbuf == nil){ 272 print("cannot stat: %r\n"); 273 return; 274 } 275 if(mbuf->mode & DMDIR) { 276 print(mime ? "text/directory\n" : "directory\n"); 277 return; 278 } 279 if(mbuf->type != 'M' && mbuf->type != '|') { 280 print(mime ? OCTET : "special file #%c/%s\n", 281 mbuf->type, mbuf->name); 282 return; 283 } 284 nbuf = read(fd, buf, sizeof(buf)-1); 285 286 if(nbuf < 0) { 287 print("cannot read\n"); 288 return; 289 } 290 if(nbuf == 0) { 291 print(mime ? PLAIN : "empty file\n"); 292 return; 293 } 294 buf[nbuf] = 0; 295 296 /* 297 * build histogram table 298 */ 299 memset(cfreq, 0, sizeof(cfreq)); 300 for (i = 0; language[i].name; i++) 301 language[i].count = 0; 302 eob = (char *)buf+nbuf; 303 for(n = 0, p = (char *)buf; p < eob; n++) { 304 if (!fullrune(p, eob-p) && eob-p < UTFmax) 305 break; 306 p += chartorune(&r, p); 307 if (r == 0) 308 f = Cnull; 309 else if (r <= 0x7f) { 310 if (!isprint(r) && !isspace(r)) 311 f = Ceascii; /* ASCII control char */ 312 else f = r; 313 } else if (r == 0x080) { 314 bump_utf_count(r); 315 f = Cutf; 316 } else if (r < 0xA0) 317 f = Cbinary; /* Invalid Runes */ 318 else if (r <= 0xff) 319 f = Clatin; /* Latin 1 */ 320 else { 321 bump_utf_count(r); 322 f = Cutf; /* UTF extension */ 323 } 324 cfreq[f]++; /* ASCII chars peg directly */ 325 } 326 /* 327 * gross classify 328 */ 329 if (cfreq[Cbinary]) 330 guess = Fbinary; 331 else if (cfreq[Cutf]) 332 guess = Futf; 333 else if (cfreq[Clatin]) 334 guess = Flatin; 335 else if (cfreq[Ceascii]) 336 guess = Feascii; 337 else if (cfreq[Cnull] == n) { 338 print(mime ? OCTET : "first block all null bytes\n"); 339 return; 340 } 341 else guess = Fascii; 342 /* 343 * lookup dictionary words 344 */ 345 memset(wfreq, 0, sizeof(wfreq)); 346 if(guess == Fascii || guess == Flatin || guess == Futf) 347 wordfreq(); 348 /* 349 * call individual classify routines 350 */ 351 for(i=0; call[i]; i++) 352 if((*call[i])()) 353 return; 354 355 /* 356 * if all else fails, 357 * print out gross classification 358 */ 359 if (nbuf < 100 && !mime) 360 print(mime ? PLAIN : "short "); 361 if (guess == Fascii) 362 print(mime ? PLAIN : "Ascii\n"); 363 else if (guess == Feascii) 364 print(mime ? PLAIN : "extended ascii\n"); 365 else if (guess == Flatin) 366 print(mime ? PLAIN : "latin ascii\n"); 367 else if (guess == Futf && utf_count() < 4) 368 print_utf(); 369 else print(mime ? OCTET : "binary\n"); 370 } 371 372 void 373 bump_utf_count(Rune r) 374 { 375 int low, high, mid; 376 377 high = sizeof(language)/sizeof(language[0])-1; 378 for (low = 0; low < high;) { 379 mid = (low+high)/2; 380 if (r >=language[mid].low) { 381 if (r <= language[mid].high) { 382 language[mid].count++; 383 break; 384 } else low = mid+1; 385 } else high = mid; 386 } 387 } 388 389 int 390 utf_count(void) 391 { 392 int i, count; 393 394 count = 0; 395 for (i = 0; language[i].name; i++) 396 if (language[i].count > 0) 397 switch (language[i].mode) { 398 case Normal: 399 case First: 400 count++; 401 break; 402 default: 403 break; 404 } 405 return count; 406 } 407 408 int 409 chkascii(void) 410 { 411 int i; 412 413 for (i = 'a'; i < 'z'; i++) 414 if (cfreq[i]) 415 return 1; 416 for (i = 'A'; i < 'Z'; i++) 417 if (cfreq[i]) 418 return 1; 419 return 0; 420 } 421 422 int 423 find_first(char *name) 424 { 425 int i; 426 427 for (i = 0; language[i].name != 0; i++) 428 if (language[i].mode == First 429 && strcmp(language[i].name, name) == 0) 430 return i; 431 return -1; 432 } 433 434 void 435 print_utf(void) 436 { 437 int i, printed, j; 438 439 if(mime){ 440 print(PLAIN); 441 return; 442 } 443 if (chkascii()) { 444 printed = 1; 445 print("Ascii"); 446 } else 447 printed = 0; 448 for (i = 0; language[i].name; i++) 449 if (language[i].count) { 450 switch(language[i].mode) { 451 case Multi: 452 j = find_first(language[i].name); 453 if (j < 0) 454 break; 455 if (language[j].count > 0) 456 break; 457 /* Fall through */ 458 case Normal: 459 case First: 460 if (printed) 461 print(" & "); 462 else printed = 1; 463 print("%s", language[i].name); 464 break; 465 case Shared: 466 default: 467 break; 468 } 469 } 470 if(!printed) 471 print("UTF"); 472 print(" text\n"); 473 } 474 475 void 476 wordfreq(void) 477 { 478 int low, high, mid, r; 479 uchar *p, *p2, c; 480 481 p = buf; 482 for(;;) { 483 while (p < buf+nbuf && !isalpha(*p)) 484 p++; 485 if (p >= buf+nbuf) 486 return; 487 p2 = p; 488 while(p < buf+nbuf && isalpha(*p)) 489 p++; 490 c = *p; 491 *p = 0; 492 high = sizeof(dict)/sizeof(dict[0]); 493 for(low = 0;low < high;) { 494 mid = (low+high)/2; 495 r = strcmp(dict[mid].word, (char*)p2); 496 if(r == 0) { 497 wfreq[dict[mid].class]++; 498 break; 499 } 500 if(r < 0) 501 low = mid+1; 502 else 503 high = mid; 504 } 505 *p++ = c; 506 } 507 } 508 509 typedef struct Filemagic Filemagic; 510 struct Filemagic { 511 ulong x; 512 ulong mask; 513 char *desc; 514 char *mime; 515 }; 516 517 Filemagic long0tab[] = { 518 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 519 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 520 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET, 521 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 522 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 523 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 524 070707, 0xFFFF, "cpio archive\n", OCTET, 525 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 526 0xfffa0000, 0xfffe0000, "mp3 audio", "audio/mpeg", 527 }; 528 529 int 530 filemagic(Filemagic *tab, int ntab, ulong x) 531 { 532 int i; 533 534 for(i=0; i<ntab; i++) 535 if((x&tab[i].mask) == tab[i].x){ 536 print(mime ? tab[i].mime : tab[i].desc); 537 return 1; 538 } 539 return 0; 540 } 541 542 int 543 long0(void) 544 { 545 Fhdr f; 546 long x; 547 548 seek(fd, 0, 0); /* reposition to start of file */ 549 if(crackhdr(fd, &f)) { 550 print(mime ? OCTET : "%s\n", f.name); 551 return 1; 552 } 553 x = LENDIAN(buf); 554 if(filemagic(long0tab, nelem(long0tab), x)) 555 return 1; 556 return 0; 557 } 558 559 /* from tar.c */ 560 enum { NAMSIZ = 100, TBLOCK = 512 }; 561 562 union hblock 563 { 564 char dummy[TBLOCK]; 565 struct header 566 { 567 char name[NAMSIZ]; 568 char mode[8]; 569 char uid[8]; 570 char gid[8]; 571 char size[12]; 572 char mtime[12]; 573 char chksum[8]; 574 char linkflag; 575 char linkname[NAMSIZ]; 576 /* rest are defined by POSIX's ustar format; see p1003.2b */ 577 char magic[6]; /* "ustar" */ 578 char version[2]; 579 char uname[32]; 580 char gname[32]; 581 char devmajor[8]; 582 char devminor[8]; 583 char prefix[155]; /* if non-null, path = prefix "/" name */ 584 } dbuf; 585 }; 586 587 int 588 checksum(union hblock *hp) 589 { 590 int i; 591 char *cp; 592 struct header *hdr = &hp->dbuf; 593 594 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 595 *cp = ' '; 596 i = 0; 597 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 598 i += *cp & 0xff; 599 return i; 600 } 601 602 int 603 istar(void) 604 { 605 int chksum; 606 char tblock[TBLOCK]; 607 union hblock *hp = (union hblock *)tblock; 608 struct header *hdr = &hp->dbuf; 609 610 seek(fd, 0, 0); /* reposition to start of file */ 611 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 612 return 0; 613 chksum = strtol(hdr->chksum, 0, 8); 614 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 615 if (strcmp(hdr->magic, "ustar") == 0) 616 print(mime? "application/x-ustar\n": 617 "posix tar archive\n"); 618 else 619 print(mime? "application/x-tar\n": "tar archive\n"); 620 return 1; 621 } 622 return 0; 623 } 624 625 /* 626 * initial words to classify file 627 */ 628 struct FILE_STRING 629 { 630 char *key; 631 char *filetype; 632 int length; 633 char *mime; 634 } file_string[] = 635 { 636 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 637 "!<arch>\n", "archive", 8, "application/octet-stream", 638 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 639 "#!/bin/rc", "rc executable file", 9, "text/plain", 640 "#!/bin/sh", "sh executable file", 9, "text/plain", 641 "%!", "postscript", 2, "application/postscript", 642 "\004%!", "postscript", 3, "application/postscript", 643 "x T post", "troff output for post", 8, "application/troff", 644 "x T Latin1", "troff output for Latin1", 10, "application/troff", 645 "x T utf", "troff output for UTF", 7, "application/troff", 646 "x T 202", "troff output for 202", 7, "application/troff", 647 "x T aps", "troff output for aps", 7, "application/troff", 648 "GIF", "GIF image", 3, "image/gif", 649 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 650 "%PDF", "PDF", 4, "application/pdf", 651 "<html>\n", "HTML file", 7, "text/html", 652 "<HTML>\n", "HTML file", 7, "text/html", 653 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream", 654 "\111\111\052\000", "tiff", 4, "image/tiff", 655 "\115\115\000\052", "tiff", 4, "image/tiff", 656 "\377\330\377\340", "jpeg", 4, "image/jpeg", 657 "\377\330\377\341", "jpeg", 4, "image/jpeg", 658 "\377\330\377\333", "jpeg", 4, "image/jpeg", 659 "BM", "bmp", 2, "image/bmp", 660 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 661 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 662 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 663 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 664 0,0,0,0 665 }; 666 667 int 668 istring(void) 669 { 670 int i; 671 struct FILE_STRING *p; 672 673 for(p = file_string; p->key; p++) { 674 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) { 675 if(mime) 676 print("%s\n", p->mime); 677 else 678 print("%s\n", p->filetype); 679 return 1; 680 } 681 } 682 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 683 for(i = 5; i < nbuf; i++) 684 if(buf[i] == '\n') 685 break; 686 if(mime) 687 print(OCTET); 688 else 689 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 690 return 1; 691 } 692 return 0; 693 } 694 695 int 696 iff(void) 697 { 698 if (strncmp((char*)buf, "FORM", 4) == 0 && 699 strncmp((char*)buf+8, "AIFF", 4) == 0) { 700 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 701 return 1; 702 } 703 return 0; 704 } 705 706 char* html_string[] = 707 { 708 "title", 709 "body", 710 "head", 711 "strong", 712 "h1", 713 "h2", 714 "h3", 715 "h4", 716 "h5", 717 "h6", 718 "ul", 719 "li", 720 "dl", 721 "br", 722 "em", 723 0, 724 }; 725 726 int 727 ishtml(void) 728 { 729 uchar *p, *q; 730 int i, count; 731 732 /* compare strings between '<' and '>' to html table */ 733 count = 0; 734 p = buf; 735 for(;;) { 736 while (p < buf+nbuf && *p != '<') 737 p++; 738 p++; 739 if (p >= buf+nbuf) 740 break; 741 if(*p == '/') 742 p++; 743 q = p; 744 while(p < buf+nbuf && *p != '>') 745 p++; 746 if (p >= buf+nbuf) 747 break; 748 for(i = 0; html_string[i]; i++) { 749 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 750 if(count++ > 4) { 751 print(mime ? "text/html\n" : "HTML file\n"); 752 return 1; 753 } 754 break; 755 } 756 } 757 p++; 758 } 759 return 0; 760 } 761 762 char* rfc822_string[] = 763 { 764 "from:", 765 "date:", 766 "to:", 767 "subject:", 768 "received:", 769 "reply to:", 770 "sender:", 771 0, 772 }; 773 774 int 775 isrfc822(void) 776 { 777 778 char *p, *q, *r; 779 int i, count; 780 781 count = 0; 782 p = (char*)buf; 783 for(;;) { 784 q = strchr(p, '\n'); 785 if(q == nil) 786 break; 787 *q = 0; 788 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 789 count++; 790 *q = '\n'; 791 p = q+1; 792 continue; 793 } 794 *q = '\n'; 795 if(*p != '\t' && *p != ' '){ 796 r = strchr(p, ':'); 797 if(r == 0 || r > q) 798 break; 799 for(i = 0; rfc822_string[i]; i++) { 800 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 801 count++; 802 break; 803 } 804 } 805 } 806 p = q+1; 807 } 808 if(count >= 3){ 809 print(mime ? "message/rfc822\n" : "email file\n"); 810 return 1; 811 } 812 return 0; 813 } 814 815 int 816 ismbox(void) 817 { 818 char *p, *q; 819 820 p = (char*)buf; 821 q = strchr(p, '\n'); 822 if(q == nil) 823 return 0; 824 *q = 0; 825 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 826 print(mime ? "text/plain\n" : "mail box\n"); 827 return 1; 828 } 829 *q = '\n'; 830 return 0; 831 } 832 833 int 834 iscint(void) 835 { 836 int type; 837 char *name; 838 Biobuf b; 839 840 if(Binit(&b, fd, OREAD) == Beof) 841 return 0; 842 seek(fd, 0, 0); 843 type = objtype(&b, &name); 844 if(type < 0) 845 return 0; 846 if(mime) 847 print(OCTET); 848 else 849 print("%s intermediate\n", name); 850 return 1; 851 } 852 853 int 854 isc(void) 855 { 856 int n; 857 858 n = wfreq[I1]; 859 /* 860 * includes 861 */ 862 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 863 goto yes; 864 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 865 goto yes; 866 /* 867 * declarations 868 */ 869 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 870 goto yes; 871 /* 872 * assignments 873 */ 874 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 875 goto yes; 876 return 0; 877 878 yes: 879 if(mime){ 880 print(PLAIN); 881 return 1; 882 } 883 if(wfreq[Alword] > 0) 884 print("alef program\n"); 885 else 886 print("c program\n"); 887 return 1; 888 } 889 890 int 891 islimbo(void) 892 { 893 894 /* 895 * includes 896 */ 897 if(wfreq[Lword] < 4) 898 return 0; 899 print(mime ? PLAIN : "limbo program\n"); 900 return 1; 901 } 902 903 int 904 isas(void) 905 { 906 907 /* 908 * includes 909 */ 910 if(wfreq[Aword] < 2) 911 return 0; 912 print(mime ? PLAIN : "as program\n"); 913 return 1; 914 } 915 916 /* 917 * low entropy means encrypted 918 */ 919 int 920 ismung(void) 921 { 922 int i, bucket[8]; 923 float cs; 924 925 if(nbuf < 64) 926 return 0; 927 memset(bucket, 0, sizeof(bucket)); 928 for(i=0; i<64; i++) 929 bucket[(buf[i]>>5)&07] += 1; 930 931 cs = 0.; 932 for(i=0; i<8; i++) 933 cs += (bucket[i]-8)*(bucket[i]-8); 934 cs /= 8.; 935 if(cs <= 24.322) { 936 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d)) 937 print(mime ? OCTET : "compressed\n"); 938 else 939 print(mime ? OCTET : "encrypted\n"); 940 return 1; 941 } 942 return 0; 943 } 944 945 /* 946 * english by punctuation and frequencies 947 */ 948 int 949 isenglish(void) 950 { 951 int vow, comm, rare, badpun, punct; 952 char *p; 953 954 if(guess != Fascii && guess != Feascii) 955 return 0; 956 badpun = 0; 957 punct = 0; 958 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 959 switch(*p) { 960 case '.': 961 case ',': 962 case ')': 963 case '%': 964 case ';': 965 case ':': 966 case '?': 967 punct++; 968 if(p[1] != ' ' && p[1] != '\n') 969 badpun++; 970 } 971 if(badpun*5 > punct) 972 return 0; 973 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 974 return 0; 975 if(2*cfreq[';'] > cfreq['e']) 976 return 0; 977 978 vow = 0; 979 for(p="AEIOU"; *p; p++) { 980 vow += cfreq[*p]; 981 vow += cfreq[tolower(*p)]; 982 } 983 comm = 0; 984 for(p="ETAION"; *p; p++) { 985 comm += cfreq[*p]; 986 comm += cfreq[tolower(*p)]; 987 } 988 rare = 0; 989 for(p="VJKQXZ"; *p; p++) { 990 rare += cfreq[*p]; 991 rare += cfreq[tolower(*p)]; 992 } 993 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 994 print(mime ? PLAIN : "English text\n"); 995 return 1; 996 } 997 return 0; 998 } 999 1000 /* 1001 * pick up a number with 1002 * syntax _*[0-9]+_ 1003 */ 1004 #define P9BITLEN 12 1005 int 1006 p9bitnum(uchar *bp) 1007 { 1008 int n, c, len; 1009 1010 len = P9BITLEN; 1011 while(*bp == ' ') { 1012 bp++; 1013 len--; 1014 if(len <= 0) 1015 return -1; 1016 } 1017 n = 0; 1018 while(len > 1) { 1019 c = *bp++; 1020 if(!isdigit(c)) 1021 return -1; 1022 n = n*10 + c-'0'; 1023 len--; 1024 } 1025 if(*bp != ' ') 1026 return -1; 1027 return n; 1028 } 1029 1030 int 1031 depthof(char *s, int *newp) 1032 { 1033 char *es; 1034 int d; 1035 1036 *newp = 0; 1037 es = s+12; 1038 while(s<es && *s==' ') 1039 s++; 1040 if(s == es) 1041 return -1; 1042 if('0'<=*s && *s<='9') 1043 return 1<<atoi(s); 1044 1045 *newp = 1; 1046 d = 0; 1047 while(s<es && *s!=' '){ 1048 s++; /* skip letter */ 1049 d += strtoul(s, &s, 10); 1050 } 1051 1052 switch(d){ 1053 case 32: 1054 case 24: 1055 case 16: 1056 case 8: 1057 return d; 1058 } 1059 return -1; 1060 } 1061 1062 int 1063 isp9bit(void) 1064 { 1065 int dep, lox, loy, hix, hiy, px, new; 1066 ulong t; 1067 long len; 1068 char *newlabel; 1069 1070 newlabel = "old "; 1071 1072 dep = depthof((char*)buf + 0*P9BITLEN, &new); 1073 if(new) 1074 newlabel = ""; 1075 lox = p9bitnum(buf + 1*P9BITLEN); 1076 loy = p9bitnum(buf + 2*P9BITLEN); 1077 hix = p9bitnum(buf + 3*P9BITLEN); 1078 hiy = p9bitnum(buf + 4*P9BITLEN); 1079 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1080 return 0; 1081 1082 if(dep < 8){ 1083 px = 8/dep; /* pixels per byte */ 1084 /* set l to number of bytes of data per scan line */ 1085 if(lox >= 0) 1086 len = (hix+px-1)/px - lox/px; 1087 else{ /* make positive before divide */ 1088 t = (-lox)+px-1; 1089 t = (t/px)*px; 1090 len = (t+hix+px-1)/px; 1091 } 1092 }else 1093 len = (hix-lox)*dep/8; 1094 len *= (hiy-loy); /* col length */ 1095 len += 5*P9BITLEN; /* size of initial ascii */ 1096 1097 /* 1098 * for image file, length is non-zero and must match calculation above 1099 * for /dev/window and /dev/screen the length is always zero 1100 * for subfont, the subfont header should follow immediately. 1101 */ 1102 if (len != 0 && mbuf->length == 0) { 1103 print("%splan 9 image\n", newlabel); 1104 return 1; 1105 } 1106 if (mbuf->length == len) { 1107 print("%splan 9 image\n", newlabel); 1108 return 1; 1109 } 1110 /* Ghostscript sometimes produces a little extra on the end */ 1111 if (mbuf->length < len+P9BITLEN) { 1112 print("%splan 9 image\n", newlabel); 1113 return 1; 1114 } 1115 if (p9subfont(buf+len)) { 1116 print("%ssubfont file\n", newlabel); 1117 return 1; 1118 } 1119 return 0; 1120 } 1121 1122 int 1123 p9subfont(uchar *p) 1124 { 1125 int n, h, a; 1126 1127 /* if image too big, assume it's a subfont */ 1128 if (p+3*P9BITLEN > buf+sizeof(buf)) 1129 return 1; 1130 1131 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1132 if (n < 0) 1133 return 0; 1134 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1135 if (h < 0) 1136 return 0; 1137 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1138 if (a < 0) 1139 return 0; 1140 return 1; 1141 } 1142 1143 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1144 1145 int 1146 isp9font(void) 1147 { 1148 uchar *cp, *p; 1149 int i, n; 1150 char pathname[1024]; 1151 1152 cp = buf; 1153 if (!getfontnum(cp, &cp)) /* height */ 1154 return 0; 1155 if (!getfontnum(cp, &cp)) /* ascent */ 1156 return 0; 1157 for (i = 0; 1; i++) { 1158 if (!getfontnum(cp, &cp)) /* min */ 1159 break; 1160 if (!getfontnum(cp, &cp)) /* max */ 1161 return 0; 1162 while (WHITESPACE(*cp)) 1163 cp++; 1164 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1165 ; 1166 /* construct a path name, if needed */ 1167 n = 0; 1168 if (*p != '/' && slash) { 1169 n = slash-fname+1; 1170 if (n < sizeof(pathname)) 1171 memcpy(pathname, fname, n); 1172 else n = 0; 1173 } 1174 if (n+cp-p < sizeof(pathname)) { 1175 memcpy(pathname+n, p, cp-p); 1176 n += cp-p; 1177 pathname[n] = 0; 1178 if (access(pathname, AEXIST) < 0) 1179 return 0; 1180 } 1181 } 1182 if (i) { 1183 print(mime ? "text/plain\n" : "font file\n"); 1184 return 1; 1185 } 1186 return 0; 1187 } 1188 1189 int 1190 getfontnum(uchar *cp, uchar **rp) 1191 { 1192 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1193 cp++; 1194 if (*cp < '0' || *cp > '9') 1195 return 0; 1196 strtoul((char *)cp, (char **)rp, 0); 1197 if (!WHITESPACE(**rp)) 1198 return 0; 1199 return 1; 1200 } 1201 1202 int 1203 isrtf(void) 1204 { 1205 if(strstr((char *)buf, "\\rtf1")){ 1206 print(mime ? "application/rtf\n" : "rich text format\n"); 1207 return 1; 1208 } 1209 return 0; 1210 } 1211 1212 int 1213 ismsdos(void) 1214 { 1215 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1216 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1217 return 1; 1218 } 1219 return 0; 1220 } 1221 1222 int 1223 iself(void) 1224 { 1225 char *cpu[] = { /* NB: incomplete and arbitary list */ 1226 [1] "WE32100", 1227 [2] "SPARC", 1228 [3] "i386", 1229 [4] "M68000", 1230 [5] "M88000", 1231 [6] "i486", 1232 [7] "i860", 1233 [8] "R3000", 1234 [9] "S370", 1235 [10] "R4000", 1236 [15] "HP-PA", 1237 [18] "sparc v8+", 1238 [19] "i960", 1239 [20] "PPC-32", 1240 [21] "PPC-64", 1241 [40] "ARM", 1242 [41] "Alpha", 1243 [43] "sparc v9", 1244 [50] "IA-46", 1245 [62] "AMD64", 1246 [75] "VAX", 1247 }; 1248 1249 1250 if (memcmp(buf, "\x7fELF", 4) == 0){ 1251 if (!mime){ 1252 int n = (buf[19] << 8) | buf[18]; 1253 char *p = "unknown"; 1254 1255 if (n > 0 && n < nelem(cpu) && cpu[n]) 1256 p = cpu[n]; 1257 else { 1258 /* try the other byte order */ 1259 n = (buf[18] << 8) | buf[19]; 1260 if (n > 0 && n < nelem(cpu) && cpu[n]) 1261 p = cpu[n]; 1262 } 1263 print("%s ELF executable\n", p); 1264 } 1265 else 1266 print("application/x-elf-executable"); 1267 return 1; 1268 } 1269 1270 return 0; 1271 } 1272