1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "chan", Alword, 51 "char", Cword, 52 "common", Fword, 53 "con", Lword, 54 "data", Fword, 55 "dimension", Fword, 56 "double", Cword, 57 "extern", Cword, 58 "bio", I2, 59 "float", Cword, 60 "fn", Lword, 61 "function", Fword, 62 "h", I3, 63 "implement", Lword, 64 "import", Lword, 65 "include", I1, 66 "int", Cword, 67 "integer", Fword, 68 "iota", Lword, 69 "libc", I2, 70 "long", Cword, 71 "module", Lword, 72 "real", Fword, 73 "ref", Lword, 74 "register", Cword, 75 "self", Lword, 76 "short", Cword, 77 "static", Cword, 78 "stdio", I2, 79 "struct", Cword, 80 "subroutine", Fword, 81 "u", I2, 82 "void", Cword, 83 }; 84 85 /* codes for 'mode' field in language structure */ 86 enum { 87 Normal = 0, 88 First, /* first entry for language spanning several ranges */ 89 Multi, /* later entries " " " ... */ 90 Shared, /* codes used in several languages */ 91 }; 92 93 struct 94 { 95 int mode; /* see enum above */ 96 int count; 97 int low; 98 int high; 99 char *name; 100 101 } language[] = 102 { 103 Normal, 0, 0x0080, 0x0080, "Extended Latin", 104 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 105 Normal, 0, 0x0370, 0x03FF, "Greek", 106 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 107 Normal, 0, 0x0530, 0x058F, "Armenian", 108 Normal, 0, 0x0590, 0x05FF, "Hebrew", 109 Normal, 0, 0x0600, 0x06FF, "Arabic", 110 Normal, 0, 0x0900, 0x097F, "Devanagari", 111 Normal, 0, 0x0980, 0x09FF, "Bengali", 112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 114 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 115 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 116 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 117 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 119 Normal, 0, 0x0E00, 0x0E7F, "Thai", 120 Normal, 0, 0x0E80, 0x0EFF, "Lao", 121 Normal, 0, 0x1000, 0x105F, "Tibetan", 122 Normal, 0, 0x10A0, 0x10FF, "Georgian", 123 Normal, 0, 0x3040, 0x30FF, "Japanese", 124 Normal, 0, 0x3100, 0x312F, "Chinese", 125 First, 0, 0x3130, 0x318F, "Korean", 126 Multi, 0, 0x3400, 0x3D2F, "Korean", 127 Shared, 0, 0x4e00, 0x9fff, "CJK", 128 Normal, 0, 0, 0, 0, /* terminal entry */ 129 }; 130 131 132 enum 133 { 134 Fascii, /* printable ascii */ 135 Flatin, /* latin 1*/ 136 Futf, /* UTf character set */ 137 Fbinary, /* binary */ 138 Feascii, /* ASCII with control chars */ 139 Fnull, /* NULL in file */ 140 } guess; 141 142 void bump_utf_count(Rune); 143 int cistrncmp(char*, char*, int); 144 void filetype(int); 145 int getfontnum(uchar*, uchar**); 146 int isas(void); 147 int isc(void); 148 int iscint(void); 149 int isenglish(void); 150 int ishp(void); 151 int ishtml(void); 152 int isrfc822(void); 153 int ismbox(void); 154 int islimbo(void); 155 int ismung(void); 156 int isp9bit(void); 157 int isp9font(void); 158 int isrtf(void); 159 int ismsdos(void); 160 int iself(void); 161 int istring(void); 162 int long0(void); 163 int p9bitnum(uchar*); 164 int p9subfont(uchar*); 165 void print_utf(void); 166 void type(char*, int); 167 int utf_count(void); 168 void wordfreq(void); 169 170 int (*call[])(void) = 171 { 172 long0, /* recognizable by first 4 bytes */ 173 istring, /* recognizable by first string */ 174 isrfc822, /* email file */ 175 ismbox, /* mail box */ 176 ishtml, /* html keywords */ 177 iscint, /* compiler/assembler intermediate */ 178 islimbo, /* limbo source */ 179 isc, /* c & alef compiler key words */ 180 isas, /* assembler key words */ 181 ismung, /* entropy compressed/encrypted */ 182 isp9font, /* plan 9 font */ 183 isp9bit, /* plan 9 image (as from /dev/window) */ 184 isenglish, /* char frequency English */ 185 isrtf, /* rich text format */ 186 ismsdos, /* msdos exe (virus file attachement) */ 187 iself, /* ELF (foreign) executable */ 188 0 189 }; 190 191 int mime; 192 193 #define OCTET "application/octet-stream\n" 194 #define PLAIN "text/plain\n" 195 196 void 197 main(int argc, char *argv[]) 198 { 199 int i, j, maxlen; 200 char *cp; 201 Rune r; 202 203 ARGBEGIN{ 204 case 'm': 205 mime = 1; 206 break; 207 default: 208 fprint(2, "usage: file [-m] [file...]\n"); 209 exits("usage"); 210 }ARGEND; 211 212 maxlen = 0; 213 if(mime == 0 || argc > 1){ 214 for(i = 0; i < argc; i++) { 215 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 216 ; 217 if(j > maxlen) 218 maxlen = j; 219 } 220 } 221 if (argc <= 0) { 222 if(!mime) 223 print ("stdin: "); 224 filetype(0); 225 } 226 else { 227 for(i = 0; i < argc; i++) 228 type(argv[i], maxlen); 229 } 230 exits(0); 231 } 232 233 void 234 type(char *file, int nlen) 235 { 236 Rune r; 237 int i; 238 char *p; 239 240 if(nlen > 0){ 241 slash = 0; 242 for (i = 0, p = file; *p; i++) { 243 if (*p == '/') /* find rightmost slash */ 244 slash = p; 245 p += chartorune(&r, p); /* count runes */ 246 } 247 print("%s:%*s",file, nlen-i+1, ""); 248 } 249 fname = file; 250 if ((fd = open(file, OREAD)) < 0) { 251 print("cannot open\n"); 252 return; 253 } 254 filetype(fd); 255 close(fd); 256 } 257 258 void 259 filetype(int fd) 260 { 261 Rune r; 262 int i, f, n; 263 char *p, *eob; 264 265 free(mbuf); 266 mbuf = dirfstat(fd); 267 if(mbuf == nil){ 268 print("cannot stat: %r\n"); 269 return; 270 } 271 if(mbuf->mode & DMDIR) { 272 print(mime ? "text/directory\n" : "directory\n"); 273 return; 274 } 275 if(mbuf->type != 'M' && mbuf->type != '|') { 276 print(mime ? OCTET : "special file #%c/%s\n", 277 mbuf->type, mbuf->name); 278 return; 279 } 280 nbuf = read(fd, buf, sizeof(buf)-1); 281 282 if(nbuf < 0) { 283 print("cannot read\n"); 284 return; 285 } 286 if(nbuf == 0) { 287 print(mime ? PLAIN : "empty file\n"); 288 return; 289 } 290 buf[nbuf] = 0; 291 292 /* 293 * build histogram table 294 */ 295 memset(cfreq, 0, sizeof(cfreq)); 296 for (i = 0; language[i].name; i++) 297 language[i].count = 0; 298 eob = (char *)buf+nbuf; 299 for(n = 0, p = (char *)buf; p < eob; n++) { 300 if (!fullrune(p, eob-p) && eob-p < UTFmax) 301 break; 302 p += chartorune(&r, p); 303 if (r == 0) 304 f = Cnull; 305 else if (r <= 0x7f) { 306 if (!isprint(r) && !isspace(r)) 307 f = Ceascii; /* ASCII control char */ 308 else f = r; 309 } else if (r == 0x080) { 310 bump_utf_count(r); 311 f = Cutf; 312 } else if (r < 0xA0) 313 f = Cbinary; /* Invalid Runes */ 314 else if (r <= 0xff) 315 f = Clatin; /* Latin 1 */ 316 else { 317 bump_utf_count(r); 318 f = Cutf; /* UTF extension */ 319 } 320 cfreq[f]++; /* ASCII chars peg directly */ 321 } 322 /* 323 * gross classify 324 */ 325 if (cfreq[Cbinary]) 326 guess = Fbinary; 327 else if (cfreq[Cutf]) 328 guess = Futf; 329 else if (cfreq[Clatin]) 330 guess = Flatin; 331 else if (cfreq[Ceascii]) 332 guess = Feascii; 333 else if (cfreq[Cnull] == n) { 334 print(mime ? OCTET : "first block all null bytes\n"); 335 return; 336 } 337 else guess = Fascii; 338 /* 339 * lookup dictionary words 340 */ 341 memset(wfreq, 0, sizeof(wfreq)); 342 if(guess == Fascii || guess == Flatin || guess == Futf) 343 wordfreq(); 344 /* 345 * call individual classify routines 346 */ 347 for(i=0; call[i]; i++) 348 if((*call[i])()) 349 return; 350 351 /* 352 * if all else fails, 353 * print out gross classification 354 */ 355 if (nbuf < 100 && !mime) 356 print(mime ? PLAIN : "short "); 357 if (guess == Fascii) 358 print(mime ? PLAIN : "Ascii\n"); 359 else if (guess == Feascii) 360 print(mime ? PLAIN : "extended ascii\n"); 361 else if (guess == Flatin) 362 print(mime ? PLAIN : "latin ascii\n"); 363 else if (guess == Futf && utf_count() < 4) 364 print_utf(); 365 else print(mime ? OCTET : "binary\n"); 366 } 367 368 void 369 bump_utf_count(Rune r) 370 { 371 int low, high, mid; 372 373 high = sizeof(language)/sizeof(language[0])-1; 374 for (low = 0; low < high;) { 375 mid = (low+high)/2; 376 if (r >=language[mid].low) { 377 if (r <= language[mid].high) { 378 language[mid].count++; 379 break; 380 } else low = mid+1; 381 } else high = mid; 382 } 383 } 384 385 int 386 utf_count(void) 387 { 388 int i, count; 389 390 count = 0; 391 for (i = 0; language[i].name; i++) 392 if (language[i].count > 0) 393 switch (language[i].mode) { 394 case Normal: 395 case First: 396 count++; 397 break; 398 default: 399 break; 400 } 401 return count; 402 } 403 404 int 405 chkascii(void) 406 { 407 int i; 408 409 for (i = 'a'; i < 'z'; i++) 410 if (cfreq[i]) 411 return 1; 412 for (i = 'A'; i < 'Z'; i++) 413 if (cfreq[i]) 414 return 1; 415 return 0; 416 } 417 418 int 419 find_first(char *name) 420 { 421 int i; 422 423 for (i = 0; language[i].name != 0; i++) 424 if (language[i].mode == First 425 && strcmp(language[i].name, name) == 0) 426 return i; 427 return -1; 428 } 429 430 void 431 print_utf(void) 432 { 433 int i, printed, j; 434 435 if(mime){ 436 print(PLAIN); 437 return; 438 } 439 if (chkascii()) { 440 printed = 1; 441 print("Ascii"); 442 } else 443 printed = 0; 444 for (i = 0; language[i].name; i++) 445 if (language[i].count) { 446 switch(language[i].mode) { 447 case Multi: 448 j = find_first(language[i].name); 449 if (j < 0) 450 break; 451 if (language[j].count > 0) 452 break; 453 /* Fall through */ 454 case Normal: 455 case First: 456 if (printed) 457 print(" & "); 458 else printed = 1; 459 print("%s", language[i].name); 460 break; 461 case Shared: 462 default: 463 break; 464 } 465 } 466 if(!printed) 467 print("UTF"); 468 print(" text\n"); 469 } 470 471 void 472 wordfreq(void) 473 { 474 int low, high, mid, r; 475 uchar *p, *p2, c; 476 477 p = buf; 478 for(;;) { 479 while (p < buf+nbuf && !isalpha(*p)) 480 p++; 481 if (p >= buf+nbuf) 482 return; 483 p2 = p; 484 while(p < buf+nbuf && isalpha(*p)) 485 p++; 486 c = *p; 487 *p = 0; 488 high = sizeof(dict)/sizeof(dict[0]); 489 for(low = 0;low < high;) { 490 mid = (low+high)/2; 491 r = strcmp(dict[mid].word, (char*)p2); 492 if(r == 0) { 493 wfreq[dict[mid].class]++; 494 break; 495 } 496 if(r < 0) 497 low = mid+1; 498 else 499 high = mid; 500 } 501 *p++ = c; 502 } 503 } 504 505 typedef struct Filemagic Filemagic; 506 struct Filemagic { 507 ulong x; 508 ulong mask; 509 char *desc; 510 char *mime; 511 }; 512 513 Filemagic long0tab[] = { 514 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 515 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 516 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET, 517 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 518 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 519 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 520 070707, 0xFFFF, "cpio archive\n", OCTET, 521 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 522 }; 523 524 int 525 filemagic(Filemagic *tab, int ntab, ulong x) 526 { 527 int i; 528 529 for(i=0; i<ntab; i++) 530 if((x&tab[i].mask) == tab[i].x){ 531 print(mime ? tab[i].mime : tab[i].desc); 532 return 1; 533 } 534 return 0; 535 } 536 537 int 538 long0(void) 539 { 540 Fhdr f; 541 long x; 542 543 seek(fd, 0, 0); /* reposition to start of file */ 544 if(crackhdr(fd, &f)) { 545 print(mime ? OCTET : "%s\n", f.name); 546 return 1; 547 } 548 x = LENDIAN(buf); 549 if(filemagic(long0tab, nelem(long0tab), x)) 550 return 1; 551 return 0; 552 } 553 554 /* 555 * initial words to classify file 556 */ 557 struct FILE_STRING 558 { 559 char *key; 560 char *filetype; 561 int length; 562 char *mime; 563 } file_string[] = 564 { 565 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 566 "!<arch>\n", "archive", 8, "application/octet-stream", 567 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 568 "#!/bin/rc", "rc executable file", 9, "text/plain", 569 "#!/bin/sh", "sh executable file", 9, "text/plain", 570 "%!", "postscript", 2, "application/postscript", 571 "\004%!", "postscript", 3, "application/postscript", 572 "x T post", "troff output for post", 8, "application/troff", 573 "x T Latin1", "troff output for Latin1", 10, "application/troff", 574 "x T utf", "troff output for UTF", 7, "application/troff", 575 "x T 202", "troff output for 202", 7, "application/troff", 576 "x T aps", "troff output for aps", 7, "application/troff", 577 "GIF", "GIF image", 3, "image/gif", 578 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 579 "%PDF", "PDF", 4, "application/pdf", 580 "<html>\n", "HTML file", 7, "text/html", 581 "<HTML>\n", "HTML file", 7, "text/html", 582 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream", 583 "\111\111\052\000", "tiff", 4, "image/tiff", 584 "\115\115\000\052", "tiff", 4, "image/tiff", 585 "\377\330\377\340", "jpeg", 4, "image/jpeg", 586 "\377\330\377\341", "jpeg", 4, "image/jpeg", 587 "\377\330\377\333", "jpeg", 4, "image/jpeg", 588 "BM", "bmp", 2, "image/bmp", 589 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 590 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 591 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 592 0,0,0,0 593 }; 594 595 int 596 istring(void) 597 { 598 int i; 599 struct FILE_STRING *p; 600 601 for(p = file_string; p->key; p++) { 602 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) { 603 if(mime) 604 print("%s\n", p->mime); 605 else 606 print("%s\n", p->filetype); 607 return 1; 608 } 609 } 610 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 611 for(i = 5; i < nbuf; i++) 612 if(buf[i] == '\n') 613 break; 614 if(mime) 615 print(OCTET); 616 else 617 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 618 return 1; 619 } 620 return 0; 621 } 622 623 char* html_string[] = 624 { 625 "title", 626 "body", 627 "head", 628 "strong", 629 "h1", 630 "h2", 631 "h3", 632 "h4", 633 "h5", 634 "h6", 635 "ul", 636 "li", 637 "dl", 638 "br", 639 "em", 640 0, 641 }; 642 643 int 644 ishtml(void) 645 { 646 uchar *p, *q; 647 int i, count; 648 649 /* compare strings between '<' and '>' to html table */ 650 count = 0; 651 p = buf; 652 for(;;) { 653 while (p < buf+nbuf && *p != '<') 654 p++; 655 p++; 656 if (p >= buf+nbuf) 657 break; 658 if(*p == '/') 659 p++; 660 q = p; 661 while(p < buf+nbuf && *p != '>') 662 p++; 663 if (p >= buf+nbuf) 664 break; 665 for(i = 0; html_string[i]; i++) { 666 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 667 if(count++ > 4) { 668 print(mime ? "text/html\n" : "HTML file\n"); 669 return 1; 670 } 671 break; 672 } 673 } 674 p++; 675 } 676 return 0; 677 } 678 679 char* rfc822_string[] = 680 { 681 "from:", 682 "date:", 683 "to:", 684 "subject:", 685 "received:", 686 "reply to:", 687 "sender:", 688 0, 689 }; 690 691 int 692 isrfc822(void) 693 { 694 695 char *p, *q, *r; 696 int i, count; 697 698 count = 0; 699 p = (char*)buf; 700 for(;;) { 701 q = strchr(p, '\n'); 702 if(q == nil) 703 break; 704 *q = 0; 705 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 706 count++; 707 *q = '\n'; 708 p = q+1; 709 continue; 710 } 711 *q = '\n'; 712 if(*p != '\t' && *p != ' '){ 713 r = strchr(p, ':'); 714 if(r == 0 || r > q) 715 break; 716 for(i = 0; rfc822_string[i]; i++) { 717 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 718 count++; 719 break; 720 } 721 } 722 } 723 p = q+1; 724 } 725 if(count >= 3){ 726 print(mime ? "message/rfc822\n" : "email file\n"); 727 return 1; 728 } 729 return 0; 730 } 731 732 int 733 ismbox(void) 734 { 735 char *p, *q; 736 737 p = (char*)buf; 738 q = strchr(p, '\n'); 739 if(q == nil) 740 return 0; 741 *q = 0; 742 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 743 print(mime ? "text/plain\n" : "mail box\n"); 744 return 1; 745 } 746 *q = '\n'; 747 return 0; 748 } 749 750 int 751 iscint(void) 752 { 753 int type; 754 char *name; 755 Biobuf b; 756 757 if(Binit(&b, fd, OREAD) == Beof) 758 return 0; 759 seek(fd, 0, 0); 760 type = objtype(&b, &name); 761 if(type < 0) 762 return 0; 763 if(mime) 764 print(OCTET); 765 else 766 print("%s intermediate\n", name); 767 return 1; 768 } 769 770 int 771 isc(void) 772 { 773 int n; 774 775 n = wfreq[I1]; 776 /* 777 * includes 778 */ 779 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 780 goto yes; 781 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 782 goto yes; 783 /* 784 * declarations 785 */ 786 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 787 goto yes; 788 /* 789 * assignments 790 */ 791 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 792 goto yes; 793 return 0; 794 795 yes: 796 if(mime){ 797 print(PLAIN); 798 return 1; 799 } 800 if(wfreq[Alword] > 0) 801 print("alef program\n"); 802 else 803 print("c program\n"); 804 return 1; 805 } 806 807 int 808 islimbo(void) 809 { 810 811 /* 812 * includes 813 */ 814 if(wfreq[Lword] < 4) 815 return 0; 816 print(mime ? PLAIN : "limbo program\n"); 817 return 1; 818 } 819 820 int 821 isas(void) 822 { 823 824 /* 825 * includes 826 */ 827 if(wfreq[Aword] < 2) 828 return 0; 829 print(mime ? PLAIN : "as program\n"); 830 return 1; 831 } 832 833 /* 834 * low entropy means encrypted 835 */ 836 int 837 ismung(void) 838 { 839 int i, bucket[8]; 840 float cs; 841 842 if(nbuf < 64) 843 return 0; 844 memset(bucket, 0, sizeof(bucket)); 845 for(i=0; i<64; i++) 846 bucket[(buf[i]>>5)&07] += 1; 847 848 cs = 0.; 849 for(i=0; i<8; i++) 850 cs += (bucket[i]-8)*(bucket[i]-8); 851 cs /= 8.; 852 if(cs <= 24.322) { 853 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d)) 854 print(mime ? OCTET : "compressed\n"); 855 else 856 print(mime ? OCTET : "encrypted\n"); 857 return 1; 858 } 859 return 0; 860 } 861 862 /* 863 * english by punctuation and frequencies 864 */ 865 int 866 isenglish(void) 867 { 868 int vow, comm, rare, badpun, punct; 869 char *p; 870 871 if(guess != Fascii && guess != Feascii) 872 return 0; 873 badpun = 0; 874 punct = 0; 875 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 876 switch(*p) { 877 case '.': 878 case ',': 879 case ')': 880 case '%': 881 case ';': 882 case ':': 883 case '?': 884 punct++; 885 if(p[1] != ' ' && p[1] != '\n') 886 badpun++; 887 } 888 if(badpun*5 > punct) 889 return 0; 890 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 891 return 0; 892 if(2*cfreq[';'] > cfreq['e']) 893 return 0; 894 895 vow = 0; 896 for(p="AEIOU"; *p; p++) { 897 vow += cfreq[*p]; 898 vow += cfreq[tolower(*p)]; 899 } 900 comm = 0; 901 for(p="ETAION"; *p; p++) { 902 comm += cfreq[*p]; 903 comm += cfreq[tolower(*p)]; 904 } 905 rare = 0; 906 for(p="VJKQXZ"; *p; p++) { 907 rare += cfreq[*p]; 908 rare += cfreq[tolower(*p)]; 909 } 910 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 911 print(mime ? PLAIN : "English text\n"); 912 return 1; 913 } 914 return 0; 915 } 916 917 /* 918 * pick up a number with 919 * syntax _*[0-9]+_ 920 */ 921 #define P9BITLEN 12 922 int 923 p9bitnum(uchar *bp) 924 { 925 int n, c, len; 926 927 len = P9BITLEN; 928 while(*bp == ' ') { 929 bp++; 930 len--; 931 if(len <= 0) 932 return -1; 933 } 934 n = 0; 935 while(len > 1) { 936 c = *bp++; 937 if(!isdigit(c)) 938 return -1; 939 n = n*10 + c-'0'; 940 len--; 941 } 942 if(*bp != ' ') 943 return -1; 944 return n; 945 } 946 947 int 948 depthof(char *s, int *newp) 949 { 950 char *es; 951 int d; 952 953 *newp = 0; 954 es = s+12; 955 while(s<es && *s==' ') 956 s++; 957 if(s == es) 958 return -1; 959 if('0'<=*s && *s<='9') 960 return 1<<atoi(s); 961 962 *newp = 1; 963 d = 0; 964 while(s<es && *s!=' '){ 965 s++; /* skip letter */ 966 d += strtoul(s, &s, 10); 967 } 968 969 switch(d){ 970 case 32: 971 case 24: 972 case 16: 973 case 8: 974 return d; 975 } 976 return -1; 977 } 978 979 int 980 isp9bit(void) 981 { 982 int dep, lox, loy, hix, hiy, px, new; 983 ulong t; 984 long len; 985 char *newlabel; 986 987 newlabel = "old "; 988 989 dep = depthof((char*)buf + 0*P9BITLEN, &new); 990 if(new) 991 newlabel = ""; 992 lox = p9bitnum(buf + 1*P9BITLEN); 993 loy = p9bitnum(buf + 2*P9BITLEN); 994 hix = p9bitnum(buf + 3*P9BITLEN); 995 hiy = p9bitnum(buf + 4*P9BITLEN); 996 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 997 return 0; 998 999 if(dep < 8){ 1000 px = 8/dep; /* pixels per byte */ 1001 /* set l to number of bytes of data per scan line */ 1002 if(lox >= 0) 1003 len = (hix+px-1)/px - lox/px; 1004 else{ /* make positive before divide */ 1005 t = (-lox)+px-1; 1006 t = (t/px)*px; 1007 len = (t+hix+px-1)/px; 1008 } 1009 }else 1010 len = (hix-lox)*dep/8; 1011 len *= (hiy-loy); /* col length */ 1012 len += 5*P9BITLEN; /* size of initial ascii */ 1013 1014 /* 1015 * for image file, length is non-zero and must match calculation above 1016 * for /dev/window and /dev/screen the length is always zero 1017 * for subfont, the subfont header should follow immediately. 1018 */ 1019 if (len != 0 && mbuf->length == 0) { 1020 print("%splan 9 image\n", newlabel); 1021 return 1; 1022 } 1023 if (mbuf->length == len) { 1024 print("%splan 9 image\n", newlabel); 1025 return 1; 1026 } 1027 /* Ghostscript sometimes produces a little extra on the end */ 1028 if (mbuf->length < len+P9BITLEN) { 1029 print("%splan 9 image\n", newlabel); 1030 return 1; 1031 } 1032 if (p9subfont(buf+len)) { 1033 print("%ssubfont file\n", newlabel); 1034 return 1; 1035 } 1036 return 0; 1037 } 1038 1039 int 1040 p9subfont(uchar *p) 1041 { 1042 int n, h, a; 1043 1044 /* if image too big, assume it's a subfont */ 1045 if (p+3*P9BITLEN > buf+sizeof(buf)) 1046 return 1; 1047 1048 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1049 if (n < 0) 1050 return 0; 1051 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1052 if (h < 0) 1053 return 0; 1054 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1055 if (a < 0) 1056 return 0; 1057 return 1; 1058 } 1059 1060 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1061 1062 int 1063 isp9font(void) 1064 { 1065 uchar *cp, *p; 1066 int i, n; 1067 char pathname[1024]; 1068 1069 cp = buf; 1070 if (!getfontnum(cp, &cp)) /* height */ 1071 return 0; 1072 if (!getfontnum(cp, &cp)) /* ascent */ 1073 return 0; 1074 for (i = 0; 1; i++) { 1075 if (!getfontnum(cp, &cp)) /* min */ 1076 break; 1077 if (!getfontnum(cp, &cp)) /* max */ 1078 return 0; 1079 while (WHITESPACE(*cp)) 1080 cp++; 1081 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1082 ; 1083 /* construct a path name, if needed */ 1084 n = 0; 1085 if (*p != '/' && slash) { 1086 n = slash-fname+1; 1087 if (n < sizeof(pathname)) 1088 memcpy(pathname, fname, n); 1089 else n = 0; 1090 } 1091 if (n+cp-p < sizeof(pathname)) { 1092 memcpy(pathname+n, p, cp-p); 1093 n += cp-p; 1094 pathname[n] = 0; 1095 if (access(pathname, AEXIST) < 0) 1096 return 0; 1097 } 1098 } 1099 if (i) { 1100 print(mime ? "text/plain\n" : "font file\n"); 1101 return 1; 1102 } 1103 return 0; 1104 } 1105 1106 int 1107 getfontnum(uchar *cp, uchar **rp) 1108 { 1109 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1110 cp++; 1111 if (*cp < '0' || *cp > '9') 1112 return 0; 1113 strtoul((char *)cp, (char **)rp, 0); 1114 if (!WHITESPACE(**rp)) 1115 return 0; 1116 return 1; 1117 } 1118 1119 int 1120 isrtf(void) 1121 { 1122 if(strstr((char *)buf, "\\rtf1")){ 1123 print(mime ? "application/rtf\n" : "rich text format\n"); 1124 return 1; 1125 } 1126 return 0; 1127 } 1128 1129 int 1130 ismsdos(void) 1131 { 1132 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1133 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1134 return 1; 1135 } 1136 return 0; 1137 } 1138 1139 int 1140 iself(void) 1141 { 1142 char *cpu[] = { /* NB: incomplete and arbitary list */ 1143 [1] "WE32100", 1144 [2] "SPARC", 1145 [3] "i386", 1146 [4] "M68000", 1147 [5] "M88000", 1148 [6] "i486", 1149 [7] "i860", 1150 [8] "R3000", 1151 [9] "S370", 1152 [10] "R4000", 1153 [15] "HP-PA", 1154 [18] "sparc v8+", 1155 [19] "i960", 1156 [20] "PPC-32", 1157 [21] "PPC-64", 1158 [40] "ARM", 1159 [41] "Alpha", 1160 [43] "sparc v9", 1161 [50] "IA-46", 1162 [62] "AMD x86-64", 1163 [75] "VAX", 1164 }; 1165 1166 1167 if (memcmp(buf, "\x7fELF", 4) == 0){ 1168 if (!mime){ 1169 int n = (buf[19] << 8) | buf[18]; 1170 char *p = (n > 0 && n < nelem(cpu) && cpu[n])? cpu[n]: "unknown"; 1171 print("%s ELF executable\n", p); 1172 } 1173 else 1174 print("application/x-elf-executable"); 1175 return 1; 1176 } 1177 1178 return 0; 1179 } 1180