1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 isp9font, /* plan 9 font */ 192 isp9bit, /* plan 9 image (as from /dev/window) */ 193 isrtf, /* rich text format */ 194 ismsdos, /* msdos exe (virus file attachement) */ 195 isface, /* ascii face file */ 196 197 /* last resorts */ 198 ismung, /* entropy compressed/encrypted */ 199 isenglish, /* char frequency English */ 200 0 201 }; 202 203 int mime; 204 205 char OCTET[] = "application/octet-stream\n"; 206 char PLAIN[] = "text/plain\n"; 207 208 void 209 main(int argc, char *argv[]) 210 { 211 int i, j, maxlen; 212 char *cp; 213 Rune r; 214 215 ARGBEGIN{ 216 case 'm': 217 mime = 1; 218 break; 219 default: 220 fprint(2, "usage: file [-m] [file...]\n"); 221 exits("usage"); 222 }ARGEND; 223 224 maxlen = 0; 225 if(mime == 0 || argc > 1){ 226 for(i = 0; i < argc; i++) { 227 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 228 ; 229 if(j > maxlen) 230 maxlen = j; 231 } 232 } 233 if (argc <= 0) { 234 if(!mime) 235 print ("stdin: "); 236 filetype(0); 237 } 238 else { 239 for(i = 0; i < argc; i++) 240 type(argv[i], maxlen); 241 } 242 exits(0); 243 } 244 245 void 246 type(char *file, int nlen) 247 { 248 Rune r; 249 int i; 250 char *p; 251 252 if(nlen > 0){ 253 slash = 0; 254 for (i = 0, p = file; *p; i++) { 255 if (*p == '/') /* find rightmost slash */ 256 slash = p; 257 p += chartorune(&r, p); /* count runes */ 258 } 259 print("%s:%*s",file, nlen-i+1, ""); 260 } 261 fname = file; 262 if ((fd = open(file, OREAD)) < 0) { 263 print("cannot open: %r\n"); 264 return; 265 } 266 filetype(fd); 267 close(fd); 268 } 269 270 /* 271 * Unicode 4.0 4-byte runes. 272 */ 273 typedef int Rune1; 274 275 enum { 276 UTFmax1 = 4, 277 }; 278 279 int 280 fullrune1(char *p, int n) 281 { 282 int c; 283 284 if(n >= 1) { 285 c = *(uchar*)p; 286 if(c < 0x80) 287 return 1; 288 if(n >= 2 && c < 0xE0) 289 return 1; 290 if(n >= 3 && c < 0xF0) 291 return 1; 292 if(n >= 4) 293 return 1; 294 } 295 return 0; 296 } 297 298 int 299 chartorune1(Rune1 *rune, char *str) 300 { 301 int c, c1, c2, c3, n; 302 Rune r; 303 304 c = *(uchar*)str; 305 if(c < 0xF0){ 306 r = 0; 307 n = chartorune(&r, str); 308 *rune = r; 309 return n; 310 } 311 c &= ~0xF0; 312 c1 = *(uchar*)(str+1) & ~0x80; 313 c2 = *(uchar*)(str+2) & ~0x80; 314 c3 = *(uchar*)(str+3) & ~0x80; 315 n = (c<<18) | (c1<<12) | (c2<<6) | c3; 316 if(n < 0x10000 || n > 0x10FFFF){ 317 *rune = Runeerror; 318 return 1; 319 } 320 *rune = n; 321 return 4; 322 } 323 324 void 325 filetype(int fd) 326 { 327 Rune1 r; 328 int i, f, n; 329 char *p, *eob; 330 331 free(mbuf); 332 mbuf = dirfstat(fd); 333 if(mbuf == nil){ 334 print("cannot stat: %r\n"); 335 return; 336 } 337 if(mbuf->mode & DMDIR) { 338 print(mime ? OCTET : "directory\n"); 339 return; 340 } 341 if(mbuf->type != 'M' && mbuf->type != '|') { 342 print(mime ? OCTET : "special file #%C/%s\n", 343 mbuf->type, mbuf->name); 344 return; 345 } 346 /* may be reading a pipe on standard input */ 347 nbuf = readn(fd, buf, sizeof(buf)-1); 348 if(nbuf < 0) { 349 print("cannot read: %r\n"); 350 return; 351 } 352 if(nbuf == 0) { 353 print(mime ? PLAIN : "empty file\n"); 354 return; 355 } 356 buf[nbuf] = 0; 357 358 /* 359 * build histogram table 360 */ 361 memset(cfreq, 0, sizeof(cfreq)); 362 for (i = 0; language[i].name; i++) 363 language[i].count = 0; 364 eob = (char *)buf+nbuf; 365 for(n = 0, p = (char *)buf; p < eob; n++) { 366 if (!fullrune1(p, eob-p) && eob-p < UTFmax1) 367 break; 368 p += chartorune1(&r, p); 369 if (r == 0) 370 f = Cnull; 371 else if (r <= 0x7f) { 372 if (!isprint(r) && !isspace(r)) 373 f = Ceascii; /* ASCII control char */ 374 else f = r; 375 } else if (r == 0x80) { 376 bump_utf_count(r); 377 f = Cutf; 378 } else if (r < 0xA0) 379 f = Cbinary; /* Invalid Runes */ 380 else if (r <= 0xff) 381 f = Clatin; /* Latin 1 */ 382 else { 383 bump_utf_count(r); 384 f = Cutf; /* UTF extension */ 385 } 386 cfreq[f]++; /* ASCII chars peg directly */ 387 } 388 /* 389 * gross classify 390 */ 391 if (cfreq[Cbinary]) 392 guess = Fbinary; 393 else if (cfreq[Cutf]) 394 guess = Futf; 395 else if (cfreq[Clatin]) 396 guess = Flatin; 397 else if (cfreq[Ceascii]) 398 guess = Feascii; 399 else if (cfreq[Cnull]) 400 guess = Fbinary; 401 else 402 guess = Fascii; 403 /* 404 * lookup dictionary words 405 */ 406 memset(wfreq, 0, sizeof(wfreq)); 407 if(guess == Fascii || guess == Flatin || guess == Futf) 408 wordfreq(); 409 /* 410 * call individual classify routines 411 */ 412 for(i=0; call[i]; i++) 413 if((*call[i])()) 414 return; 415 416 /* 417 * if all else fails, 418 * print out gross classification 419 */ 420 if (nbuf < 100 && !mime) 421 print(mime ? PLAIN : "short "); 422 if (guess == Fascii) 423 print(mime ? PLAIN : "Ascii\n"); 424 else if (guess == Feascii) 425 print(mime ? PLAIN : "extended ascii\n"); 426 else if (guess == Flatin) 427 print(mime ? PLAIN : "latin ascii\n"); 428 else if (guess == Futf && utf_count() < 4) 429 print_utf(); 430 else print(mime ? OCTET : "binary\n"); 431 } 432 433 void 434 bump_utf_count(Rune r) 435 { 436 int low, high, mid; 437 438 high = sizeof(language)/sizeof(language[0])-1; 439 for (low = 0; low < high;) { 440 mid = (low+high)/2; 441 if (r >= language[mid].low) { 442 if (r <= language[mid].high) { 443 language[mid].count++; 444 break; 445 } else low = mid+1; 446 } else high = mid; 447 } 448 } 449 450 int 451 utf_count(void) 452 { 453 int i, count; 454 455 count = 0; 456 for (i = 0; language[i].name; i++) 457 if (language[i].count > 0) 458 switch (language[i].mode) { 459 case Normal: 460 case First: 461 count++; 462 break; 463 default: 464 break; 465 } 466 return count; 467 } 468 469 int 470 chkascii(void) 471 { 472 int i; 473 474 for (i = 'a'; i < 'z'; i++) 475 if (cfreq[i]) 476 return 1; 477 for (i = 'A'; i < 'Z'; i++) 478 if (cfreq[i]) 479 return 1; 480 return 0; 481 } 482 483 int 484 find_first(char *name) 485 { 486 int i; 487 488 for (i = 0; language[i].name != 0; i++) 489 if (language[i].mode == First 490 && strcmp(language[i].name, name) == 0) 491 return i; 492 return -1; 493 } 494 495 void 496 print_utf(void) 497 { 498 int i, printed, j; 499 500 if(mime){ 501 print(PLAIN); 502 return; 503 } 504 if (chkascii()) { 505 printed = 1; 506 print("Ascii"); 507 } else 508 printed = 0; 509 for (i = 0; language[i].name; i++) 510 if (language[i].count) { 511 switch(language[i].mode) { 512 case Multi: 513 j = find_first(language[i].name); 514 if (j < 0) 515 break; 516 if (language[j].count > 0) 517 break; 518 /* Fall through */ 519 case Normal: 520 case First: 521 if (printed) 522 print(" & "); 523 else printed = 1; 524 print("%s", language[i].name); 525 break; 526 case Shared: 527 default: 528 break; 529 } 530 } 531 if(!printed) 532 print("UTF"); 533 print(" text\n"); 534 } 535 536 void 537 wordfreq(void) 538 { 539 int low, high, mid, r; 540 uchar *p, *p2, c; 541 542 p = buf; 543 for(;;) { 544 while (p < buf+nbuf && !isalpha(*p)) 545 p++; 546 if (p >= buf+nbuf) 547 return; 548 p2 = p; 549 while(p < buf+nbuf && isalpha(*p)) 550 p++; 551 c = *p; 552 *p = 0; 553 high = sizeof(dict)/sizeof(dict[0]); 554 for(low = 0;low < high;) { 555 mid = (low+high)/2; 556 r = strcmp(dict[mid].word, (char*)p2); 557 if(r == 0) { 558 wfreq[dict[mid].class]++; 559 break; 560 } 561 if(r < 0) 562 low = mid+1; 563 else 564 high = mid; 565 } 566 *p++ = c; 567 } 568 } 569 570 typedef struct Filemagic Filemagic; 571 struct Filemagic { 572 ulong x; 573 ulong mask; 574 char *desc; 575 char *mime; 576 }; 577 578 /* 579 * integers in this table must be as seen on a little-endian machine 580 * when read from a file. 581 */ 582 Filemagic long0tab[] = { 583 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 584 /* "pac1" */ 585 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 586 /* "pXc2 */ 587 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 588 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 589 0x43614c66, 0xFFFFFFFF, "FLAC audio file\n", OCTET, 590 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 591 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 592 070707, 0xFFFF, "cpio archive\n", OCTET, 593 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 594 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 595 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 596 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 597 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 598 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 599 /* 0xfeedface: this could alternately be a Next Plan 9 boot image */ 600 0xcefaedfe, 0xFFFFFFFF, "32-bit power Mach-O executable\n", OCTET, 601 /* 0xfeedfacf */ 602 0xcffaedfe, 0xFFFFFFFF, "64-bit power Mach-O executable\n", OCTET, 603 /* 0xcefaedfe */ 604 0xfeedface, 0xFFFFFFFF, "386 Mach-O executable\n", OCTET, 605 /* 0xcffaedfe */ 606 0xfeedfacf, 0xFFFFFFFF, "amd64 Mach-O executable\n", OCTET, 607 /* 0xcafebabe */ 608 0xbebafeca, 0xFFFFFFFF, "Mach-O universal executable\n", OCTET, 609 /* 610 * these magic numbers are stored big-endian on disk, 611 * thus the numbers appear reversed in this table. 612 */ 613 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 614 0x2bb19a52, 0xFFFFFFFF, "paq archive\n", OCTET, 615 }; 616 617 int 618 filemagic(Filemagic *tab, int ntab, ulong x) 619 { 620 int i; 621 622 for(i=0; i<ntab; i++) 623 if((x&tab[i].mask) == tab[i].x){ 624 print(mime ? tab[i].mime : tab[i].desc); 625 return 1; 626 } 627 return 0; 628 } 629 630 int 631 long0(void) 632 { 633 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 634 } 635 636 typedef struct Fileoffmag Fileoffmag; 637 struct Fileoffmag { 638 ulong off; 639 Filemagic; 640 }; 641 642 /* 643 * integers in this table must be as seen on a little-endian machine 644 * when read from a file. 645 */ 646 Fileoffmag longofftab[] = { 647 /* 648 * these magic numbers are stored big-endian on disk, 649 * thus the numbers appear reversed in this table. 650 */ 651 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 652 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 653 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 654 4, 0x31647542, 0xFFFFFFFF, "OS X finder properties\n", OCTET, 655 }; 656 657 int 658 fileoffmagic(Fileoffmag *tab, int ntab) 659 { 660 int i; 661 ulong x; 662 Fileoffmag *tp; 663 uchar buf[sizeof(long)]; 664 665 for(i=0; i<ntab; i++) { 666 tp = tab + i; 667 seek(fd, tp->off, 0); 668 if (readn(fd, buf, sizeof buf) != sizeof buf) 669 continue; 670 x = LENDIAN(buf); 671 if((x&tp->mask) == tp->x){ 672 print(mime? tp->mime: tp->desc); 673 return 1; 674 } 675 } 676 return 0; 677 } 678 679 int 680 longoff(void) 681 { 682 return fileoffmagic(longofftab, nelem(longofftab)); 683 } 684 685 int 686 isexec(void) 687 { 688 Fhdr f; 689 690 seek(fd, 0, 0); /* reposition to start of file */ 691 if(crackhdr(fd, &f)) { 692 print(mime ? OCTET : "%s\n", f.name); 693 return 1; 694 } 695 return 0; 696 } 697 698 699 /* from tar.c */ 700 enum { NAMSIZ = 100, TBLOCK = 512 }; 701 702 union hblock 703 { 704 char dummy[TBLOCK]; 705 struct header 706 { 707 char name[NAMSIZ]; 708 char mode[8]; 709 char uid[8]; 710 char gid[8]; 711 char size[12]; 712 char mtime[12]; 713 char chksum[8]; 714 char linkflag; 715 char linkname[NAMSIZ]; 716 /* rest are defined by POSIX's ustar format; see p1003.2b */ 717 char magic[6]; /* "ustar" */ 718 char version[2]; 719 char uname[32]; 720 char gname[32]; 721 char devmajor[8]; 722 char devminor[8]; 723 char prefix[155]; /* if non-null, path = prefix "/" name */ 724 } dbuf; 725 }; 726 727 int 728 checksum(union hblock *hp) 729 { 730 int i; 731 char *cp; 732 struct header *hdr = &hp->dbuf; 733 734 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 735 *cp = ' '; 736 i = 0; 737 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 738 i += *cp & 0xff; 739 return i; 740 } 741 742 int 743 istar(void) 744 { 745 int chksum; 746 char tblock[TBLOCK]; 747 union hblock *hp = (union hblock *)tblock; 748 struct header *hdr = &hp->dbuf; 749 750 seek(fd, 0, 0); /* reposition to start of file */ 751 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 752 return 0; 753 chksum = strtol(hdr->chksum, 0, 8); 754 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 755 if (strcmp(hdr->magic, "ustar") == 0) 756 print(mime? "application/x-ustar\n": 757 "posix tar archive\n"); 758 else 759 print(mime? "application/x-tar\n": "tar archive\n"); 760 return 1; 761 } 762 return 0; 763 } 764 765 /* 766 * initial words to classify file 767 */ 768 struct FILE_STRING 769 { 770 char *key; 771 char *filetype; 772 int length; 773 char *mime; 774 } file_string[] = 775 { 776 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 777 "!<arch>\n", "archive", 8, "application/octet-stream", 778 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 779 "#!/bin/rc", "rc executable file", 9, "text/plain", 780 "#!/bin/sh", "sh executable file", 9, "text/plain", 781 "%!", "postscript", 2, "application/postscript", 782 "\004%!", "postscript", 3, "application/postscript", 783 "x T post", "troff output for post", 8, "application/troff", 784 "x T Latin1", "troff output for Latin1", 10, "application/troff", 785 "x T utf", "troff output for UTF", 7, "application/troff", 786 "x T 202", "troff output for 202", 7, "application/troff", 787 "x T aps", "troff output for aps", 7, "application/troff", 788 "x T ", "troff output", 4, "application/troff", 789 "GIF", "GIF image", 3, "image/gif", 790 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 791 "%PDF", "PDF", 4, "application/pdf", 792 "<html>\n", "HTML file", 7, "text/html", 793 "<HTML>\n", "HTML file", 7, "text/html", 794 "\111\111\052\000", "tiff", 4, "image/tiff", 795 "\115\115\000\052", "tiff", 4, "image/tiff", 796 "\377\330\377\340", "jpeg", 4, "image/jpeg", 797 "\377\330\377\341", "jpeg", 4, "image/jpeg", 798 "\377\330\377\333", "jpeg", 4, "image/jpeg", 799 "BM", "bmp", 2, "image/bmp", 800 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 801 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 802 "\033E\033", "HP PCL printer data", 3, OCTET, 803 "\033&", "HP PCL printer data", 2, OCTET, 804 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 805 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 806 "\211PNG", "PNG image", 4, "image/png", 807 "P3\n", "ppm", 3, "image/ppm", 808 "P6\n", "ppm", 3, "image/ppm", 809 "/* XPM */\n", "xbm", 10, "image/xbm", 810 ".HTML ", "troff -ms input", 6, "text/troff", 811 ".LP", "troff -ms input", 3, "text/troff", 812 ".ND", "troff -ms input", 3, "text/troff", 813 ".PP", "troff -ms input", 3, "text/troff", 814 ".TL", "troff -ms input", 3, "text/troff", 815 ".TR", "troff -ms input", 3, "text/troff", 816 ".TH", "manual page", 3, "text/troff", 817 ".\\\"", "troff input", 3, "text/troff", 818 ".de", "troff input", 3, "text/troff", 819 ".if", "troff input", 3, "text/troff", 820 ".nr", "troff input", 3, "text/troff", 821 ".tr", "troff input", 3, "text/troff", 822 "vac:", "venti score", 4, "text/plain", 823 "-----BEGIN CERTIFICATE-----\n", 824 "pem certificate", -1, "text/plain", 825 "-----BEGIN TRUSTED CERTIFICATE-----\n", 826 "pem trusted certificate", -1, "text/plain", 827 "-----BEGIN X509 CERTIFICATE-----\n", 828 "pem x.509 certificate", -1, "text/plain", 829 "subject=/C=", "pem certificate with header", -1, "text/plain", 830 "process snapshot ", "process snapshot", -1, "application/snapfs", 831 "BEGIN:VCARD\r\n", "vCard", 13, "text/directory;profile=vcard", 832 "BEGIN:VCARD\n", "vCard", 12, "text/directory;profile=vcard", 833 0,0,0,0 834 }; 835 836 int 837 istring(void) 838 { 839 int i, l; 840 struct FILE_STRING *p; 841 842 for(p = file_string; p->key; p++) { 843 l = p->length; 844 if(l == -1) 845 l = strlen(p->key); 846 if(nbuf >= l && memcmp(buf, p->key, l) == 0) { 847 if(mime) 848 print("%s\n", p->mime); 849 else 850 print("%s\n", p->filetype); 851 return 1; 852 } 853 } 854 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 855 for(i = 5; i < nbuf; i++) 856 if(buf[i] == '\n') 857 break; 858 if(mime) 859 print(OCTET); 860 else 861 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 862 return 1; 863 } 864 return 0; 865 } 866 867 struct offstr 868 { 869 ulong off; 870 struct FILE_STRING; 871 } offstrs[] = { 872 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 873 0, 0, 0, 0, 0 874 }; 875 876 int 877 isoffstr(void) 878 { 879 int n; 880 char buf[256]; 881 struct offstr *p; 882 883 for(p = offstrs; p->key; p++) { 884 seek(fd, p->off, 0); 885 n = p->length; 886 if (n > sizeof buf) 887 n = sizeof buf; 888 if (readn(fd, buf, n) != n) 889 continue; 890 if(memcmp(buf, p->key, n) == 0) { 891 if(mime) 892 print("%s\n", p->mime); 893 else 894 print("%s\n", p->filetype); 895 return 1; 896 } 897 } 898 return 0; 899 } 900 901 int 902 iff(void) 903 { 904 if (strncmp((char*)buf, "FORM", 4) == 0 && 905 strncmp((char*)buf+8, "AIFF", 4) == 0) { 906 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 907 return 1; 908 } 909 if (strncmp((char*)buf, "RIFF", 4) == 0) { 910 if (strncmp((char*)buf+8, "WAVE", 4) == 0) 911 print("%s\n", mime? "audio/wave": "wave audio"); 912 else if (strncmp((char*)buf+8, "AVI ", 4) == 0) 913 print("%s\n", mime? "video/avi": "avi video"); 914 else 915 print("%s\n", mime? "application/octet-stream": 916 "riff file"); 917 return 1; 918 } 919 return 0; 920 } 921 922 char* html_string[] = 923 { 924 "title", 925 "body", 926 "head", 927 "strong", 928 "h1", 929 "h2", 930 "h3", 931 "h4", 932 "h5", 933 "h6", 934 "ul", 935 "li", 936 "dl", 937 "br", 938 "em", 939 0, 940 }; 941 942 int 943 ishtml(void) 944 { 945 uchar *p, *q; 946 int i, count; 947 948 /* compare strings between '<' and '>' to html table */ 949 count = 0; 950 p = buf; 951 for(;;) { 952 while (p < buf+nbuf && *p != '<') 953 p++; 954 p++; 955 if (p >= buf+nbuf) 956 break; 957 if(*p == '/') 958 p++; 959 q = p; 960 while(p < buf+nbuf && *p != '>') 961 p++; 962 if (p >= buf+nbuf) 963 break; 964 for(i = 0; html_string[i]; i++) { 965 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 966 if(count++ > 4) { 967 print(mime ? "text/html\n" : "HTML file\n"); 968 return 1; 969 } 970 break; 971 } 972 } 973 p++; 974 } 975 return 0; 976 } 977 978 char* rfc822_string[] = 979 { 980 "from:", 981 "date:", 982 "to:", 983 "subject:", 984 "received:", 985 "reply to:", 986 "sender:", 987 0, 988 }; 989 990 int 991 isrfc822(void) 992 { 993 994 char *p, *q, *r; 995 int i, count; 996 997 count = 0; 998 p = (char*)buf; 999 for(;;) { 1000 q = strchr(p, '\n'); 1001 if(q == nil) 1002 break; 1003 *q = 0; 1004 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 1005 count++; 1006 *q = '\n'; 1007 p = q+1; 1008 continue; 1009 } 1010 *q = '\n'; 1011 if(*p != '\t' && *p != ' '){ 1012 r = strchr(p, ':'); 1013 if(r == 0 || r > q) 1014 break; 1015 for(i = 0; rfc822_string[i]; i++) { 1016 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 1017 count++; 1018 break; 1019 } 1020 } 1021 } 1022 p = q+1; 1023 } 1024 if(count >= 3){ 1025 print(mime ? "message/rfc822\n" : "email file\n"); 1026 return 1; 1027 } 1028 return 0; 1029 } 1030 1031 int 1032 ismbox(void) 1033 { 1034 char *p, *q; 1035 1036 p = (char*)buf; 1037 q = strchr(p, '\n'); 1038 if(q == nil) 1039 return 0; 1040 *q = 0; 1041 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 1042 print(mime ? "text/plain\n" : "mail box\n"); 1043 return 1; 1044 } 1045 *q = '\n'; 1046 return 0; 1047 } 1048 1049 int 1050 iscint(void) 1051 { 1052 int type; 1053 char *name; 1054 Biobuf b; 1055 1056 if(Binit(&b, fd, OREAD) == Beof) 1057 return 0; 1058 seek(fd, 0, 0); 1059 type = objtype(&b, &name); 1060 if(type < 0) 1061 return 0; 1062 if(mime) 1063 print(OCTET); 1064 else 1065 print("%s intermediate\n", name); 1066 return 1; 1067 } 1068 1069 int 1070 isc(void) 1071 { 1072 int n; 1073 1074 n = wfreq[I1]; 1075 /* 1076 * includes 1077 */ 1078 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1079 goto yes; 1080 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1081 goto yes; 1082 /* 1083 * declarations 1084 */ 1085 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1086 goto yes; 1087 /* 1088 * assignments 1089 */ 1090 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1091 goto yes; 1092 return 0; 1093 1094 yes: 1095 if(mime){ 1096 print(PLAIN); 1097 return 1; 1098 } 1099 if(wfreq[Alword] > 0) 1100 print("alef program\n"); 1101 else 1102 print("c program\n"); 1103 return 1; 1104 } 1105 1106 int 1107 islimbo(void) 1108 { 1109 1110 /* 1111 * includes 1112 */ 1113 if(wfreq[Lword] < 4) 1114 return 0; 1115 print(mime ? PLAIN : "limbo program\n"); 1116 return 1; 1117 } 1118 1119 int 1120 isas(void) 1121 { 1122 1123 /* 1124 * includes 1125 */ 1126 if(wfreq[Aword] < 2) 1127 return 0; 1128 print(mime ? PLAIN : "as program\n"); 1129 return 1; 1130 } 1131 1132 /* 1133 * low entropy means encrypted 1134 */ 1135 int 1136 ismung(void) 1137 { 1138 int i, bucket[8]; 1139 float cs; 1140 1141 if(nbuf < 64) 1142 return 0; 1143 memset(bucket, 0, sizeof(bucket)); 1144 for(i=nbuf-64; i<nbuf; i++) 1145 bucket[(buf[i]>>5)&07] += 1; 1146 1147 cs = 0.; 1148 for(i=0; i<8; i++) 1149 cs += (bucket[i]-8)*(bucket[i]-8); 1150 cs /= 8.; 1151 if(cs <= 24.322) { 1152 if(buf[0]==0x1f && buf[1]==0x9d) 1153 print(mime ? OCTET : "compressed\n"); 1154 else 1155 if(buf[0]==0x1f && buf[1]==0x8b) 1156 print(mime ? OCTET : "gzip compressed\n"); 1157 else 1158 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1159 print(mime ? OCTET : "bzip2 compressed\n"); 1160 else 1161 print(mime ? OCTET : "encrypted\n"); 1162 return 1; 1163 } 1164 return 0; 1165 } 1166 1167 /* 1168 * english by punctuation and frequencies 1169 */ 1170 int 1171 isenglish(void) 1172 { 1173 int vow, comm, rare, badpun, punct; 1174 char *p; 1175 1176 if(guess != Fascii && guess != Feascii) 1177 return 0; 1178 badpun = 0; 1179 punct = 0; 1180 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1181 switch(*p) { 1182 case '.': 1183 case ',': 1184 case ')': 1185 case '%': 1186 case ';': 1187 case ':': 1188 case '?': 1189 punct++; 1190 if(p[1] != ' ' && p[1] != '\n') 1191 badpun++; 1192 } 1193 if(badpun*5 > punct) 1194 return 0; 1195 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1196 return 0; 1197 if(2*cfreq[';'] > cfreq['e']) 1198 return 0; 1199 1200 vow = 0; 1201 for(p="AEIOU"; *p; p++) { 1202 vow += cfreq[*p]; 1203 vow += cfreq[tolower(*p)]; 1204 } 1205 comm = 0; 1206 for(p="ETAION"; *p; p++) { 1207 comm += cfreq[*p]; 1208 comm += cfreq[tolower(*p)]; 1209 } 1210 rare = 0; 1211 for(p="VJKQXZ"; *p; p++) { 1212 rare += cfreq[*p]; 1213 rare += cfreq[tolower(*p)]; 1214 } 1215 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1216 print(mime ? PLAIN : "English text\n"); 1217 return 1; 1218 } 1219 return 0; 1220 } 1221 1222 /* 1223 * pick up a number with 1224 * syntax _*[0-9]+_ 1225 */ 1226 #define P9BITLEN 12 1227 int 1228 p9bitnum(uchar *bp) 1229 { 1230 int n, c, len; 1231 1232 len = P9BITLEN; 1233 while(*bp == ' ') { 1234 bp++; 1235 len--; 1236 if(len <= 0) 1237 return -1; 1238 } 1239 n = 0; 1240 while(len > 1) { 1241 c = *bp++; 1242 if(!isdigit(c)) 1243 return -1; 1244 n = n*10 + c-'0'; 1245 len--; 1246 } 1247 if(*bp != ' ') 1248 return -1; 1249 return n; 1250 } 1251 1252 int 1253 depthof(char *s, int *newp) 1254 { 1255 char *es; 1256 int d; 1257 1258 *newp = 0; 1259 es = s+12; 1260 while(s<es && *s==' ') 1261 s++; 1262 if(s == es) 1263 return -1; 1264 if('0'<=*s && *s<='9') 1265 return 1<<strtol(s, 0, 0); 1266 1267 *newp = 1; 1268 d = 0; 1269 while(s<es && *s!=' '){ 1270 s++; /* skip letter */ 1271 d += strtoul(s, &s, 10); 1272 } 1273 1274 if(d % 8 == 0 || 8 % d == 0) 1275 return d; 1276 else 1277 return -1; 1278 } 1279 1280 int 1281 isp9bit(void) 1282 { 1283 int dep, lox, loy, hix, hiy, px, new, cmpr; 1284 ulong t; 1285 long len; 1286 char *newlabel; 1287 uchar *cp; 1288 1289 cp = buf; 1290 cmpr = 0; 1291 newlabel = "old "; 1292 1293 if(memcmp(cp, "compressed\n", 11) == 0) { 1294 cmpr = 1; 1295 cp = buf + 11; 1296 } 1297 1298 dep = depthof((char*)cp + 0*P9BITLEN, &new); 1299 if(new) 1300 newlabel = ""; 1301 lox = p9bitnum(cp + 1*P9BITLEN); 1302 loy = p9bitnum(cp + 2*P9BITLEN); 1303 hix = p9bitnum(cp + 3*P9BITLEN); 1304 hiy = p9bitnum(cp + 4*P9BITLEN); 1305 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1306 return 0; 1307 1308 if(dep < 8){ 1309 px = 8/dep; /* pixels per byte */ 1310 /* set l to number of bytes of data per scan line */ 1311 if(lox >= 0) 1312 len = (hix+px-1)/px - lox/px; 1313 else{ /* make positive before divide */ 1314 t = (-lox)+px-1; 1315 t = (t/px)*px; 1316 len = (t+hix+px-1)/px; 1317 } 1318 }else 1319 len = (hix-lox)*dep/8; 1320 len *= hiy - loy; /* col length */ 1321 len += 5 * P9BITLEN; /* size of initial ascii */ 1322 1323 /* 1324 * for compressed images, don't look any further. otherwise: 1325 * for image file, length is non-zero and must match calculation above. 1326 * for /dev/window and /dev/screen the length is always zero. 1327 * for subfont, the subfont header should follow immediately. 1328 */ 1329 if (cmpr) { 1330 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n", 1331 newlabel, dep); 1332 return 1; 1333 } 1334 /* 1335 * mbuf->length == 0 probably indicates reading a pipe. 1336 * Ghostscript sometimes produces a little extra on the end. 1337 */ 1338 if (len != 0 && (mbuf->length == 0 || mbuf->length == len || 1339 mbuf->length > len && mbuf->length < len+P9BITLEN)) { 1340 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1341 return 1; 1342 } 1343 if (p9subfont(buf+len)) { 1344 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep); 1345 return 1; 1346 } 1347 return 0; 1348 } 1349 1350 int 1351 p9subfont(uchar *p) 1352 { 1353 int n, h, a; 1354 1355 /* if image too big, assume it's a subfont */ 1356 if (p+3*P9BITLEN > buf+sizeof(buf)) 1357 return 1; 1358 1359 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1360 if (n < 0) 1361 return 0; 1362 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1363 if (h < 0) 1364 return 0; 1365 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1366 if (a < 0) 1367 return 0; 1368 return 1; 1369 } 1370 1371 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1372 1373 int 1374 isp9font(void) 1375 { 1376 uchar *cp, *p; 1377 int i, n; 1378 char pathname[1024]; 1379 1380 cp = buf; 1381 if (!getfontnum(cp, &cp)) /* height */ 1382 return 0; 1383 if (!getfontnum(cp, &cp)) /* ascent */ 1384 return 0; 1385 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1386 if (!getfontnum(cp, &cp)) /* min */ 1387 break; 1388 if (!getfontnum(cp, &cp)) /* max */ 1389 return 0; 1390 getfontnum(cp, &cp); /* optional offset */ 1391 while (WHITESPACE(*cp)) 1392 cp++; 1393 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1394 ; 1395 /* construct a path name, if needed */ 1396 n = 0; 1397 if (*p != '/' && slash) { 1398 n = slash-fname+1; 1399 if (n < sizeof(pathname)) 1400 memcpy(pathname, fname, n); 1401 else n = 0; 1402 } 1403 if (n+cp-p+4 < sizeof(pathname)) { 1404 memcpy(pathname+n, p, cp-p); 1405 n += cp-p; 1406 pathname[n] = 0; 1407 if (access(pathname, AEXIST) < 0) { 1408 strcpy(pathname+n, ".0"); 1409 if (access(pathname, AEXIST) < 0) 1410 return 0; 1411 } 1412 } 1413 } 1414 if (i) { 1415 print(mime ? "text/plain\n" : "font file\n"); 1416 return 1; 1417 } 1418 return 0; 1419 } 1420 1421 int 1422 getfontnum(uchar *cp, uchar **rp) 1423 { 1424 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1425 cp++; 1426 if (*cp < '0' || *cp > '9') 1427 return 0; 1428 strtoul((char *)cp, (char **)rp, 0); 1429 if (!WHITESPACE(**rp)) { 1430 *rp = cp; 1431 return 0; 1432 } 1433 return 1; 1434 } 1435 1436 int 1437 isrtf(void) 1438 { 1439 if(strstr((char *)buf, "\\rtf1")){ 1440 print(mime ? "application/rtf\n" : "rich text format\n"); 1441 return 1; 1442 } 1443 return 0; 1444 } 1445 1446 int 1447 ismsdos(void) 1448 { 1449 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1450 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1451 return 1; 1452 } 1453 return 0; 1454 } 1455 1456 int 1457 iself(void) 1458 { 1459 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1460 [1] "WE32100", 1461 [2] "SPARC", 1462 [3] "i386", 1463 [4] "M68000", 1464 [5] "M88000", 1465 [6] "i486", 1466 [7] "i860", 1467 [8] "R3000", 1468 [9] "S370", 1469 [10] "R4000", 1470 [15] "HP-PA", 1471 [18] "sparc v8+", 1472 [19] "i960", 1473 [20] "PPC-32", 1474 [21] "PPC-64", 1475 [40] "ARM", 1476 [41] "Alpha", 1477 [43] "sparc v9", 1478 [50] "IA-64", 1479 [62] "AMD64", 1480 [75] "VAX", 1481 }; 1482 static char *type[] = { 1483 [1] "relocatable object", 1484 [2] "executable", 1485 [3] "shared library", 1486 [4] "core dump", 1487 }; 1488 1489 if (memcmp(buf, "\x7fELF", 4) == 0){ 1490 if (!mime){ 1491 int isdifend = 0; 1492 int n = (buf[19] << 8) | buf[18]; 1493 char *p = "unknown"; 1494 char *t = "unknown"; 1495 1496 if (n > 0 && n < nelem(cpu) && cpu[n]) 1497 p = cpu[n]; 1498 else { 1499 /* try the other byte order */ 1500 isdifend = 1; 1501 n = (buf[18] << 8) | buf[19]; 1502 if (n > 0 && n < nelem(cpu) && cpu[n]) 1503 p = cpu[n]; 1504 } 1505 if(isdifend) 1506 n = (buf[16]<< 8) | buf[17]; 1507 else 1508 n = (buf[17]<< 8) | buf[16]; 1509 1510 if(n>0 && n < nelem(type) && type[n]) 1511 t = type[n]; 1512 print("%s ELF %s\n", p, t); 1513 } 1514 else 1515 print("application/x-elf-executable"); 1516 return 1; 1517 } 1518 1519 return 0; 1520 } 1521 1522 int 1523 isface(void) 1524 { 1525 int i, j, ldepth, l; 1526 char *p; 1527 1528 ldepth = -1; 1529 for(j = 0; j < 3; j++){ 1530 for(p = (char*)buf, i=0; i<3; i++){ 1531 if(p[0] != '0' || p[1] != 'x') 1532 return 0; 1533 if(buf[2+8] == ',') 1534 l = 2; 1535 else if(buf[2+4] == ',') 1536 l = 1; 1537 else 1538 return 0; 1539 if(ldepth == -1) 1540 ldepth = l; 1541 if(l != ldepth) 1542 return 0; 1543 strtoul(p, &p, 16); 1544 if(*p++ != ',') 1545 return 0; 1546 while(*p == ' ' || *p == '\t') 1547 p++; 1548 } 1549 if (*p++ != '\n') 1550 return 0; 1551 } 1552 1553 if(mime) 1554 print("application/x-face\n"); 1555 else 1556 print("face image depth %d\n", ldepth); 1557 return 1; 1558 } 1559 1560