1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 isp9font, /* plan 9 font */ 192 isp9bit, /* plan 9 image (as from /dev/window) */ 193 isrtf, /* rich text format */ 194 ismsdos, /* msdos exe (virus file attachement) */ 195 isface, /* ascii face file */ 196 197 /* last resorts */ 198 ismung, /* entropy compressed/encrypted */ 199 isenglish, /* char frequency English */ 200 0 201 }; 202 203 int mime; 204 205 char OCTET[] = "application/octet-stream\n"; 206 char PLAIN[] = "text/plain\n"; 207 208 void 209 main(int argc, char *argv[]) 210 { 211 int i, j, maxlen; 212 char *cp; 213 Rune r; 214 215 ARGBEGIN{ 216 case 'm': 217 mime = 1; 218 break; 219 default: 220 fprint(2, "usage: file [-m] [file...]\n"); 221 exits("usage"); 222 }ARGEND; 223 224 maxlen = 0; 225 if(mime == 0 || argc > 1){ 226 for(i = 0; i < argc; i++) { 227 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 228 ; 229 if(j > maxlen) 230 maxlen = j; 231 } 232 } 233 if (argc <= 0) { 234 if(!mime) 235 print ("stdin: "); 236 filetype(0); 237 } 238 else { 239 for(i = 0; i < argc; i++) 240 type(argv[i], maxlen); 241 } 242 exits(0); 243 } 244 245 void 246 type(char *file, int nlen) 247 { 248 Rune r; 249 int i; 250 char *p; 251 252 if(nlen > 0){ 253 slash = 0; 254 for (i = 0, p = file; *p; i++) { 255 if (*p == '/') /* find rightmost slash */ 256 slash = p; 257 p += chartorune(&r, p); /* count runes */ 258 } 259 print("%s:%*s",file, nlen-i+1, ""); 260 } 261 fname = file; 262 if ((fd = open(file, OREAD)) < 0) { 263 print("cannot open: %r\n"); 264 return; 265 } 266 filetype(fd); 267 close(fd); 268 } 269 270 /* 271 * Unicode 4.0 4-byte runes. 272 */ 273 typedef int Rune1; 274 275 enum { 276 UTFmax1 = 4, 277 }; 278 279 int 280 fullrune1(char *p, int n) 281 { 282 int c; 283 284 if(n >= 1) { 285 c = *(uchar*)p; 286 if(c < 0x80) 287 return 1; 288 if(n >= 2 && c < 0xE0) 289 return 1; 290 if(n >= 3 && c < 0xF0) 291 return 1; 292 if(n >= 4) 293 return 1; 294 } 295 return 0; 296 } 297 298 int 299 chartorune1(Rune1 *rune, char *str) 300 { 301 int c, c1, c2, c3, n; 302 Rune r; 303 304 c = *(uchar*)str; 305 if(c < 0xF0){ 306 r = 0; 307 n = chartorune(&r, str); 308 *rune = r; 309 return n; 310 } 311 c &= ~0xF0; 312 c1 = *(uchar*)(str+1) & ~0x80; 313 c2 = *(uchar*)(str+2) & ~0x80; 314 c3 = *(uchar*)(str+3) & ~0x80; 315 n = (c<<18) | (c1<<12) | (c2<<6) | c3; 316 if(n < 0x10000 || n > 0x10FFFF){ 317 *rune = Runeerror; 318 return 1; 319 } 320 *rune = n; 321 return 4; 322 } 323 324 void 325 filetype(int fd) 326 { 327 Rune1 r; 328 int i, f, n; 329 char *p, *eob; 330 331 free(mbuf); 332 mbuf = dirfstat(fd); 333 if(mbuf == nil){ 334 print("cannot stat: %r\n"); 335 return; 336 } 337 if(mbuf->mode & DMDIR) { 338 print(mime ? OCTET : "directory\n"); 339 return; 340 } 341 if(mbuf->type != 'M' && mbuf->type != '|') { 342 print(mime ? OCTET : "special file #%C/%s\n", 343 mbuf->type, mbuf->name); 344 return; 345 } 346 /* may be reading a pipe on standard input */ 347 nbuf = readn(fd, buf, sizeof(buf)-1); 348 if(nbuf < 0) { 349 print("cannot read: %r\n"); 350 return; 351 } 352 if(nbuf == 0) { 353 print(mime ? PLAIN : "empty file\n"); 354 return; 355 } 356 buf[nbuf] = 0; 357 358 /* 359 * build histogram table 360 */ 361 memset(cfreq, 0, sizeof(cfreq)); 362 for (i = 0; language[i].name; i++) 363 language[i].count = 0; 364 eob = (char *)buf+nbuf; 365 for(n = 0, p = (char *)buf; p < eob; n++) { 366 if (!fullrune1(p, eob-p) && eob-p < UTFmax1) 367 break; 368 p += chartorune1(&r, p); 369 if (r == 0) 370 f = Cnull; 371 else if (r <= 0x7f) { 372 if (!isprint(r) && !isspace(r)) 373 f = Ceascii; /* ASCII control char */ 374 else f = r; 375 } else if (r == 0x80) { 376 bump_utf_count(r); 377 f = Cutf; 378 } else if (r < 0xA0) 379 f = Cbinary; /* Invalid Runes */ 380 else if (r <= 0xff) 381 f = Clatin; /* Latin 1 */ 382 else { 383 bump_utf_count(r); 384 f = Cutf; /* UTF extension */ 385 } 386 cfreq[f]++; /* ASCII chars peg directly */ 387 } 388 /* 389 * gross classify 390 */ 391 if (cfreq[Cbinary]) 392 guess = Fbinary; 393 else if (cfreq[Cutf]) 394 guess = Futf; 395 else if (cfreq[Clatin]) 396 guess = Flatin; 397 else if (cfreq[Ceascii]) 398 guess = Feascii; 399 else if (cfreq[Cnull]) 400 guess = Fbinary; 401 else 402 guess = Fascii; 403 /* 404 * lookup dictionary words 405 */ 406 memset(wfreq, 0, sizeof(wfreq)); 407 if(guess == Fascii || guess == Flatin || guess == Futf) 408 wordfreq(); 409 /* 410 * call individual classify routines 411 */ 412 for(i=0; call[i]; i++) 413 if((*call[i])()) 414 return; 415 416 /* 417 * if all else fails, 418 * print out gross classification 419 */ 420 if (nbuf < 100 && !mime) 421 print(mime ? PLAIN : "short "); 422 if (guess == Fascii) 423 print(mime ? PLAIN : "Ascii\n"); 424 else if (guess == Feascii) 425 print(mime ? PLAIN : "extended ascii\n"); 426 else if (guess == Flatin) 427 print(mime ? PLAIN : "latin ascii\n"); 428 else if (guess == Futf && utf_count() < 4) 429 print_utf(); 430 else print(mime ? OCTET : "binary\n"); 431 } 432 433 void 434 bump_utf_count(Rune r) 435 { 436 int low, high, mid; 437 438 high = sizeof(language)/sizeof(language[0])-1; 439 for (low = 0; low < high;) { 440 mid = (low+high)/2; 441 if (r >= language[mid].low) { 442 if (r <= language[mid].high) { 443 language[mid].count++; 444 break; 445 } else low = mid+1; 446 } else high = mid; 447 } 448 } 449 450 int 451 utf_count(void) 452 { 453 int i, count; 454 455 count = 0; 456 for (i = 0; language[i].name; i++) 457 if (language[i].count > 0) 458 switch (language[i].mode) { 459 case Normal: 460 case First: 461 count++; 462 break; 463 default: 464 break; 465 } 466 return count; 467 } 468 469 int 470 chkascii(void) 471 { 472 int i; 473 474 for (i = 'a'; i < 'z'; i++) 475 if (cfreq[i]) 476 return 1; 477 for (i = 'A'; i < 'Z'; i++) 478 if (cfreq[i]) 479 return 1; 480 return 0; 481 } 482 483 int 484 find_first(char *name) 485 { 486 int i; 487 488 for (i = 0; language[i].name != 0; i++) 489 if (language[i].mode == First 490 && strcmp(language[i].name, name) == 0) 491 return i; 492 return -1; 493 } 494 495 void 496 print_utf(void) 497 { 498 int i, printed, j; 499 500 if(mime){ 501 print(PLAIN); 502 return; 503 } 504 if (chkascii()) { 505 printed = 1; 506 print("Ascii"); 507 } else 508 printed = 0; 509 for (i = 0; language[i].name; i++) 510 if (language[i].count) { 511 switch(language[i].mode) { 512 case Multi: 513 j = find_first(language[i].name); 514 if (j < 0) 515 break; 516 if (language[j].count > 0) 517 break; 518 /* Fall through */ 519 case Normal: 520 case First: 521 if (printed) 522 print(" & "); 523 else printed = 1; 524 print("%s", language[i].name); 525 break; 526 case Shared: 527 default: 528 break; 529 } 530 } 531 if(!printed) 532 print("UTF"); 533 print(" text\n"); 534 } 535 536 void 537 wordfreq(void) 538 { 539 int low, high, mid, r; 540 uchar *p, *p2, c; 541 542 p = buf; 543 for(;;) { 544 while (p < buf+nbuf && !isalpha(*p)) 545 p++; 546 if (p >= buf+nbuf) 547 return; 548 p2 = p; 549 while(p < buf+nbuf && isalpha(*p)) 550 p++; 551 c = *p; 552 *p = 0; 553 high = sizeof(dict)/sizeof(dict[0]); 554 for(low = 0;low < high;) { 555 mid = (low+high)/2; 556 r = strcmp(dict[mid].word, (char*)p2); 557 if(r == 0) { 558 wfreq[dict[mid].class]++; 559 break; 560 } 561 if(r < 0) 562 low = mid+1; 563 else 564 high = mid; 565 } 566 *p++ = c; 567 } 568 } 569 570 typedef struct Filemagic Filemagic; 571 struct Filemagic { 572 ulong x; 573 ulong mask; 574 char *desc; 575 char *mime; 576 }; 577 578 /* 579 * integers in this table must be as seen on a little-endian machine 580 * when read from a file. 581 */ 582 Filemagic long0tab[] = { 583 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 584 /* "pac1" */ 585 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 586 /* "pXc2 */ 587 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 588 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 589 0x43614c66, 0xFFFFFFFF, "FLAC audio file\n", OCTET, 590 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 591 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 592 070707, 0xFFFF, "cpio archive\n", OCTET, 593 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 594 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 595 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 596 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 597 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 598 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 599 /* 0xfeedface: this could alternately be a Next Plan 9 boot image */ 600 0xcefaedfe, 0xFFFFFFFF, "32-bit power Mach-O executable\n", OCTET, 601 /* 0xfeedfacf */ 602 0xcffaedfe, 0xFFFFFFFF, "64-bit power Mach-O executable\n", OCTET, 603 /* 0xcefaedfe */ 604 0xfeedface, 0xFFFFFFFF, "386 Mach-O executable\n", OCTET, 605 /* 0xcffaedfe */ 606 0xfeedfacf, 0xFFFFFFFF, "amd64 Mach-O executable\n", OCTET, 607 /* 0xcafebabe */ 608 0xbebafeca, 0xFFFFFFFF, "Mach-O universal executable\n", OCTET, 609 /* 610 * venti & fossil magic numbers are stored big-endian on disk, 611 * thus the numbers appear reversed in this table. 612 */ 613 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 614 }; 615 616 int 617 filemagic(Filemagic *tab, int ntab, ulong x) 618 { 619 int i; 620 621 for(i=0; i<ntab; i++) 622 if((x&tab[i].mask) == tab[i].x){ 623 print(mime ? tab[i].mime : tab[i].desc); 624 return 1; 625 } 626 return 0; 627 } 628 629 int 630 long0(void) 631 { 632 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 633 } 634 635 typedef struct Fileoffmag Fileoffmag; 636 struct Fileoffmag { 637 ulong off; 638 Filemagic; 639 }; 640 641 /* 642 * integers in this table must be as seen on a little-endian machine 643 * when read from a file. 644 */ 645 Fileoffmag longofftab[] = { 646 /* 647 * venti & fossil magic numbers are stored big-endian on disk, 648 * thus the numbers appear reversed in this table. 649 */ 650 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 651 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 652 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 653 4, 0x31647542, 0xFFFFFFFF, "OS X finder properties\n", OCTET, 654 }; 655 656 int 657 fileoffmagic(Fileoffmag *tab, int ntab) 658 { 659 int i; 660 ulong x; 661 Fileoffmag *tp; 662 uchar buf[sizeof(long)]; 663 664 for(i=0; i<ntab; i++) { 665 tp = tab + i; 666 seek(fd, tp->off, 0); 667 if (readn(fd, buf, sizeof buf) != sizeof buf) 668 continue; 669 x = LENDIAN(buf); 670 if((x&tp->mask) == tp->x){ 671 print(mime? tp->mime: tp->desc); 672 return 1; 673 } 674 } 675 return 0; 676 } 677 678 int 679 longoff(void) 680 { 681 return fileoffmagic(longofftab, nelem(longofftab)); 682 } 683 684 int 685 isexec(void) 686 { 687 Fhdr f; 688 689 seek(fd, 0, 0); /* reposition to start of file */ 690 if(crackhdr(fd, &f)) { 691 print(mime ? OCTET : "%s\n", f.name); 692 return 1; 693 } 694 return 0; 695 } 696 697 698 /* from tar.c */ 699 enum { NAMSIZ = 100, TBLOCK = 512 }; 700 701 union hblock 702 { 703 char dummy[TBLOCK]; 704 struct header 705 { 706 char name[NAMSIZ]; 707 char mode[8]; 708 char uid[8]; 709 char gid[8]; 710 char size[12]; 711 char mtime[12]; 712 char chksum[8]; 713 char linkflag; 714 char linkname[NAMSIZ]; 715 /* rest are defined by POSIX's ustar format; see p1003.2b */ 716 char magic[6]; /* "ustar" */ 717 char version[2]; 718 char uname[32]; 719 char gname[32]; 720 char devmajor[8]; 721 char devminor[8]; 722 char prefix[155]; /* if non-null, path = prefix "/" name */ 723 } dbuf; 724 }; 725 726 int 727 checksum(union hblock *hp) 728 { 729 int i; 730 char *cp; 731 struct header *hdr = &hp->dbuf; 732 733 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 734 *cp = ' '; 735 i = 0; 736 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 737 i += *cp & 0xff; 738 return i; 739 } 740 741 int 742 istar(void) 743 { 744 int chksum; 745 char tblock[TBLOCK]; 746 union hblock *hp = (union hblock *)tblock; 747 struct header *hdr = &hp->dbuf; 748 749 seek(fd, 0, 0); /* reposition to start of file */ 750 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 751 return 0; 752 chksum = strtol(hdr->chksum, 0, 8); 753 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 754 if (strcmp(hdr->magic, "ustar") == 0) 755 print(mime? "application/x-ustar\n": 756 "posix tar archive\n"); 757 else 758 print(mime? "application/x-tar\n": "tar archive\n"); 759 return 1; 760 } 761 return 0; 762 } 763 764 /* 765 * initial words to classify file 766 */ 767 struct FILE_STRING 768 { 769 char *key; 770 char *filetype; 771 int length; 772 char *mime; 773 } file_string[] = 774 { 775 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 776 "!<arch>\n", "archive", 8, "application/octet-stream", 777 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 778 "#!/bin/rc", "rc executable file", 9, "text/plain", 779 "#!/bin/sh", "sh executable file", 9, "text/plain", 780 "%!", "postscript", 2, "application/postscript", 781 "\004%!", "postscript", 3, "application/postscript", 782 "x T post", "troff output for post", 8, "application/troff", 783 "x T Latin1", "troff output for Latin1", 10, "application/troff", 784 "x T utf", "troff output for UTF", 7, "application/troff", 785 "x T 202", "troff output for 202", 7, "application/troff", 786 "x T aps", "troff output for aps", 7, "application/troff", 787 "GIF", "GIF image", 3, "image/gif", 788 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 789 "%PDF", "PDF", 4, "application/pdf", 790 "<html>\n", "HTML file", 7, "text/html", 791 "<HTML>\n", "HTML file", 7, "text/html", 792 "\111\111\052\000", "tiff", 4, "image/tiff", 793 "\115\115\000\052", "tiff", 4, "image/tiff", 794 "\377\330\377\340", "jpeg", 4, "image/jpeg", 795 "\377\330\377\341", "jpeg", 4, "image/jpeg", 796 "\377\330\377\333", "jpeg", 4, "image/jpeg", 797 "BM", "bmp", 2, "image/bmp", 798 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 799 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 800 "\033E\033", "HP PCL printer data", 3, OCTET, 801 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 802 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 803 "\211PNG", "PNG image", 4, "image/png", 804 "P3\n", "ppm", 3, "image/ppm", 805 "P6\n", "ppm", 3, "image/ppm", 806 "/* XPM */\n", "xbm", 10, "image/xbm", 807 ".HTML ", "troff -ms input", 6, "text/troff", 808 ".LP", "troff -ms input", 3, "text/troff", 809 ".ND", "troff -ms input", 3, "text/troff", 810 ".PP", "troff -ms input", 3, "text/troff", 811 ".TL", "troff -ms input", 3, "text/troff", 812 ".TR", "troff -ms input", 3, "text/troff", 813 ".TH", "manual page", 3, "text/troff", 814 ".\\\"", "troff input", 3, "text/troff", 815 ".de", "troff input", 3, "text/troff", 816 ".if", "troff input", 3, "text/troff", 817 ".nr", "troff input", 3, "text/troff", 818 ".tr", "troff input", 3, "text/troff", 819 "vac:", "venti score", 4, "text/plain", 820 "-----BEGIN CERTIFICATE-----\n", 821 "pem certificate", -1, "text/plain", 822 "-----BEGIN TRUSTED CERTIFICATE-----\n", 823 "pem trusted certificate", -1, "text/plain", 824 "-----BEGIN X509 CERTIFICATE-----\n", 825 "pem x.509 certificate", -1, "text/plain", 826 "subject=/C=", "pem certificate with header", -1, "text/plain", 827 "process snapshot ", "process snapshot", -1, "application/snapfs", 828 0,0,0,0 829 }; 830 831 int 832 istring(void) 833 { 834 int i, l; 835 struct FILE_STRING *p; 836 837 for(p = file_string; p->key; p++) { 838 l = p->length; 839 if(l == -1) 840 l = strlen(p->key); 841 if(nbuf >= l && memcmp(buf, p->key, l) == 0) { 842 if(mime) 843 print("%s\n", p->mime); 844 else 845 print("%s\n", p->filetype); 846 return 1; 847 } 848 } 849 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 850 for(i = 5; i < nbuf; i++) 851 if(buf[i] == '\n') 852 break; 853 if(mime) 854 print(OCTET); 855 else 856 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 857 return 1; 858 } 859 return 0; 860 } 861 862 struct offstr 863 { 864 ulong off; 865 struct FILE_STRING; 866 } offstrs[] = { 867 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 868 0, 0, 0, 0, 0 869 }; 870 871 int 872 isoffstr(void) 873 { 874 int n; 875 char buf[256]; 876 struct offstr *p; 877 878 for(p = offstrs; p->key; p++) { 879 seek(fd, p->off, 0); 880 n = p->length; 881 if (n > sizeof buf) 882 n = sizeof buf; 883 if (readn(fd, buf, n) != n) 884 continue; 885 if(memcmp(buf, p->key, n) == 0) { 886 if(mime) 887 print("%s\n", p->mime); 888 else 889 print("%s\n", p->filetype); 890 return 1; 891 } 892 } 893 return 0; 894 } 895 896 int 897 iff(void) 898 { 899 if (strncmp((char*)buf, "FORM", 4) == 0 && 900 strncmp((char*)buf+8, "AIFF", 4) == 0) { 901 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 902 return 1; 903 } 904 if (strncmp((char*)buf, "RIFF", 4) == 0) { 905 if (strncmp((char*)buf+8, "WAVE", 4) == 0) 906 print("%s\n", mime? "audio/wave": "wave audio"); 907 else if (strncmp((char*)buf+8, "AVI ", 4) == 0) 908 print("%s\n", mime? "video/avi": "avi video"); 909 else 910 print("%s\n", mime? "application/octet-stream": 911 "riff file"); 912 return 1; 913 } 914 return 0; 915 } 916 917 char* html_string[] = 918 { 919 "title", 920 "body", 921 "head", 922 "strong", 923 "h1", 924 "h2", 925 "h3", 926 "h4", 927 "h5", 928 "h6", 929 "ul", 930 "li", 931 "dl", 932 "br", 933 "em", 934 0, 935 }; 936 937 int 938 ishtml(void) 939 { 940 uchar *p, *q; 941 int i, count; 942 943 /* compare strings between '<' and '>' to html table */ 944 count = 0; 945 p = buf; 946 for(;;) { 947 while (p < buf+nbuf && *p != '<') 948 p++; 949 p++; 950 if (p >= buf+nbuf) 951 break; 952 if(*p == '/') 953 p++; 954 q = p; 955 while(p < buf+nbuf && *p != '>') 956 p++; 957 if (p >= buf+nbuf) 958 break; 959 for(i = 0; html_string[i]; i++) { 960 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 961 if(count++ > 4) { 962 print(mime ? "text/html\n" : "HTML file\n"); 963 return 1; 964 } 965 break; 966 } 967 } 968 p++; 969 } 970 return 0; 971 } 972 973 char* rfc822_string[] = 974 { 975 "from:", 976 "date:", 977 "to:", 978 "subject:", 979 "received:", 980 "reply to:", 981 "sender:", 982 0, 983 }; 984 985 int 986 isrfc822(void) 987 { 988 989 char *p, *q, *r; 990 int i, count; 991 992 count = 0; 993 p = (char*)buf; 994 for(;;) { 995 q = strchr(p, '\n'); 996 if(q == nil) 997 break; 998 *q = 0; 999 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 1000 count++; 1001 *q = '\n'; 1002 p = q+1; 1003 continue; 1004 } 1005 *q = '\n'; 1006 if(*p != '\t' && *p != ' '){ 1007 r = strchr(p, ':'); 1008 if(r == 0 || r > q) 1009 break; 1010 for(i = 0; rfc822_string[i]; i++) { 1011 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 1012 count++; 1013 break; 1014 } 1015 } 1016 } 1017 p = q+1; 1018 } 1019 if(count >= 3){ 1020 print(mime ? "message/rfc822\n" : "email file\n"); 1021 return 1; 1022 } 1023 return 0; 1024 } 1025 1026 int 1027 ismbox(void) 1028 { 1029 char *p, *q; 1030 1031 p = (char*)buf; 1032 q = strchr(p, '\n'); 1033 if(q == nil) 1034 return 0; 1035 *q = 0; 1036 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 1037 print(mime ? "text/plain\n" : "mail box\n"); 1038 return 1; 1039 } 1040 *q = '\n'; 1041 return 0; 1042 } 1043 1044 int 1045 iscint(void) 1046 { 1047 int type; 1048 char *name; 1049 Biobuf b; 1050 1051 if(Binit(&b, fd, OREAD) == Beof) 1052 return 0; 1053 seek(fd, 0, 0); 1054 type = objtype(&b, &name); 1055 if(type < 0) 1056 return 0; 1057 if(mime) 1058 print(OCTET); 1059 else 1060 print("%s intermediate\n", name); 1061 return 1; 1062 } 1063 1064 int 1065 isc(void) 1066 { 1067 int n; 1068 1069 n = wfreq[I1]; 1070 /* 1071 * includes 1072 */ 1073 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1074 goto yes; 1075 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1076 goto yes; 1077 /* 1078 * declarations 1079 */ 1080 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1081 goto yes; 1082 /* 1083 * assignments 1084 */ 1085 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1086 goto yes; 1087 return 0; 1088 1089 yes: 1090 if(mime){ 1091 print(PLAIN); 1092 return 1; 1093 } 1094 if(wfreq[Alword] > 0) 1095 print("alef program\n"); 1096 else 1097 print("c program\n"); 1098 return 1; 1099 } 1100 1101 int 1102 islimbo(void) 1103 { 1104 1105 /* 1106 * includes 1107 */ 1108 if(wfreq[Lword] < 4) 1109 return 0; 1110 print(mime ? PLAIN : "limbo program\n"); 1111 return 1; 1112 } 1113 1114 int 1115 isas(void) 1116 { 1117 1118 /* 1119 * includes 1120 */ 1121 if(wfreq[Aword] < 2) 1122 return 0; 1123 print(mime ? PLAIN : "as program\n"); 1124 return 1; 1125 } 1126 1127 /* 1128 * low entropy means encrypted 1129 */ 1130 int 1131 ismung(void) 1132 { 1133 int i, bucket[8]; 1134 float cs; 1135 1136 if(nbuf < 64) 1137 return 0; 1138 memset(bucket, 0, sizeof(bucket)); 1139 for(i=nbuf-64; i<nbuf; i++) 1140 bucket[(buf[i]>>5)&07] += 1; 1141 1142 cs = 0.; 1143 for(i=0; i<8; i++) 1144 cs += (bucket[i]-8)*(bucket[i]-8); 1145 cs /= 8.; 1146 if(cs <= 24.322) { 1147 if(buf[0]==0x1f && buf[1]==0x9d) 1148 print(mime ? OCTET : "compressed\n"); 1149 else 1150 if(buf[0]==0x1f && buf[1]==0x8b) 1151 print(mime ? OCTET : "gzip compressed\n"); 1152 else 1153 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1154 print(mime ? OCTET : "bzip2 compressed\n"); 1155 else 1156 print(mime ? OCTET : "encrypted\n"); 1157 return 1; 1158 } 1159 return 0; 1160 } 1161 1162 /* 1163 * english by punctuation and frequencies 1164 */ 1165 int 1166 isenglish(void) 1167 { 1168 int vow, comm, rare, badpun, punct; 1169 char *p; 1170 1171 if(guess != Fascii && guess != Feascii) 1172 return 0; 1173 badpun = 0; 1174 punct = 0; 1175 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1176 switch(*p) { 1177 case '.': 1178 case ',': 1179 case ')': 1180 case '%': 1181 case ';': 1182 case ':': 1183 case '?': 1184 punct++; 1185 if(p[1] != ' ' && p[1] != '\n') 1186 badpun++; 1187 } 1188 if(badpun*5 > punct) 1189 return 0; 1190 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1191 return 0; 1192 if(2*cfreq[';'] > cfreq['e']) 1193 return 0; 1194 1195 vow = 0; 1196 for(p="AEIOU"; *p; p++) { 1197 vow += cfreq[*p]; 1198 vow += cfreq[tolower(*p)]; 1199 } 1200 comm = 0; 1201 for(p="ETAION"; *p; p++) { 1202 comm += cfreq[*p]; 1203 comm += cfreq[tolower(*p)]; 1204 } 1205 rare = 0; 1206 for(p="VJKQXZ"; *p; p++) { 1207 rare += cfreq[*p]; 1208 rare += cfreq[tolower(*p)]; 1209 } 1210 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1211 print(mime ? PLAIN : "English text\n"); 1212 return 1; 1213 } 1214 return 0; 1215 } 1216 1217 /* 1218 * pick up a number with 1219 * syntax _*[0-9]+_ 1220 */ 1221 #define P9BITLEN 12 1222 int 1223 p9bitnum(uchar *bp) 1224 { 1225 int n, c, len; 1226 1227 len = P9BITLEN; 1228 while(*bp == ' ') { 1229 bp++; 1230 len--; 1231 if(len <= 0) 1232 return -1; 1233 } 1234 n = 0; 1235 while(len > 1) { 1236 c = *bp++; 1237 if(!isdigit(c)) 1238 return -1; 1239 n = n*10 + c-'0'; 1240 len--; 1241 } 1242 if(*bp != ' ') 1243 return -1; 1244 return n; 1245 } 1246 1247 int 1248 depthof(char *s, int *newp) 1249 { 1250 char *es; 1251 int d; 1252 1253 *newp = 0; 1254 es = s+12; 1255 while(s<es && *s==' ') 1256 s++; 1257 if(s == es) 1258 return -1; 1259 if('0'<=*s && *s<='9') 1260 return 1<<strtol(s, 0, 0); 1261 1262 *newp = 1; 1263 d = 0; 1264 while(s<es && *s!=' '){ 1265 s++; /* skip letter */ 1266 d += strtoul(s, &s, 10); 1267 } 1268 1269 if(d % 8 == 0 || 8 % d == 0) 1270 return d; 1271 else 1272 return -1; 1273 } 1274 1275 int 1276 isp9bit(void) 1277 { 1278 int dep, lox, loy, hix, hiy, px, new, cmpr; 1279 ulong t; 1280 long len; 1281 char *newlabel; 1282 uchar *cp; 1283 1284 cp = buf; 1285 cmpr = 0; 1286 newlabel = "old "; 1287 1288 if(memcmp(cp, "compressed\n", 11) == 0) { 1289 cmpr = 1; 1290 cp = buf + 11; 1291 } 1292 1293 dep = depthof((char*)cp + 0*P9BITLEN, &new); 1294 if(new) 1295 newlabel = ""; 1296 lox = p9bitnum(cp + 1*P9BITLEN); 1297 loy = p9bitnum(cp + 2*P9BITLEN); 1298 hix = p9bitnum(cp + 3*P9BITLEN); 1299 hiy = p9bitnum(cp + 4*P9BITLEN); 1300 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1301 return 0; 1302 1303 if(dep < 8){ 1304 px = 8/dep; /* pixels per byte */ 1305 /* set l to number of bytes of data per scan line */ 1306 if(lox >= 0) 1307 len = (hix+px-1)/px - lox/px; 1308 else{ /* make positive before divide */ 1309 t = (-lox)+px-1; 1310 t = (t/px)*px; 1311 len = (t+hix+px-1)/px; 1312 } 1313 }else 1314 len = (hix-lox)*dep/8; 1315 len *= hiy - loy; /* col length */ 1316 len += 5 * P9BITLEN; /* size of initial ascii */ 1317 1318 /* 1319 * for compressed images, don't look any further. otherwise: 1320 * for image file, length is non-zero and must match calculation above. 1321 * for /dev/window and /dev/screen the length is always zero. 1322 * for subfont, the subfont header should follow immediately. 1323 */ 1324 if (cmpr) { 1325 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n", 1326 newlabel, dep); 1327 return 1; 1328 } 1329 /* 1330 * mbuf->length == 0 probably indicates reading a pipe. 1331 * Ghostscript sometimes produces a little extra on the end. 1332 */ 1333 if (len != 0 && (mbuf->length == 0 || mbuf->length == len || 1334 mbuf->length > len && mbuf->length < len+P9BITLEN)) { 1335 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1336 return 1; 1337 } 1338 if (p9subfont(buf+len)) { 1339 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep); 1340 return 1; 1341 } 1342 return 0; 1343 } 1344 1345 int 1346 p9subfont(uchar *p) 1347 { 1348 int n, h, a; 1349 1350 /* if image too big, assume it's a subfont */ 1351 if (p+3*P9BITLEN > buf+sizeof(buf)) 1352 return 1; 1353 1354 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1355 if (n < 0) 1356 return 0; 1357 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1358 if (h < 0) 1359 return 0; 1360 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1361 if (a < 0) 1362 return 0; 1363 return 1; 1364 } 1365 1366 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1367 1368 int 1369 isp9font(void) 1370 { 1371 uchar *cp, *p; 1372 int i, n; 1373 char pathname[1024]; 1374 1375 cp = buf; 1376 if (!getfontnum(cp, &cp)) /* height */ 1377 return 0; 1378 if (!getfontnum(cp, &cp)) /* ascent */ 1379 return 0; 1380 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1381 if (!getfontnum(cp, &cp)) /* min */ 1382 break; 1383 if (!getfontnum(cp, &cp)) /* max */ 1384 return 0; 1385 getfontnum(cp, &cp); /* optional offset */ 1386 while (WHITESPACE(*cp)) 1387 cp++; 1388 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1389 ; 1390 /* construct a path name, if needed */ 1391 n = 0; 1392 if (*p != '/' && slash) { 1393 n = slash-fname+1; 1394 if (n < sizeof(pathname)) 1395 memcpy(pathname, fname, n); 1396 else n = 0; 1397 } 1398 if (n+cp-p+4 < sizeof(pathname)) { 1399 memcpy(pathname+n, p, cp-p); 1400 n += cp-p; 1401 pathname[n] = 0; 1402 if (access(pathname, AEXIST) < 0) { 1403 strcpy(pathname+n, ".0"); 1404 if (access(pathname, AEXIST) < 0) 1405 return 0; 1406 } 1407 } 1408 } 1409 if (i) { 1410 print(mime ? "text/plain\n" : "font file\n"); 1411 return 1; 1412 } 1413 return 0; 1414 } 1415 1416 int 1417 getfontnum(uchar *cp, uchar **rp) 1418 { 1419 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1420 cp++; 1421 if (*cp < '0' || *cp > '9') 1422 return 0; 1423 strtoul((char *)cp, (char **)rp, 0); 1424 if (!WHITESPACE(**rp)) { 1425 *rp = cp; 1426 return 0; 1427 } 1428 return 1; 1429 } 1430 1431 int 1432 isrtf(void) 1433 { 1434 if(strstr((char *)buf, "\\rtf1")){ 1435 print(mime ? "application/rtf\n" : "rich text format\n"); 1436 return 1; 1437 } 1438 return 0; 1439 } 1440 1441 int 1442 ismsdos(void) 1443 { 1444 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1445 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1446 return 1; 1447 } 1448 return 0; 1449 } 1450 1451 int 1452 iself(void) 1453 { 1454 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1455 [1] "WE32100", 1456 [2] "SPARC", 1457 [3] "i386", 1458 [4] "M68000", 1459 [5] "M88000", 1460 [6] "i486", 1461 [7] "i860", 1462 [8] "R3000", 1463 [9] "S370", 1464 [10] "R4000", 1465 [15] "HP-PA", 1466 [18] "sparc v8+", 1467 [19] "i960", 1468 [20] "PPC-32", 1469 [21] "PPC-64", 1470 [40] "ARM", 1471 [41] "Alpha", 1472 [43] "sparc v9", 1473 [50] "IA-64", 1474 [62] "AMD64", 1475 [75] "VAX", 1476 }; 1477 static char *type[] = { 1478 [1] "relocatable object", 1479 [2] "executable", 1480 [3] "shared library", 1481 [4] "core dump", 1482 }; 1483 1484 if (memcmp(buf, "\x7fELF", 4) == 0){ 1485 if (!mime){ 1486 int isdifend = 0; 1487 int n = (buf[19] << 8) | buf[18]; 1488 char *p = "unknown"; 1489 char *t = "unknown"; 1490 1491 if (n > 0 && n < nelem(cpu) && cpu[n]) 1492 p = cpu[n]; 1493 else { 1494 /* try the other byte order */ 1495 isdifend = 1; 1496 n = (buf[18] << 8) | buf[19]; 1497 if (n > 0 && n < nelem(cpu) && cpu[n]) 1498 p = cpu[n]; 1499 } 1500 if(isdifend) 1501 n = (buf[16]<< 8) | buf[17]; 1502 else 1503 n = (buf[17]<< 8) | buf[16]; 1504 1505 if(n>0 && n < nelem(type) && type[n]) 1506 t = type[n]; 1507 print("%s ELF %s\n", p, t); 1508 } 1509 else 1510 print("application/x-elf-executable"); 1511 return 1; 1512 } 1513 1514 return 0; 1515 } 1516 1517 int 1518 isface(void) 1519 { 1520 int i, j, ldepth, l; 1521 char *p; 1522 1523 ldepth = -1; 1524 for(j = 0; j < 3; j++){ 1525 for(p = (char*)buf, i=0; i<3; i++){ 1526 if(p[0] != '0' || p[1] != 'x') 1527 return 0; 1528 if(buf[2+8] == ',') 1529 l = 2; 1530 else if(buf[2+4] == ',') 1531 l = 1; 1532 else 1533 return 0; 1534 if(ldepth == -1) 1535 ldepth = l; 1536 if(l != ldepth) 1537 return 0; 1538 strtoul(p, &p, 16); 1539 if(*p++ != ',') 1540 return 0; 1541 while(*p == ' ' || *p == '\t') 1542 p++; 1543 } 1544 if (*p++ != '\n') 1545 return 0; 1546 } 1547 1548 if(mime) 1549 print("application/x-face\n"); 1550 else 1551 print("face image depth %d\n", ldepth); 1552 return 1; 1553 } 1554 1555