1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 isp9font, /* plan 9 font */ 192 isp9bit, /* plan 9 image (as from /dev/window) */ 193 isrtf, /* rich text format */ 194 ismsdos, /* msdos exe (virus file attachement) */ 195 isface, /* ascii face file */ 196 197 /* last resorts */ 198 ismung, /* entropy compressed/encrypted */ 199 isenglish, /* char frequency English */ 200 0 201 }; 202 203 int mime; 204 205 char OCTET[] = "application/octet-stream\n"; 206 char PLAIN[] = "text/plain\n"; 207 208 void 209 main(int argc, char *argv[]) 210 { 211 int i, j, maxlen; 212 char *cp; 213 Rune r; 214 215 ARGBEGIN{ 216 case 'm': 217 mime = 1; 218 break; 219 default: 220 fprint(2, "usage: file [-m] [file...]\n"); 221 exits("usage"); 222 }ARGEND; 223 224 maxlen = 0; 225 if(mime == 0 || argc > 1){ 226 for(i = 0; i < argc; i++) { 227 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 228 ; 229 if(j > maxlen) 230 maxlen = j; 231 } 232 } 233 if (argc <= 0) { 234 if(!mime) 235 print ("stdin: "); 236 filetype(0); 237 } 238 else { 239 for(i = 0; i < argc; i++) 240 type(argv[i], maxlen); 241 } 242 exits(0); 243 } 244 245 void 246 type(char *file, int nlen) 247 { 248 Rune r; 249 int i; 250 char *p; 251 252 if(nlen > 0){ 253 slash = 0; 254 for (i = 0, p = file; *p; i++) { 255 if (*p == '/') /* find rightmost slash */ 256 slash = p; 257 p += chartorune(&r, p); /* count runes */ 258 } 259 print("%s:%*s",file, nlen-i+1, ""); 260 } 261 fname = file; 262 if ((fd = open(file, OREAD)) < 0) { 263 print("cannot open: %r\n"); 264 return; 265 } 266 filetype(fd); 267 close(fd); 268 } 269 270 /* 271 * Unicode 4.0 4-byte runes. 272 */ 273 typedef int Rune1; 274 275 enum { 276 UTFmax1 = 4, 277 }; 278 279 int 280 fullrune1(char *p, int n) 281 { 282 int c; 283 284 if(n >= 1) { 285 c = *(uchar*)p; 286 if(c < 0x80) 287 return 1; 288 if(n >= 2 && c < 0xE0) 289 return 1; 290 if(n >= 3 && c < 0xF0) 291 return 1; 292 if(n >= 4) 293 return 1; 294 } 295 return 0; 296 } 297 298 int 299 chartorune1(Rune1 *rune, char *str) 300 { 301 int c, c1, c2, c3, n; 302 Rune r; 303 304 c = *(uchar*)str; 305 if(c < 0xF0){ 306 r = 0; 307 n = chartorune(&r, str); 308 *rune = r; 309 return n; 310 } 311 c &= ~0xF0; 312 c1 = *(uchar*)(str+1) & ~0x80; 313 c2 = *(uchar*)(str+2) & ~0x80; 314 c3 = *(uchar*)(str+3) & ~0x80; 315 n = (c<<18) | (c1<<12) | (c2<<6) | c3; 316 if(n < 0x10000 || n > 0x10FFFF){ 317 *rune = Runeerror; 318 return 1; 319 } 320 *rune = n; 321 return 4; 322 } 323 324 void 325 filetype(int fd) 326 { 327 Rune1 r; 328 int i, f, n; 329 char *p, *eob; 330 331 free(mbuf); 332 mbuf = dirfstat(fd); 333 if(mbuf == nil){ 334 print("cannot stat: %r\n"); 335 return; 336 } 337 if(mbuf->mode & DMDIR) { 338 print(mime ? OCTET : "directory\n"); 339 return; 340 } 341 if(mbuf->type != 'M' && mbuf->type != '|') { 342 print(mime ? OCTET : "special file #%c/%s\n", 343 mbuf->type, mbuf->name); 344 return; 345 } 346 /* may be reading a pipe on standard input */ 347 nbuf = readn(fd, buf, sizeof(buf)-1); 348 if(nbuf < 0) { 349 print("cannot read: %r\n"); 350 return; 351 } 352 if(nbuf == 0) { 353 print(mime ? PLAIN : "empty file\n"); 354 return; 355 } 356 buf[nbuf] = 0; 357 358 /* 359 * build histogram table 360 */ 361 memset(cfreq, 0, sizeof(cfreq)); 362 for (i = 0; language[i].name; i++) 363 language[i].count = 0; 364 eob = (char *)buf+nbuf; 365 for(n = 0, p = (char *)buf; p < eob; n++) { 366 if (!fullrune1(p, eob-p) && eob-p < UTFmax1) 367 break; 368 p += chartorune1(&r, p); 369 if (r == 0) 370 f = Cnull; 371 else if (r <= 0x7f) { 372 if (!isprint(r) && !isspace(r)) 373 f = Ceascii; /* ASCII control char */ 374 else f = r; 375 } else if (r == 0x80) { 376 bump_utf_count(r); 377 f = Cutf; 378 } else if (r < 0xA0) 379 f = Cbinary; /* Invalid Runes */ 380 else if (r <= 0xff) 381 f = Clatin; /* Latin 1 */ 382 else { 383 bump_utf_count(r); 384 f = Cutf; /* UTF extension */ 385 } 386 cfreq[f]++; /* ASCII chars peg directly */ 387 } 388 /* 389 * gross classify 390 */ 391 if (cfreq[Cbinary]) 392 guess = Fbinary; 393 else if (cfreq[Cutf]) 394 guess = Futf; 395 else if (cfreq[Clatin]) 396 guess = Flatin; 397 else if (cfreq[Ceascii]) 398 guess = Feascii; 399 else if (cfreq[Cnull]) 400 guess = Fbinary; 401 else 402 guess = Fascii; 403 /* 404 * lookup dictionary words 405 */ 406 memset(wfreq, 0, sizeof(wfreq)); 407 if(guess == Fascii || guess == Flatin || guess == Futf) 408 wordfreq(); 409 /* 410 * call individual classify routines 411 */ 412 for(i=0; call[i]; i++) 413 if((*call[i])()) 414 return; 415 416 /* 417 * if all else fails, 418 * print out gross classification 419 */ 420 if (nbuf < 100 && !mime) 421 print(mime ? PLAIN : "short "); 422 if (guess == Fascii) 423 print(mime ? PLAIN : "Ascii\n"); 424 else if (guess == Feascii) 425 print(mime ? PLAIN : "extended ascii\n"); 426 else if (guess == Flatin) 427 print(mime ? PLAIN : "latin ascii\n"); 428 else if (guess == Futf && utf_count() < 4) 429 print_utf(); 430 else print(mime ? OCTET : "binary\n"); 431 } 432 433 void 434 bump_utf_count(Rune r) 435 { 436 int low, high, mid; 437 438 high = sizeof(language)/sizeof(language[0])-1; 439 for (low = 0; low < high;) { 440 mid = (low+high)/2; 441 if (r >= language[mid].low) { 442 if (r <= language[mid].high) { 443 language[mid].count++; 444 break; 445 } else low = mid+1; 446 } else high = mid; 447 } 448 } 449 450 int 451 utf_count(void) 452 { 453 int i, count; 454 455 count = 0; 456 for (i = 0; language[i].name; i++) 457 if (language[i].count > 0) 458 switch (language[i].mode) { 459 case Normal: 460 case First: 461 count++; 462 break; 463 default: 464 break; 465 } 466 return count; 467 } 468 469 int 470 chkascii(void) 471 { 472 int i; 473 474 for (i = 'a'; i < 'z'; i++) 475 if (cfreq[i]) 476 return 1; 477 for (i = 'A'; i < 'Z'; i++) 478 if (cfreq[i]) 479 return 1; 480 return 0; 481 } 482 483 int 484 find_first(char *name) 485 { 486 int i; 487 488 for (i = 0; language[i].name != 0; i++) 489 if (language[i].mode == First 490 && strcmp(language[i].name, name) == 0) 491 return i; 492 return -1; 493 } 494 495 void 496 print_utf(void) 497 { 498 int i, printed, j; 499 500 if(mime){ 501 print(PLAIN); 502 return; 503 } 504 if (chkascii()) { 505 printed = 1; 506 print("Ascii"); 507 } else 508 printed = 0; 509 for (i = 0; language[i].name; i++) 510 if (language[i].count) { 511 switch(language[i].mode) { 512 case Multi: 513 j = find_first(language[i].name); 514 if (j < 0) 515 break; 516 if (language[j].count > 0) 517 break; 518 /* Fall through */ 519 case Normal: 520 case First: 521 if (printed) 522 print(" & "); 523 else printed = 1; 524 print("%s", language[i].name); 525 break; 526 case Shared: 527 default: 528 break; 529 } 530 } 531 if(!printed) 532 print("UTF"); 533 print(" text\n"); 534 } 535 536 void 537 wordfreq(void) 538 { 539 int low, high, mid, r; 540 uchar *p, *p2, c; 541 542 p = buf; 543 for(;;) { 544 while (p < buf+nbuf && !isalpha(*p)) 545 p++; 546 if (p >= buf+nbuf) 547 return; 548 p2 = p; 549 while(p < buf+nbuf && isalpha(*p)) 550 p++; 551 c = *p; 552 *p = 0; 553 high = sizeof(dict)/sizeof(dict[0]); 554 for(low = 0;low < high;) { 555 mid = (low+high)/2; 556 r = strcmp(dict[mid].word, (char*)p2); 557 if(r == 0) { 558 wfreq[dict[mid].class]++; 559 break; 560 } 561 if(r < 0) 562 low = mid+1; 563 else 564 high = mid; 565 } 566 *p++ = c; 567 } 568 } 569 570 typedef struct Filemagic Filemagic; 571 struct Filemagic { 572 ulong x; 573 ulong mask; 574 char *desc; 575 char *mime; 576 }; 577 578 /* 579 * integers in this table must be as seen on a little-endian machine 580 * when read from a file. 581 */ 582 Filemagic long0tab[] = { 583 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 584 /* "pac1" */ 585 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 586 /* "pXc2 */ 587 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 588 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 589 0x43614c66, 0xFFFFFFFF, "FLAC audio file\n", OCTET, 590 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 591 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 592 070707, 0xFFFF, "cpio archive\n", OCTET, 593 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 594 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 595 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 596 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 597 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 598 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 599 /* 600 * venti & fossil magic numbers are stored big-endian on disk, 601 * thus the numbers appear reversed in this table. 602 */ 603 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 604 }; 605 606 int 607 filemagic(Filemagic *tab, int ntab, ulong x) 608 { 609 int i; 610 611 for(i=0; i<ntab; i++) 612 if((x&tab[i].mask) == tab[i].x){ 613 print(mime ? tab[i].mime : tab[i].desc); 614 return 1; 615 } 616 return 0; 617 } 618 619 int 620 long0(void) 621 { 622 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 623 } 624 625 typedef struct Fileoffmag Fileoffmag; 626 struct Fileoffmag { 627 ulong off; 628 Filemagic; 629 }; 630 631 /* 632 * integers in this table must be as seen on a little-endian machine 633 * when read from a file. 634 */ 635 Fileoffmag longofftab[] = { 636 /* 637 * venti & fossil magic numbers are stored big-endian on disk, 638 * thus the numbers appear reversed in this table. 639 */ 640 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 641 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 642 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 643 4, 0x31647542, 0xFFFFFFFF, "OS X finder properties\n", OCTET, 644 }; 645 646 int 647 fileoffmagic(Fileoffmag *tab, int ntab) 648 { 649 int i; 650 ulong x; 651 Fileoffmag *tp; 652 uchar buf[sizeof(long)]; 653 654 for(i=0; i<ntab; i++) { 655 tp = tab + i; 656 seek(fd, tp->off, 0); 657 if (readn(fd, buf, sizeof buf) != sizeof buf) 658 continue; 659 x = LENDIAN(buf); 660 if((x&tp->mask) == tp->x){ 661 print(mime? tp->mime: tp->desc); 662 return 1; 663 } 664 } 665 return 0; 666 } 667 668 int 669 longoff(void) 670 { 671 return fileoffmagic(longofftab, nelem(longofftab)); 672 } 673 674 int 675 isexec(void) 676 { 677 Fhdr f; 678 679 seek(fd, 0, 0); /* reposition to start of file */ 680 if(crackhdr(fd, &f)) { 681 print(mime ? OCTET : "%s\n", f.name); 682 return 1; 683 } 684 return 0; 685 } 686 687 688 /* from tar.c */ 689 enum { NAMSIZ = 100, TBLOCK = 512 }; 690 691 union hblock 692 { 693 char dummy[TBLOCK]; 694 struct header 695 { 696 char name[NAMSIZ]; 697 char mode[8]; 698 char uid[8]; 699 char gid[8]; 700 char size[12]; 701 char mtime[12]; 702 char chksum[8]; 703 char linkflag; 704 char linkname[NAMSIZ]; 705 /* rest are defined by POSIX's ustar format; see p1003.2b */ 706 char magic[6]; /* "ustar" */ 707 char version[2]; 708 char uname[32]; 709 char gname[32]; 710 char devmajor[8]; 711 char devminor[8]; 712 char prefix[155]; /* if non-null, path = prefix "/" name */ 713 } dbuf; 714 }; 715 716 int 717 checksum(union hblock *hp) 718 { 719 int i; 720 char *cp; 721 struct header *hdr = &hp->dbuf; 722 723 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 724 *cp = ' '; 725 i = 0; 726 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 727 i += *cp & 0xff; 728 return i; 729 } 730 731 int 732 istar(void) 733 { 734 int chksum; 735 char tblock[TBLOCK]; 736 union hblock *hp = (union hblock *)tblock; 737 struct header *hdr = &hp->dbuf; 738 739 seek(fd, 0, 0); /* reposition to start of file */ 740 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 741 return 0; 742 chksum = strtol(hdr->chksum, 0, 8); 743 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 744 if (strcmp(hdr->magic, "ustar") == 0) 745 print(mime? "application/x-ustar\n": 746 "posix tar archive\n"); 747 else 748 print(mime? "application/x-tar\n": "tar archive\n"); 749 return 1; 750 } 751 return 0; 752 } 753 754 /* 755 * initial words to classify file 756 */ 757 struct FILE_STRING 758 { 759 char *key; 760 char *filetype; 761 int length; 762 char *mime; 763 } file_string[] = 764 { 765 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 766 "!<arch>\n", "archive", 8, "application/octet-stream", 767 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 768 "#!/bin/rc", "rc executable file", 9, "text/plain", 769 "#!/bin/sh", "sh executable file", 9, "text/plain", 770 "%!", "postscript", 2, "application/postscript", 771 "\004%!", "postscript", 3, "application/postscript", 772 "x T post", "troff output for post", 8, "application/troff", 773 "x T Latin1", "troff output for Latin1", 10, "application/troff", 774 "x T utf", "troff output for UTF", 7, "application/troff", 775 "x T 202", "troff output for 202", 7, "application/troff", 776 "x T aps", "troff output for aps", 7, "application/troff", 777 "GIF", "GIF image", 3, "image/gif", 778 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 779 "%PDF", "PDF", 4, "application/pdf", 780 "<html>\n", "HTML file", 7, "text/html", 781 "<HTML>\n", "HTML file", 7, "text/html", 782 "\111\111\052\000", "tiff", 4, "image/tiff", 783 "\115\115\000\052", "tiff", 4, "image/tiff", 784 "\377\330\377\340", "jpeg", 4, "image/jpeg", 785 "\377\330\377\341", "jpeg", 4, "image/jpeg", 786 "\377\330\377\333", "jpeg", 4, "image/jpeg", 787 "BM", "bmp", 2, "image/bmp", 788 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 789 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 790 "\033E\033", "HP PCL printer data", 3, OCTET, 791 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 792 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 793 "\211PNG", "PNG image", 4, "image/png", 794 "P3\n", "ppm", 3, "image/ppm", 795 "P6\n", "ppm", 3, "image/ppm", 796 "/* XPM */\n", "xbm", 10, "image/xbm", 797 ".HTML ", "troff -ms input", 6, "text/troff", 798 ".LP", "troff -ms input", 3, "text/troff", 799 ".ND", "troff -ms input", 3, "text/troff", 800 ".PP", "troff -ms input", 3, "text/troff", 801 ".TL", "troff -ms input", 3, "text/troff", 802 ".TR", "troff -ms input", 3, "text/troff", 803 ".TH", "manual page", 3, "text/troff", 804 ".\\\"", "troff input", 3, "text/troff", 805 ".de", "troff input", 3, "text/troff", 806 ".if", "troff input", 3, "text/troff", 807 ".nr", "troff input", 3, "text/troff", 808 ".tr", "troff input", 3, "text/troff", 809 "vac:", "venti score", 4, "text/plain", 810 "-----BEGIN CERTIFICATE-----\n", 811 "pem certificate", -1, "text/plain", 812 "-----BEGIN TRUSTED CERTIFICATE-----\n", 813 "pem trusted certificate", -1, "text/plain", 814 "-----BEGIN X509 CERTIFICATE-----\n", 815 "pem x.509 certificate", -1, "text/plain", 816 "subject=/C=", "pem certificate with header", -1, "text/plain", 817 "process snapshot ", "process snapshot", -1, "application/snapfs", 818 0,0,0,0 819 }; 820 821 int 822 istring(void) 823 { 824 int i, l; 825 struct FILE_STRING *p; 826 827 for(p = file_string; p->key; p++) { 828 l = p->length; 829 if(l == -1) 830 l = strlen(p->key); 831 if(nbuf >= l && memcmp(buf, p->key, l) == 0) { 832 if(mime) 833 print("%s\n", p->mime); 834 else 835 print("%s\n", p->filetype); 836 return 1; 837 } 838 } 839 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 840 for(i = 5; i < nbuf; i++) 841 if(buf[i] == '\n') 842 break; 843 if(mime) 844 print(OCTET); 845 else 846 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 847 return 1; 848 } 849 return 0; 850 } 851 852 struct offstr 853 { 854 ulong off; 855 struct FILE_STRING; 856 } offstrs[] = { 857 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 858 0, 0, 0, 0, 0 859 }; 860 861 int 862 isoffstr(void) 863 { 864 int n; 865 char buf[256]; 866 struct offstr *p; 867 868 for(p = offstrs; p->key; p++) { 869 seek(fd, p->off, 0); 870 n = p->length; 871 if (n > sizeof buf) 872 n = sizeof buf; 873 if (readn(fd, buf, n) != n) 874 continue; 875 if(memcmp(buf, p->key, n) == 0) { 876 if(mime) 877 print("%s\n", p->mime); 878 else 879 print("%s\n", p->filetype); 880 return 1; 881 } 882 } 883 return 0; 884 } 885 886 int 887 iff(void) 888 { 889 if (strncmp((char*)buf, "FORM", 4) == 0 && 890 strncmp((char*)buf+8, "AIFF", 4) == 0) { 891 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 892 return 1; 893 } 894 if (strncmp((char*)buf, "RIFF", 4) == 0) { 895 if (strncmp((char*)buf+8, "WAVE", 4) == 0) 896 print("%s\n", mime? "audio/wave": "wave audio"); 897 else if (strncmp((char*)buf+8, "AVI ", 4) == 0) 898 print("%s\n", mime? "video/avi": "avi video"); 899 else 900 print("%s\n", mime? "application/octet-stream": 901 "riff file"); 902 return 1; 903 } 904 return 0; 905 } 906 907 char* html_string[] = 908 { 909 "title", 910 "body", 911 "head", 912 "strong", 913 "h1", 914 "h2", 915 "h3", 916 "h4", 917 "h5", 918 "h6", 919 "ul", 920 "li", 921 "dl", 922 "br", 923 "em", 924 0, 925 }; 926 927 int 928 ishtml(void) 929 { 930 uchar *p, *q; 931 int i, count; 932 933 /* compare strings between '<' and '>' to html table */ 934 count = 0; 935 p = buf; 936 for(;;) { 937 while (p < buf+nbuf && *p != '<') 938 p++; 939 p++; 940 if (p >= buf+nbuf) 941 break; 942 if(*p == '/') 943 p++; 944 q = p; 945 while(p < buf+nbuf && *p != '>') 946 p++; 947 if (p >= buf+nbuf) 948 break; 949 for(i = 0; html_string[i]; i++) { 950 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 951 if(count++ > 4) { 952 print(mime ? "text/html\n" : "HTML file\n"); 953 return 1; 954 } 955 break; 956 } 957 } 958 p++; 959 } 960 return 0; 961 } 962 963 char* rfc822_string[] = 964 { 965 "from:", 966 "date:", 967 "to:", 968 "subject:", 969 "received:", 970 "reply to:", 971 "sender:", 972 0, 973 }; 974 975 int 976 isrfc822(void) 977 { 978 979 char *p, *q, *r; 980 int i, count; 981 982 count = 0; 983 p = (char*)buf; 984 for(;;) { 985 q = strchr(p, '\n'); 986 if(q == nil) 987 break; 988 *q = 0; 989 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 990 count++; 991 *q = '\n'; 992 p = q+1; 993 continue; 994 } 995 *q = '\n'; 996 if(*p != '\t' && *p != ' '){ 997 r = strchr(p, ':'); 998 if(r == 0 || r > q) 999 break; 1000 for(i = 0; rfc822_string[i]; i++) { 1001 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 1002 count++; 1003 break; 1004 } 1005 } 1006 } 1007 p = q+1; 1008 } 1009 if(count >= 3){ 1010 print(mime ? "message/rfc822\n" : "email file\n"); 1011 return 1; 1012 } 1013 return 0; 1014 } 1015 1016 int 1017 ismbox(void) 1018 { 1019 char *p, *q; 1020 1021 p = (char*)buf; 1022 q = strchr(p, '\n'); 1023 if(q == nil) 1024 return 0; 1025 *q = 0; 1026 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 1027 print(mime ? "text/plain\n" : "mail box\n"); 1028 return 1; 1029 } 1030 *q = '\n'; 1031 return 0; 1032 } 1033 1034 int 1035 iscint(void) 1036 { 1037 int type; 1038 char *name; 1039 Biobuf b; 1040 1041 if(Binit(&b, fd, OREAD) == Beof) 1042 return 0; 1043 seek(fd, 0, 0); 1044 type = objtype(&b, &name); 1045 if(type < 0) 1046 return 0; 1047 if(mime) 1048 print(OCTET); 1049 else 1050 print("%s intermediate\n", name); 1051 return 1; 1052 } 1053 1054 int 1055 isc(void) 1056 { 1057 int n; 1058 1059 n = wfreq[I1]; 1060 /* 1061 * includes 1062 */ 1063 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1064 goto yes; 1065 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1066 goto yes; 1067 /* 1068 * declarations 1069 */ 1070 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1071 goto yes; 1072 /* 1073 * assignments 1074 */ 1075 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1076 goto yes; 1077 return 0; 1078 1079 yes: 1080 if(mime){ 1081 print(PLAIN); 1082 return 1; 1083 } 1084 if(wfreq[Alword] > 0) 1085 print("alef program\n"); 1086 else 1087 print("c program\n"); 1088 return 1; 1089 } 1090 1091 int 1092 islimbo(void) 1093 { 1094 1095 /* 1096 * includes 1097 */ 1098 if(wfreq[Lword] < 4) 1099 return 0; 1100 print(mime ? PLAIN : "limbo program\n"); 1101 return 1; 1102 } 1103 1104 int 1105 isas(void) 1106 { 1107 1108 /* 1109 * includes 1110 */ 1111 if(wfreq[Aword] < 2) 1112 return 0; 1113 print(mime ? PLAIN : "as program\n"); 1114 return 1; 1115 } 1116 1117 /* 1118 * low entropy means encrypted 1119 */ 1120 int 1121 ismung(void) 1122 { 1123 int i, bucket[8]; 1124 float cs; 1125 1126 if(nbuf < 64) 1127 return 0; 1128 memset(bucket, 0, sizeof(bucket)); 1129 for(i=nbuf-64; i<nbuf; i++) 1130 bucket[(buf[i]>>5)&07] += 1; 1131 1132 cs = 0.; 1133 for(i=0; i<8; i++) 1134 cs += (bucket[i]-8)*(bucket[i]-8); 1135 cs /= 8.; 1136 if(cs <= 24.322) { 1137 if(buf[0]==0x1f && buf[1]==0x9d) 1138 print(mime ? OCTET : "compressed\n"); 1139 else 1140 if(buf[0]==0x1f && buf[1]==0x8b) 1141 print(mime ? OCTET : "gzip compressed\n"); 1142 else 1143 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1144 print(mime ? OCTET : "bzip2 compressed\n"); 1145 else 1146 print(mime ? OCTET : "encrypted\n"); 1147 return 1; 1148 } 1149 return 0; 1150 } 1151 1152 /* 1153 * english by punctuation and frequencies 1154 */ 1155 int 1156 isenglish(void) 1157 { 1158 int vow, comm, rare, badpun, punct; 1159 char *p; 1160 1161 if(guess != Fascii && guess != Feascii) 1162 return 0; 1163 badpun = 0; 1164 punct = 0; 1165 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1166 switch(*p) { 1167 case '.': 1168 case ',': 1169 case ')': 1170 case '%': 1171 case ';': 1172 case ':': 1173 case '?': 1174 punct++; 1175 if(p[1] != ' ' && p[1] != '\n') 1176 badpun++; 1177 } 1178 if(badpun*5 > punct) 1179 return 0; 1180 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1181 return 0; 1182 if(2*cfreq[';'] > cfreq['e']) 1183 return 0; 1184 1185 vow = 0; 1186 for(p="AEIOU"; *p; p++) { 1187 vow += cfreq[*p]; 1188 vow += cfreq[tolower(*p)]; 1189 } 1190 comm = 0; 1191 for(p="ETAION"; *p; p++) { 1192 comm += cfreq[*p]; 1193 comm += cfreq[tolower(*p)]; 1194 } 1195 rare = 0; 1196 for(p="VJKQXZ"; *p; p++) { 1197 rare += cfreq[*p]; 1198 rare += cfreq[tolower(*p)]; 1199 } 1200 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1201 print(mime ? PLAIN : "English text\n"); 1202 return 1; 1203 } 1204 return 0; 1205 } 1206 1207 /* 1208 * pick up a number with 1209 * syntax _*[0-9]+_ 1210 */ 1211 #define P9BITLEN 12 1212 int 1213 p9bitnum(uchar *bp) 1214 { 1215 int n, c, len; 1216 1217 len = P9BITLEN; 1218 while(*bp == ' ') { 1219 bp++; 1220 len--; 1221 if(len <= 0) 1222 return -1; 1223 } 1224 n = 0; 1225 while(len > 1) { 1226 c = *bp++; 1227 if(!isdigit(c)) 1228 return -1; 1229 n = n*10 + c-'0'; 1230 len--; 1231 } 1232 if(*bp != ' ') 1233 return -1; 1234 return n; 1235 } 1236 1237 int 1238 depthof(char *s, int *newp) 1239 { 1240 char *es; 1241 int d; 1242 1243 *newp = 0; 1244 es = s+12; 1245 while(s<es && *s==' ') 1246 s++; 1247 if(s == es) 1248 return -1; 1249 if('0'<=*s && *s<='9') 1250 return 1<<strtol(s, 0, 0); 1251 1252 *newp = 1; 1253 d = 0; 1254 while(s<es && *s!=' '){ 1255 s++; /* skip letter */ 1256 d += strtoul(s, &s, 10); 1257 } 1258 1259 if(d % 8 == 0 || 8 % d == 0) 1260 return d; 1261 else 1262 return -1; 1263 } 1264 1265 int 1266 isp9bit(void) 1267 { 1268 int dep, lox, loy, hix, hiy, px, new, cmpr; 1269 ulong t; 1270 long len; 1271 char *newlabel; 1272 uchar *cp; 1273 1274 cp = buf; 1275 cmpr = 0; 1276 newlabel = "old "; 1277 1278 if(memcmp(cp, "compressed\n", 11) == 0) { 1279 cmpr = 1; 1280 cp = buf + 11; 1281 } 1282 1283 dep = depthof((char*)cp + 0*P9BITLEN, &new); 1284 if(new) 1285 newlabel = ""; 1286 lox = p9bitnum(cp + 1*P9BITLEN); 1287 loy = p9bitnum(cp + 2*P9BITLEN); 1288 hix = p9bitnum(cp + 3*P9BITLEN); 1289 hiy = p9bitnum(cp + 4*P9BITLEN); 1290 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1291 return 0; 1292 1293 if(dep < 8){ 1294 px = 8/dep; /* pixels per byte */ 1295 /* set l to number of bytes of data per scan line */ 1296 if(lox >= 0) 1297 len = (hix+px-1)/px - lox/px; 1298 else{ /* make positive before divide */ 1299 t = (-lox)+px-1; 1300 t = (t/px)*px; 1301 len = (t+hix+px-1)/px; 1302 } 1303 }else 1304 len = (hix-lox)*dep/8; 1305 len *= hiy - loy; /* col length */ 1306 len += 5 * P9BITLEN; /* size of initial ascii */ 1307 1308 /* 1309 * for compressed images, don't look any further. otherwise: 1310 * for image file, length is non-zero and must match calculation above. 1311 * for /dev/window and /dev/screen the length is always zero. 1312 * for subfont, the subfont header should follow immediately. 1313 */ 1314 if (cmpr) { 1315 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n", 1316 newlabel, dep); 1317 return 1; 1318 } 1319 /* 1320 * mbuf->length == 0 probably indicates reading a pipe. 1321 * Ghostscript sometimes produces a little extra on the end. 1322 */ 1323 if (len != 0 && (mbuf->length == 0 || mbuf->length == len || 1324 mbuf->length > len && mbuf->length < len+P9BITLEN)) { 1325 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1326 return 1; 1327 } 1328 if (p9subfont(buf+len)) { 1329 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep); 1330 return 1; 1331 } 1332 return 0; 1333 } 1334 1335 int 1336 p9subfont(uchar *p) 1337 { 1338 int n, h, a; 1339 1340 /* if image too big, assume it's a subfont */ 1341 if (p+3*P9BITLEN > buf+sizeof(buf)) 1342 return 1; 1343 1344 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1345 if (n < 0) 1346 return 0; 1347 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1348 if (h < 0) 1349 return 0; 1350 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1351 if (a < 0) 1352 return 0; 1353 return 1; 1354 } 1355 1356 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1357 1358 int 1359 isp9font(void) 1360 { 1361 uchar *cp, *p; 1362 int i, n; 1363 char pathname[1024]; 1364 1365 cp = buf; 1366 if (!getfontnum(cp, &cp)) /* height */ 1367 return 0; 1368 if (!getfontnum(cp, &cp)) /* ascent */ 1369 return 0; 1370 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1371 if (!getfontnum(cp, &cp)) /* min */ 1372 break; 1373 if (!getfontnum(cp, &cp)) /* max */ 1374 return 0; 1375 getfontnum(cp, &cp); /* optional offset */ 1376 while (WHITESPACE(*cp)) 1377 cp++; 1378 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1379 ; 1380 /* construct a path name, if needed */ 1381 n = 0; 1382 if (*p != '/' && slash) { 1383 n = slash-fname+1; 1384 if (n < sizeof(pathname)) 1385 memcpy(pathname, fname, n); 1386 else n = 0; 1387 } 1388 if (n+cp-p+4 < sizeof(pathname)) { 1389 memcpy(pathname+n, p, cp-p); 1390 n += cp-p; 1391 pathname[n] = 0; 1392 if (access(pathname, AEXIST) < 0) { 1393 strcpy(pathname+n, ".0"); 1394 if (access(pathname, AEXIST) < 0) 1395 return 0; 1396 } 1397 } 1398 } 1399 if (i) { 1400 print(mime ? "text/plain\n" : "font file\n"); 1401 return 1; 1402 } 1403 return 0; 1404 } 1405 1406 int 1407 getfontnum(uchar *cp, uchar **rp) 1408 { 1409 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1410 cp++; 1411 if (*cp < '0' || *cp > '9') 1412 return 0; 1413 strtoul((char *)cp, (char **)rp, 0); 1414 if (!WHITESPACE(**rp)) { 1415 *rp = cp; 1416 return 0; 1417 } 1418 return 1; 1419 } 1420 1421 int 1422 isrtf(void) 1423 { 1424 if(strstr((char *)buf, "\\rtf1")){ 1425 print(mime ? "application/rtf\n" : "rich text format\n"); 1426 return 1; 1427 } 1428 return 0; 1429 } 1430 1431 int 1432 ismsdos(void) 1433 { 1434 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1435 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1436 return 1; 1437 } 1438 return 0; 1439 } 1440 1441 int 1442 iself(void) 1443 { 1444 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1445 [1] "WE32100", 1446 [2] "SPARC", 1447 [3] "i386", 1448 [4] "M68000", 1449 [5] "M88000", 1450 [6] "i486", 1451 [7] "i860", 1452 [8] "R3000", 1453 [9] "S370", 1454 [10] "R4000", 1455 [15] "HP-PA", 1456 [18] "sparc v8+", 1457 [19] "i960", 1458 [20] "PPC-32", 1459 [21] "PPC-64", 1460 [40] "ARM", 1461 [41] "Alpha", 1462 [43] "sparc v9", 1463 [50] "IA-64", 1464 [62] "AMD64", 1465 [75] "VAX", 1466 }; 1467 static char *type[] = { 1468 [1] "relocatable object", 1469 [2] "executable", 1470 [3] "shared library", 1471 [4] "core dump", 1472 }; 1473 1474 if (memcmp(buf, "\x7fELF", 4) == 0){ 1475 if (!mime){ 1476 int isdifend = 0; 1477 int n = (buf[19] << 8) | buf[18]; 1478 char *p = "unknown"; 1479 char *t = "unknown"; 1480 1481 if (n > 0 && n < nelem(cpu) && cpu[n]) 1482 p = cpu[n]; 1483 else { 1484 /* try the other byte order */ 1485 isdifend = 1; 1486 n = (buf[18] << 8) | buf[19]; 1487 if (n > 0 && n < nelem(cpu) && cpu[n]) 1488 p = cpu[n]; 1489 } 1490 if(isdifend) 1491 n = (buf[16]<< 8) | buf[17]; 1492 else 1493 n = (buf[17]<< 8) | buf[16]; 1494 1495 if(n>0 && n < nelem(type) && type[n]) 1496 t = type[n]; 1497 print("%s ELF %s\n", p, t); 1498 } 1499 else 1500 print("application/x-elf-executable"); 1501 return 1; 1502 } 1503 1504 return 0; 1505 } 1506 1507 int 1508 isface(void) 1509 { 1510 int i, j, ldepth, l; 1511 char *p; 1512 1513 ldepth = -1; 1514 for(j = 0; j < 3; j++){ 1515 for(p = (char*)buf, i=0; i<3; i++){ 1516 if(p[0] != '0' || p[1] != 'x') 1517 return 0; 1518 if(buf[2+8] == ',') 1519 l = 2; 1520 else if(buf[2+4] == ',') 1521 l = 1; 1522 else 1523 return 0; 1524 if(ldepth == -1) 1525 ldepth = l; 1526 if(l != ldepth) 1527 return 0; 1528 strtoul(p, &p, 16); 1529 if(*p++ != ',') 1530 return 0; 1531 while(*p == ' ' || *p == '\t') 1532 p++; 1533 } 1534 if (*p++ != '\n') 1535 return 0; 1536 } 1537 1538 if(mime) 1539 print("application/x-face\n"); 1540 else 1541 print("face image depth %d\n", ldepth); 1542 return 1; 1543 } 1544 1545