1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 isp9font, /* plan 9 font */ 192 isp9bit, /* plan 9 image (as from /dev/window) */ 193 isrtf, /* rich text format */ 194 ismsdos, /* msdos exe (virus file attachement) */ 195 isface, /* ascii face file */ 196 197 /* last resorts */ 198 ismung, /* entropy compressed/encrypted */ 199 isenglish, /* char frequency English */ 200 0 201 }; 202 203 int mime; 204 205 char OCTET[] = "application/octet-stream\n"; 206 char PLAIN[] = "text/plain\n"; 207 208 void 209 main(int argc, char *argv[]) 210 { 211 int i, j, maxlen; 212 char *cp; 213 Rune r; 214 215 ARGBEGIN{ 216 case 'm': 217 mime = 1; 218 break; 219 default: 220 fprint(2, "usage: file [-m] [file...]\n"); 221 exits("usage"); 222 }ARGEND; 223 224 maxlen = 0; 225 if(mime == 0 || argc > 1){ 226 for(i = 0; i < argc; i++) { 227 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 228 ; 229 if(j > maxlen) 230 maxlen = j; 231 } 232 } 233 if (argc <= 0) { 234 if(!mime) 235 print ("stdin: "); 236 filetype(0); 237 } 238 else { 239 for(i = 0; i < argc; i++) 240 type(argv[i], maxlen); 241 } 242 exits(0); 243 } 244 245 void 246 type(char *file, int nlen) 247 { 248 Rune r; 249 int i; 250 char *p; 251 252 if(nlen > 0){ 253 slash = 0; 254 for (i = 0, p = file; *p; i++) { 255 if (*p == '/') /* find rightmost slash */ 256 slash = p; 257 p += chartorune(&r, p); /* count runes */ 258 } 259 print("%s:%*s",file, nlen-i+1, ""); 260 } 261 fname = file; 262 if ((fd = open(file, OREAD)) < 0) { 263 print("cannot open: %r\n"); 264 return; 265 } 266 filetype(fd); 267 close(fd); 268 } 269 270 /* 271 * Unicode 4.0 4-byte runes. 272 */ 273 typedef int Rune1; 274 275 enum { 276 UTFmax1 = 4, 277 }; 278 279 int 280 fullrune1(char *p, int n) 281 { 282 int c; 283 284 if(n >= 1) { 285 c = *(uchar*)p; 286 if(c < 0x80) 287 return 1; 288 if(n >= 2 && c < 0xE0) 289 return 1; 290 if(n >= 3 && c < 0xF0) 291 return 1; 292 if(n >= 4) 293 return 1; 294 } 295 return 0; 296 } 297 298 int 299 chartorune1(Rune1 *rune, char *str) 300 { 301 int c, c1, c2, c3, n; 302 Rune r; 303 304 c = *(uchar*)str; 305 if(c < 0xF0){ 306 r = 0; 307 n = chartorune(&r, str); 308 *rune = r; 309 return n; 310 } 311 c &= ~0xF0; 312 c1 = *(uchar*)(str+1) & ~0x80; 313 c2 = *(uchar*)(str+2) & ~0x80; 314 c3 = *(uchar*)(str+3) & ~0x80; 315 n = (c<<18) | (c1<<12) | (c2<<6) | c3; 316 if(n < 0x10000 || n > 0x10FFFF){ 317 *rune = Runeerror; 318 return 1; 319 } 320 *rune = n; 321 return 4; 322 } 323 324 void 325 filetype(int fd) 326 { 327 Rune1 r; 328 int i, f, n; 329 char *p, *eob; 330 331 free(mbuf); 332 mbuf = dirfstat(fd); 333 if(mbuf == nil){ 334 print("cannot stat: %r\n"); 335 return; 336 } 337 if(mbuf->mode & DMDIR) { 338 print(mime ? "text/directory\n" : "directory\n"); 339 return; 340 } 341 if(mbuf->type != 'M' && mbuf->type != '|') { 342 print(mime ? OCTET : "special file #%c/%s\n", 343 mbuf->type, mbuf->name); 344 return; 345 } 346 /* may be reading a pipe on standard input */ 347 nbuf = readn(fd, buf, sizeof(buf)-1); 348 if(nbuf < 0) { 349 print("cannot read: %r\n"); 350 return; 351 } 352 if(nbuf == 0) { 353 print(mime ? PLAIN : "empty file\n"); 354 return; 355 } 356 buf[nbuf] = 0; 357 358 /* 359 * build histogram table 360 */ 361 memset(cfreq, 0, sizeof(cfreq)); 362 for (i = 0; language[i].name; i++) 363 language[i].count = 0; 364 eob = (char *)buf+nbuf; 365 for(n = 0, p = (char *)buf; p < eob; n++) { 366 if (!fullrune1(p, eob-p) && eob-p < UTFmax1) 367 break; 368 p += chartorune1(&r, p); 369 if (r == 0) 370 f = Cnull; 371 else if (r <= 0x7f) { 372 if (!isprint(r) && !isspace(r)) 373 f = Ceascii; /* ASCII control char */ 374 else f = r; 375 } else if (r == 0x80) { 376 bump_utf_count(r); 377 f = Cutf; 378 } else if (r < 0xA0) 379 f = Cbinary; /* Invalid Runes */ 380 else if (r <= 0xff) 381 f = Clatin; /* Latin 1 */ 382 else { 383 bump_utf_count(r); 384 f = Cutf; /* UTF extension */ 385 } 386 cfreq[f]++; /* ASCII chars peg directly */ 387 } 388 /* 389 * gross classify 390 */ 391 if (cfreq[Cbinary]) 392 guess = Fbinary; 393 else if (cfreq[Cutf]) 394 guess = Futf; 395 else if (cfreq[Clatin]) 396 guess = Flatin; 397 else if (cfreq[Ceascii]) 398 guess = Feascii; 399 else if (cfreq[Cnull]) 400 guess = Fbinary; 401 else 402 guess = Fascii; 403 /* 404 * lookup dictionary words 405 */ 406 memset(wfreq, 0, sizeof(wfreq)); 407 if(guess == Fascii || guess == Flatin || guess == Futf) 408 wordfreq(); 409 /* 410 * call individual classify routines 411 */ 412 for(i=0; call[i]; i++) 413 if((*call[i])()) 414 return; 415 416 /* 417 * if all else fails, 418 * print out gross classification 419 */ 420 if (nbuf < 100 && !mime) 421 print(mime ? PLAIN : "short "); 422 if (guess == Fascii) 423 print(mime ? PLAIN : "Ascii\n"); 424 else if (guess == Feascii) 425 print(mime ? PLAIN : "extended ascii\n"); 426 else if (guess == Flatin) 427 print(mime ? PLAIN : "latin ascii\n"); 428 else if (guess == Futf && utf_count() < 4) 429 print_utf(); 430 else print(mime ? OCTET : "binary\n"); 431 } 432 433 void 434 bump_utf_count(Rune r) 435 { 436 int low, high, mid; 437 438 high = sizeof(language)/sizeof(language[0])-1; 439 for (low = 0; low < high;) { 440 mid = (low+high)/2; 441 if (r >= language[mid].low) { 442 if (r <= language[mid].high) { 443 language[mid].count++; 444 break; 445 } else low = mid+1; 446 } else high = mid; 447 } 448 } 449 450 int 451 utf_count(void) 452 { 453 int i, count; 454 455 count = 0; 456 for (i = 0; language[i].name; i++) 457 if (language[i].count > 0) 458 switch (language[i].mode) { 459 case Normal: 460 case First: 461 count++; 462 break; 463 default: 464 break; 465 } 466 return count; 467 } 468 469 int 470 chkascii(void) 471 { 472 int i; 473 474 for (i = 'a'; i < 'z'; i++) 475 if (cfreq[i]) 476 return 1; 477 for (i = 'A'; i < 'Z'; i++) 478 if (cfreq[i]) 479 return 1; 480 return 0; 481 } 482 483 int 484 find_first(char *name) 485 { 486 int i; 487 488 for (i = 0; language[i].name != 0; i++) 489 if (language[i].mode == First 490 && strcmp(language[i].name, name) == 0) 491 return i; 492 return -1; 493 } 494 495 void 496 print_utf(void) 497 { 498 int i, printed, j; 499 500 if(mime){ 501 print(PLAIN); 502 return; 503 } 504 if (chkascii()) { 505 printed = 1; 506 print("Ascii"); 507 } else 508 printed = 0; 509 for (i = 0; language[i].name; i++) 510 if (language[i].count) { 511 switch(language[i].mode) { 512 case Multi: 513 j = find_first(language[i].name); 514 if (j < 0) 515 break; 516 if (language[j].count > 0) 517 break; 518 /* Fall through */ 519 case Normal: 520 case First: 521 if (printed) 522 print(" & "); 523 else printed = 1; 524 print("%s", language[i].name); 525 break; 526 case Shared: 527 default: 528 break; 529 } 530 } 531 if(!printed) 532 print("UTF"); 533 print(" text\n"); 534 } 535 536 void 537 wordfreq(void) 538 { 539 int low, high, mid, r; 540 uchar *p, *p2, c; 541 542 p = buf; 543 for(;;) { 544 while (p < buf+nbuf && !isalpha(*p)) 545 p++; 546 if (p >= buf+nbuf) 547 return; 548 p2 = p; 549 while(p < buf+nbuf && isalpha(*p)) 550 p++; 551 c = *p; 552 *p = 0; 553 high = sizeof(dict)/sizeof(dict[0]); 554 for(low = 0;low < high;) { 555 mid = (low+high)/2; 556 r = strcmp(dict[mid].word, (char*)p2); 557 if(r == 0) { 558 wfreq[dict[mid].class]++; 559 break; 560 } 561 if(r < 0) 562 low = mid+1; 563 else 564 high = mid; 565 } 566 *p++ = c; 567 } 568 } 569 570 typedef struct Filemagic Filemagic; 571 struct Filemagic { 572 ulong x; 573 ulong mask; 574 char *desc; 575 char *mime; 576 }; 577 578 /* 579 * integers in this table must be as seen on a little-endian machine 580 * when read from a file. 581 */ 582 Filemagic long0tab[] = { 583 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 584 /* "pac1" */ 585 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 586 /* "pXc2 */ 587 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 588 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 589 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 590 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 591 070707, 0xFFFF, "cpio archive\n", OCTET, 592 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 593 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 594 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 595 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 596 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 597 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 598 /* 599 * venti & fossil magic numbers are stored big-endian on disk, 600 * thus the numbers appear reversed in this table. 601 */ 602 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 603 }; 604 605 int 606 filemagic(Filemagic *tab, int ntab, ulong x) 607 { 608 int i; 609 610 for(i=0; i<ntab; i++) 611 if((x&tab[i].mask) == tab[i].x){ 612 print(mime ? tab[i].mime : tab[i].desc); 613 return 1; 614 } 615 return 0; 616 } 617 618 int 619 long0(void) 620 { 621 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 622 } 623 624 typedef struct Fileoffmag Fileoffmag; 625 struct Fileoffmag { 626 ulong off; 627 Filemagic; 628 }; 629 630 /* 631 * integers in this table must be as seen on a little-endian machine 632 * when read from a file. 633 */ 634 Fileoffmag longofftab[] = { 635 /* 636 * venti & fossil magic numbers are stored big-endian on disk, 637 * thus the numbers appear reversed in this table. 638 */ 639 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 640 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 641 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 642 }; 643 644 int 645 fileoffmagic(Fileoffmag *tab, int ntab) 646 { 647 int i; 648 ulong x; 649 Fileoffmag *tp; 650 uchar buf[sizeof(long)]; 651 652 for(i=0; i<ntab; i++) { 653 tp = tab + i; 654 seek(fd, tp->off, 0); 655 if (readn(fd, buf, sizeof buf) != sizeof buf) 656 continue; 657 x = LENDIAN(buf); 658 if((x&tp->mask) == tp->x){ 659 print(mime? tp->mime: tp->desc); 660 return 1; 661 } 662 } 663 return 0; 664 } 665 666 int 667 longoff(void) 668 { 669 return fileoffmagic(longofftab, nelem(longofftab)); 670 } 671 672 int 673 isexec(void) 674 { 675 Fhdr f; 676 677 seek(fd, 0, 0); /* reposition to start of file */ 678 if(crackhdr(fd, &f)) { 679 print(mime ? OCTET : "%s\n", f.name); 680 return 1; 681 } 682 return 0; 683 } 684 685 686 /* from tar.c */ 687 enum { NAMSIZ = 100, TBLOCK = 512 }; 688 689 union hblock 690 { 691 char dummy[TBLOCK]; 692 struct header 693 { 694 char name[NAMSIZ]; 695 char mode[8]; 696 char uid[8]; 697 char gid[8]; 698 char size[12]; 699 char mtime[12]; 700 char chksum[8]; 701 char linkflag; 702 char linkname[NAMSIZ]; 703 /* rest are defined by POSIX's ustar format; see p1003.2b */ 704 char magic[6]; /* "ustar" */ 705 char version[2]; 706 char uname[32]; 707 char gname[32]; 708 char devmajor[8]; 709 char devminor[8]; 710 char prefix[155]; /* if non-null, path = prefix "/" name */ 711 } dbuf; 712 }; 713 714 int 715 checksum(union hblock *hp) 716 { 717 int i; 718 char *cp; 719 struct header *hdr = &hp->dbuf; 720 721 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 722 *cp = ' '; 723 i = 0; 724 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 725 i += *cp & 0xff; 726 return i; 727 } 728 729 int 730 istar(void) 731 { 732 int chksum; 733 char tblock[TBLOCK]; 734 union hblock *hp = (union hblock *)tblock; 735 struct header *hdr = &hp->dbuf; 736 737 seek(fd, 0, 0); /* reposition to start of file */ 738 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 739 return 0; 740 chksum = strtol(hdr->chksum, 0, 8); 741 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 742 if (strcmp(hdr->magic, "ustar") == 0) 743 print(mime? "application/x-ustar\n": 744 "posix tar archive\n"); 745 else 746 print(mime? "application/x-tar\n": "tar archive\n"); 747 return 1; 748 } 749 return 0; 750 } 751 752 /* 753 * initial words to classify file 754 */ 755 struct FILE_STRING 756 { 757 char *key; 758 char *filetype; 759 int length; 760 char *mime; 761 } file_string[] = 762 { 763 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 764 "!<arch>\n", "archive", 8, "application/octet-stream", 765 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 766 "#!/bin/rc", "rc executable file", 9, "text/plain", 767 "#!/bin/sh", "sh executable file", 9, "text/plain", 768 "%!", "postscript", 2, "application/postscript", 769 "\004%!", "postscript", 3, "application/postscript", 770 "x T post", "troff output for post", 8, "application/troff", 771 "x T Latin1", "troff output for Latin1", 10, "application/troff", 772 "x T utf", "troff output for UTF", 7, "application/troff", 773 "x T 202", "troff output for 202", 7, "application/troff", 774 "x T aps", "troff output for aps", 7, "application/troff", 775 "GIF", "GIF image", 3, "image/gif", 776 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 777 "%PDF", "PDF", 4, "application/pdf", 778 "<html>\n", "HTML file", 7, "text/html", 779 "<HTML>\n", "HTML file", 7, "text/html", 780 "\111\111\052\000", "tiff", 4, "image/tiff", 781 "\115\115\000\052", "tiff", 4, "image/tiff", 782 "\377\330\377\340", "jpeg", 4, "image/jpeg", 783 "\377\330\377\341", "jpeg", 4, "image/jpeg", 784 "\377\330\377\333", "jpeg", 4, "image/jpeg", 785 "BM", "bmp", 2, "image/bmp", 786 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 787 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 788 "\033E\033", "HP PCL printer data", 3, OCTET, 789 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 790 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 791 "\211PNG", "PNG image", 4, "image/png", 792 "P3\n", "ppm", 3, "image/ppm", 793 "P6\n", "ppm", 3, "image/ppm", 794 "/* XPM */\n", "xbm", 10, "image/xbm", 795 ".HTML ", "troff -ms input", 6, "text/troff", 796 ".LP", "troff -ms input", 3, "text/troff", 797 ".ND", "troff -ms input", 3, "text/troff", 798 ".PP", "troff -ms input", 3, "text/troff", 799 ".TL", "troff -ms input", 3, "text/troff", 800 ".TR", "troff -ms input", 3, "text/troff", 801 ".TH", "manual page", 3, "text/troff", 802 ".\\\"", "troff input", 3, "text/troff", 803 ".de", "troff input", 3, "text/troff", 804 ".if", "troff input", 3, "text/troff", 805 ".nr", "troff input", 3, "text/troff", 806 ".tr", "troff input", 3, "text/troff", 807 "vac:", "venti score", 4, "text/plain", 808 "-----BEGIN CERTIFICATE-----\n", 809 "pem certificate", -1, "text/plain", 810 "-----BEGIN TRUSTED CERTIFICATE-----\n", 811 "pem trusted certificate", -1, "text/plain", 812 "-----BEGIN X509 CERTIFICATE-----\n", 813 "pem x.509 certificate", -1, "text/plain", 814 "subject=/C=", "pem certificate with header", -1, "text/plain", 815 "process snapshot ", "process snapshot", -1, "application/snapfs", 816 0,0,0,0 817 }; 818 819 int 820 istring(void) 821 { 822 int i, l; 823 struct FILE_STRING *p; 824 825 for(p = file_string; p->key; p++) { 826 l = p->length; 827 if(l == -1) 828 l = strlen(p->key); 829 if(nbuf >= l && memcmp(buf, p->key, l) == 0) { 830 if(mime) 831 print("%s\n", p->mime); 832 else 833 print("%s\n", p->filetype); 834 return 1; 835 } 836 } 837 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 838 for(i = 5; i < nbuf; i++) 839 if(buf[i] == '\n') 840 break; 841 if(mime) 842 print(OCTET); 843 else 844 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 845 return 1; 846 } 847 return 0; 848 } 849 850 struct offstr 851 { 852 ulong off; 853 struct FILE_STRING; 854 } offstrs[] = { 855 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 856 0, 0, 0, 0, 0 857 }; 858 859 int 860 isoffstr(void) 861 { 862 int n; 863 char buf[256]; 864 struct offstr *p; 865 866 for(p = offstrs; p->key; p++) { 867 seek(fd, p->off, 0); 868 n = p->length; 869 if (n > sizeof buf) 870 n = sizeof buf; 871 if (readn(fd, buf, n) != n) 872 continue; 873 if(memcmp(buf, p->key, n) == 0) { 874 if(mime) 875 print("%s\n", p->mime); 876 else 877 print("%s\n", p->filetype); 878 return 1; 879 } 880 } 881 return 0; 882 } 883 884 int 885 iff(void) 886 { 887 if (strncmp((char*)buf, "FORM", 4) == 0 && 888 strncmp((char*)buf+8, "AIFF", 4) == 0) { 889 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 890 return 1; 891 } 892 if (strncmp((char*)buf, "RIFF", 4) == 0) { 893 if (strncmp((char*)buf+8, "WAVE", 4) == 0) 894 print("%s\n", mime? "audio/wave": "wave audio"); 895 else if (strncmp((char*)buf+8, "AVI ", 4) == 0) 896 print("%s\n", mime? "video/avi": "avi video"); 897 else 898 print("%s\n", mime? "application/octet-stream": 899 "riff file"); 900 return 1; 901 } 902 return 0; 903 } 904 905 char* html_string[] = 906 { 907 "title", 908 "body", 909 "head", 910 "strong", 911 "h1", 912 "h2", 913 "h3", 914 "h4", 915 "h5", 916 "h6", 917 "ul", 918 "li", 919 "dl", 920 "br", 921 "em", 922 0, 923 }; 924 925 int 926 ishtml(void) 927 { 928 uchar *p, *q; 929 int i, count; 930 931 /* compare strings between '<' and '>' to html table */ 932 count = 0; 933 p = buf; 934 for(;;) { 935 while (p < buf+nbuf && *p != '<') 936 p++; 937 p++; 938 if (p >= buf+nbuf) 939 break; 940 if(*p == '/') 941 p++; 942 q = p; 943 while(p < buf+nbuf && *p != '>') 944 p++; 945 if (p >= buf+nbuf) 946 break; 947 for(i = 0; html_string[i]; i++) { 948 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 949 if(count++ > 4) { 950 print(mime ? "text/html\n" : "HTML file\n"); 951 return 1; 952 } 953 break; 954 } 955 } 956 p++; 957 } 958 return 0; 959 } 960 961 char* rfc822_string[] = 962 { 963 "from:", 964 "date:", 965 "to:", 966 "subject:", 967 "received:", 968 "reply to:", 969 "sender:", 970 0, 971 }; 972 973 int 974 isrfc822(void) 975 { 976 977 char *p, *q, *r; 978 int i, count; 979 980 count = 0; 981 p = (char*)buf; 982 for(;;) { 983 q = strchr(p, '\n'); 984 if(q == nil) 985 break; 986 *q = 0; 987 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 988 count++; 989 *q = '\n'; 990 p = q+1; 991 continue; 992 } 993 *q = '\n'; 994 if(*p != '\t' && *p != ' '){ 995 r = strchr(p, ':'); 996 if(r == 0 || r > q) 997 break; 998 for(i = 0; rfc822_string[i]; i++) { 999 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 1000 count++; 1001 break; 1002 } 1003 } 1004 } 1005 p = q+1; 1006 } 1007 if(count >= 3){ 1008 print(mime ? "message/rfc822\n" : "email file\n"); 1009 return 1; 1010 } 1011 return 0; 1012 } 1013 1014 int 1015 ismbox(void) 1016 { 1017 char *p, *q; 1018 1019 p = (char*)buf; 1020 q = strchr(p, '\n'); 1021 if(q == nil) 1022 return 0; 1023 *q = 0; 1024 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 1025 print(mime ? "text/plain\n" : "mail box\n"); 1026 return 1; 1027 } 1028 *q = '\n'; 1029 return 0; 1030 } 1031 1032 int 1033 iscint(void) 1034 { 1035 int type; 1036 char *name; 1037 Biobuf b; 1038 1039 if(Binit(&b, fd, OREAD) == Beof) 1040 return 0; 1041 seek(fd, 0, 0); 1042 type = objtype(&b, &name); 1043 if(type < 0) 1044 return 0; 1045 if(mime) 1046 print(OCTET); 1047 else 1048 print("%s intermediate\n", name); 1049 return 1; 1050 } 1051 1052 int 1053 isc(void) 1054 { 1055 int n; 1056 1057 n = wfreq[I1]; 1058 /* 1059 * includes 1060 */ 1061 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1062 goto yes; 1063 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1064 goto yes; 1065 /* 1066 * declarations 1067 */ 1068 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1069 goto yes; 1070 /* 1071 * assignments 1072 */ 1073 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1074 goto yes; 1075 return 0; 1076 1077 yes: 1078 if(mime){ 1079 print(PLAIN); 1080 return 1; 1081 } 1082 if(wfreq[Alword] > 0) 1083 print("alef program\n"); 1084 else 1085 print("c program\n"); 1086 return 1; 1087 } 1088 1089 int 1090 islimbo(void) 1091 { 1092 1093 /* 1094 * includes 1095 */ 1096 if(wfreq[Lword] < 4) 1097 return 0; 1098 print(mime ? PLAIN : "limbo program\n"); 1099 return 1; 1100 } 1101 1102 int 1103 isas(void) 1104 { 1105 1106 /* 1107 * includes 1108 */ 1109 if(wfreq[Aword] < 2) 1110 return 0; 1111 print(mime ? PLAIN : "as program\n"); 1112 return 1; 1113 } 1114 1115 /* 1116 * low entropy means encrypted 1117 */ 1118 int 1119 ismung(void) 1120 { 1121 int i, bucket[8]; 1122 float cs; 1123 1124 if(nbuf < 64) 1125 return 0; 1126 memset(bucket, 0, sizeof(bucket)); 1127 for(i=nbuf-64; i<nbuf; i++) 1128 bucket[(buf[i]>>5)&07] += 1; 1129 1130 cs = 0.; 1131 for(i=0; i<8; i++) 1132 cs += (bucket[i]-8)*(bucket[i]-8); 1133 cs /= 8.; 1134 if(cs <= 24.322) { 1135 if(buf[0]==0x1f && buf[1]==0x9d) 1136 print(mime ? OCTET : "compressed\n"); 1137 else 1138 if(buf[0]==0x1f && buf[1]==0x8b) 1139 print(mime ? OCTET : "gzip compressed\n"); 1140 else 1141 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1142 print(mime ? OCTET : "bzip2 compressed\n"); 1143 else 1144 print(mime ? OCTET : "encrypted\n"); 1145 return 1; 1146 } 1147 return 0; 1148 } 1149 1150 /* 1151 * english by punctuation and frequencies 1152 */ 1153 int 1154 isenglish(void) 1155 { 1156 int vow, comm, rare, badpun, punct; 1157 char *p; 1158 1159 if(guess != Fascii && guess != Feascii) 1160 return 0; 1161 badpun = 0; 1162 punct = 0; 1163 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1164 switch(*p) { 1165 case '.': 1166 case ',': 1167 case ')': 1168 case '%': 1169 case ';': 1170 case ':': 1171 case '?': 1172 punct++; 1173 if(p[1] != ' ' && p[1] != '\n') 1174 badpun++; 1175 } 1176 if(badpun*5 > punct) 1177 return 0; 1178 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1179 return 0; 1180 if(2*cfreq[';'] > cfreq['e']) 1181 return 0; 1182 1183 vow = 0; 1184 for(p="AEIOU"; *p; p++) { 1185 vow += cfreq[*p]; 1186 vow += cfreq[tolower(*p)]; 1187 } 1188 comm = 0; 1189 for(p="ETAION"; *p; p++) { 1190 comm += cfreq[*p]; 1191 comm += cfreq[tolower(*p)]; 1192 } 1193 rare = 0; 1194 for(p="VJKQXZ"; *p; p++) { 1195 rare += cfreq[*p]; 1196 rare += cfreq[tolower(*p)]; 1197 } 1198 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1199 print(mime ? PLAIN : "English text\n"); 1200 return 1; 1201 } 1202 return 0; 1203 } 1204 1205 /* 1206 * pick up a number with 1207 * syntax _*[0-9]+_ 1208 */ 1209 #define P9BITLEN 12 1210 int 1211 p9bitnum(uchar *bp) 1212 { 1213 int n, c, len; 1214 1215 len = P9BITLEN; 1216 while(*bp == ' ') { 1217 bp++; 1218 len--; 1219 if(len <= 0) 1220 return -1; 1221 } 1222 n = 0; 1223 while(len > 1) { 1224 c = *bp++; 1225 if(!isdigit(c)) 1226 return -1; 1227 n = n*10 + c-'0'; 1228 len--; 1229 } 1230 if(*bp != ' ') 1231 return -1; 1232 return n; 1233 } 1234 1235 int 1236 depthof(char *s, int *newp) 1237 { 1238 char *es; 1239 int d; 1240 1241 *newp = 0; 1242 es = s+12; 1243 while(s<es && *s==' ') 1244 s++; 1245 if(s == es) 1246 return -1; 1247 if('0'<=*s && *s<='9') 1248 return 1<<strtol(s, 0, 0); 1249 1250 *newp = 1; 1251 d = 0; 1252 while(s<es && *s!=' '){ 1253 s++; /* skip letter */ 1254 d += strtoul(s, &s, 10); 1255 } 1256 1257 if(d % 8 == 0 || 8 % d == 0) 1258 return d; 1259 else 1260 return -1; 1261 } 1262 1263 int 1264 isp9bit(void) 1265 { 1266 int dep, lox, loy, hix, hiy, px, new, cmpr; 1267 ulong t; 1268 long len; 1269 char *newlabel; 1270 uchar *cp; 1271 1272 cp = buf; 1273 cmpr = 0; 1274 newlabel = "old "; 1275 1276 if(memcmp(cp, "compressed\n", 11) == 0) { 1277 cmpr = 1; 1278 cp = buf + 11; 1279 } 1280 1281 dep = depthof((char*)cp + 0*P9BITLEN, &new); 1282 if(new) 1283 newlabel = ""; 1284 lox = p9bitnum(cp + 1*P9BITLEN); 1285 loy = p9bitnum(cp + 2*P9BITLEN); 1286 hix = p9bitnum(cp + 3*P9BITLEN); 1287 hiy = p9bitnum(cp + 4*P9BITLEN); 1288 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1289 return 0; 1290 1291 if(dep < 8){ 1292 px = 8/dep; /* pixels per byte */ 1293 /* set l to number of bytes of data per scan line */ 1294 if(lox >= 0) 1295 len = (hix+px-1)/px - lox/px; 1296 else{ /* make positive before divide */ 1297 t = (-lox)+px-1; 1298 t = (t/px)*px; 1299 len = (t+hix+px-1)/px; 1300 } 1301 }else 1302 len = (hix-lox)*dep/8; 1303 len *= hiy - loy; /* col length */ 1304 len += 5 * P9BITLEN; /* size of initial ascii */ 1305 1306 /* 1307 * for compressed images, don't look any further. otherwise: 1308 * for image file, length is non-zero and must match calculation above. 1309 * for /dev/window and /dev/screen the length is always zero. 1310 * for subfont, the subfont header should follow immediately. 1311 */ 1312 if (cmpr) { 1313 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n", 1314 newlabel, dep); 1315 return 1; 1316 } 1317 /* 1318 * mbuf->length == 0 probably indicates reading a pipe. 1319 * Ghostscript sometimes produces a little extra on the end. 1320 */ 1321 if (len != 0 && (mbuf->length == 0 || mbuf->length == len || 1322 mbuf->length > len && mbuf->length < len+P9BITLEN)) { 1323 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1324 return 1; 1325 } 1326 if (p9subfont(buf+len)) { 1327 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep); 1328 return 1; 1329 } 1330 return 0; 1331 } 1332 1333 int 1334 p9subfont(uchar *p) 1335 { 1336 int n, h, a; 1337 1338 /* if image too big, assume it's a subfont */ 1339 if (p+3*P9BITLEN > buf+sizeof(buf)) 1340 return 1; 1341 1342 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1343 if (n < 0) 1344 return 0; 1345 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1346 if (h < 0) 1347 return 0; 1348 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1349 if (a < 0) 1350 return 0; 1351 return 1; 1352 } 1353 1354 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1355 1356 int 1357 isp9font(void) 1358 { 1359 uchar *cp, *p; 1360 int i, n; 1361 char pathname[1024]; 1362 1363 cp = buf; 1364 if (!getfontnum(cp, &cp)) /* height */ 1365 return 0; 1366 if (!getfontnum(cp, &cp)) /* ascent */ 1367 return 0; 1368 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1369 if (!getfontnum(cp, &cp)) /* min */ 1370 break; 1371 if (!getfontnum(cp, &cp)) /* max */ 1372 return 0; 1373 getfontnum(cp, &cp); /* optional offset */ 1374 while (WHITESPACE(*cp)) 1375 cp++; 1376 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1377 ; 1378 /* construct a path name, if needed */ 1379 n = 0; 1380 if (*p != '/' && slash) { 1381 n = slash-fname+1; 1382 if (n < sizeof(pathname)) 1383 memcpy(pathname, fname, n); 1384 else n = 0; 1385 } 1386 if (n+cp-p+4 < sizeof(pathname)) { 1387 memcpy(pathname+n, p, cp-p); 1388 n += cp-p; 1389 pathname[n] = 0; 1390 if (access(pathname, AEXIST) < 0) { 1391 strcpy(pathname+n, ".0"); 1392 if (access(pathname, AEXIST) < 0) 1393 return 0; 1394 } 1395 } 1396 } 1397 if (i) { 1398 print(mime ? "text/plain\n" : "font file\n"); 1399 return 1; 1400 } 1401 return 0; 1402 } 1403 1404 int 1405 getfontnum(uchar *cp, uchar **rp) 1406 { 1407 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1408 cp++; 1409 if (*cp < '0' || *cp > '9') 1410 return 0; 1411 strtoul((char *)cp, (char **)rp, 0); 1412 if (!WHITESPACE(**rp)) { 1413 *rp = cp; 1414 return 0; 1415 } 1416 return 1; 1417 } 1418 1419 int 1420 isrtf(void) 1421 { 1422 if(strstr((char *)buf, "\\rtf1")){ 1423 print(mime ? "application/rtf\n" : "rich text format\n"); 1424 return 1; 1425 } 1426 return 0; 1427 } 1428 1429 int 1430 ismsdos(void) 1431 { 1432 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1433 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1434 return 1; 1435 } 1436 return 0; 1437 } 1438 1439 int 1440 iself(void) 1441 { 1442 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1443 [1] "WE32100", 1444 [2] "SPARC", 1445 [3] "i386", 1446 [4] "M68000", 1447 [5] "M88000", 1448 [6] "i486", 1449 [7] "i860", 1450 [8] "R3000", 1451 [9] "S370", 1452 [10] "R4000", 1453 [15] "HP-PA", 1454 [18] "sparc v8+", 1455 [19] "i960", 1456 [20] "PPC-32", 1457 [21] "PPC-64", 1458 [40] "ARM", 1459 [41] "Alpha", 1460 [43] "sparc v9", 1461 [50] "IA-64", 1462 [62] "AMD64", 1463 [75] "VAX", 1464 }; 1465 static char *type[] = { 1466 [1] "relocatable object", 1467 [2] "executable", 1468 [3] "shared library", 1469 [4] "core dump", 1470 }; 1471 1472 if (memcmp(buf, "\x7fELF", 4) == 0){ 1473 if (!mime){ 1474 int isdifend = 0; 1475 int n = (buf[19] << 8) | buf[18]; 1476 char *p = "unknown"; 1477 char *t = "unknown"; 1478 1479 if (n > 0 && n < nelem(cpu) && cpu[n]) 1480 p = cpu[n]; 1481 else { 1482 /* try the other byte order */ 1483 isdifend = 1; 1484 n = (buf[18] << 8) | buf[19]; 1485 if (n > 0 && n < nelem(cpu) && cpu[n]) 1486 p = cpu[n]; 1487 } 1488 if(isdifend) 1489 n = (buf[16]<< 8) | buf[17]; 1490 else 1491 n = (buf[17]<< 8) | buf[16]; 1492 1493 if(n>0 && n < nelem(type) && type[n]) 1494 t = type[n]; 1495 print("%s ELF %s\n", p, t); 1496 } 1497 else 1498 print("application/x-elf-executable"); 1499 return 1; 1500 } 1501 1502 return 0; 1503 } 1504 1505 int 1506 isface(void) 1507 { 1508 int i, j, ldepth, l; 1509 char *p; 1510 1511 ldepth = -1; 1512 for(j = 0; j < 3; j++){ 1513 for(p = (char*)buf, i=0; i<3; i++){ 1514 if(p[0] != '0' || p[1] != 'x') 1515 return 0; 1516 if(buf[2+8] == ',') 1517 l = 2; 1518 else if(buf[2+4] == ',') 1519 l = 1; 1520 else 1521 return 0; 1522 if(ldepth == -1) 1523 ldepth = l; 1524 if(l != ldepth) 1525 return 0; 1526 strtoul(p, &p, 16); 1527 if(*p++ != ',') 1528 return 0; 1529 while(*p == ' ' || *p == '\t') 1530 p++; 1531 } 1532 if (*p++ != '\n') 1533 return 0; 1534 } 1535 1536 if(mime) 1537 print("application/x-face\n"); 1538 else 1539 print("face image depth %d\n", ldepth); 1540 return 1; 1541 } 1542 1543