1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 isp9font, /* plan 9 font */ 192 isp9bit, /* plan 9 image (as from /dev/window) */ 193 isrtf, /* rich text format */ 194 ismsdos, /* msdos exe (virus file attachement) */ 195 isface, /* ascii face file */ 196 197 /* last resorts */ 198 ismung, /* entropy compressed/encrypted */ 199 isenglish, /* char frequency English */ 200 0 201 }; 202 203 int mime; 204 205 char OCTET[] = "application/octet-stream\n"; 206 char PLAIN[] = "text/plain\n"; 207 208 void 209 main(int argc, char *argv[]) 210 { 211 int i, j, maxlen; 212 char *cp; 213 Rune r; 214 215 ARGBEGIN{ 216 case 'm': 217 mime = 1; 218 break; 219 default: 220 fprint(2, "usage: file [-m] [file...]\n"); 221 exits("usage"); 222 }ARGEND; 223 224 maxlen = 0; 225 if(mime == 0 || argc > 1){ 226 for(i = 0; i < argc; i++) { 227 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 228 ; 229 if(j > maxlen) 230 maxlen = j; 231 } 232 } 233 if (argc <= 0) { 234 if(!mime) 235 print ("stdin: "); 236 filetype(0); 237 } 238 else { 239 for(i = 0; i < argc; i++) 240 type(argv[i], maxlen); 241 } 242 exits(0); 243 } 244 245 void 246 type(char *file, int nlen) 247 { 248 Rune r; 249 int i; 250 char *p; 251 252 if(nlen > 0){ 253 slash = 0; 254 for (i = 0, p = file; *p; i++) { 255 if (*p == '/') /* find rightmost slash */ 256 slash = p; 257 p += chartorune(&r, p); /* count runes */ 258 } 259 print("%s:%*s",file, nlen-i+1, ""); 260 } 261 fname = file; 262 if ((fd = open(file, OREAD)) < 0) { 263 print("cannot open\n"); 264 return; 265 } 266 filetype(fd); 267 close(fd); 268 } 269 270 /* 271 * Unicode 4.0 4-byte runes. 272 */ 273 typedef int Rune1; 274 275 enum { 276 UTFmax1 = 4, 277 }; 278 279 int 280 fullrune1(char *p, int n) 281 { 282 int c; 283 284 if(n >= 1) { 285 c = *(uchar*)p; 286 if(c < 0x80) 287 return 1; 288 if(n >= 2 && c < 0xE0) 289 return 1; 290 if(n >= 3 && c < 0xF0) 291 return 1; 292 if(n >= 4) 293 return 1; 294 } 295 return 0; 296 } 297 298 int 299 chartorune1(Rune1 *rune, char *str) 300 { 301 int c, c1, c2, c3, n; 302 Rune r; 303 304 c = *(uchar*)str; 305 if(c < 0xF0){ 306 r = 0; 307 n = chartorune(&r, str); 308 *rune = r; 309 return n; 310 } 311 c &= ~0xF0; 312 c1 = *(uchar*)(str+1) & ~0x80; 313 c2 = *(uchar*)(str+2) & ~0x80; 314 c3 = *(uchar*)(str+3) & ~0x80; 315 n = (c<<18) | (c1<<12) | (c2<<6) | c3; 316 if(n < 0x10000 || n > 0x10FFFF){ 317 *rune = Runeerror; 318 return 1; 319 } 320 *rune = n; 321 return 4; 322 } 323 324 void 325 filetype(int fd) 326 { 327 Rune1 r; 328 int i, f, n; 329 char *p, *eob; 330 331 free(mbuf); 332 mbuf = dirfstat(fd); 333 if(mbuf == nil){ 334 print("cannot stat: %r\n"); 335 return; 336 } 337 if(mbuf->mode & DMDIR) { 338 print(mime ? "text/directory\n" : "directory\n"); 339 return; 340 } 341 if(mbuf->type != 'M' && mbuf->type != '|') { 342 print(mime ? OCTET : "special file #%c/%s\n", 343 mbuf->type, mbuf->name); 344 return; 345 } 346 /* may be reading a pipe on standard input */ 347 nbuf = readn(fd, buf, sizeof(buf)-1); 348 if(nbuf < 0) { 349 print("cannot read\n"); 350 return; 351 } 352 if(nbuf == 0) { 353 print(mime ? PLAIN : "empty file\n"); 354 return; 355 } 356 buf[nbuf] = 0; 357 358 /* 359 * build histogram table 360 */ 361 memset(cfreq, 0, sizeof(cfreq)); 362 for (i = 0; language[i].name; i++) 363 language[i].count = 0; 364 eob = (char *)buf+nbuf; 365 for(n = 0, p = (char *)buf; p < eob; n++) { 366 if (!fullrune1(p, eob-p) && eob-p < UTFmax1) 367 break; 368 p += chartorune1(&r, p); 369 if (r == 0) 370 f = Cnull; 371 else if (r <= 0x7f) { 372 if (!isprint(r) && !isspace(r)) 373 f = Ceascii; /* ASCII control char */ 374 else f = r; 375 } else if (r == 0x80) { 376 bump_utf_count(r); 377 f = Cutf; 378 } else if (r < 0xA0) 379 f = Cbinary; /* Invalid Runes */ 380 else if (r <= 0xff) 381 f = Clatin; /* Latin 1 */ 382 else { 383 bump_utf_count(r); 384 f = Cutf; /* UTF extension */ 385 } 386 cfreq[f]++; /* ASCII chars peg directly */ 387 } 388 /* 389 * gross classify 390 */ 391 if (cfreq[Cbinary]) 392 guess = Fbinary; 393 else if (cfreq[Cutf]) 394 guess = Futf; 395 else if (cfreq[Clatin]) 396 guess = Flatin; 397 else if (cfreq[Ceascii]) 398 guess = Feascii; 399 else if (cfreq[Cnull]) 400 guess = Fbinary; 401 else 402 guess = Fascii; 403 /* 404 * lookup dictionary words 405 */ 406 memset(wfreq, 0, sizeof(wfreq)); 407 if(guess == Fascii || guess == Flatin || guess == Futf) 408 wordfreq(); 409 /* 410 * call individual classify routines 411 */ 412 for(i=0; call[i]; i++) 413 if((*call[i])()) 414 return; 415 416 /* 417 * if all else fails, 418 * print out gross classification 419 */ 420 if (nbuf < 100 && !mime) 421 print(mime ? PLAIN : "short "); 422 if (guess == Fascii) 423 print(mime ? PLAIN : "Ascii\n"); 424 else if (guess == Feascii) 425 print(mime ? PLAIN : "extended ascii\n"); 426 else if (guess == Flatin) 427 print(mime ? PLAIN : "latin ascii\n"); 428 else if (guess == Futf && utf_count() < 4) 429 print_utf(); 430 else print(mime ? OCTET : "binary\n"); 431 } 432 433 void 434 bump_utf_count(Rune r) 435 { 436 int low, high, mid; 437 438 high = sizeof(language)/sizeof(language[0])-1; 439 for (low = 0; low < high;) { 440 mid = (low+high)/2; 441 if (r >= language[mid].low) { 442 if (r <= language[mid].high) { 443 language[mid].count++; 444 break; 445 } else low = mid+1; 446 } else high = mid; 447 } 448 } 449 450 int 451 utf_count(void) 452 { 453 int i, count; 454 455 count = 0; 456 for (i = 0; language[i].name; i++) 457 if (language[i].count > 0) 458 switch (language[i].mode) { 459 case Normal: 460 case First: 461 count++; 462 break; 463 default: 464 break; 465 } 466 return count; 467 } 468 469 int 470 chkascii(void) 471 { 472 int i; 473 474 for (i = 'a'; i < 'z'; i++) 475 if (cfreq[i]) 476 return 1; 477 for (i = 'A'; i < 'Z'; i++) 478 if (cfreq[i]) 479 return 1; 480 return 0; 481 } 482 483 int 484 find_first(char *name) 485 { 486 int i; 487 488 for (i = 0; language[i].name != 0; i++) 489 if (language[i].mode == First 490 && strcmp(language[i].name, name) == 0) 491 return i; 492 return -1; 493 } 494 495 void 496 print_utf(void) 497 { 498 int i, printed, j; 499 500 if(mime){ 501 print(PLAIN); 502 return; 503 } 504 if (chkascii()) { 505 printed = 1; 506 print("Ascii"); 507 } else 508 printed = 0; 509 for (i = 0; language[i].name; i++) 510 if (language[i].count) { 511 switch(language[i].mode) { 512 case Multi: 513 j = find_first(language[i].name); 514 if (j < 0) 515 break; 516 if (language[j].count > 0) 517 break; 518 /* Fall through */ 519 case Normal: 520 case First: 521 if (printed) 522 print(" & "); 523 else printed = 1; 524 print("%s", language[i].name); 525 break; 526 case Shared: 527 default: 528 break; 529 } 530 } 531 if(!printed) 532 print("UTF"); 533 print(" text\n"); 534 } 535 536 void 537 wordfreq(void) 538 { 539 int low, high, mid, r; 540 uchar *p, *p2, c; 541 542 p = buf; 543 for(;;) { 544 while (p < buf+nbuf && !isalpha(*p)) 545 p++; 546 if (p >= buf+nbuf) 547 return; 548 p2 = p; 549 while(p < buf+nbuf && isalpha(*p)) 550 p++; 551 c = *p; 552 *p = 0; 553 high = sizeof(dict)/sizeof(dict[0]); 554 for(low = 0;low < high;) { 555 mid = (low+high)/2; 556 r = strcmp(dict[mid].word, (char*)p2); 557 if(r == 0) { 558 wfreq[dict[mid].class]++; 559 break; 560 } 561 if(r < 0) 562 low = mid+1; 563 else 564 high = mid; 565 } 566 *p++ = c; 567 } 568 } 569 570 typedef struct Filemagic Filemagic; 571 struct Filemagic { 572 ulong x; 573 ulong mask; 574 char *desc; 575 char *mime; 576 }; 577 578 /* 579 * integers in this table must be as seen on a little-endian machine 580 * when read from a file. 581 */ 582 Filemagic long0tab[] = { 583 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 584 /* "pac1" */ 585 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 586 /* "pXc2 */ 587 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 588 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 589 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 590 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 591 070707, 0xFFFF, "cpio archive\n", OCTET, 592 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 593 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 594 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 595 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 596 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 597 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 598 /* 599 * venti & fossil magic numbers are stored big-endian on disk, 600 * thus the numbers appear reversed in this table. 601 */ 602 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 603 }; 604 605 int 606 filemagic(Filemagic *tab, int ntab, ulong x) 607 { 608 int i; 609 610 for(i=0; i<ntab; i++) 611 if((x&tab[i].mask) == tab[i].x){ 612 print(mime ? tab[i].mime : tab[i].desc); 613 return 1; 614 } 615 return 0; 616 } 617 618 int 619 long0(void) 620 { 621 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 622 } 623 624 typedef struct Fileoffmag Fileoffmag; 625 struct Fileoffmag { 626 ulong off; 627 Filemagic; 628 }; 629 630 /* 631 * integers in this table must be as seen on a little-endian machine 632 * when read from a file. 633 */ 634 Fileoffmag longofftab[] = { 635 /* 636 * venti & fossil magic numbers are stored big-endian on disk, 637 * thus the numbers appear reversed in this table. 638 */ 639 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 640 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 641 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 642 }; 643 644 int 645 fileoffmagic(Fileoffmag *tab, int ntab) 646 { 647 int i; 648 ulong x; 649 Fileoffmag *tp; 650 uchar buf[sizeof(long)]; 651 652 for(i=0; i<ntab; i++) { 653 tp = tab + i; 654 seek(fd, tp->off, 0); 655 if (readn(fd, buf, sizeof buf) != sizeof buf) 656 continue; 657 x = LENDIAN(buf); 658 if((x&tp->mask) == tp->x){ 659 print(mime? tp->mime: tp->desc); 660 return 1; 661 } 662 } 663 return 0; 664 } 665 666 int 667 longoff(void) 668 { 669 return fileoffmagic(longofftab, nelem(longofftab)); 670 } 671 672 int 673 isexec(void) 674 { 675 Fhdr f; 676 677 seek(fd, 0, 0); /* reposition to start of file */ 678 if(crackhdr(fd, &f)) { 679 print(mime ? OCTET : "%s\n", f.name); 680 return 1; 681 } 682 return 0; 683 } 684 685 686 /* from tar.c */ 687 enum { NAMSIZ = 100, TBLOCK = 512 }; 688 689 union hblock 690 { 691 char dummy[TBLOCK]; 692 struct header 693 { 694 char name[NAMSIZ]; 695 char mode[8]; 696 char uid[8]; 697 char gid[8]; 698 char size[12]; 699 char mtime[12]; 700 char chksum[8]; 701 char linkflag; 702 char linkname[NAMSIZ]; 703 /* rest are defined by POSIX's ustar format; see p1003.2b */ 704 char magic[6]; /* "ustar" */ 705 char version[2]; 706 char uname[32]; 707 char gname[32]; 708 char devmajor[8]; 709 char devminor[8]; 710 char prefix[155]; /* if non-null, path = prefix "/" name */ 711 } dbuf; 712 }; 713 714 int 715 checksum(union hblock *hp) 716 { 717 int i; 718 char *cp; 719 struct header *hdr = &hp->dbuf; 720 721 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 722 *cp = ' '; 723 i = 0; 724 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 725 i += *cp & 0xff; 726 return i; 727 } 728 729 int 730 istar(void) 731 { 732 int chksum; 733 char tblock[TBLOCK]; 734 union hblock *hp = (union hblock *)tblock; 735 struct header *hdr = &hp->dbuf; 736 737 seek(fd, 0, 0); /* reposition to start of file */ 738 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 739 return 0; 740 chksum = strtol(hdr->chksum, 0, 8); 741 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 742 if (strcmp(hdr->magic, "ustar") == 0) 743 print(mime? "application/x-ustar\n": 744 "posix tar archive\n"); 745 else 746 print(mime? "application/x-tar\n": "tar archive\n"); 747 return 1; 748 } 749 return 0; 750 } 751 752 /* 753 * initial words to classify file 754 */ 755 struct FILE_STRING 756 { 757 char *key; 758 char *filetype; 759 int length; 760 char *mime; 761 } file_string[] = 762 { 763 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 764 "!<arch>\n", "archive", 8, "application/octet-stream", 765 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 766 "#!/bin/rc", "rc executable file", 9, "text/plain", 767 "#!/bin/sh", "sh executable file", 9, "text/plain", 768 "%!", "postscript", 2, "application/postscript", 769 "\004%!", "postscript", 3, "application/postscript", 770 "x T post", "troff output for post", 8, "application/troff", 771 "x T Latin1", "troff output for Latin1", 10, "application/troff", 772 "x T utf", "troff output for UTF", 7, "application/troff", 773 "x T 202", "troff output for 202", 7, "application/troff", 774 "x T aps", "troff output for aps", 7, "application/troff", 775 "GIF", "GIF image", 3, "image/gif", 776 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 777 "%PDF", "PDF", 4, "application/pdf", 778 "<html>\n", "HTML file", 7, "text/html", 779 "<HTML>\n", "HTML file", 7, "text/html", 780 "\111\111\052\000", "tiff", 4, "image/tiff", 781 "\115\115\000\052", "tiff", 4, "image/tiff", 782 "\377\330\377\340", "jpeg", 4, "image/jpeg", 783 "\377\330\377\341", "jpeg", 4, "image/jpeg", 784 "\377\330\377\333", "jpeg", 4, "image/jpeg", 785 "BM", "bmp", 2, "image/bmp", 786 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 787 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 788 "\033E\033", "HP PCL printer data", 3, OCTET, 789 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 790 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 791 "\211PNG", "PNG image", 4, "image/png", 792 "P3\n", "ppm", 3, "image/ppm", 793 "P6\n", "ppm", 3, "image/ppm", 794 "/* XPM */\n", "xbm", 10, "image/xbm", 795 ".HTML ", "troff -ms input", 6, "text/troff", 796 ".LP", "troff -ms input", 3, "text/troff", 797 ".ND", "troff -ms input", 3, "text/troff", 798 ".PP", "troff -ms input", 3, "text/troff", 799 ".TL", "troff -ms input", 3, "text/troff", 800 ".TR", "troff -ms input", 3, "text/troff", 801 ".TH", "manual page", 3, "text/troff", 802 ".\\\"", "troff input", 3, "text/troff", 803 ".de", "troff input", 3, "text/troff", 804 ".if", "troff input", 3, "text/troff", 805 ".nr", "troff input", 3, "text/troff", 806 ".tr", "troff input", 3, "text/troff", 807 "vac:", "venti score", 4, "text/plain", 808 "-----BEGIN CERTIFICATE-----\n", 809 "pem certificate", -1, "text/plain", 810 "-----BEGIN TRUSTED CERTIFICATE-----\n", 811 "pem trusted certificate", -1, "text/plain", 812 "-----BEGIN X509 CERTIFICATE-----\n", 813 "pem x.509 certificate", -1, "text/plain", 814 "subject=/C=", "pem certificate with header", -1, "text/plain", 815 "process snapshot ", "process snapshot", -1, "application/snapfs", 816 0,0,0,0 817 }; 818 819 int 820 istring(void) 821 { 822 int i, l; 823 struct FILE_STRING *p; 824 825 for(p = file_string; p->key; p++) { 826 l = p->length; 827 if(l == -1) 828 l = strlen(p->key); 829 if(nbuf >= l && memcmp(buf, p->key, l) == 0) { 830 if(mime) 831 print("%s\n", p->mime); 832 else 833 print("%s\n", p->filetype); 834 return 1; 835 } 836 } 837 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 838 for(i = 5; i < nbuf; i++) 839 if(buf[i] == '\n') 840 break; 841 if(mime) 842 print(OCTET); 843 else 844 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 845 return 1; 846 } 847 return 0; 848 } 849 850 struct offstr 851 { 852 ulong off; 853 struct FILE_STRING; 854 } offstrs[] = { 855 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 856 0, 0, 0, 0, 0 857 }; 858 859 int 860 isoffstr(void) 861 { 862 int n; 863 char buf[256]; 864 struct offstr *p; 865 866 for(p = offstrs; p->key; p++) { 867 seek(fd, p->off, 0); 868 n = p->length; 869 if (n > sizeof buf) 870 n = sizeof buf; 871 if (readn(fd, buf, n) != n) 872 continue; 873 if(memcmp(buf, p->key, n) == 0) { 874 if(mime) 875 print("%s\n", p->mime); 876 else 877 print("%s\n", p->filetype); 878 return 1; 879 } 880 } 881 return 0; 882 } 883 884 int 885 iff(void) 886 { 887 if (strncmp((char*)buf, "FORM", 4) == 0 && 888 strncmp((char*)buf+8, "AIFF", 4) == 0) { 889 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 890 return 1; 891 } 892 return 0; 893 } 894 895 char* html_string[] = 896 { 897 "title", 898 "body", 899 "head", 900 "strong", 901 "h1", 902 "h2", 903 "h3", 904 "h4", 905 "h5", 906 "h6", 907 "ul", 908 "li", 909 "dl", 910 "br", 911 "em", 912 0, 913 }; 914 915 int 916 ishtml(void) 917 { 918 uchar *p, *q; 919 int i, count; 920 921 /* compare strings between '<' and '>' to html table */ 922 count = 0; 923 p = buf; 924 for(;;) { 925 while (p < buf+nbuf && *p != '<') 926 p++; 927 p++; 928 if (p >= buf+nbuf) 929 break; 930 if(*p == '/') 931 p++; 932 q = p; 933 while(p < buf+nbuf && *p != '>') 934 p++; 935 if (p >= buf+nbuf) 936 break; 937 for(i = 0; html_string[i]; i++) { 938 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 939 if(count++ > 4) { 940 print(mime ? "text/html\n" : "HTML file\n"); 941 return 1; 942 } 943 break; 944 } 945 } 946 p++; 947 } 948 return 0; 949 } 950 951 char* rfc822_string[] = 952 { 953 "from:", 954 "date:", 955 "to:", 956 "subject:", 957 "received:", 958 "reply to:", 959 "sender:", 960 0, 961 }; 962 963 int 964 isrfc822(void) 965 { 966 967 char *p, *q, *r; 968 int i, count; 969 970 count = 0; 971 p = (char*)buf; 972 for(;;) { 973 q = strchr(p, '\n'); 974 if(q == nil) 975 break; 976 *q = 0; 977 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 978 count++; 979 *q = '\n'; 980 p = q+1; 981 continue; 982 } 983 *q = '\n'; 984 if(*p != '\t' && *p != ' '){ 985 r = strchr(p, ':'); 986 if(r == 0 || r > q) 987 break; 988 for(i = 0; rfc822_string[i]; i++) { 989 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 990 count++; 991 break; 992 } 993 } 994 } 995 p = q+1; 996 } 997 if(count >= 3){ 998 print(mime ? "message/rfc822\n" : "email file\n"); 999 return 1; 1000 } 1001 return 0; 1002 } 1003 1004 int 1005 ismbox(void) 1006 { 1007 char *p, *q; 1008 1009 p = (char*)buf; 1010 q = strchr(p, '\n'); 1011 if(q == nil) 1012 return 0; 1013 *q = 0; 1014 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 1015 print(mime ? "text/plain\n" : "mail box\n"); 1016 return 1; 1017 } 1018 *q = '\n'; 1019 return 0; 1020 } 1021 1022 int 1023 iscint(void) 1024 { 1025 int type; 1026 char *name; 1027 Biobuf b; 1028 1029 if(Binit(&b, fd, OREAD) == Beof) 1030 return 0; 1031 seek(fd, 0, 0); 1032 type = objtype(&b, &name); 1033 if(type < 0) 1034 return 0; 1035 if(mime) 1036 print(OCTET); 1037 else 1038 print("%s intermediate\n", name); 1039 return 1; 1040 } 1041 1042 int 1043 isc(void) 1044 { 1045 int n; 1046 1047 n = wfreq[I1]; 1048 /* 1049 * includes 1050 */ 1051 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1052 goto yes; 1053 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1054 goto yes; 1055 /* 1056 * declarations 1057 */ 1058 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1059 goto yes; 1060 /* 1061 * assignments 1062 */ 1063 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1064 goto yes; 1065 return 0; 1066 1067 yes: 1068 if(mime){ 1069 print(PLAIN); 1070 return 1; 1071 } 1072 if(wfreq[Alword] > 0) 1073 print("alef program\n"); 1074 else 1075 print("c program\n"); 1076 return 1; 1077 } 1078 1079 int 1080 islimbo(void) 1081 { 1082 1083 /* 1084 * includes 1085 */ 1086 if(wfreq[Lword] < 4) 1087 return 0; 1088 print(mime ? PLAIN : "limbo program\n"); 1089 return 1; 1090 } 1091 1092 int 1093 isas(void) 1094 { 1095 1096 /* 1097 * includes 1098 */ 1099 if(wfreq[Aword] < 2) 1100 return 0; 1101 print(mime ? PLAIN : "as program\n"); 1102 return 1; 1103 } 1104 1105 /* 1106 * low entropy means encrypted 1107 */ 1108 int 1109 ismung(void) 1110 { 1111 int i, bucket[8]; 1112 float cs; 1113 1114 if(nbuf < 64) 1115 return 0; 1116 memset(bucket, 0, sizeof(bucket)); 1117 for(i=nbuf-64; i<nbuf; i++) 1118 bucket[(buf[i]>>5)&07] += 1; 1119 1120 cs = 0.; 1121 for(i=0; i<8; i++) 1122 cs += (bucket[i]-8)*(bucket[i]-8); 1123 cs /= 8.; 1124 if(cs <= 24.322) { 1125 if(buf[0]==0x1f && buf[1]==0x9d) 1126 print(mime ? OCTET : "compressed\n"); 1127 else 1128 if(buf[0]==0x1f && buf[1]==0x8b) 1129 print(mime ? OCTET : "gzip compressed\n"); 1130 else 1131 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1132 print(mime ? OCTET : "bzip2 compressed\n"); 1133 else 1134 print(mime ? OCTET : "encrypted\n"); 1135 return 1; 1136 } 1137 return 0; 1138 } 1139 1140 /* 1141 * english by punctuation and frequencies 1142 */ 1143 int 1144 isenglish(void) 1145 { 1146 int vow, comm, rare, badpun, punct; 1147 char *p; 1148 1149 if(guess != Fascii && guess != Feascii) 1150 return 0; 1151 badpun = 0; 1152 punct = 0; 1153 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1154 switch(*p) { 1155 case '.': 1156 case ',': 1157 case ')': 1158 case '%': 1159 case ';': 1160 case ':': 1161 case '?': 1162 punct++; 1163 if(p[1] != ' ' && p[1] != '\n') 1164 badpun++; 1165 } 1166 if(badpun*5 > punct) 1167 return 0; 1168 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1169 return 0; 1170 if(2*cfreq[';'] > cfreq['e']) 1171 return 0; 1172 1173 vow = 0; 1174 for(p="AEIOU"; *p; p++) { 1175 vow += cfreq[*p]; 1176 vow += cfreq[tolower(*p)]; 1177 } 1178 comm = 0; 1179 for(p="ETAION"; *p; p++) { 1180 comm += cfreq[*p]; 1181 comm += cfreq[tolower(*p)]; 1182 } 1183 rare = 0; 1184 for(p="VJKQXZ"; *p; p++) { 1185 rare += cfreq[*p]; 1186 rare += cfreq[tolower(*p)]; 1187 } 1188 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1189 print(mime ? PLAIN : "English text\n"); 1190 return 1; 1191 } 1192 return 0; 1193 } 1194 1195 /* 1196 * pick up a number with 1197 * syntax _*[0-9]+_ 1198 */ 1199 #define P9BITLEN 12 1200 int 1201 p9bitnum(uchar *bp) 1202 { 1203 int n, c, len; 1204 1205 len = P9BITLEN; 1206 while(*bp == ' ') { 1207 bp++; 1208 len--; 1209 if(len <= 0) 1210 return -1; 1211 } 1212 n = 0; 1213 while(len > 1) { 1214 c = *bp++; 1215 if(!isdigit(c)) 1216 return -1; 1217 n = n*10 + c-'0'; 1218 len--; 1219 } 1220 if(*bp != ' ') 1221 return -1; 1222 return n; 1223 } 1224 1225 int 1226 depthof(char *s, int *newp) 1227 { 1228 char *es; 1229 int d; 1230 1231 *newp = 0; 1232 es = s+12; 1233 while(s<es && *s==' ') 1234 s++; 1235 if(s == es) 1236 return -1; 1237 if('0'<=*s && *s<='9') 1238 return 1<<strtol(s, 0, 0); 1239 1240 *newp = 1; 1241 d = 0; 1242 while(s<es && *s!=' '){ 1243 s++; /* skip letter */ 1244 d += strtoul(s, &s, 10); 1245 } 1246 1247 if(d % 8 == 0 || 8 % d == 0) 1248 return d; 1249 else 1250 return -1; 1251 } 1252 1253 int 1254 isp9bit(void) 1255 { 1256 int dep, lox, loy, hix, hiy, px, new, cmpr; 1257 ulong t; 1258 long len; 1259 char *newlabel; 1260 uchar *cp; 1261 1262 cp = buf; 1263 cmpr = 0; 1264 newlabel = "old "; 1265 1266 if(memcmp(cp, "compressed\n", 11) == 0) { 1267 cmpr = 1; 1268 cp = buf + 11; 1269 } 1270 1271 dep = depthof((char*)cp + 0*P9BITLEN, &new); 1272 if(new) 1273 newlabel = ""; 1274 lox = p9bitnum(cp + 1*P9BITLEN); 1275 loy = p9bitnum(cp + 2*P9BITLEN); 1276 hix = p9bitnum(cp + 3*P9BITLEN); 1277 hiy = p9bitnum(cp + 4*P9BITLEN); 1278 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1279 return 0; 1280 1281 if(dep < 8){ 1282 px = 8/dep; /* pixels per byte */ 1283 /* set l to number of bytes of data per scan line */ 1284 if(lox >= 0) 1285 len = (hix+px-1)/px - lox/px; 1286 else{ /* make positive before divide */ 1287 t = (-lox)+px-1; 1288 t = (t/px)*px; 1289 len = (t+hix+px-1)/px; 1290 } 1291 }else 1292 len = (hix-lox)*dep/8; 1293 len *= hiy - loy; /* col length */ 1294 len += 5 * P9BITLEN; /* size of initial ascii */ 1295 1296 /* 1297 * for compressed images, don't look any further. otherwise: 1298 * for image file, length is non-zero and must match calculation above. 1299 * for /dev/window and /dev/screen the length is always zero. 1300 * for subfont, the subfont header should follow immediately. 1301 */ 1302 if (cmpr) { 1303 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n", 1304 newlabel, dep); 1305 return 1; 1306 } 1307 /* 1308 * mbuf->length == 0 probably indicates reading a pipe. 1309 * Ghostscript sometimes produces a little extra on the end. 1310 */ 1311 if (len != 0 && (mbuf->length == 0 || mbuf->length == len || 1312 mbuf->length > len && mbuf->length < len+P9BITLEN)) { 1313 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1314 return 1; 1315 } 1316 if (p9subfont(buf+len)) { 1317 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep); 1318 return 1; 1319 } 1320 return 0; 1321 } 1322 1323 int 1324 p9subfont(uchar *p) 1325 { 1326 int n, h, a; 1327 1328 /* if image too big, assume it's a subfont */ 1329 if (p+3*P9BITLEN > buf+sizeof(buf)) 1330 return 1; 1331 1332 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1333 if (n < 0) 1334 return 0; 1335 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1336 if (h < 0) 1337 return 0; 1338 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1339 if (a < 0) 1340 return 0; 1341 return 1; 1342 } 1343 1344 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1345 1346 int 1347 isp9font(void) 1348 { 1349 uchar *cp, *p; 1350 int i, n; 1351 char pathname[1024]; 1352 1353 cp = buf; 1354 if (!getfontnum(cp, &cp)) /* height */ 1355 return 0; 1356 if (!getfontnum(cp, &cp)) /* ascent */ 1357 return 0; 1358 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1359 if (!getfontnum(cp, &cp)) /* min */ 1360 break; 1361 if (!getfontnum(cp, &cp)) /* max */ 1362 return 0; 1363 getfontnum(cp, &cp); /* optional offset */ 1364 while (WHITESPACE(*cp)) 1365 cp++; 1366 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1367 ; 1368 /* construct a path name, if needed */ 1369 n = 0; 1370 if (*p != '/' && slash) { 1371 n = slash-fname+1; 1372 if (n < sizeof(pathname)) 1373 memcpy(pathname, fname, n); 1374 else n = 0; 1375 } 1376 if (n+cp-p+4 < sizeof(pathname)) { 1377 memcpy(pathname+n, p, cp-p); 1378 n += cp-p; 1379 pathname[n] = 0; 1380 if (access(pathname, AEXIST) < 0) { 1381 strcpy(pathname+n, ".0"); 1382 if (access(pathname, AEXIST) < 0) 1383 return 0; 1384 } 1385 } 1386 } 1387 if (i) { 1388 print(mime ? "text/plain\n" : "font file\n"); 1389 return 1; 1390 } 1391 return 0; 1392 } 1393 1394 int 1395 getfontnum(uchar *cp, uchar **rp) 1396 { 1397 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1398 cp++; 1399 if (*cp < '0' || *cp > '9') 1400 return 0; 1401 strtoul((char *)cp, (char **)rp, 0); 1402 if (!WHITESPACE(**rp)) { 1403 *rp = cp; 1404 return 0; 1405 } 1406 return 1; 1407 } 1408 1409 int 1410 isrtf(void) 1411 { 1412 if(strstr((char *)buf, "\\rtf1")){ 1413 print(mime ? "application/rtf\n" : "rich text format\n"); 1414 return 1; 1415 } 1416 return 0; 1417 } 1418 1419 int 1420 ismsdos(void) 1421 { 1422 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1423 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1424 return 1; 1425 } 1426 return 0; 1427 } 1428 1429 int 1430 iself(void) 1431 { 1432 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1433 [1] "WE32100", 1434 [2] "SPARC", 1435 [3] "i386", 1436 [4] "M68000", 1437 [5] "M88000", 1438 [6] "i486", 1439 [7] "i860", 1440 [8] "R3000", 1441 [9] "S370", 1442 [10] "R4000", 1443 [15] "HP-PA", 1444 [18] "sparc v8+", 1445 [19] "i960", 1446 [20] "PPC-32", 1447 [21] "PPC-64", 1448 [40] "ARM", 1449 [41] "Alpha", 1450 [43] "sparc v9", 1451 [50] "IA-64", 1452 [62] "AMD64", 1453 [75] "VAX", 1454 }; 1455 static char *type[] = { 1456 [1] "relocatable object", 1457 [2] "executable", 1458 [3] "shared library", 1459 [4] "core dump", 1460 }; 1461 1462 if (memcmp(buf, "\x7fELF", 4) == 0){ 1463 if (!mime){ 1464 int isdifend = 0; 1465 int n = (buf[19] << 8) | buf[18]; 1466 char *p = "unknown"; 1467 char *t = "unknown"; 1468 1469 if (n > 0 && n < nelem(cpu) && cpu[n]) 1470 p = cpu[n]; 1471 else { 1472 /* try the other byte order */ 1473 isdifend = 1; 1474 n = (buf[18] << 8) | buf[19]; 1475 if (n > 0 && n < nelem(cpu) && cpu[n]) 1476 p = cpu[n]; 1477 } 1478 if(isdifend) 1479 n = (buf[16]<< 8) | buf[17]; 1480 else 1481 n = (buf[17]<< 8) | buf[16]; 1482 1483 if(n>0 && n < nelem(type) && type[n]) 1484 t = type[n]; 1485 print("%s ELF %s\n", p, t); 1486 } 1487 else 1488 print("application/x-elf-executable"); 1489 return 1; 1490 } 1491 1492 return 0; 1493 } 1494 1495 int 1496 isface(void) 1497 { 1498 int i, j, ldepth, l; 1499 char *p; 1500 1501 ldepth = -1; 1502 for(j = 0; j < 3; j++){ 1503 for(p = (char*)buf, i=0; i<3; i++){ 1504 if(p[0] != '0' || p[1] != 'x') 1505 return 0; 1506 if(buf[2+8] == ',') 1507 l = 2; 1508 else if(buf[2+4] == ',') 1509 l = 1; 1510 else 1511 return 0; 1512 if(ldepth == -1) 1513 ldepth = l; 1514 if(l != ldepth) 1515 return 0; 1516 strtoul(p, &p, 16); 1517 if(*p++ != ',') 1518 return 0; 1519 while(*p == ' ' || *p == '\t') 1520 p++; 1521 } 1522 if (*p++ != '\n') 1523 return 0; 1524 } 1525 1526 if(mime) 1527 print("application/x-face\n"); 1528 else 1529 print("face image depth %d\n", ldepth); 1530 return 1; 1531 } 1532 1533