1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 isp9font, /* plan 9 font */ 192 isp9bit, /* plan 9 image (as from /dev/window) */ 193 ismung, /* entropy compressed/encrypted */ 194 isenglish, /* char frequency English */ 195 isrtf, /* rich text format */ 196 ismsdos, /* msdos exe (virus file attachement) */ 197 isface, /* ascii face file */ 198 0 199 }; 200 201 int mime; 202 203 #define OCTET "application/octet-stream\n" 204 #define PLAIN "text/plain\n" 205 206 void 207 main(int argc, char *argv[]) 208 { 209 int i, j, maxlen; 210 char *cp; 211 Rune r; 212 213 ARGBEGIN{ 214 case 'm': 215 mime = 1; 216 break; 217 default: 218 fprint(2, "usage: file [-m] [file...]\n"); 219 exits("usage"); 220 }ARGEND; 221 222 maxlen = 0; 223 if(mime == 0 || argc > 1){ 224 for(i = 0; i < argc; i++) { 225 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 226 ; 227 if(j > maxlen) 228 maxlen = j; 229 } 230 } 231 if (argc <= 0) { 232 if(!mime) 233 print ("stdin: "); 234 filetype(0); 235 } 236 else { 237 for(i = 0; i < argc; i++) 238 type(argv[i], maxlen); 239 } 240 exits(0); 241 } 242 243 void 244 type(char *file, int nlen) 245 { 246 Rune r; 247 int i; 248 char *p; 249 250 if(nlen > 0){ 251 slash = 0; 252 for (i = 0, p = file; *p; i++) { 253 if (*p == '/') /* find rightmost slash */ 254 slash = p; 255 p += chartorune(&r, p); /* count runes */ 256 } 257 print("%s:%*s",file, nlen-i+1, ""); 258 } 259 fname = file; 260 if ((fd = open(file, OREAD)) < 0) { 261 print("cannot open\n"); 262 return; 263 } 264 filetype(fd); 265 close(fd); 266 } 267 268 /* 269 * Unicode 4.0 4-byte runes. 270 */ 271 typedef int Rune1; 272 273 enum { 274 UTFmax1 = 4, 275 }; 276 277 int 278 fullrune1(char *p, int n) 279 { 280 int c; 281 282 if(n >= 1) { 283 c = *(uchar*)p; 284 if(c < 0x80) 285 return 1; 286 if(n >= 2 && c < 0xE0) 287 return 1; 288 if(n >= 3 && c < 0xF0) 289 return 1; 290 if(n >= 4) 291 return 1; 292 } 293 return 0; 294 } 295 296 int 297 chartorune1(Rune1 *rune, char *str) 298 { 299 int c, c1, c2, c3, n; 300 Rune r; 301 302 c = *(uchar*)str; 303 if(c < 0xF0){ 304 r = 0; 305 n = chartorune(&r, str); 306 *rune = r; 307 return n; 308 } 309 c &= ~0xF0; 310 c1 = *(uchar*)(str+1) & ~0x80; 311 c2 = *(uchar*)(str+2) & ~0x80; 312 c3 = *(uchar*)(str+3) & ~0x80; 313 n = (c<<18) | (c1<<12) | (c2<<6) | c3; 314 if(n < 0x10000 || n > 0x10FFFF){ 315 *rune = Runeerror; 316 return 1; 317 } 318 *rune = n; 319 return 4; 320 } 321 322 void 323 filetype(int fd) 324 { 325 Rune1 r; 326 int i, f, n; 327 char *p, *eob; 328 329 free(mbuf); 330 mbuf = dirfstat(fd); 331 if(mbuf == nil){ 332 print("cannot stat: %r\n"); 333 return; 334 } 335 if(mbuf->mode & DMDIR) { 336 print(mime ? "text/directory\n" : "directory\n"); 337 return; 338 } 339 if(mbuf->type != 'M' && mbuf->type != '|') { 340 print(mime ? OCTET : "special file #%c/%s\n", 341 mbuf->type, mbuf->name); 342 return; 343 } 344 /* may be reading a pipe on standard input */ 345 nbuf = readn(fd, buf, sizeof(buf)-1); 346 if(nbuf < 0) { 347 print("cannot read\n"); 348 return; 349 } 350 if(nbuf == 0) { 351 print(mime ? PLAIN : "empty file\n"); 352 return; 353 } 354 buf[nbuf] = 0; 355 356 /* 357 * build histogram table 358 */ 359 memset(cfreq, 0, sizeof(cfreq)); 360 for (i = 0; language[i].name; i++) 361 language[i].count = 0; 362 eob = (char *)buf+nbuf; 363 for(n = 0, p = (char *)buf; p < eob; n++) { 364 if (!fullrune1(p, eob-p) && eob-p < UTFmax1) 365 break; 366 p += chartorune1(&r, p); 367 if (r == 0) 368 f = Cnull; 369 else if (r <= 0x7f) { 370 if (!isprint(r) && !isspace(r)) 371 f = Ceascii; /* ASCII control char */ 372 else f = r; 373 } else if (r == 0x80) { 374 bump_utf_count(r); 375 f = Cutf; 376 } else if (r < 0xA0) 377 f = Cbinary; /* Invalid Runes */ 378 else if (r <= 0xff) 379 f = Clatin; /* Latin 1 */ 380 else { 381 bump_utf_count(r); 382 f = Cutf; /* UTF extension */ 383 } 384 cfreq[f]++; /* ASCII chars peg directly */ 385 } 386 /* 387 * gross classify 388 */ 389 if (cfreq[Cbinary]) 390 guess = Fbinary; 391 else if (cfreq[Cutf]) 392 guess = Futf; 393 else if (cfreq[Clatin]) 394 guess = Flatin; 395 else if (cfreq[Ceascii]) 396 guess = Feascii; 397 else if (cfreq[Cnull]) 398 guess = Fbinary; 399 else 400 guess = Fascii; 401 /* 402 * lookup dictionary words 403 */ 404 memset(wfreq, 0, sizeof(wfreq)); 405 if(guess == Fascii || guess == Flatin || guess == Futf) 406 wordfreq(); 407 /* 408 * call individual classify routines 409 */ 410 for(i=0; call[i]; i++) 411 if((*call[i])()) 412 return; 413 414 /* 415 * if all else fails, 416 * print out gross classification 417 */ 418 if (nbuf < 100 && !mime) 419 print(mime ? PLAIN : "short "); 420 if (guess == Fascii) 421 print(mime ? PLAIN : "Ascii\n"); 422 else if (guess == Feascii) 423 print(mime ? PLAIN : "extended ascii\n"); 424 else if (guess == Flatin) 425 print(mime ? PLAIN : "latin ascii\n"); 426 else if (guess == Futf && utf_count() < 4) 427 print_utf(); 428 else print(mime ? OCTET : "binary\n"); 429 } 430 431 void 432 bump_utf_count(Rune r) 433 { 434 int low, high, mid; 435 436 high = sizeof(language)/sizeof(language[0])-1; 437 for (low = 0; low < high;) { 438 mid = (low+high)/2; 439 if (r >= language[mid].low) { 440 if (r <= language[mid].high) { 441 language[mid].count++; 442 break; 443 } else low = mid+1; 444 } else high = mid; 445 } 446 } 447 448 int 449 utf_count(void) 450 { 451 int i, count; 452 453 count = 0; 454 for (i = 0; language[i].name; i++) 455 if (language[i].count > 0) 456 switch (language[i].mode) { 457 case Normal: 458 case First: 459 count++; 460 break; 461 default: 462 break; 463 } 464 return count; 465 } 466 467 int 468 chkascii(void) 469 { 470 int i; 471 472 for (i = 'a'; i < 'z'; i++) 473 if (cfreq[i]) 474 return 1; 475 for (i = 'A'; i < 'Z'; i++) 476 if (cfreq[i]) 477 return 1; 478 return 0; 479 } 480 481 int 482 find_first(char *name) 483 { 484 int i; 485 486 for (i = 0; language[i].name != 0; i++) 487 if (language[i].mode == First 488 && strcmp(language[i].name, name) == 0) 489 return i; 490 return -1; 491 } 492 493 void 494 print_utf(void) 495 { 496 int i, printed, j; 497 498 if(mime){ 499 print(PLAIN); 500 return; 501 } 502 if (chkascii()) { 503 printed = 1; 504 print("Ascii"); 505 } else 506 printed = 0; 507 for (i = 0; language[i].name; i++) 508 if (language[i].count) { 509 switch(language[i].mode) { 510 case Multi: 511 j = find_first(language[i].name); 512 if (j < 0) 513 break; 514 if (language[j].count > 0) 515 break; 516 /* Fall through */ 517 case Normal: 518 case First: 519 if (printed) 520 print(" & "); 521 else printed = 1; 522 print("%s", language[i].name); 523 break; 524 case Shared: 525 default: 526 break; 527 } 528 } 529 if(!printed) 530 print("UTF"); 531 print(" text\n"); 532 } 533 534 void 535 wordfreq(void) 536 { 537 int low, high, mid, r; 538 uchar *p, *p2, c; 539 540 p = buf; 541 for(;;) { 542 while (p < buf+nbuf && !isalpha(*p)) 543 p++; 544 if (p >= buf+nbuf) 545 return; 546 p2 = p; 547 while(p < buf+nbuf && isalpha(*p)) 548 p++; 549 c = *p; 550 *p = 0; 551 high = sizeof(dict)/sizeof(dict[0]); 552 for(low = 0;low < high;) { 553 mid = (low+high)/2; 554 r = strcmp(dict[mid].word, (char*)p2); 555 if(r == 0) { 556 wfreq[dict[mid].class]++; 557 break; 558 } 559 if(r < 0) 560 low = mid+1; 561 else 562 high = mid; 563 } 564 *p++ = c; 565 } 566 } 567 568 typedef struct Filemagic Filemagic; 569 struct Filemagic { 570 ulong x; 571 ulong mask; 572 char *desc; 573 char *mime; 574 }; 575 576 /* 577 * integers in this table must be as seen on a little-endian machine 578 * when read from a file. 579 */ 580 Filemagic long0tab[] = { 581 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 582 /* "pac1" */ 583 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 584 /* "pXc2 */ 585 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 586 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 587 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 588 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 589 070707, 0xFFFF, "cpio archive\n", OCTET, 590 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 591 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 592 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 593 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 594 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 595 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 596 /* 597 * venti & fossil magic numbers are stored big-endian on disk, 598 * thus the numbers appear reversed in this table. 599 */ 600 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 601 }; 602 603 int 604 filemagic(Filemagic *tab, int ntab, ulong x) 605 { 606 int i; 607 608 for(i=0; i<ntab; i++) 609 if((x&tab[i].mask) == tab[i].x){ 610 print(mime ? tab[i].mime : tab[i].desc); 611 return 1; 612 } 613 return 0; 614 } 615 616 int 617 long0(void) 618 { 619 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 620 } 621 622 typedef struct Fileoffmag Fileoffmag; 623 struct Fileoffmag { 624 ulong off; 625 Filemagic; 626 }; 627 628 /* 629 * integers in this table must be as seen on a little-endian machine 630 * when read from a file. 631 */ 632 Fileoffmag longofftab[] = { 633 /* 634 * venti & fossil magic numbers are stored big-endian on disk, 635 * thus the numbers appear reversed in this table. 636 */ 637 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 638 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 639 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 640 }; 641 642 int 643 fileoffmagic(Fileoffmag *tab, int ntab) 644 { 645 int i; 646 ulong x; 647 Fileoffmag *tp; 648 uchar buf[sizeof(long)]; 649 650 for(i=0; i<ntab; i++) { 651 tp = tab + i; 652 seek(fd, tp->off, 0); 653 if (readn(fd, buf, sizeof buf) != sizeof buf) 654 continue; 655 x = LENDIAN(buf); 656 if((x&tp->mask) == tp->x){ 657 print(mime? tp->mime: tp->desc); 658 return 1; 659 } 660 } 661 return 0; 662 } 663 664 int 665 longoff(void) 666 { 667 return fileoffmagic(longofftab, nelem(longofftab)); 668 } 669 670 int 671 isexec(void) 672 { 673 Fhdr f; 674 675 seek(fd, 0, 0); /* reposition to start of file */ 676 if(crackhdr(fd, &f)) { 677 print(mime ? OCTET : "%s\n", f.name); 678 return 1; 679 } 680 return 0; 681 } 682 683 684 /* from tar.c */ 685 enum { NAMSIZ = 100, TBLOCK = 512 }; 686 687 union hblock 688 { 689 char dummy[TBLOCK]; 690 struct header 691 { 692 char name[NAMSIZ]; 693 char mode[8]; 694 char uid[8]; 695 char gid[8]; 696 char size[12]; 697 char mtime[12]; 698 char chksum[8]; 699 char linkflag; 700 char linkname[NAMSIZ]; 701 /* rest are defined by POSIX's ustar format; see p1003.2b */ 702 char magic[6]; /* "ustar" */ 703 char version[2]; 704 char uname[32]; 705 char gname[32]; 706 char devmajor[8]; 707 char devminor[8]; 708 char prefix[155]; /* if non-null, path = prefix "/" name */ 709 } dbuf; 710 }; 711 712 int 713 checksum(union hblock *hp) 714 { 715 int i; 716 char *cp; 717 struct header *hdr = &hp->dbuf; 718 719 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 720 *cp = ' '; 721 i = 0; 722 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 723 i += *cp & 0xff; 724 return i; 725 } 726 727 int 728 istar(void) 729 { 730 int chksum; 731 char tblock[TBLOCK]; 732 union hblock *hp = (union hblock *)tblock; 733 struct header *hdr = &hp->dbuf; 734 735 seek(fd, 0, 0); /* reposition to start of file */ 736 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 737 return 0; 738 chksum = strtol(hdr->chksum, 0, 8); 739 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 740 if (strcmp(hdr->magic, "ustar") == 0) 741 print(mime? "application/x-ustar\n": 742 "posix tar archive\n"); 743 else 744 print(mime? "application/x-tar\n": "tar archive\n"); 745 return 1; 746 } 747 return 0; 748 } 749 750 /* 751 * initial words to classify file 752 */ 753 struct FILE_STRING 754 { 755 char *key; 756 char *filetype; 757 int length; 758 char *mime; 759 } file_string[] = 760 { 761 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 762 "!<arch>\n", "archive", 8, "application/octet-stream", 763 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 764 "#!/bin/rc", "rc executable file", 9, "text/plain", 765 "#!/bin/sh", "sh executable file", 9, "text/plain", 766 "%!", "postscript", 2, "application/postscript", 767 "\004%!", "postscript", 3, "application/postscript", 768 "x T post", "troff output for post", 8, "application/troff", 769 "x T Latin1", "troff output for Latin1", 10, "application/troff", 770 "x T utf", "troff output for UTF", 7, "application/troff", 771 "x T 202", "troff output for 202", 7, "application/troff", 772 "x T aps", "troff output for aps", 7, "application/troff", 773 "GIF", "GIF image", 3, "image/gif", 774 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 775 "%PDF", "PDF", 4, "application/pdf", 776 "<html>\n", "HTML file", 7, "text/html", 777 "<HTML>\n", "HTML file", 7, "text/html", 778 "\111\111\052\000", "tiff", 4, "image/tiff", 779 "\115\115\000\052", "tiff", 4, "image/tiff", 780 "\377\330\377\340", "jpeg", 4, "image/jpeg", 781 "\377\330\377\341", "jpeg", 4, "image/jpeg", 782 "\377\330\377\333", "jpeg", 4, "image/jpeg", 783 "BM", "bmp", 2, "image/bmp", 784 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 785 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 786 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 787 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 788 "\211PNG", "PNG image", 4, "image/png", 789 "P3\n", "ppm", 3, "image/ppm", 790 "P6\n", "ppm", 3, "image/ppm", 791 "/* XPM */\n", "xbm", 10, "image/xbm", 792 ".HTML ", "troff -ms input", 6, "text/troff", 793 ".LP", "troff -ms input", 3, "text/troff", 794 ".ND", "troff -ms input", 3, "text/troff", 795 ".PP", "troff -ms input", 3, "text/troff", 796 ".TL", "troff -ms input", 3, "text/troff", 797 ".TR", "troff -ms input", 3, "text/troff", 798 ".TH", "manual page", 3, "text/troff", 799 ".\\\"", "troff input", 3, "text/troff", 800 ".de", "troff input", 3, "text/troff", 801 ".if", "troff input", 3, "text/troff", 802 ".nr", "troff input", 3, "text/troff", 803 ".tr", "troff input", 3, "text/troff", 804 "vac:", "venti score", 4, "text/plain", 805 "-----BEGIN CERTIFICATE-----\n", 806 "pem certificate", -1, "text/plain", 807 "-----BEGIN TRUSTED CERTIFICATE-----\n", 808 "pem trusted certificate", -1, "text/plain", 809 "-----BEGIN X509 CERTIFICATE-----\n", 810 "pem x.509 certificate", -1, "text/plain", 811 "subject=/C=", "pem certificate with header", -1, "text/plain", 812 "process snapshot ", "process snapshot", -1, "application/snapfs", 813 0,0,0,0 814 }; 815 816 int 817 istring(void) 818 { 819 int i, l; 820 struct FILE_STRING *p; 821 822 for(p = file_string; p->key; p++) { 823 l = p->length; 824 if(l == -1) 825 l = strlen(p->key); 826 if(nbuf >= l && memcmp(buf, p->key, l) == 0) { 827 if(mime) 828 print("%s\n", p->mime); 829 else 830 print("%s\n", p->filetype); 831 return 1; 832 } 833 } 834 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 835 for(i = 5; i < nbuf; i++) 836 if(buf[i] == '\n') 837 break; 838 if(mime) 839 print(OCTET); 840 else 841 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 842 return 1; 843 } 844 return 0; 845 } 846 847 struct offstr 848 { 849 ulong off; 850 struct FILE_STRING; 851 } offstrs[] = { 852 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 853 0, 0, 0, 0, 0 854 }; 855 856 int 857 isoffstr(void) 858 { 859 int n; 860 char buf[256]; 861 struct offstr *p; 862 863 for(p = offstrs; p->key; p++) { 864 seek(fd, p->off, 0); 865 n = p->length; 866 if (n > sizeof buf) 867 n = sizeof buf; 868 if (readn(fd, buf, n) != n) 869 continue; 870 if(memcmp(buf, p->key, n) == 0) { 871 if(mime) 872 print("%s\n", p->mime); 873 else 874 print("%s\n", p->filetype); 875 return 1; 876 } 877 } 878 return 0; 879 } 880 881 int 882 iff(void) 883 { 884 if (strncmp((char*)buf, "FORM", 4) == 0 && 885 strncmp((char*)buf+8, "AIFF", 4) == 0) { 886 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 887 return 1; 888 } 889 return 0; 890 } 891 892 char* html_string[] = 893 { 894 "title", 895 "body", 896 "head", 897 "strong", 898 "h1", 899 "h2", 900 "h3", 901 "h4", 902 "h5", 903 "h6", 904 "ul", 905 "li", 906 "dl", 907 "br", 908 "em", 909 0, 910 }; 911 912 int 913 ishtml(void) 914 { 915 uchar *p, *q; 916 int i, count; 917 918 /* compare strings between '<' and '>' to html table */ 919 count = 0; 920 p = buf; 921 for(;;) { 922 while (p < buf+nbuf && *p != '<') 923 p++; 924 p++; 925 if (p >= buf+nbuf) 926 break; 927 if(*p == '/') 928 p++; 929 q = p; 930 while(p < buf+nbuf && *p != '>') 931 p++; 932 if (p >= buf+nbuf) 933 break; 934 for(i = 0; html_string[i]; i++) { 935 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 936 if(count++ > 4) { 937 print(mime ? "text/html\n" : "HTML file\n"); 938 return 1; 939 } 940 break; 941 } 942 } 943 p++; 944 } 945 return 0; 946 } 947 948 char* rfc822_string[] = 949 { 950 "from:", 951 "date:", 952 "to:", 953 "subject:", 954 "received:", 955 "reply to:", 956 "sender:", 957 0, 958 }; 959 960 int 961 isrfc822(void) 962 { 963 964 char *p, *q, *r; 965 int i, count; 966 967 count = 0; 968 p = (char*)buf; 969 for(;;) { 970 q = strchr(p, '\n'); 971 if(q == nil) 972 break; 973 *q = 0; 974 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 975 count++; 976 *q = '\n'; 977 p = q+1; 978 continue; 979 } 980 *q = '\n'; 981 if(*p != '\t' && *p != ' '){ 982 r = strchr(p, ':'); 983 if(r == 0 || r > q) 984 break; 985 for(i = 0; rfc822_string[i]; i++) { 986 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 987 count++; 988 break; 989 } 990 } 991 } 992 p = q+1; 993 } 994 if(count >= 3){ 995 print(mime ? "message/rfc822\n" : "email file\n"); 996 return 1; 997 } 998 return 0; 999 } 1000 1001 int 1002 ismbox(void) 1003 { 1004 char *p, *q; 1005 1006 p = (char*)buf; 1007 q = strchr(p, '\n'); 1008 if(q == nil) 1009 return 0; 1010 *q = 0; 1011 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 1012 print(mime ? "text/plain\n" : "mail box\n"); 1013 return 1; 1014 } 1015 *q = '\n'; 1016 return 0; 1017 } 1018 1019 int 1020 iscint(void) 1021 { 1022 int type; 1023 char *name; 1024 Biobuf b; 1025 1026 if(Binit(&b, fd, OREAD) == Beof) 1027 return 0; 1028 seek(fd, 0, 0); 1029 type = objtype(&b, &name); 1030 if(type < 0) 1031 return 0; 1032 if(mime) 1033 print(OCTET); 1034 else 1035 print("%s intermediate\n", name); 1036 return 1; 1037 } 1038 1039 int 1040 isc(void) 1041 { 1042 int n; 1043 1044 n = wfreq[I1]; 1045 /* 1046 * includes 1047 */ 1048 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1049 goto yes; 1050 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1051 goto yes; 1052 /* 1053 * declarations 1054 */ 1055 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1056 goto yes; 1057 /* 1058 * assignments 1059 */ 1060 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1061 goto yes; 1062 return 0; 1063 1064 yes: 1065 if(mime){ 1066 print(PLAIN); 1067 return 1; 1068 } 1069 if(wfreq[Alword] > 0) 1070 print("alef program\n"); 1071 else 1072 print("c program\n"); 1073 return 1; 1074 } 1075 1076 int 1077 islimbo(void) 1078 { 1079 1080 /* 1081 * includes 1082 */ 1083 if(wfreq[Lword] < 4) 1084 return 0; 1085 print(mime ? PLAIN : "limbo program\n"); 1086 return 1; 1087 } 1088 1089 int 1090 isas(void) 1091 { 1092 1093 /* 1094 * includes 1095 */ 1096 if(wfreq[Aword] < 2) 1097 return 0; 1098 print(mime ? PLAIN : "as program\n"); 1099 return 1; 1100 } 1101 1102 /* 1103 * low entropy means encrypted 1104 */ 1105 int 1106 ismung(void) 1107 { 1108 int i, bucket[8]; 1109 float cs; 1110 1111 if(nbuf < 64) 1112 return 0; 1113 memset(bucket, 0, sizeof(bucket)); 1114 for(i=nbuf-64; i<nbuf; i++) 1115 bucket[(buf[i]>>5)&07] += 1; 1116 1117 cs = 0.; 1118 for(i=0; i<8; i++) 1119 cs += (bucket[i]-8)*(bucket[i]-8); 1120 cs /= 8.; 1121 if(cs <= 24.322) { 1122 if(buf[0]==0x1f && buf[1]==0x9d) 1123 print(mime ? OCTET : "compressed\n"); 1124 else 1125 if(buf[0]==0x1f && buf[1]==0x8b) 1126 print(mime ? OCTET : "gzip compressed\n"); 1127 else 1128 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1129 print(mime ? OCTET : "bzip2 compressed\n"); 1130 else 1131 print(mime ? OCTET : "encrypted\n"); 1132 return 1; 1133 } 1134 return 0; 1135 } 1136 1137 /* 1138 * english by punctuation and frequencies 1139 */ 1140 int 1141 isenglish(void) 1142 { 1143 int vow, comm, rare, badpun, punct; 1144 char *p; 1145 1146 if(guess != Fascii && guess != Feascii) 1147 return 0; 1148 badpun = 0; 1149 punct = 0; 1150 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1151 switch(*p) { 1152 case '.': 1153 case ',': 1154 case ')': 1155 case '%': 1156 case ';': 1157 case ':': 1158 case '?': 1159 punct++; 1160 if(p[1] != ' ' && p[1] != '\n') 1161 badpun++; 1162 } 1163 if(badpun*5 > punct) 1164 return 0; 1165 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1166 return 0; 1167 if(2*cfreq[';'] > cfreq['e']) 1168 return 0; 1169 1170 vow = 0; 1171 for(p="AEIOU"; *p; p++) { 1172 vow += cfreq[*p]; 1173 vow += cfreq[tolower(*p)]; 1174 } 1175 comm = 0; 1176 for(p="ETAION"; *p; p++) { 1177 comm += cfreq[*p]; 1178 comm += cfreq[tolower(*p)]; 1179 } 1180 rare = 0; 1181 for(p="VJKQXZ"; *p; p++) { 1182 rare += cfreq[*p]; 1183 rare += cfreq[tolower(*p)]; 1184 } 1185 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1186 print(mime ? PLAIN : "English text\n"); 1187 return 1; 1188 } 1189 return 0; 1190 } 1191 1192 /* 1193 * pick up a number with 1194 * syntax _*[0-9]+_ 1195 */ 1196 #define P9BITLEN 12 1197 int 1198 p9bitnum(uchar *bp) 1199 { 1200 int n, c, len; 1201 1202 len = P9BITLEN; 1203 while(*bp == ' ') { 1204 bp++; 1205 len--; 1206 if(len <= 0) 1207 return -1; 1208 } 1209 n = 0; 1210 while(len > 1) { 1211 c = *bp++; 1212 if(!isdigit(c)) 1213 return -1; 1214 n = n*10 + c-'0'; 1215 len--; 1216 } 1217 if(*bp != ' ') 1218 return -1; 1219 return n; 1220 } 1221 1222 int 1223 depthof(char *s, int *newp) 1224 { 1225 char *es; 1226 int d; 1227 1228 *newp = 0; 1229 es = s+12; 1230 while(s<es && *s==' ') 1231 s++; 1232 if(s == es) 1233 return -1; 1234 if('0'<=*s && *s<='9') 1235 return 1<<strtol(s, 0, 0); 1236 1237 *newp = 1; 1238 d = 0; 1239 while(s<es && *s!=' '){ 1240 s++; /* skip letter */ 1241 d += strtoul(s, &s, 10); 1242 } 1243 1244 if(d % 8 == 0 || 8 % d == 0) 1245 return d; 1246 else 1247 return -1; 1248 } 1249 1250 int 1251 isp9bit(void) 1252 { 1253 int dep, lox, loy, hix, hiy, px, new, cmpr; 1254 ulong t; 1255 long len; 1256 char *newlabel; 1257 uchar *cp; 1258 1259 cp = buf; 1260 cmpr = 0; 1261 newlabel = "old "; 1262 1263 if(memcmp(cp, "compressed\n", 11) == 0) { 1264 cmpr = 1; 1265 cp = buf + 11; 1266 } 1267 1268 dep = depthof((char*)cp + 0*P9BITLEN, &new); 1269 if(new) 1270 newlabel = ""; 1271 lox = p9bitnum(cp + 1*P9BITLEN); 1272 loy = p9bitnum(cp + 2*P9BITLEN); 1273 hix = p9bitnum(cp + 3*P9BITLEN); 1274 hiy = p9bitnum(cp + 4*P9BITLEN); 1275 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1276 return 0; 1277 1278 if(dep < 8){ 1279 px = 8/dep; /* pixels per byte */ 1280 /* set l to number of bytes of data per scan line */ 1281 if(lox >= 0) 1282 len = (hix+px-1)/px - lox/px; 1283 else{ /* make positive before divide */ 1284 t = (-lox)+px-1; 1285 t = (t/px)*px; 1286 len = (t+hix+px-1)/px; 1287 } 1288 }else 1289 len = (hix-lox)*dep/8; 1290 len *= hiy - loy; /* col length */ 1291 len += 5 * P9BITLEN; /* size of initial ascii */ 1292 1293 /* 1294 * for compressed images, don't look any further. otherwise: 1295 * for image file, length is non-zero and must match calculation above. 1296 * for /dev/window and /dev/screen the length is always zero. 1297 * for subfont, the subfont header should follow immediately. 1298 */ 1299 if (cmpr) { 1300 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n", 1301 newlabel, dep); 1302 return 1; 1303 } 1304 /* 1305 * mbuf->length == 0 probably indicates reading a pipe. 1306 * Ghostscript sometimes produces a little extra on the end. 1307 */ 1308 if (len != 0 && (mbuf->length == 0 || mbuf->length == len || 1309 mbuf->length > len && mbuf->length < len+P9BITLEN)) { 1310 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1311 return 1; 1312 } 1313 if (p9subfont(buf+len)) { 1314 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep); 1315 return 1; 1316 } 1317 return 0; 1318 } 1319 1320 int 1321 p9subfont(uchar *p) 1322 { 1323 int n, h, a; 1324 1325 /* if image too big, assume it's a subfont */ 1326 if (p+3*P9BITLEN > buf+sizeof(buf)) 1327 return 1; 1328 1329 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1330 if (n < 0) 1331 return 0; 1332 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1333 if (h < 0) 1334 return 0; 1335 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1336 if (a < 0) 1337 return 0; 1338 return 1; 1339 } 1340 1341 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1342 1343 int 1344 isp9font(void) 1345 { 1346 uchar *cp, *p; 1347 int i, n; 1348 char pathname[1024]; 1349 1350 cp = buf; 1351 if (!getfontnum(cp, &cp)) /* height */ 1352 return 0; 1353 if (!getfontnum(cp, &cp)) /* ascent */ 1354 return 0; 1355 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1356 if (!getfontnum(cp, &cp)) /* min */ 1357 break; 1358 if (!getfontnum(cp, &cp)) /* max */ 1359 return 0; 1360 getfontnum(cp, &cp); /* optional offset */ 1361 while (WHITESPACE(*cp)) 1362 cp++; 1363 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1364 ; 1365 /* construct a path name, if needed */ 1366 n = 0; 1367 if (*p != '/' && slash) { 1368 n = slash-fname+1; 1369 if (n < sizeof(pathname)) 1370 memcpy(pathname, fname, n); 1371 else n = 0; 1372 } 1373 if (n+cp-p+4 < sizeof(pathname)) { 1374 memcpy(pathname+n, p, cp-p); 1375 n += cp-p; 1376 pathname[n] = 0; 1377 if (access(pathname, AEXIST) < 0) { 1378 strcpy(pathname+n, ".0"); 1379 if (access(pathname, AEXIST) < 0) 1380 return 0; 1381 } 1382 } 1383 } 1384 if (i) { 1385 print(mime ? "text/plain\n" : "font file\n"); 1386 return 1; 1387 } 1388 return 0; 1389 } 1390 1391 int 1392 getfontnum(uchar *cp, uchar **rp) 1393 { 1394 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1395 cp++; 1396 if (*cp < '0' || *cp > '9') 1397 return 0; 1398 strtoul((char *)cp, (char **)rp, 0); 1399 if (!WHITESPACE(**rp)) { 1400 *rp = cp; 1401 return 0; 1402 } 1403 return 1; 1404 } 1405 1406 int 1407 isrtf(void) 1408 { 1409 if(strstr((char *)buf, "\\rtf1")){ 1410 print(mime ? "application/rtf\n" : "rich text format\n"); 1411 return 1; 1412 } 1413 return 0; 1414 } 1415 1416 int 1417 ismsdos(void) 1418 { 1419 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1420 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1421 return 1; 1422 } 1423 return 0; 1424 } 1425 1426 int 1427 iself(void) 1428 { 1429 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1430 [1] "WE32100", 1431 [2] "SPARC", 1432 [3] "i386", 1433 [4] "M68000", 1434 [5] "M88000", 1435 [6] "i486", 1436 [7] "i860", 1437 [8] "R3000", 1438 [9] "S370", 1439 [10] "R4000", 1440 [15] "HP-PA", 1441 [18] "sparc v8+", 1442 [19] "i960", 1443 [20] "PPC-32", 1444 [21] "PPC-64", 1445 [40] "ARM", 1446 [41] "Alpha", 1447 [43] "sparc v9", 1448 [50] "IA-64", 1449 [62] "AMD64", 1450 [75] "VAX", 1451 }; 1452 static char *type[] = { 1453 [1] "relocatable object", 1454 [2] "executable", 1455 [3] "shared library", 1456 [4] "core dump", 1457 }; 1458 1459 if (memcmp(buf, "\x7fELF", 4) == 0){ 1460 if (!mime){ 1461 int isdifend = 0; 1462 int n = (buf[19] << 8) | buf[18]; 1463 char *p = "unknown"; 1464 char *t = "unknown"; 1465 1466 if (n > 0 && n < nelem(cpu) && cpu[n]) 1467 p = cpu[n]; 1468 else { 1469 /* try the other byte order */ 1470 isdifend = 1; 1471 n = (buf[18] << 8) | buf[19]; 1472 if (n > 0 && n < nelem(cpu) && cpu[n]) 1473 p = cpu[n]; 1474 } 1475 if(isdifend) 1476 n = (buf[16]<< 8) | buf[17]; 1477 else 1478 n = (buf[17]<< 8) | buf[16]; 1479 1480 if(n>0 && n < nelem(type) && type[n]) 1481 t = type[n]; 1482 print("%s ELF %s\n", p, t); 1483 } 1484 else 1485 print("application/x-elf-executable"); 1486 return 1; 1487 } 1488 1489 return 0; 1490 } 1491 1492 int 1493 isface(void) 1494 { 1495 int i, j, ldepth, l; 1496 char *p; 1497 1498 ldepth = -1; 1499 for(j = 0; j < 3; j++){ 1500 for(p = (char*)buf, i=0; i<3; i++){ 1501 if(p[0] != '0' || p[1] != 'x') 1502 return 0; 1503 if(buf[2+8] == ',') 1504 l = 2; 1505 else if(buf[2+4] == ',') 1506 l = 1; 1507 else 1508 return 0; 1509 if(ldepth == -1) 1510 ldepth = l; 1511 if(l != ldepth) 1512 return 0; 1513 strtoul(p, &p, 16); 1514 if(*p++ != ',') 1515 return 0; 1516 while(*p == ' ' || *p == '\t') 1517 p++; 1518 } 1519 if (*p++ != '\n') 1520 return 0; 1521 } 1522 1523 if(mime) 1524 print("application/x-face\n"); 1525 else 1526 print("face image depth %d\n", ldepth); 1527 return 1; 1528 } 1529 1530