1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 isp9font, /* plan 9 font */ 192 isp9bit, /* plan 9 image (as from /dev/window) */ 193 isrtf, /* rich text format */ 194 ismsdos, /* msdos exe (virus file attachement) */ 195 isface, /* ascii face file */ 196 197 /* last resorts */ 198 ismung, /* entropy compressed/encrypted */ 199 isenglish, /* char frequency English */ 200 0 201 }; 202 203 int mime; 204 205 char OCTET[] = "application/octet-stream\n"; 206 char PLAIN[] = "text/plain\n"; 207 208 void 209 main(int argc, char *argv[]) 210 { 211 int i, j, maxlen; 212 char *cp; 213 Rune r; 214 215 ARGBEGIN{ 216 case 'm': 217 mime = 1; 218 break; 219 default: 220 fprint(2, "usage: file [-m] [file...]\n"); 221 exits("usage"); 222 }ARGEND; 223 224 maxlen = 0; 225 if(mime == 0 || argc > 1){ 226 for(i = 0; i < argc; i++) { 227 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 228 ; 229 if(j > maxlen) 230 maxlen = j; 231 } 232 } 233 if (argc <= 0) { 234 if(!mime) 235 print ("stdin: "); 236 filetype(0); 237 } 238 else { 239 for(i = 0; i < argc; i++) 240 type(argv[i], maxlen); 241 } 242 exits(0); 243 } 244 245 void 246 type(char *file, int nlen) 247 { 248 Rune r; 249 int i; 250 char *p; 251 252 if(nlen > 0){ 253 slash = 0; 254 for (i = 0, p = file; *p; i++) { 255 if (*p == '/') /* find rightmost slash */ 256 slash = p; 257 p += chartorune(&r, p); /* count runes */ 258 } 259 print("%s:%*s",file, nlen-i+1, ""); 260 } 261 fname = file; 262 if ((fd = open(file, OREAD)) < 0) { 263 print("cannot open: %r\n"); 264 return; 265 } 266 filetype(fd); 267 close(fd); 268 } 269 270 void 271 filetype(int fd) 272 { 273 Rune r; 274 int i, f, n; 275 char *p, *eob; 276 277 free(mbuf); 278 mbuf = dirfstat(fd); 279 if(mbuf == nil){ 280 print("cannot stat: %r\n"); 281 return; 282 } 283 if(mbuf->mode & DMDIR) { 284 print(mime ? OCTET : "directory\n"); 285 return; 286 } 287 if(mbuf->type != 'M' && mbuf->type != '|') { 288 print(mime ? OCTET : "special file #%C/%s\n", 289 mbuf->type, mbuf->name); 290 return; 291 } 292 /* may be reading a pipe on standard input */ 293 nbuf = readn(fd, buf, sizeof(buf)-1); 294 if(nbuf < 0) { 295 print("cannot read: %r\n"); 296 return; 297 } 298 if(nbuf == 0) { 299 print(mime ? PLAIN : "empty file\n"); 300 return; 301 } 302 buf[nbuf] = 0; 303 304 /* 305 * build histogram table 306 */ 307 memset(cfreq, 0, sizeof(cfreq)); 308 for (i = 0; language[i].name; i++) 309 language[i].count = 0; 310 eob = (char *)buf+nbuf; 311 for(n = 0, p = (char *)buf; p < eob; n++) { 312 if (!fullrune(p, eob-p) && eob-p < UTFmax) 313 break; 314 p += chartorune(&r, p); 315 if (r == 0) 316 f = Cnull; 317 else if (r <= 0x7f) { 318 if (!isprint(r) && !isspace(r)) 319 f = Ceascii; /* ASCII control char */ 320 else f = r; 321 } else if (r == 0x80) { 322 bump_utf_count(r); 323 f = Cutf; 324 } else if (r < 0xA0) 325 f = Cbinary; /* Invalid Runes */ 326 else if (r <= 0xff) 327 f = Clatin; /* Latin 1 */ 328 else { 329 bump_utf_count(r); 330 f = Cutf; /* UTF extension */ 331 } 332 cfreq[f]++; /* ASCII chars peg directly */ 333 } 334 /* 335 * gross classify 336 */ 337 if (cfreq[Cbinary]) 338 guess = Fbinary; 339 else if (cfreq[Cutf]) 340 guess = Futf; 341 else if (cfreq[Clatin]) 342 guess = Flatin; 343 else if (cfreq[Ceascii]) 344 guess = Feascii; 345 else if (cfreq[Cnull]) 346 guess = Fbinary; 347 else 348 guess = Fascii; 349 /* 350 * lookup dictionary words 351 */ 352 memset(wfreq, 0, sizeof(wfreq)); 353 if(guess == Fascii || guess == Flatin || guess == Futf) 354 wordfreq(); 355 /* 356 * call individual classify routines 357 */ 358 for(i=0; call[i]; i++) 359 if((*call[i])()) 360 return; 361 362 /* 363 * if all else fails, 364 * print out gross classification 365 */ 366 if (nbuf < 100 && !mime) 367 print(mime ? PLAIN : "short "); 368 if (guess == Fascii) 369 print(mime ? PLAIN : "Ascii\n"); 370 else if (guess == Feascii) 371 print(mime ? PLAIN : "extended ascii\n"); 372 else if (guess == Flatin) 373 print(mime ? PLAIN : "latin ascii\n"); 374 else if (guess == Futf && utf_count() < 4) 375 print_utf(); 376 else print(mime ? OCTET : "binary\n"); 377 } 378 379 void 380 bump_utf_count(Rune r) 381 { 382 int low, high, mid; 383 384 high = sizeof(language)/sizeof(language[0])-1; 385 for (low = 0; low < high;) { 386 mid = (low+high)/2; 387 if (r >= language[mid].low) { 388 if (r <= language[mid].high) { 389 language[mid].count++; 390 break; 391 } else low = mid+1; 392 } else high = mid; 393 } 394 } 395 396 int 397 utf_count(void) 398 { 399 int i, count; 400 401 count = 0; 402 for (i = 0; language[i].name; i++) 403 if (language[i].count > 0) 404 switch (language[i].mode) { 405 case Normal: 406 case First: 407 count++; 408 break; 409 default: 410 break; 411 } 412 return count; 413 } 414 415 int 416 chkascii(void) 417 { 418 int i; 419 420 for (i = 'a'; i < 'z'; i++) 421 if (cfreq[i]) 422 return 1; 423 for (i = 'A'; i < 'Z'; i++) 424 if (cfreq[i]) 425 return 1; 426 return 0; 427 } 428 429 int 430 find_first(char *name) 431 { 432 int i; 433 434 for (i = 0; language[i].name != 0; i++) 435 if (language[i].mode == First 436 && strcmp(language[i].name, name) == 0) 437 return i; 438 return -1; 439 } 440 441 void 442 print_utf(void) 443 { 444 int i, printed, j; 445 446 if(mime){ 447 print(PLAIN); 448 return; 449 } 450 if (chkascii()) { 451 printed = 1; 452 print("Ascii"); 453 } else 454 printed = 0; 455 for (i = 0; language[i].name; i++) 456 if (language[i].count) { 457 switch(language[i].mode) { 458 case Multi: 459 j = find_first(language[i].name); 460 if (j < 0) 461 break; 462 if (language[j].count > 0) 463 break; 464 /* Fall through */ 465 case Normal: 466 case First: 467 if (printed) 468 print(" & "); 469 else printed = 1; 470 print("%s", language[i].name); 471 break; 472 case Shared: 473 default: 474 break; 475 } 476 } 477 if(!printed) 478 print("UTF"); 479 print(" text\n"); 480 } 481 482 void 483 wordfreq(void) 484 { 485 int low, high, mid, r; 486 uchar *p, *p2, c; 487 488 p = buf; 489 for(;;) { 490 while (p < buf+nbuf && !isalpha(*p)) 491 p++; 492 if (p >= buf+nbuf) 493 return; 494 p2 = p; 495 while(p < buf+nbuf && isalpha(*p)) 496 p++; 497 c = *p; 498 *p = 0; 499 high = sizeof(dict)/sizeof(dict[0]); 500 for(low = 0;low < high;) { 501 mid = (low+high)/2; 502 r = strcmp(dict[mid].word, (char*)p2); 503 if(r == 0) { 504 wfreq[dict[mid].class]++; 505 break; 506 } 507 if(r < 0) 508 low = mid+1; 509 else 510 high = mid; 511 } 512 *p++ = c; 513 } 514 } 515 516 typedef struct Filemagic Filemagic; 517 struct Filemagic { 518 ulong x; 519 ulong mask; 520 char *desc; 521 char *mime; 522 }; 523 524 /* 525 * integers in this table must be as seen on a little-endian machine 526 * when read from a file. 527 */ 528 Filemagic long0tab[] = { 529 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 530 /* "pac1" */ 531 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 532 /* "pXc2 */ 533 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 534 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 535 0x43614c66, 0xFFFFFFFF, "FLAC audio file\n", OCTET, 536 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 537 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 538 070707, 0xFFFF, "cpio archive\n", OCTET, 539 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 540 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 541 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 542 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 543 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 544 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 545 /* 0xfeedface: this could alternately be a Next Plan 9 boot image */ 546 0xcefaedfe, 0xFFFFFFFF, "32-bit power Mach-O executable\n", OCTET, 547 /* 0xfeedfacf */ 548 0xcffaedfe, 0xFFFFFFFF, "64-bit power Mach-O executable\n", OCTET, 549 /* 0xcefaedfe */ 550 0xfeedface, 0xFFFFFFFF, "386 Mach-O executable\n", OCTET, 551 /* 0xcffaedfe */ 552 0xfeedfacf, 0xFFFFFFFF, "amd64 Mach-O executable\n", OCTET, 553 /* 0xcafebabe */ 554 0xbebafeca, 0xFFFFFFFF, "Mach-O universal executable\n", OCTET, 555 /* 556 * these magic numbers are stored big-endian on disk, 557 * thus the numbers appear reversed in this table. 558 */ 559 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 560 0x2bb19a52, 0xFFFFFFFF, "paq archive\n", OCTET, 561 }; 562 563 int 564 filemagic(Filemagic *tab, int ntab, ulong x) 565 { 566 int i; 567 568 for(i=0; i<ntab; i++) 569 if((x&tab[i].mask) == tab[i].x){ 570 print(mime ? tab[i].mime : tab[i].desc); 571 return 1; 572 } 573 return 0; 574 } 575 576 int 577 long0(void) 578 { 579 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 580 } 581 582 typedef struct Fileoffmag Fileoffmag; 583 struct Fileoffmag { 584 ulong off; 585 Filemagic; 586 }; 587 588 /* 589 * integers in this table must be as seen on a little-endian machine 590 * when read from a file. 591 */ 592 Fileoffmag longofftab[] = { 593 /* 594 * these magic numbers are stored big-endian on disk, 595 * thus the numbers appear reversed in this table. 596 */ 597 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 598 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 599 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 600 4, 0x31647542, 0xFFFFFFFF, "OS X finder properties\n", OCTET, 601 }; 602 603 int 604 fileoffmagic(Fileoffmag *tab, int ntab) 605 { 606 int i; 607 ulong x; 608 Fileoffmag *tp; 609 uchar buf[sizeof(long)]; 610 611 for(i=0; i<ntab; i++) { 612 tp = tab + i; 613 seek(fd, tp->off, 0); 614 if (readn(fd, buf, sizeof buf) != sizeof buf) 615 continue; 616 x = LENDIAN(buf); 617 if((x&tp->mask) == tp->x){ 618 print(mime? tp->mime: tp->desc); 619 return 1; 620 } 621 } 622 return 0; 623 } 624 625 int 626 longoff(void) 627 { 628 return fileoffmagic(longofftab, nelem(longofftab)); 629 } 630 631 int 632 isexec(void) 633 { 634 Fhdr f; 635 636 seek(fd, 0, 0); /* reposition to start of file */ 637 if(crackhdr(fd, &f)) { 638 print(mime ? OCTET : "%s\n", f.name); 639 return 1; 640 } 641 return 0; 642 } 643 644 645 /* from tar.c */ 646 enum { NAMSIZ = 100, TBLOCK = 512 }; 647 648 union hblock 649 { 650 char dummy[TBLOCK]; 651 struct header 652 { 653 char name[NAMSIZ]; 654 char mode[8]; 655 char uid[8]; 656 char gid[8]; 657 char size[12]; 658 char mtime[12]; 659 char chksum[8]; 660 char linkflag; 661 char linkname[NAMSIZ]; 662 /* rest are defined by POSIX's ustar format; see p1003.2b */ 663 char magic[6]; /* "ustar" */ 664 char version[2]; 665 char uname[32]; 666 char gname[32]; 667 char devmajor[8]; 668 char devminor[8]; 669 char prefix[155]; /* if non-null, path = prefix "/" name */ 670 } dbuf; 671 }; 672 673 int 674 checksum(union hblock *hp) 675 { 676 int i; 677 char *cp; 678 struct header *hdr = &hp->dbuf; 679 680 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 681 *cp = ' '; 682 i = 0; 683 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 684 i += *cp & 0xff; 685 return i; 686 } 687 688 int 689 istar(void) 690 { 691 int chksum; 692 char tblock[TBLOCK]; 693 union hblock *hp = (union hblock *)tblock; 694 struct header *hdr = &hp->dbuf; 695 696 seek(fd, 0, 0); /* reposition to start of file */ 697 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 698 return 0; 699 chksum = strtol(hdr->chksum, 0, 8); 700 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 701 if (strcmp(hdr->magic, "ustar") == 0) 702 print(mime? "application/x-ustar\n": 703 "posix tar archive\n"); 704 else 705 print(mime? "application/x-tar\n": "tar archive\n"); 706 return 1; 707 } 708 return 0; 709 } 710 711 /* 712 * initial words to classify file 713 */ 714 struct FILE_STRING 715 { 716 char *key; 717 char *filetype; 718 int length; 719 char *mime; 720 } file_string[] = 721 { 722 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 723 "!<arch>\n", "archive", 8, "application/octet-stream", 724 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 725 "#!/bin/rc", "rc executable file", 9, "text/plain", 726 "#!/bin/sh", "sh executable file", 9, "text/plain", 727 "%!", "postscript", 2, "application/postscript", 728 "\004%!", "postscript", 3, "application/postscript", 729 "x T post", "troff output for post", 8, "application/troff", 730 "x T Latin1", "troff output for Latin1", 10, "application/troff", 731 "x T utf", "troff output for UTF", 7, "application/troff", 732 "x T 202", "troff output for 202", 7, "application/troff", 733 "x T aps", "troff output for aps", 7, "application/troff", 734 "x T ", "troff output", 4, "application/troff", 735 "GIF", "GIF image", 3, "image/gif", 736 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 737 "%PDF", "PDF", 4, "application/pdf", 738 "<html>\n", "HTML file", 7, "text/html", 739 "<HTML>\n", "HTML file", 7, "text/html", 740 "\111\111\052\000", "tiff", 4, "image/tiff", 741 "\115\115\000\052", "tiff", 4, "image/tiff", 742 "\377\330\377\340", "jpeg", 4, "image/jpeg", 743 "\377\330\377\341", "jpeg", 4, "image/jpeg", 744 "\377\330\377\333", "jpeg", 4, "image/jpeg", 745 "BM", "bmp", 2, "image/bmp", 746 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 747 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 748 "\033E\033", "HP PCL printer data", 3, OCTET, 749 "\033&", "HP PCL printer data", 2, OCTET, 750 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 751 "\033Lua", "Lua bytecode", 4, OCTET, 752 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 753 "\211PNG", "PNG image", 4, "image/png", 754 "P3\n", "ppm", 3, "image/ppm", 755 "P6\n", "ppm", 3, "image/ppm", 756 "/* XPM */\n", "xbm", 10, "image/xbm", 757 ".HTML ", "troff -ms input", 6, "text/troff", 758 ".LP", "troff -ms input", 3, "text/troff", 759 ".ND", "troff -ms input", 3, "text/troff", 760 ".PP", "troff -ms input", 3, "text/troff", 761 ".TL", "troff -ms input", 3, "text/troff", 762 ".TR", "troff -ms input", 3, "text/troff", 763 ".TH", "manual page", 3, "text/troff", 764 ".\\\"", "troff input", 3, "text/troff", 765 ".de", "troff input", 3, "text/troff", 766 ".if", "troff input", 3, "text/troff", 767 ".nr", "troff input", 3, "text/troff", 768 ".tr", "troff input", 3, "text/troff", 769 "vac:", "venti score", 4, "text/plain", 770 "-----BEGIN CERTIFICATE-----\n", 771 "pem certificate", -1, "text/plain", 772 "-----BEGIN TRUSTED CERTIFICATE-----\n", 773 "pem trusted certificate", -1, "text/plain", 774 "-----BEGIN X509 CERTIFICATE-----\n", 775 "pem x.509 certificate", -1, "text/plain", 776 "subject=/C=", "pem certificate with header", -1, "text/plain", 777 "process snapshot ", "process snapshot", -1, "application/snapfs", 778 "BEGIN:VCARD\r\n", "vCard", 13, "text/directory;profile=vcard", 779 "BEGIN:VCARD\n", "vCard", 12, "text/directory;profile=vcard", 780 0,0,0,0 781 }; 782 783 int 784 istring(void) 785 { 786 int i, l; 787 struct FILE_STRING *p; 788 789 for(p = file_string; p->key; p++) { 790 l = p->length; 791 if(l == -1) 792 l = strlen(p->key); 793 if(nbuf >= l && memcmp(buf, p->key, l) == 0) { 794 if(mime) 795 print("%s\n", p->mime); 796 else 797 print("%s\n", p->filetype); 798 return 1; 799 } 800 } 801 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 802 for(i = 5; i < nbuf; i++) 803 if(buf[i] == '\n') 804 break; 805 if(mime) 806 print(OCTET); 807 else 808 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 809 return 1; 810 } 811 return 0; 812 } 813 814 struct offstr 815 { 816 ulong off; 817 struct FILE_STRING; 818 } offstrs[] = { 819 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 820 0, 0, 0, 0, 0 821 }; 822 823 int 824 isoffstr(void) 825 { 826 int n; 827 char buf[256]; 828 struct offstr *p; 829 830 for(p = offstrs; p->key; p++) { 831 seek(fd, p->off, 0); 832 n = p->length; 833 if (n > sizeof buf) 834 n = sizeof buf; 835 if (readn(fd, buf, n) != n) 836 continue; 837 if(memcmp(buf, p->key, n) == 0) { 838 if(mime) 839 print("%s\n", p->mime); 840 else 841 print("%s\n", p->filetype); 842 return 1; 843 } 844 } 845 return 0; 846 } 847 848 int 849 iff(void) 850 { 851 if (strncmp((char*)buf, "FORM", 4) == 0 && 852 strncmp((char*)buf+8, "AIFF", 4) == 0) { 853 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 854 return 1; 855 } 856 if (strncmp((char*)buf, "RIFF", 4) == 0) { 857 if (strncmp((char*)buf+8, "WAVE", 4) == 0) 858 print("%s\n", mime? "audio/wave": "wave audio"); 859 else if (strncmp((char*)buf+8, "AVI ", 4) == 0) 860 print("%s\n", mime? "video/avi": "avi video"); 861 else 862 print("%s\n", mime? "application/octet-stream": 863 "riff file"); 864 return 1; 865 } 866 return 0; 867 } 868 869 char* html_string[] = 870 { 871 "title", 872 "body", 873 "head", 874 "strong", 875 "h1", 876 "h2", 877 "h3", 878 "h4", 879 "h5", 880 "h6", 881 "ul", 882 "li", 883 "dl", 884 "br", 885 "em", 886 0, 887 }; 888 889 int 890 ishtml(void) 891 { 892 uchar *p, *q; 893 int i, count; 894 895 /* compare strings between '<' and '>' to html table */ 896 count = 0; 897 p = buf; 898 for(;;) { 899 while (p < buf+nbuf && *p != '<') 900 p++; 901 p++; 902 if (p >= buf+nbuf) 903 break; 904 if(*p == '/') 905 p++; 906 q = p; 907 while(p < buf+nbuf && *p != '>') 908 p++; 909 if (p >= buf+nbuf) 910 break; 911 for(i = 0; html_string[i]; i++) { 912 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 913 if(count++ > 4) { 914 print(mime ? "text/html\n" : "HTML file\n"); 915 return 1; 916 } 917 break; 918 } 919 } 920 p++; 921 } 922 return 0; 923 } 924 925 char* rfc822_string[] = 926 { 927 "from:", 928 "date:", 929 "to:", 930 "subject:", 931 "received:", 932 "reply to:", 933 "sender:", 934 0, 935 }; 936 937 int 938 isrfc822(void) 939 { 940 941 char *p, *q, *r; 942 int i, count; 943 944 count = 0; 945 p = (char*)buf; 946 for(;;) { 947 q = strchr(p, '\n'); 948 if(q == nil) 949 break; 950 *q = 0; 951 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 952 count++; 953 *q = '\n'; 954 p = q+1; 955 continue; 956 } 957 *q = '\n'; 958 if(*p != '\t' && *p != ' '){ 959 r = strchr(p, ':'); 960 if(r == 0 || r > q) 961 break; 962 for(i = 0; rfc822_string[i]; i++) { 963 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 964 count++; 965 break; 966 } 967 } 968 } 969 p = q+1; 970 } 971 if(count >= 3){ 972 print(mime ? "message/rfc822\n" : "email file\n"); 973 return 1; 974 } 975 return 0; 976 } 977 978 int 979 ismbox(void) 980 { 981 char *p, *q; 982 983 p = (char*)buf; 984 q = strchr(p, '\n'); 985 if(q == nil) 986 return 0; 987 *q = 0; 988 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 989 print(mime ? "text/plain\n" : "mail box\n"); 990 return 1; 991 } 992 *q = '\n'; 993 return 0; 994 } 995 996 int 997 iscint(void) 998 { 999 int type; 1000 char *name; 1001 Biobuf b; 1002 1003 if(Binit(&b, fd, OREAD) == Beof) 1004 return 0; 1005 seek(fd, 0, 0); 1006 type = objtype(&b, &name); 1007 if(type < 0) 1008 return 0; 1009 if(mime) 1010 print(OCTET); 1011 else 1012 print("%s intermediate\n", name); 1013 return 1; 1014 } 1015 1016 int 1017 isc(void) 1018 { 1019 int n; 1020 1021 n = wfreq[I1]; 1022 /* 1023 * includes 1024 */ 1025 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1026 goto yes; 1027 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1028 goto yes; 1029 /* 1030 * declarations 1031 */ 1032 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1033 goto yes; 1034 /* 1035 * assignments 1036 */ 1037 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1038 goto yes; 1039 return 0; 1040 1041 yes: 1042 if(mime){ 1043 print(PLAIN); 1044 return 1; 1045 } 1046 if(wfreq[Alword] > 0) 1047 print("alef program\n"); 1048 else 1049 print("c program\n"); 1050 return 1; 1051 } 1052 1053 int 1054 islimbo(void) 1055 { 1056 1057 /* 1058 * includes 1059 */ 1060 if(wfreq[Lword] < 4) 1061 return 0; 1062 print(mime ? PLAIN : "limbo program\n"); 1063 return 1; 1064 } 1065 1066 int 1067 isas(void) 1068 { 1069 1070 /* 1071 * includes 1072 */ 1073 if(wfreq[Aword] < 2) 1074 return 0; 1075 print(mime ? PLAIN : "as program\n"); 1076 return 1; 1077 } 1078 1079 /* 1080 * low entropy means encrypted 1081 */ 1082 int 1083 ismung(void) 1084 { 1085 int i, bucket[8]; 1086 float cs; 1087 1088 if(nbuf < 64) 1089 return 0; 1090 memset(bucket, 0, sizeof(bucket)); 1091 for(i=nbuf-64; i<nbuf; i++) 1092 bucket[(buf[i]>>5)&07] += 1; 1093 1094 cs = 0.; 1095 for(i=0; i<8; i++) 1096 cs += (bucket[i]-8)*(bucket[i]-8); 1097 cs /= 8.; 1098 if(cs <= 24.322) { 1099 if(buf[0]==0x1f && buf[1]==0x9d) 1100 print(mime ? OCTET : "compressed\n"); 1101 else 1102 if(buf[0]==0x1f && buf[1]==0x8b) 1103 print(mime ? OCTET : "gzip compressed\n"); 1104 else 1105 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1106 print(mime ? OCTET : "bzip2 compressed\n"); 1107 else 1108 print(mime ? OCTET : "encrypted\n"); 1109 return 1; 1110 } 1111 return 0; 1112 } 1113 1114 /* 1115 * english by punctuation and frequencies 1116 */ 1117 int 1118 isenglish(void) 1119 { 1120 int vow, comm, rare, badpun, punct; 1121 char *p; 1122 1123 if(guess != Fascii && guess != Feascii) 1124 return 0; 1125 badpun = 0; 1126 punct = 0; 1127 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1128 switch(*p) { 1129 case '.': 1130 case ',': 1131 case ')': 1132 case '%': 1133 case ';': 1134 case ':': 1135 case '?': 1136 punct++; 1137 if(p[1] != ' ' && p[1] != '\n') 1138 badpun++; 1139 } 1140 if(badpun*5 > punct) 1141 return 0; 1142 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1143 return 0; 1144 if(2*cfreq[';'] > cfreq['e']) 1145 return 0; 1146 1147 vow = 0; 1148 for(p="AEIOU"; *p; p++) { 1149 vow += cfreq[*p]; 1150 vow += cfreq[tolower(*p)]; 1151 } 1152 comm = 0; 1153 for(p="ETAION"; *p; p++) { 1154 comm += cfreq[*p]; 1155 comm += cfreq[tolower(*p)]; 1156 } 1157 rare = 0; 1158 for(p="VJKQXZ"; *p; p++) { 1159 rare += cfreq[*p]; 1160 rare += cfreq[tolower(*p)]; 1161 } 1162 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1163 print(mime ? PLAIN : "English text\n"); 1164 return 1; 1165 } 1166 return 0; 1167 } 1168 1169 /* 1170 * pick up a number with 1171 * syntax _*[0-9]+_ 1172 */ 1173 #define P9BITLEN 12 1174 int 1175 p9bitnum(uchar *bp) 1176 { 1177 int n, c, len; 1178 1179 len = P9BITLEN; 1180 while(*bp == ' ') { 1181 bp++; 1182 len--; 1183 if(len <= 0) 1184 return -1; 1185 } 1186 n = 0; 1187 while(len > 1) { 1188 c = *bp++; 1189 if(!isdigit(c)) 1190 return -1; 1191 n = n*10 + c-'0'; 1192 len--; 1193 } 1194 if(*bp != ' ') 1195 return -1; 1196 return n; 1197 } 1198 1199 int 1200 depthof(char *s, int *newp) 1201 { 1202 char *es; 1203 int d; 1204 1205 *newp = 0; 1206 es = s+12; 1207 while(s<es && *s==' ') 1208 s++; 1209 if(s == es) 1210 return -1; 1211 if('0'<=*s && *s<='9') 1212 return 1<<strtol(s, 0, 0); 1213 1214 *newp = 1; 1215 d = 0; 1216 while(s<es && *s!=' '){ 1217 s++; /* skip letter */ 1218 d += strtoul(s, &s, 10); 1219 } 1220 1221 if(d % 8 == 0 || 8 % d == 0) 1222 return d; 1223 else 1224 return -1; 1225 } 1226 1227 int 1228 isp9bit(void) 1229 { 1230 int dep, lox, loy, hix, hiy, px, new, cmpr; 1231 ulong t; 1232 long len; 1233 char *newlabel; 1234 uchar *cp; 1235 1236 cp = buf; 1237 cmpr = 0; 1238 newlabel = "old "; 1239 1240 if(memcmp(cp, "compressed\n", 11) == 0) { 1241 cmpr = 1; 1242 cp = buf + 11; 1243 } 1244 1245 dep = depthof((char*)cp + 0*P9BITLEN, &new); 1246 if(new) 1247 newlabel = ""; 1248 lox = p9bitnum(cp + 1*P9BITLEN); 1249 loy = p9bitnum(cp + 2*P9BITLEN); 1250 hix = p9bitnum(cp + 3*P9BITLEN); 1251 hiy = p9bitnum(cp + 4*P9BITLEN); 1252 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1253 return 0; 1254 1255 if(dep < 8){ 1256 px = 8/dep; /* pixels per byte */ 1257 /* set l to number of bytes of data per scan line */ 1258 if(lox >= 0) 1259 len = (hix+px-1)/px - lox/px; 1260 else{ /* make positive before divide */ 1261 t = (-lox)+px-1; 1262 t = (t/px)*px; 1263 len = (t+hix+px-1)/px; 1264 } 1265 }else 1266 len = (hix-lox)*dep/8; 1267 len *= hiy - loy; /* col length */ 1268 len += 5 * P9BITLEN; /* size of initial ascii */ 1269 1270 /* 1271 * for compressed images, don't look any further. otherwise: 1272 * for image file, length is non-zero and must match calculation above. 1273 * for /dev/window and /dev/screen the length is always zero. 1274 * for subfont, the subfont header should follow immediately. 1275 */ 1276 if (cmpr) { 1277 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n", 1278 newlabel, dep); 1279 return 1; 1280 } 1281 /* 1282 * mbuf->length == 0 probably indicates reading a pipe. 1283 * Ghostscript sometimes produces a little extra on the end. 1284 */ 1285 if (len != 0 && (mbuf->length == 0 || mbuf->length == len || 1286 mbuf->length > len && mbuf->length < len+P9BITLEN)) { 1287 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1288 return 1; 1289 } 1290 if (p9subfont(buf+len)) { 1291 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep); 1292 return 1; 1293 } 1294 return 0; 1295 } 1296 1297 int 1298 p9subfont(uchar *p) 1299 { 1300 int n, h, a; 1301 1302 /* if image too big, assume it's a subfont */ 1303 if (p+3*P9BITLEN > buf+sizeof(buf)) 1304 return 1; 1305 1306 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1307 if (n < 0) 1308 return 0; 1309 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1310 if (h < 0) 1311 return 0; 1312 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1313 if (a < 0) 1314 return 0; 1315 return 1; 1316 } 1317 1318 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1319 1320 int 1321 isp9font(void) 1322 { 1323 uchar *cp, *p; 1324 int i, n; 1325 char pathname[1024]; 1326 1327 cp = buf; 1328 if (!getfontnum(cp, &cp)) /* height */ 1329 return 0; 1330 if (!getfontnum(cp, &cp)) /* ascent */ 1331 return 0; 1332 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1333 if (!getfontnum(cp, &cp)) /* min */ 1334 break; 1335 if (!getfontnum(cp, &cp)) /* max */ 1336 return 0; 1337 getfontnum(cp, &cp); /* optional offset */ 1338 while (WHITESPACE(*cp)) 1339 cp++; 1340 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1341 ; 1342 /* construct a path name, if needed */ 1343 n = 0; 1344 if (*p != '/' && slash) { 1345 n = slash-fname+1; 1346 if (n < sizeof(pathname)) 1347 memcpy(pathname, fname, n); 1348 else n = 0; 1349 } 1350 if (n+cp-p+4 < sizeof(pathname)) { 1351 memcpy(pathname+n, p, cp-p); 1352 n += cp-p; 1353 pathname[n] = 0; 1354 if (access(pathname, AEXIST) < 0) { 1355 strcpy(pathname+n, ".0"); 1356 if (access(pathname, AEXIST) < 0) 1357 return 0; 1358 } 1359 } 1360 } 1361 if (i) { 1362 print(mime ? "text/plain\n" : "font file\n"); 1363 return 1; 1364 } 1365 return 0; 1366 } 1367 1368 int 1369 getfontnum(uchar *cp, uchar **rp) 1370 { 1371 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1372 cp++; 1373 if (*cp < '0' || *cp > '9') 1374 return 0; 1375 strtoul((char *)cp, (char **)rp, 0); 1376 if (!WHITESPACE(**rp)) { 1377 *rp = cp; 1378 return 0; 1379 } 1380 return 1; 1381 } 1382 1383 int 1384 isrtf(void) 1385 { 1386 if(strstr((char *)buf, "\\rtf1")){ 1387 print(mime ? "application/rtf\n" : "rich text format\n"); 1388 return 1; 1389 } 1390 return 0; 1391 } 1392 1393 int 1394 ismsdos(void) 1395 { 1396 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1397 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1398 return 1; 1399 } 1400 return 0; 1401 } 1402 1403 int 1404 iself(void) 1405 { 1406 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1407 [1] "WE32100", 1408 [2] "SPARC", 1409 [3] "i386", 1410 [4] "M68000", 1411 [5] "M88000", 1412 [6] "i486", 1413 [7] "i860", 1414 [8] "R3000", 1415 [9] "S370", 1416 [10] "R4000", 1417 [15] "HP-PA", 1418 [18] "sparc v8+", 1419 [19] "i960", 1420 [20] "PPC-32", 1421 [21] "PPC-64", 1422 [40] "ARM", 1423 [41] "Alpha", 1424 [43] "sparc v9", 1425 [50] "IA-64", 1426 [62] "AMD64", 1427 [75] "VAX", 1428 }; 1429 static char *type[] = { 1430 [1] "relocatable object", 1431 [2] "executable", 1432 [3] "shared library", 1433 [4] "core dump", 1434 }; 1435 1436 if (memcmp(buf, "\x7fELF", 4) == 0){ 1437 if (!mime){ 1438 int isdifend = 0; 1439 int n = (buf[19] << 8) | buf[18]; 1440 char *p = "unknown"; 1441 char *t = "unknown"; 1442 1443 if (n > 0 && n < nelem(cpu) && cpu[n]) 1444 p = cpu[n]; 1445 else { 1446 /* try the other byte order */ 1447 isdifend = 1; 1448 n = (buf[18] << 8) | buf[19]; 1449 if (n > 0 && n < nelem(cpu) && cpu[n]) 1450 p = cpu[n]; 1451 } 1452 if(isdifend) 1453 n = (buf[16]<< 8) | buf[17]; 1454 else 1455 n = (buf[17]<< 8) | buf[16]; 1456 1457 if(n>0 && n < nelem(type) && type[n]) 1458 t = type[n]; 1459 print("%s ELF %s\n", p, t); 1460 } 1461 else 1462 print("application/x-elf-executable"); 1463 return 1; 1464 } 1465 1466 return 0; 1467 } 1468 1469 int 1470 isface(void) 1471 { 1472 int i, j, ldepth, l; 1473 char *p; 1474 1475 ldepth = -1; 1476 for(j = 0; j < 3; j++){ 1477 for(p = (char*)buf, i=0; i<3; i++){ 1478 if(p[0] != '0' || p[1] != 'x') 1479 return 0; 1480 if(buf[2+8] == ',') 1481 l = 2; 1482 else if(buf[2+4] == ',') 1483 l = 1; 1484 else 1485 return 0; 1486 if(ldepth == -1) 1487 ldepth = l; 1488 if(l != ldepth) 1489 return 0; 1490 strtoul(p, &p, 16); 1491 if(*p++ != ',') 1492 return 0; 1493 while(*p == ' ' || *p == '\t') 1494 p++; 1495 } 1496 if (*p++ != '\n') 1497 return 0; 1498 } 1499 1500 if(mime) 1501 print("application/x-face\n"); 1502 else 1503 print("face image depth %d\n", ldepth); 1504 return 1; 1505 } 1506 1507