1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 isp9font, /* plan 9 font */ 192 isp9bit, /* plan 9 image (as from /dev/window) */ 193 isrtf, /* rich text format */ 194 ismsdos, /* msdos exe (virus file attachement) */ 195 isface, /* ascii face file */ 196 197 /* last resorts */ 198 ismung, /* entropy compressed/encrypted */ 199 isenglish, /* char frequency English */ 200 0 201 }; 202 203 int mime; 204 205 char OCTET[] = "application/octet-stream\n"; 206 char PLAIN[] = "text/plain\n"; 207 208 void 209 main(int argc, char *argv[]) 210 { 211 int i, j, maxlen; 212 char *cp; 213 Rune r; 214 215 ARGBEGIN{ 216 case 'm': 217 mime = 1; 218 break; 219 default: 220 fprint(2, "usage: file [-m] [file...]\n"); 221 exits("usage"); 222 }ARGEND; 223 224 maxlen = 0; 225 if(mime == 0 || argc > 1){ 226 for(i = 0; i < argc; i++) { 227 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 228 ; 229 if(j > maxlen) 230 maxlen = j; 231 } 232 } 233 if (argc <= 0) { 234 if(!mime) 235 print ("stdin: "); 236 filetype(0); 237 } 238 else { 239 for(i = 0; i < argc; i++) 240 type(argv[i], maxlen); 241 } 242 exits(0); 243 } 244 245 void 246 type(char *file, int nlen) 247 { 248 Rune r; 249 int i; 250 char *p; 251 252 if(nlen > 0){ 253 slash = 0; 254 for (i = 0, p = file; *p; i++) { 255 if (*p == '/') /* find rightmost slash */ 256 slash = p; 257 p += chartorune(&r, p); /* count runes */ 258 } 259 print("%s:%*s",file, nlen-i+1, ""); 260 } 261 fname = file; 262 if ((fd = open(file, OREAD)) < 0) { 263 print("cannot open: %r\n"); 264 return; 265 } 266 filetype(fd); 267 close(fd); 268 } 269 270 void 271 filetype(int fd) 272 { 273 Rune r; 274 int i, f, n; 275 char *p, *eob; 276 277 free(mbuf); 278 mbuf = dirfstat(fd); 279 if(mbuf == nil){ 280 print("cannot stat: %r\n"); 281 return; 282 } 283 if(mbuf->mode & DMDIR) { 284 print(mime ? OCTET : "directory\n"); 285 return; 286 } 287 if(mbuf->type != 'M' && mbuf->type != '|') { 288 print(mime ? OCTET : "special file #%C/%s\n", 289 mbuf->type, mbuf->name); 290 return; 291 } 292 /* may be reading a pipe on standard input */ 293 nbuf = readn(fd, buf, sizeof(buf)-1); 294 if(nbuf < 0) { 295 print("cannot read: %r\n"); 296 return; 297 } 298 if(nbuf == 0) { 299 print(mime ? PLAIN : "empty file\n"); 300 return; 301 } 302 buf[nbuf] = 0; 303 304 /* 305 * build histogram table 306 */ 307 memset(cfreq, 0, sizeof(cfreq)); 308 for (i = 0; language[i].name; i++) 309 language[i].count = 0; 310 eob = (char *)buf+nbuf; 311 for(n = 0, p = (char *)buf; p < eob; n++) { 312 if (!fullrune(p, eob-p) && eob-p < UTFmax) 313 break; 314 p += chartorune(&r, p); 315 if (r == 0) 316 f = Cnull; 317 else if (r <= 0x7f) { 318 if (!isprint(r) && !isspace(r)) 319 f = Ceascii; /* ASCII control char */ 320 else f = r; 321 } else if (r == 0x80) { 322 bump_utf_count(r); 323 f = Cutf; 324 } else if (r < 0xA0) 325 f = Cbinary; /* Invalid Runes */ 326 else if (r <= 0xff) 327 f = Clatin; /* Latin 1 */ 328 else { 329 bump_utf_count(r); 330 f = Cutf; /* UTF extension */ 331 } 332 cfreq[f]++; /* ASCII chars peg directly */ 333 } 334 /* 335 * gross classify 336 */ 337 if (cfreq[Cbinary]) 338 guess = Fbinary; 339 else if (cfreq[Cutf]) 340 guess = Futf; 341 else if (cfreq[Clatin]) 342 guess = Flatin; 343 else if (cfreq[Ceascii]) 344 guess = Feascii; 345 else if (cfreq[Cnull]) 346 guess = Fbinary; 347 else 348 guess = Fascii; 349 /* 350 * lookup dictionary words 351 */ 352 memset(wfreq, 0, sizeof(wfreq)); 353 if(guess == Fascii || guess == Flatin || guess == Futf) 354 wordfreq(); 355 /* 356 * call individual classify routines 357 */ 358 for(i=0; call[i]; i++) 359 if((*call[i])()) 360 return; 361 362 /* 363 * if all else fails, 364 * print out gross classification 365 */ 366 if (nbuf < 100 && !mime) 367 print(mime ? PLAIN : "short "); 368 if (guess == Fascii) 369 print(mime ? PLAIN : "Ascii\n"); 370 else if (guess == Feascii) 371 print(mime ? PLAIN : "extended ascii\n"); 372 else if (guess == Flatin) 373 print(mime ? PLAIN : "latin ascii\n"); 374 else if (guess == Futf && utf_count() < 4) 375 print_utf(); 376 else print(mime ? OCTET : "binary\n"); 377 } 378 379 void 380 bump_utf_count(Rune r) 381 { 382 int low, high, mid; 383 384 high = sizeof(language)/sizeof(language[0])-1; 385 for (low = 0; low < high;) { 386 mid = (low+high)/2; 387 if (r >= language[mid].low) { 388 if (r <= language[mid].high) { 389 language[mid].count++; 390 break; 391 } else low = mid+1; 392 } else high = mid; 393 } 394 } 395 396 int 397 utf_count(void) 398 { 399 int i, count; 400 401 count = 0; 402 for (i = 0; language[i].name; i++) 403 if (language[i].count > 0) 404 switch (language[i].mode) { 405 case Normal: 406 case First: 407 count++; 408 break; 409 default: 410 break; 411 } 412 return count; 413 } 414 415 int 416 chkascii(void) 417 { 418 int i; 419 420 for (i = 'a'; i < 'z'; i++) 421 if (cfreq[i]) 422 return 1; 423 for (i = 'A'; i < 'Z'; i++) 424 if (cfreq[i]) 425 return 1; 426 return 0; 427 } 428 429 int 430 find_first(char *name) 431 { 432 int i; 433 434 for (i = 0; language[i].name != 0; i++) 435 if (language[i].mode == First 436 && strcmp(language[i].name, name) == 0) 437 return i; 438 return -1; 439 } 440 441 void 442 print_utf(void) 443 { 444 int i, printed, j; 445 446 if(mime){ 447 print(PLAIN); 448 return; 449 } 450 if (chkascii()) { 451 printed = 1; 452 print("Ascii"); 453 } else 454 printed = 0; 455 for (i = 0; language[i].name; i++) 456 if (language[i].count) { 457 switch(language[i].mode) { 458 case Multi: 459 j = find_first(language[i].name); 460 if (j < 0) 461 break; 462 if (language[j].count > 0) 463 break; 464 /* Fall through */ 465 case Normal: 466 case First: 467 if (printed) 468 print(" & "); 469 else printed = 1; 470 print("%s", language[i].name); 471 break; 472 case Shared: 473 default: 474 break; 475 } 476 } 477 if(!printed) 478 print("UTF"); 479 print(" text\n"); 480 } 481 482 void 483 wordfreq(void) 484 { 485 int low, high, mid, r; 486 uchar *p, *p2, c; 487 488 p = buf; 489 for(;;) { 490 while (p < buf+nbuf && !isalpha(*p)) 491 p++; 492 if (p >= buf+nbuf) 493 return; 494 p2 = p; 495 while(p < buf+nbuf && isalpha(*p)) 496 p++; 497 c = *p; 498 *p = 0; 499 high = sizeof(dict)/sizeof(dict[0]); 500 for(low = 0;low < high;) { 501 mid = (low+high)/2; 502 r = strcmp(dict[mid].word, (char*)p2); 503 if(r == 0) { 504 wfreq[dict[mid].class]++; 505 break; 506 } 507 if(r < 0) 508 low = mid+1; 509 else 510 high = mid; 511 } 512 *p++ = c; 513 } 514 } 515 516 typedef struct Filemagic Filemagic; 517 struct Filemagic { 518 ulong x; 519 ulong mask; 520 char *desc; 521 char *mime; 522 }; 523 524 /* 525 * integers in this table must be as seen on a little-endian machine 526 * when read from a file. 527 */ 528 Filemagic long0tab[] = { 529 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 530 /* "pac1" */ 531 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 532 /* "pXc2 */ 533 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 534 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 535 0x43614c66, 0xFFFFFFFF, "FLAC audio file\n", OCTET, 536 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 537 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 538 070707, 0xFFFF, "cpio archive\n", OCTET, 539 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 540 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 541 0xf0ff, 0xf6ff, "aac audio\n", "audio/mpeg", 542 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 543 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 544 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 545 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 546 /* 0xfeedface: this could alternately be a Next Plan 9 boot image */ 547 0xcefaedfe, 0xFFFFFFFF, "32-bit power Mach-O executable\n", OCTET, 548 /* 0xfeedfacf */ 549 0xcffaedfe, 0xFFFFFFFF, "64-bit power Mach-O executable\n", OCTET, 550 /* 0xcefaedfe */ 551 0xfeedface, 0xFFFFFFFF, "386 Mach-O executable\n", OCTET, 552 /* 0xcffaedfe */ 553 0xfeedfacf, 0xFFFFFFFF, "amd64 Mach-O executable\n", OCTET, 554 /* 0xcafebabe */ 555 0xbebafeca, 0xFFFFFFFF, "Mach-O universal executable\n", OCTET, 556 /* 557 * these magic numbers are stored big-endian on disk, 558 * thus the numbers appear reversed in this table. 559 */ 560 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 561 0x2bb19a52, 0xFFFFFFFF, "paq archive\n", OCTET, 562 }; 563 564 int 565 filemagic(Filemagic *tab, int ntab, ulong x) 566 { 567 int i; 568 569 for(i=0; i<ntab; i++) 570 if((x&tab[i].mask) == tab[i].x){ 571 print(mime ? tab[i].mime : tab[i].desc); 572 return 1; 573 } 574 return 0; 575 } 576 577 int 578 long0(void) 579 { 580 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 581 } 582 583 typedef struct Fileoffmag Fileoffmag; 584 struct Fileoffmag { 585 ulong off; 586 Filemagic; 587 }; 588 589 /* 590 * integers in this table must be as seen on a little-endian machine 591 * when read from a file. 592 */ 593 Fileoffmag longofftab[] = { 594 /* 595 * these magic numbers are stored big-endian on disk, 596 * thus the numbers appear reversed in this table. 597 */ 598 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 599 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 600 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 601 4, 0x31647542, 0xFFFFFFFF, "OS X finder properties\n", OCTET, 602 }; 603 604 int 605 fileoffmagic(Fileoffmag *tab, int ntab) 606 { 607 int i; 608 ulong x; 609 Fileoffmag *tp; 610 uchar buf[sizeof(long)]; 611 612 for(i=0; i<ntab; i++) { 613 tp = tab + i; 614 seek(fd, tp->off, 0); 615 if (readn(fd, buf, sizeof buf) != sizeof buf) 616 continue; 617 x = LENDIAN(buf); 618 if((x&tp->mask) == tp->x){ 619 print(mime? tp->mime: tp->desc); 620 return 1; 621 } 622 } 623 return 0; 624 } 625 626 int 627 longoff(void) 628 { 629 return fileoffmagic(longofftab, nelem(longofftab)); 630 } 631 632 int 633 isexec(void) 634 { 635 Fhdr f; 636 637 seek(fd, 0, 0); /* reposition to start of file */ 638 if(crackhdr(fd, &f)) { 639 print(mime ? OCTET : "%s\n", f.name); 640 return 1; 641 } 642 return 0; 643 } 644 645 646 /* from tar.c */ 647 enum { NAMSIZ = 100, TBLOCK = 512 }; 648 649 union hblock 650 { 651 char dummy[TBLOCK]; 652 struct header 653 { 654 char name[NAMSIZ]; 655 char mode[8]; 656 char uid[8]; 657 char gid[8]; 658 char size[12]; 659 char mtime[12]; 660 char chksum[8]; 661 char linkflag; 662 char linkname[NAMSIZ]; 663 /* rest are defined by POSIX's ustar format; see p1003.2b */ 664 char magic[6]; /* "ustar" */ 665 char version[2]; 666 char uname[32]; 667 char gname[32]; 668 char devmajor[8]; 669 char devminor[8]; 670 char prefix[155]; /* if non-null, path = prefix "/" name */ 671 } dbuf; 672 }; 673 674 int 675 checksum(union hblock *hp) 676 { 677 int i; 678 char *cp; 679 struct header *hdr = &hp->dbuf; 680 681 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 682 *cp = ' '; 683 i = 0; 684 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 685 i += *cp & 0xff; 686 return i; 687 } 688 689 int 690 istar(void) 691 { 692 int chksum; 693 char tblock[TBLOCK]; 694 union hblock *hp = (union hblock *)tblock; 695 struct header *hdr = &hp->dbuf; 696 697 seek(fd, 0, 0); /* reposition to start of file */ 698 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 699 return 0; 700 chksum = strtol(hdr->chksum, 0, 8); 701 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 702 if (strcmp(hdr->magic, "ustar") == 0) 703 print(mime? "application/x-ustar\n": 704 "posix tar archive\n"); 705 else 706 print(mime? "application/x-tar\n": "tar archive\n"); 707 return 1; 708 } 709 return 0; 710 } 711 712 /* 713 * initial words to classify file 714 */ 715 struct FILE_STRING 716 { 717 char *key; 718 char *filetype; 719 int length; 720 char *mime; 721 } file_string[] = 722 { 723 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 724 "!<arch>\n", "archive", 8, "application/octet-stream", 725 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 726 "#!/bin/rc", "rc executable file", 9, "text/plain", 727 "#!/bin/sh", "sh executable file", 9, "text/plain", 728 "%!", "postscript", 2, "application/postscript", 729 "\004%!", "postscript", 3, "application/postscript", 730 "x T post", "troff output for post", 8, "application/troff", 731 "x T Latin1", "troff output for Latin1", 10, "application/troff", 732 "x T utf", "troff output for UTF", 7, "application/troff", 733 "x T 202", "troff output for 202", 7, "application/troff", 734 "x T aps", "troff output for aps", 7, "application/troff", 735 "x T ", "troff output", 4, "application/troff", 736 "GIF", "GIF image", 3, "image/gif", 737 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 738 "%PDF", "PDF", 4, "application/pdf", 739 "<html>\n", "HTML file", 7, "text/html", 740 "<HTML>\n", "HTML file", 7, "text/html", 741 "\111\111\052\000", "tiff", 4, "image/tiff", 742 "\115\115\000\052", "tiff", 4, "image/tiff", 743 "\377\330\377\340", "jpeg", 4, "image/jpeg", 744 "\377\330\377\341", "jpeg", 4, "image/jpeg", 745 "\377\330\377\333", "jpeg", 4, "image/jpeg", 746 "BM", "bmp", 2, "image/bmp", 747 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 748 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 749 "\033E\033", "HP PCL printer data", 3, OCTET, 750 "\033&", "HP PCL printer data", 2, OCTET, 751 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 752 "\033Lua", "Lua bytecode", 4, OCTET, 753 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 754 "\211PNG", "PNG image", 4, "image/png", 755 "P3\n", "ppm", 3, "image/ppm", 756 "P6\n", "ppm", 3, "image/ppm", 757 "/* XPM */\n", "xbm", 10, "image/xbm", 758 ".HTML ", "troff -ms input", 6, "text/troff", 759 ".LP", "troff -ms input", 3, "text/troff", 760 ".ND", "troff -ms input", 3, "text/troff", 761 ".PP", "troff -ms input", 3, "text/troff", 762 ".TL", "troff -ms input", 3, "text/troff", 763 ".TR", "troff -ms input", 3, "text/troff", 764 ".TH", "manual page", 3, "text/troff", 765 ".\\\"", "troff input", 3, "text/troff", 766 ".de", "troff input", 3, "text/troff", 767 ".if", "troff input", 3, "text/troff", 768 ".nr", "troff input", 3, "text/troff", 769 ".tr", "troff input", 3, "text/troff", 770 "vac:", "venti score", 4, "text/plain", 771 "-----BEGIN CERTIFICATE-----\n", 772 "pem certificate", -1, "text/plain", 773 "-----BEGIN TRUSTED CERTIFICATE-----\n", 774 "pem trusted certificate", -1, "text/plain", 775 "-----BEGIN X509 CERTIFICATE-----\n", 776 "pem x.509 certificate", -1, "text/plain", 777 "subject=/C=", "pem certificate with header", -1, "text/plain", 778 "process snapshot ", "process snapshot", -1, "application/snapfs", 779 "BEGIN:VCARD\r\n", "vCard", 13, "text/directory;profile=vcard", 780 "BEGIN:VCARD\n", "vCard", 12, "text/directory;profile=vcard", 781 0,0,0,0 782 }; 783 784 int 785 istring(void) 786 { 787 int i, l; 788 struct FILE_STRING *p; 789 790 for(p = file_string; p->key; p++) { 791 l = p->length; 792 if(l == -1) 793 l = strlen(p->key); 794 if(nbuf >= l && memcmp(buf, p->key, l) == 0) { 795 if(mime) 796 print("%s\n", p->mime); 797 else 798 print("%s\n", p->filetype); 799 return 1; 800 } 801 } 802 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 803 for(i = 5; i < nbuf; i++) 804 if(buf[i] == '\n') 805 break; 806 if(mime) 807 print(OCTET); 808 else 809 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 810 return 1; 811 } 812 return 0; 813 } 814 815 struct offstr 816 { 817 ulong off; 818 struct FILE_STRING; 819 } offstrs[] = { 820 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 821 0, 0, 0, 0, 0 822 }; 823 824 int 825 isoffstr(void) 826 { 827 int n; 828 char buf[256]; 829 struct offstr *p; 830 831 for(p = offstrs; p->key; p++) { 832 seek(fd, p->off, 0); 833 n = p->length; 834 if (n > sizeof buf) 835 n = sizeof buf; 836 if (readn(fd, buf, n) != n) 837 continue; 838 if(memcmp(buf, p->key, n) == 0) { 839 if(mime) 840 print("%s\n", p->mime); 841 else 842 print("%s\n", p->filetype); 843 return 1; 844 } 845 } 846 return 0; 847 } 848 849 int 850 iff(void) 851 { 852 if (strncmp((char*)buf, "FORM", 4) == 0 && 853 strncmp((char*)buf+8, "AIFF", 4) == 0) { 854 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 855 return 1; 856 } 857 if (strncmp((char*)buf, "RIFF", 4) == 0) { 858 if (strncmp((char*)buf+8, "WAVE", 4) == 0) 859 print("%s\n", mime? "audio/wave": "wave audio"); 860 else if (strncmp((char*)buf+8, "AVI ", 4) == 0) 861 print("%s\n", mime? "video/avi": "avi video"); 862 else 863 print("%s\n", mime? "application/octet-stream": 864 "riff file"); 865 return 1; 866 } 867 return 0; 868 } 869 870 char* html_string[] = 871 { 872 "title", 873 "body", 874 "head", 875 "strong", 876 "h1", 877 "h2", 878 "h3", 879 "h4", 880 "h5", 881 "h6", 882 "ul", 883 "li", 884 "dl", 885 "br", 886 "em", 887 0, 888 }; 889 890 int 891 ishtml(void) 892 { 893 uchar *p, *q; 894 int i, count; 895 896 /* compare strings between '<' and '>' to html table */ 897 count = 0; 898 p = buf; 899 for(;;) { 900 while (p < buf+nbuf && *p != '<') 901 p++; 902 p++; 903 if (p >= buf+nbuf) 904 break; 905 if(*p == '/') 906 p++; 907 q = p; 908 while(p < buf+nbuf && *p != '>') 909 p++; 910 if (p >= buf+nbuf) 911 break; 912 for(i = 0; html_string[i]; i++) { 913 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 914 if(count++ > 4) { 915 print(mime ? "text/html\n" : "HTML file\n"); 916 return 1; 917 } 918 break; 919 } 920 } 921 p++; 922 } 923 return 0; 924 } 925 926 char* rfc822_string[] = 927 { 928 "from:", 929 "date:", 930 "to:", 931 "subject:", 932 "received:", 933 "reply to:", 934 "sender:", 935 0, 936 }; 937 938 int 939 isrfc822(void) 940 { 941 942 char *p, *q, *r; 943 int i, count; 944 945 count = 0; 946 p = (char*)buf; 947 for(;;) { 948 q = strchr(p, '\n'); 949 if(q == nil) 950 break; 951 *q = 0; 952 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 953 count++; 954 *q = '\n'; 955 p = q+1; 956 continue; 957 } 958 *q = '\n'; 959 if(*p != '\t' && *p != ' '){ 960 r = strchr(p, ':'); 961 if(r == 0 || r > q) 962 break; 963 for(i = 0; rfc822_string[i]; i++) { 964 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 965 count++; 966 break; 967 } 968 } 969 } 970 p = q+1; 971 } 972 if(count >= 3){ 973 print(mime ? "message/rfc822\n" : "email file\n"); 974 return 1; 975 } 976 return 0; 977 } 978 979 int 980 ismbox(void) 981 { 982 char *p, *q; 983 984 p = (char*)buf; 985 q = strchr(p, '\n'); 986 if(q == nil) 987 return 0; 988 *q = 0; 989 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 990 print(mime ? "text/plain\n" : "mail box\n"); 991 return 1; 992 } 993 *q = '\n'; 994 return 0; 995 } 996 997 int 998 iscint(void) 999 { 1000 int type; 1001 char *name; 1002 Biobuf b; 1003 1004 if(Binit(&b, fd, OREAD) == Beof) 1005 return 0; 1006 seek(fd, 0, 0); 1007 type = objtype(&b, &name); 1008 if(type < 0) 1009 return 0; 1010 if(mime) 1011 print(OCTET); 1012 else 1013 print("%s intermediate\n", name); 1014 return 1; 1015 } 1016 1017 int 1018 isc(void) 1019 { 1020 int n; 1021 1022 n = wfreq[I1]; 1023 /* 1024 * includes 1025 */ 1026 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1027 goto yes; 1028 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1029 goto yes; 1030 /* 1031 * declarations 1032 */ 1033 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1034 goto yes; 1035 /* 1036 * assignments 1037 */ 1038 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1039 goto yes; 1040 return 0; 1041 1042 yes: 1043 if(mime){ 1044 print(PLAIN); 1045 return 1; 1046 } 1047 if(wfreq[Alword] > 0) 1048 print("alef program\n"); 1049 else 1050 print("c program\n"); 1051 return 1; 1052 } 1053 1054 int 1055 islimbo(void) 1056 { 1057 1058 /* 1059 * includes 1060 */ 1061 if(wfreq[Lword] < 4) 1062 return 0; 1063 print(mime ? PLAIN : "limbo program\n"); 1064 return 1; 1065 } 1066 1067 int 1068 isas(void) 1069 { 1070 1071 /* 1072 * includes 1073 */ 1074 if(wfreq[Aword] < 2) 1075 return 0; 1076 print(mime ? PLAIN : "as program\n"); 1077 return 1; 1078 } 1079 1080 /* 1081 * low entropy means encrypted 1082 */ 1083 int 1084 ismung(void) 1085 { 1086 int i, bucket[8]; 1087 float cs; 1088 1089 if(nbuf < 64) 1090 return 0; 1091 memset(bucket, 0, sizeof(bucket)); 1092 for(i=nbuf-64; i<nbuf; i++) 1093 bucket[(buf[i]>>5)&07] += 1; 1094 1095 cs = 0.; 1096 for(i=0; i<8; i++) 1097 cs += (bucket[i]-8)*(bucket[i]-8); 1098 cs /= 8.; 1099 if(cs <= 24.322) { 1100 if(buf[0]==0x1f && buf[1]==0x9d) 1101 print(mime ? OCTET : "compressed\n"); 1102 else 1103 if(buf[0]==0x1f && buf[1]==0x8b) 1104 print(mime ? OCTET : "gzip compressed\n"); 1105 else 1106 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1107 print(mime ? OCTET : "bzip2 compressed\n"); 1108 else 1109 print(mime ? OCTET : "encrypted\n"); 1110 return 1; 1111 } 1112 return 0; 1113 } 1114 1115 /* 1116 * english by punctuation and frequencies 1117 */ 1118 int 1119 isenglish(void) 1120 { 1121 int vow, comm, rare, badpun, punct; 1122 char *p; 1123 1124 if(guess != Fascii && guess != Feascii) 1125 return 0; 1126 badpun = 0; 1127 punct = 0; 1128 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1129 switch(*p) { 1130 case '.': 1131 case ',': 1132 case ')': 1133 case '%': 1134 case ';': 1135 case ':': 1136 case '?': 1137 punct++; 1138 if(p[1] != ' ' && p[1] != '\n') 1139 badpun++; 1140 } 1141 if(badpun*5 > punct) 1142 return 0; 1143 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1144 return 0; 1145 if(2*cfreq[';'] > cfreq['e']) 1146 return 0; 1147 1148 vow = 0; 1149 for(p="AEIOU"; *p; p++) { 1150 vow += cfreq[*p]; 1151 vow += cfreq[tolower(*p)]; 1152 } 1153 comm = 0; 1154 for(p="ETAION"; *p; p++) { 1155 comm += cfreq[*p]; 1156 comm += cfreq[tolower(*p)]; 1157 } 1158 rare = 0; 1159 for(p="VJKQXZ"; *p; p++) { 1160 rare += cfreq[*p]; 1161 rare += cfreq[tolower(*p)]; 1162 } 1163 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1164 print(mime ? PLAIN : "English text\n"); 1165 return 1; 1166 } 1167 return 0; 1168 } 1169 1170 /* 1171 * pick up a number with 1172 * syntax _*[0-9]+_ 1173 */ 1174 #define P9BITLEN 12 1175 int 1176 p9bitnum(uchar *bp) 1177 { 1178 int n, c, len; 1179 1180 len = P9BITLEN; 1181 while(*bp == ' ') { 1182 bp++; 1183 len--; 1184 if(len <= 0) 1185 return -1; 1186 } 1187 n = 0; 1188 while(len > 1) { 1189 c = *bp++; 1190 if(!isdigit(c)) 1191 return -1; 1192 n = n*10 + c-'0'; 1193 len--; 1194 } 1195 if(*bp != ' ') 1196 return -1; 1197 return n; 1198 } 1199 1200 int 1201 depthof(char *s, int *newp) 1202 { 1203 char *es; 1204 int d; 1205 1206 *newp = 0; 1207 es = s+12; 1208 while(s<es && *s==' ') 1209 s++; 1210 if(s == es) 1211 return -1; 1212 if('0'<=*s && *s<='9') 1213 return 1<<strtol(s, 0, 0); 1214 1215 *newp = 1; 1216 d = 0; 1217 while(s<es && *s!=' '){ 1218 s++; /* skip letter */ 1219 d += strtoul(s, &s, 10); 1220 } 1221 1222 if(d % 8 == 0 || 8 % d == 0) 1223 return d; 1224 else 1225 return -1; 1226 } 1227 1228 int 1229 isp9bit(void) 1230 { 1231 int dep, lox, loy, hix, hiy, px, new, cmpr; 1232 ulong t; 1233 long len; 1234 char *newlabel; 1235 uchar *cp; 1236 1237 cp = buf; 1238 cmpr = 0; 1239 newlabel = "old "; 1240 1241 if(memcmp(cp, "compressed\n", 11) == 0) { 1242 cmpr = 1; 1243 cp = buf + 11; 1244 } 1245 1246 dep = depthof((char*)cp + 0*P9BITLEN, &new); 1247 if(new) 1248 newlabel = ""; 1249 lox = p9bitnum(cp + 1*P9BITLEN); 1250 loy = p9bitnum(cp + 2*P9BITLEN); 1251 hix = p9bitnum(cp + 3*P9BITLEN); 1252 hiy = p9bitnum(cp + 4*P9BITLEN); 1253 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1254 return 0; 1255 1256 if(dep < 8){ 1257 px = 8/dep; /* pixels per byte */ 1258 /* set l to number of bytes of data per scan line */ 1259 if(lox >= 0) 1260 len = (hix+px-1)/px - lox/px; 1261 else{ /* make positive before divide */ 1262 t = (-lox)+px-1; 1263 t = (t/px)*px; 1264 len = (t+hix+px-1)/px; 1265 } 1266 }else 1267 len = (hix-lox)*dep/8; 1268 len *= hiy - loy; /* col length */ 1269 len += 5 * P9BITLEN; /* size of initial ascii */ 1270 1271 /* 1272 * for compressed images, don't look any further. otherwise: 1273 * for image file, length is non-zero and must match calculation above. 1274 * for /dev/window and /dev/screen the length is always zero. 1275 * for subfont, the subfont header should follow immediately. 1276 */ 1277 if (cmpr) { 1278 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n", 1279 newlabel, dep); 1280 return 1; 1281 } 1282 /* 1283 * mbuf->length == 0 probably indicates reading a pipe. 1284 * Ghostscript sometimes produces a little extra on the end. 1285 */ 1286 if (len != 0 && (mbuf->length == 0 || mbuf->length == len || 1287 mbuf->length > len && mbuf->length < len+P9BITLEN)) { 1288 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1289 return 1; 1290 } 1291 if (p9subfont(buf+len)) { 1292 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep); 1293 return 1; 1294 } 1295 return 0; 1296 } 1297 1298 int 1299 p9subfont(uchar *p) 1300 { 1301 int n, h, a; 1302 1303 /* if image too big, assume it's a subfont */ 1304 if (p+3*P9BITLEN > buf+sizeof(buf)) 1305 return 1; 1306 1307 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1308 if (n < 0) 1309 return 0; 1310 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1311 if (h < 0) 1312 return 0; 1313 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1314 if (a < 0) 1315 return 0; 1316 return 1; 1317 } 1318 1319 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1320 1321 int 1322 isp9font(void) 1323 { 1324 uchar *cp, *p; 1325 int i, n; 1326 char pathname[1024]; 1327 1328 cp = buf; 1329 if (!getfontnum(cp, &cp)) /* height */ 1330 return 0; 1331 if (!getfontnum(cp, &cp)) /* ascent */ 1332 return 0; 1333 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1334 if (!getfontnum(cp, &cp)) /* min */ 1335 break; 1336 if (!getfontnum(cp, &cp)) /* max */ 1337 return 0; 1338 getfontnum(cp, &cp); /* optional offset */ 1339 while (WHITESPACE(*cp)) 1340 cp++; 1341 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1342 ; 1343 /* construct a path name, if needed */ 1344 n = 0; 1345 if (*p != '/' && slash) { 1346 n = slash-fname+1; 1347 if (n < sizeof(pathname)) 1348 memcpy(pathname, fname, n); 1349 else n = 0; 1350 } 1351 if (n+cp-p+4 < sizeof(pathname)) { 1352 memcpy(pathname+n, p, cp-p); 1353 n += cp-p; 1354 pathname[n] = 0; 1355 if (access(pathname, AEXIST) < 0) { 1356 strcpy(pathname+n, ".0"); 1357 if (access(pathname, AEXIST) < 0) 1358 return 0; 1359 } 1360 } 1361 } 1362 if (i) { 1363 print(mime ? "text/plain\n" : "font file\n"); 1364 return 1; 1365 } 1366 return 0; 1367 } 1368 1369 int 1370 getfontnum(uchar *cp, uchar **rp) 1371 { 1372 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1373 cp++; 1374 if (*cp < '0' || *cp > '9') 1375 return 0; 1376 strtoul((char *)cp, (char **)rp, 0); 1377 if (!WHITESPACE(**rp)) { 1378 *rp = cp; 1379 return 0; 1380 } 1381 return 1; 1382 } 1383 1384 int 1385 isrtf(void) 1386 { 1387 if(strstr((char *)buf, "\\rtf1")){ 1388 print(mime ? "application/rtf\n" : "rich text format\n"); 1389 return 1; 1390 } 1391 return 0; 1392 } 1393 1394 int 1395 ismsdos(void) 1396 { 1397 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1398 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1399 return 1; 1400 } 1401 return 0; 1402 } 1403 1404 int 1405 iself(void) 1406 { 1407 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1408 [1] "WE32100", 1409 [2] "SPARC", 1410 [3] "i386", 1411 [4] "M68000", 1412 [5] "M88000", 1413 [6] "i486", 1414 [7] "i860", 1415 [8] "R3000", 1416 [9] "S370", 1417 [10] "R4000", 1418 [15] "HP-PA", 1419 [18] "sparc v8+", 1420 [19] "i960", 1421 [20] "PPC-32", 1422 [21] "PPC-64", 1423 [40] "ARM", 1424 [41] "Alpha", 1425 [43] "sparc v9", 1426 [50] "IA-64", 1427 [62] "AMD64", 1428 [75] "VAX", 1429 }; 1430 static char *type[] = { 1431 [1] "relocatable object", 1432 [2] "executable", 1433 [3] "shared library", 1434 [4] "core dump", 1435 }; 1436 1437 if (memcmp(buf, "\x7fELF", 4) == 0){ 1438 if (!mime){ 1439 int isdifend = 0; 1440 int n = (buf[19] << 8) | buf[18]; 1441 char *p = "unknown"; 1442 char *t = "unknown"; 1443 1444 if (n > 0 && n < nelem(cpu) && cpu[n]) 1445 p = cpu[n]; 1446 else { 1447 /* try the other byte order */ 1448 isdifend = 1; 1449 n = (buf[18] << 8) | buf[19]; 1450 if (n > 0 && n < nelem(cpu) && cpu[n]) 1451 p = cpu[n]; 1452 } 1453 if(isdifend) 1454 n = (buf[16]<< 8) | buf[17]; 1455 else 1456 n = (buf[17]<< 8) | buf[16]; 1457 1458 if(n>0 && n < nelem(type) && type[n]) 1459 t = type[n]; 1460 print("%s ELF%s %s\n", p, (buf[4] == 2? "64": "32"), t); 1461 } 1462 else 1463 print("application/x-elf-executable"); 1464 return 1; 1465 } 1466 1467 return 0; 1468 } 1469 1470 int 1471 isface(void) 1472 { 1473 int i, j, ldepth, l; 1474 char *p; 1475 1476 ldepth = -1; 1477 for(j = 0; j < 3; j++){ 1478 for(p = (char*)buf, i=0; i<3; i++){ 1479 if(p[0] != '0' || p[1] != 'x') 1480 return 0; 1481 if(buf[2+8] == ',') 1482 l = 2; 1483 else if(buf[2+4] == ',') 1484 l = 1; 1485 else 1486 return 0; 1487 if(ldepth == -1) 1488 ldepth = l; 1489 if(l != ldepth) 1490 return 0; 1491 strtoul(p, &p, 16); 1492 if(*p++ != ',') 1493 return 0; 1494 while(*p == ' ' || *p == '\t') 1495 p++; 1496 } 1497 if (*p++ != '\n') 1498 return 0; 1499 } 1500 1501 if(mime) 1502 print("application/x-face\n"); 1503 else 1504 print("face image depth %d\n", ldepth); 1505 return 1; 1506 } 1507 1508