1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 ismung, /* entropy compressed/encrypted */ 192 isp9font, /* plan 9 font */ 193 isp9bit, /* plan 9 image (as from /dev/window) */ 194 isenglish, /* char frequency English */ 195 isrtf, /* rich text format */ 196 ismsdos, /* msdos exe (virus file attachement) */ 197 isface, /* ascii face file */ 198 0 199 }; 200 201 int mime; 202 203 #define OCTET "application/octet-stream\n" 204 #define PLAIN "text/plain\n" 205 206 void 207 main(int argc, char *argv[]) 208 { 209 int i, j, maxlen; 210 char *cp; 211 Rune r; 212 213 ARGBEGIN{ 214 case 'm': 215 mime = 1; 216 break; 217 default: 218 fprint(2, "usage: file [-m] [file...]\n"); 219 exits("usage"); 220 }ARGEND; 221 222 maxlen = 0; 223 if(mime == 0 || argc > 1){ 224 for(i = 0; i < argc; i++) { 225 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 226 ; 227 if(j > maxlen) 228 maxlen = j; 229 } 230 } 231 if (argc <= 0) { 232 if(!mime) 233 print ("stdin: "); 234 filetype(0); 235 } 236 else { 237 for(i = 0; i < argc; i++) 238 type(argv[i], maxlen); 239 } 240 exits(0); 241 } 242 243 void 244 type(char *file, int nlen) 245 { 246 Rune r; 247 int i; 248 char *p; 249 250 if(nlen > 0){ 251 slash = 0; 252 for (i = 0, p = file; *p; i++) { 253 if (*p == '/') /* find rightmost slash */ 254 slash = p; 255 p += chartorune(&r, p); /* count runes */ 256 } 257 print("%s:%*s",file, nlen-i+1, ""); 258 } 259 fname = file; 260 if ((fd = open(file, OREAD)) < 0) { 261 print("cannot open\n"); 262 return; 263 } 264 filetype(fd); 265 close(fd); 266 } 267 268 /* 269 * Unicode 4.0 4-byte runes. 270 */ 271 typedef int Rune1; 272 273 enum { 274 UTFmax1 = 4, 275 }; 276 277 int 278 fullrune1(char *p, int n) 279 { 280 int c; 281 282 if(n >= 1) { 283 c = *(uchar*)p; 284 if(c < 0x80) 285 return 1; 286 if(n >= 2 && c < 0xE0) 287 return 1; 288 if(n >= 3 && c < 0xF0) 289 return 1; 290 if(n >= 4) 291 return 1; 292 } 293 return 0; 294 } 295 296 int 297 chartorune1(Rune1 *rune, char *str) 298 { 299 int c, c1, c2, c3, n; 300 Rune r; 301 302 c = *(uchar*)str; 303 if(c < 0xF0){ 304 r = 0; 305 n = chartorune(&r, str); 306 *rune = r; 307 return n; 308 } 309 c &= ~0xF0; 310 c1 = *(uchar*)(str+1) & ~0x80; 311 c2 = *(uchar*)(str+2) & ~0x80; 312 c3 = *(uchar*)(str+3) & ~0x80; 313 n = (c<<18) | (c1<<12) | (c2<<6) | c3; 314 if(n < 0x10000 || n > 0x10FFFF){ 315 *rune = Runeerror; 316 return 1; 317 } 318 *rune = n; 319 return 4; 320 } 321 322 void 323 filetype(int fd) 324 { 325 Rune1 r; 326 int i, f, n; 327 char *p, *eob; 328 329 free(mbuf); 330 mbuf = dirfstat(fd); 331 if(mbuf == nil){ 332 print("cannot stat: %r\n"); 333 return; 334 } 335 if(mbuf->mode & DMDIR) { 336 print(mime ? "text/directory\n" : "directory\n"); 337 return; 338 } 339 if(mbuf->type != 'M' && mbuf->type != '|') { 340 print(mime ? OCTET : "special file #%c/%s\n", 341 mbuf->type, mbuf->name); 342 return; 343 } 344 nbuf = read(fd, buf, sizeof(buf)-1); 345 346 if(nbuf < 0) { 347 print("cannot read\n"); 348 return; 349 } 350 if(nbuf == 0) { 351 print(mime ? PLAIN : "empty file\n"); 352 return; 353 } 354 buf[nbuf] = 0; 355 356 /* 357 * build histogram table 358 */ 359 memset(cfreq, 0, sizeof(cfreq)); 360 for (i = 0; language[i].name; i++) 361 language[i].count = 0; 362 eob = (char *)buf+nbuf; 363 for(n = 0, p = (char *)buf; p < eob; n++) { 364 if (!fullrune1(p, eob-p) && eob-p < UTFmax1) 365 break; 366 p += chartorune1(&r, p); 367 if (r == 0) 368 f = Cnull; 369 else if (r <= 0x7f) { 370 if (!isprint(r) && !isspace(r)) 371 f = Ceascii; /* ASCII control char */ 372 else f = r; 373 } else if (r == 0x80) { 374 bump_utf_count(r); 375 f = Cutf; 376 } else if (r < 0xA0) 377 f = Cbinary; /* Invalid Runes */ 378 else if (r <= 0xff) 379 f = Clatin; /* Latin 1 */ 380 else { 381 bump_utf_count(r); 382 f = Cutf; /* UTF extension */ 383 } 384 cfreq[f]++; /* ASCII chars peg directly */ 385 } 386 /* 387 * gross classify 388 */ 389 if (cfreq[Cbinary]) 390 guess = Fbinary; 391 else if (cfreq[Cutf]) 392 guess = Futf; 393 else if (cfreq[Clatin]) 394 guess = Flatin; 395 else if (cfreq[Ceascii]) 396 guess = Feascii; 397 else if (cfreq[Cnull]) 398 guess = Fbinary; 399 else 400 guess = Fascii; 401 /* 402 * lookup dictionary words 403 */ 404 memset(wfreq, 0, sizeof(wfreq)); 405 if(guess == Fascii || guess == Flatin || guess == Futf) 406 wordfreq(); 407 /* 408 * call individual classify routines 409 */ 410 for(i=0; call[i]; i++) 411 if((*call[i])()) 412 return; 413 414 /* 415 * if all else fails, 416 * print out gross classification 417 */ 418 if (nbuf < 100 && !mime) 419 print(mime ? PLAIN : "short "); 420 if (guess == Fascii) 421 print(mime ? PLAIN : "Ascii\n"); 422 else if (guess == Feascii) 423 print(mime ? PLAIN : "extended ascii\n"); 424 else if (guess == Flatin) 425 print(mime ? PLAIN : "latin ascii\n"); 426 else if (guess == Futf && utf_count() < 4) 427 print_utf(); 428 else print(mime ? OCTET : "binary\n"); 429 } 430 431 void 432 bump_utf_count(Rune r) 433 { 434 int low, high, mid; 435 436 high = sizeof(language)/sizeof(language[0])-1; 437 for (low = 0; low < high;) { 438 mid = (low+high)/2; 439 if (r >= language[mid].low) { 440 if (r <= language[mid].high) { 441 language[mid].count++; 442 break; 443 } else low = mid+1; 444 } else high = mid; 445 } 446 } 447 448 int 449 utf_count(void) 450 { 451 int i, count; 452 453 count = 0; 454 for (i = 0; language[i].name; i++) 455 if (language[i].count > 0) 456 switch (language[i].mode) { 457 case Normal: 458 case First: 459 count++; 460 break; 461 default: 462 break; 463 } 464 return count; 465 } 466 467 int 468 chkascii(void) 469 { 470 int i; 471 472 for (i = 'a'; i < 'z'; i++) 473 if (cfreq[i]) 474 return 1; 475 for (i = 'A'; i < 'Z'; i++) 476 if (cfreq[i]) 477 return 1; 478 return 0; 479 } 480 481 int 482 find_first(char *name) 483 { 484 int i; 485 486 for (i = 0; language[i].name != 0; i++) 487 if (language[i].mode == First 488 && strcmp(language[i].name, name) == 0) 489 return i; 490 return -1; 491 } 492 493 void 494 print_utf(void) 495 { 496 int i, printed, j; 497 498 if(mime){ 499 print(PLAIN); 500 return; 501 } 502 if (chkascii()) { 503 printed = 1; 504 print("Ascii"); 505 } else 506 printed = 0; 507 for (i = 0; language[i].name; i++) 508 if (language[i].count) { 509 switch(language[i].mode) { 510 case Multi: 511 j = find_first(language[i].name); 512 if (j < 0) 513 break; 514 if (language[j].count > 0) 515 break; 516 /* Fall through */ 517 case Normal: 518 case First: 519 if (printed) 520 print(" & "); 521 else printed = 1; 522 print("%s", language[i].name); 523 break; 524 case Shared: 525 default: 526 break; 527 } 528 } 529 if(!printed) 530 print("UTF"); 531 print(" text\n"); 532 } 533 534 void 535 wordfreq(void) 536 { 537 int low, high, mid, r; 538 uchar *p, *p2, c; 539 540 p = buf; 541 for(;;) { 542 while (p < buf+nbuf && !isalpha(*p)) 543 p++; 544 if (p >= buf+nbuf) 545 return; 546 p2 = p; 547 while(p < buf+nbuf && isalpha(*p)) 548 p++; 549 c = *p; 550 *p = 0; 551 high = sizeof(dict)/sizeof(dict[0]); 552 for(low = 0;low < high;) { 553 mid = (low+high)/2; 554 r = strcmp(dict[mid].word, (char*)p2); 555 if(r == 0) { 556 wfreq[dict[mid].class]++; 557 break; 558 } 559 if(r < 0) 560 low = mid+1; 561 else 562 high = mid; 563 } 564 *p++ = c; 565 } 566 } 567 568 typedef struct Filemagic Filemagic; 569 struct Filemagic { 570 ulong x; 571 ulong mask; 572 char *desc; 573 char *mime; 574 }; 575 576 /* 577 * integers in this table must be as seen on a little-endian machine 578 * when read from a file. 579 */ 580 Filemagic long0tab[] = { 581 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 582 /* "pac1" */ 583 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 584 /* "pXc2 */ 585 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 586 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 587 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 588 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 589 070707, 0xFFFF, "cpio archive\n", OCTET, 590 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 591 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 592 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 593 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 594 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 595 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 596 /* 597 * venti & fossil magic numbers are stored big-endian on disk, 598 * thus the numbers appear reversed in this table. 599 */ 600 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 601 }; 602 603 int 604 filemagic(Filemagic *tab, int ntab, ulong x) 605 { 606 int i; 607 608 for(i=0; i<ntab; i++) 609 if((x&tab[i].mask) == tab[i].x){ 610 print(mime ? tab[i].mime : tab[i].desc); 611 return 1; 612 } 613 return 0; 614 } 615 616 int 617 long0(void) 618 { 619 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 620 } 621 622 typedef struct Fileoffmag Fileoffmag; 623 struct Fileoffmag { 624 ulong off; 625 Filemagic; 626 }; 627 628 /* 629 * integers in this table must be as seen on a little-endian machine 630 * when read from a file. 631 */ 632 Fileoffmag longofftab[] = { 633 /* 634 * venti & fossil magic numbers are stored big-endian on disk, 635 * thus the numbers appear reversed in this table. 636 */ 637 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 638 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 639 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 640 }; 641 642 int 643 fileoffmagic(Fileoffmag *tab, int ntab) 644 { 645 int i; 646 ulong x; 647 Fileoffmag *tp; 648 uchar buf[sizeof(long)]; 649 650 for(i=0; i<ntab; i++) { 651 tp = tab + i; 652 seek(fd, tp->off, 0); 653 if (read(fd, buf, sizeof buf) != sizeof buf) 654 continue; 655 x = LENDIAN(buf); 656 if((x&tp->mask) == tp->x){ 657 print(mime? tp->mime: tp->desc); 658 return 1; 659 } 660 } 661 return 0; 662 } 663 664 int 665 longoff(void) 666 { 667 return fileoffmagic(longofftab, nelem(longofftab)); 668 } 669 670 int 671 isexec(void) 672 { 673 Fhdr f; 674 675 seek(fd, 0, 0); /* reposition to start of file */ 676 if(crackhdr(fd, &f)) { 677 print(mime ? OCTET : "%s\n", f.name); 678 return 1; 679 } 680 return 0; 681 } 682 683 684 /* from tar.c */ 685 enum { NAMSIZ = 100, TBLOCK = 512 }; 686 687 union hblock 688 { 689 char dummy[TBLOCK]; 690 struct header 691 { 692 char name[NAMSIZ]; 693 char mode[8]; 694 char uid[8]; 695 char gid[8]; 696 char size[12]; 697 char mtime[12]; 698 char chksum[8]; 699 char linkflag; 700 char linkname[NAMSIZ]; 701 /* rest are defined by POSIX's ustar format; see p1003.2b */ 702 char magic[6]; /* "ustar" */ 703 char version[2]; 704 char uname[32]; 705 char gname[32]; 706 char devmajor[8]; 707 char devminor[8]; 708 char prefix[155]; /* if non-null, path = prefix "/" name */ 709 } dbuf; 710 }; 711 712 int 713 checksum(union hblock *hp) 714 { 715 int i; 716 char *cp; 717 struct header *hdr = &hp->dbuf; 718 719 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 720 *cp = ' '; 721 i = 0; 722 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 723 i += *cp & 0xff; 724 return i; 725 } 726 727 int 728 istar(void) 729 { 730 int chksum; 731 char tblock[TBLOCK]; 732 union hblock *hp = (union hblock *)tblock; 733 struct header *hdr = &hp->dbuf; 734 735 seek(fd, 0, 0); /* reposition to start of file */ 736 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 737 return 0; 738 chksum = strtol(hdr->chksum, 0, 8); 739 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 740 if (strcmp(hdr->magic, "ustar") == 0) 741 print(mime? "application/x-ustar\n": 742 "posix tar archive\n"); 743 else 744 print(mime? "application/x-tar\n": "tar archive\n"); 745 return 1; 746 } 747 return 0; 748 } 749 750 /* 751 * initial words to classify file 752 */ 753 struct FILE_STRING 754 { 755 char *key; 756 char *filetype; 757 int length; 758 char *mime; 759 } file_string[] = 760 { 761 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 762 "!<arch>\n", "archive", 8, "application/octet-stream", 763 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 764 "#!/bin/rc", "rc executable file", 9, "text/plain", 765 "#!/bin/sh", "sh executable file", 9, "text/plain", 766 "%!", "postscript", 2, "application/postscript", 767 "\004%!", "postscript", 3, "application/postscript", 768 "x T post", "troff output for post", 8, "application/troff", 769 "x T Latin1", "troff output for Latin1", 10, "application/troff", 770 "x T utf", "troff output for UTF", 7, "application/troff", 771 "x T 202", "troff output for 202", 7, "application/troff", 772 "x T aps", "troff output for aps", 7, "application/troff", 773 "GIF", "GIF image", 3, "image/gif", 774 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 775 "%PDF", "PDF", 4, "application/pdf", 776 "<html>\n", "HTML file", 7, "text/html", 777 "<HTML>\n", "HTML file", 7, "text/html", 778 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream", 779 "\111\111\052\000", "tiff", 4, "image/tiff", 780 "\115\115\000\052", "tiff", 4, "image/tiff", 781 "\377\330\377\340", "jpeg", 4, "image/jpeg", 782 "\377\330\377\341", "jpeg", 4, "image/jpeg", 783 "\377\330\377\333", "jpeg", 4, "image/jpeg", 784 "BM", "bmp", 2, "image/bmp", 785 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 786 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 787 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 788 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 789 "\211PNG", "PNG image", 4, "image/png", 790 "P3\n", "ppm", 3, "image/ppm", 791 "P6\n", "ppm", 3, "image/ppm", 792 "/* XPM */\n", "xbm", 10, "image/xbm", 793 ".HTML ", "troff -ms input", 6, "text/troff", 794 ".LP", "troff -ms input", 3, "text/troff", 795 ".ND", "troff -ms input", 3, "text/troff", 796 ".PP", "troff -ms input", 3, "text/troff", 797 ".TL", "troff -ms input", 3, "text/troff", 798 ".TR", "troff -ms input", 3, "text/troff", 799 ".TH", "manual page", 3, "text/troff", 800 ".\\\"", "troff input", 3, "text/troff", 801 ".de", "troff input", 3, "text/troff", 802 ".if", "troff input", 3, "text/troff", 803 ".nr", "troff input", 3, "text/troff", 804 ".tr", "troff input", 3, "text/troff", 805 "vac:", "venti score", 4, "text/plain", 806 0,0,0,0 807 }; 808 809 int 810 istring(void) 811 { 812 int i; 813 struct FILE_STRING *p; 814 815 for(p = file_string; p->key; p++) { 816 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) { 817 if(mime) 818 print("%s\n", p->mime); 819 else 820 print("%s\n", p->filetype); 821 return 1; 822 } 823 } 824 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 825 for(i = 5; i < nbuf; i++) 826 if(buf[i] == '\n') 827 break; 828 if(mime) 829 print(OCTET); 830 else 831 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 832 return 1; 833 } 834 return 0; 835 } 836 837 struct offstr 838 { 839 ulong off; 840 struct FILE_STRING; 841 } offstrs[] = { 842 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 843 0, 0, 0, 0, 0 844 }; 845 846 int 847 isoffstr(void) 848 { 849 int n; 850 char buf[256]; 851 struct offstr *p; 852 853 for(p = offstrs; p->key; p++) { 854 seek(fd, p->off, 0); 855 n = p->length; 856 if (n > sizeof buf) 857 n = sizeof buf; 858 if (read(fd, buf, n) != n) 859 continue; 860 if(memcmp(buf, p->key, n) == 0) { 861 if(mime) 862 print("%s\n", p->mime); 863 else 864 print("%s\n", p->filetype); 865 return 1; 866 } 867 } 868 return 0; 869 } 870 871 int 872 iff(void) 873 { 874 if (strncmp((char*)buf, "FORM", 4) == 0 && 875 strncmp((char*)buf+8, "AIFF", 4) == 0) { 876 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 877 return 1; 878 } 879 return 0; 880 } 881 882 char* html_string[] = 883 { 884 "title", 885 "body", 886 "head", 887 "strong", 888 "h1", 889 "h2", 890 "h3", 891 "h4", 892 "h5", 893 "h6", 894 "ul", 895 "li", 896 "dl", 897 "br", 898 "em", 899 0, 900 }; 901 902 int 903 ishtml(void) 904 { 905 uchar *p, *q; 906 int i, count; 907 908 /* compare strings between '<' and '>' to html table */ 909 count = 0; 910 p = buf; 911 for(;;) { 912 while (p < buf+nbuf && *p != '<') 913 p++; 914 p++; 915 if (p >= buf+nbuf) 916 break; 917 if(*p == '/') 918 p++; 919 q = p; 920 while(p < buf+nbuf && *p != '>') 921 p++; 922 if (p >= buf+nbuf) 923 break; 924 for(i = 0; html_string[i]; i++) { 925 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 926 if(count++ > 4) { 927 print(mime ? "text/html\n" : "HTML file\n"); 928 return 1; 929 } 930 break; 931 } 932 } 933 p++; 934 } 935 return 0; 936 } 937 938 char* rfc822_string[] = 939 { 940 "from:", 941 "date:", 942 "to:", 943 "subject:", 944 "received:", 945 "reply to:", 946 "sender:", 947 0, 948 }; 949 950 int 951 isrfc822(void) 952 { 953 954 char *p, *q, *r; 955 int i, count; 956 957 count = 0; 958 p = (char*)buf; 959 for(;;) { 960 q = strchr(p, '\n'); 961 if(q == nil) 962 break; 963 *q = 0; 964 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 965 count++; 966 *q = '\n'; 967 p = q+1; 968 continue; 969 } 970 *q = '\n'; 971 if(*p != '\t' && *p != ' '){ 972 r = strchr(p, ':'); 973 if(r == 0 || r > q) 974 break; 975 for(i = 0; rfc822_string[i]; i++) { 976 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 977 count++; 978 break; 979 } 980 } 981 } 982 p = q+1; 983 } 984 if(count >= 3){ 985 print(mime ? "message/rfc822\n" : "email file\n"); 986 return 1; 987 } 988 return 0; 989 } 990 991 int 992 ismbox(void) 993 { 994 char *p, *q; 995 996 p = (char*)buf; 997 q = strchr(p, '\n'); 998 if(q == nil) 999 return 0; 1000 *q = 0; 1001 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 1002 print(mime ? "text/plain\n" : "mail box\n"); 1003 return 1; 1004 } 1005 *q = '\n'; 1006 return 0; 1007 } 1008 1009 int 1010 iscint(void) 1011 { 1012 int type; 1013 char *name; 1014 Biobuf b; 1015 1016 if(Binit(&b, fd, OREAD) == Beof) 1017 return 0; 1018 seek(fd, 0, 0); 1019 type = objtype(&b, &name); 1020 if(type < 0) 1021 return 0; 1022 if(mime) 1023 print(OCTET); 1024 else 1025 print("%s intermediate\n", name); 1026 return 1; 1027 } 1028 1029 int 1030 isc(void) 1031 { 1032 int n; 1033 1034 n = wfreq[I1]; 1035 /* 1036 * includes 1037 */ 1038 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1039 goto yes; 1040 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1041 goto yes; 1042 /* 1043 * declarations 1044 */ 1045 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1046 goto yes; 1047 /* 1048 * assignments 1049 */ 1050 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1051 goto yes; 1052 return 0; 1053 1054 yes: 1055 if(mime){ 1056 print(PLAIN); 1057 return 1; 1058 } 1059 if(wfreq[Alword] > 0) 1060 print("alef program\n"); 1061 else 1062 print("c program\n"); 1063 return 1; 1064 } 1065 1066 int 1067 islimbo(void) 1068 { 1069 1070 /* 1071 * includes 1072 */ 1073 if(wfreq[Lword] < 4) 1074 return 0; 1075 print(mime ? PLAIN : "limbo program\n"); 1076 return 1; 1077 } 1078 1079 int 1080 isas(void) 1081 { 1082 1083 /* 1084 * includes 1085 */ 1086 if(wfreq[Aword] < 2) 1087 return 0; 1088 print(mime ? PLAIN : "as program\n"); 1089 return 1; 1090 } 1091 1092 /* 1093 * low entropy means encrypted 1094 */ 1095 int 1096 ismung(void) 1097 { 1098 int i, bucket[8]; 1099 float cs; 1100 1101 if(nbuf < 64) 1102 return 0; 1103 memset(bucket, 0, sizeof(bucket)); 1104 for(i=nbuf-64; i<nbuf; i++) 1105 bucket[(buf[i]>>5)&07] += 1; 1106 1107 cs = 0.; 1108 for(i=0; i<8; i++) 1109 cs += (bucket[i]-8)*(bucket[i]-8); 1110 cs /= 8.; 1111 if(cs <= 24.322) { 1112 if(buf[0]==0x1f && buf[1]==0x9d) 1113 print(mime ? OCTET : "compressed\n"); 1114 else 1115 if(buf[0]==0x1f && buf[1]==0x8b) 1116 print(mime ? OCTET : "gzip compressed\n"); 1117 else 1118 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1119 print(mime ? OCTET : "bzip2 compressed\n"); 1120 else 1121 print(mime ? OCTET : "encrypted\n"); 1122 return 1; 1123 } 1124 return 0; 1125 } 1126 1127 /* 1128 * english by punctuation and frequencies 1129 */ 1130 int 1131 isenglish(void) 1132 { 1133 int vow, comm, rare, badpun, punct; 1134 char *p; 1135 1136 if(guess != Fascii && guess != Feascii) 1137 return 0; 1138 badpun = 0; 1139 punct = 0; 1140 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1141 switch(*p) { 1142 case '.': 1143 case ',': 1144 case ')': 1145 case '%': 1146 case ';': 1147 case ':': 1148 case '?': 1149 punct++; 1150 if(p[1] != ' ' && p[1] != '\n') 1151 badpun++; 1152 } 1153 if(badpun*5 > punct) 1154 return 0; 1155 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1156 return 0; 1157 if(2*cfreq[';'] > cfreq['e']) 1158 return 0; 1159 1160 vow = 0; 1161 for(p="AEIOU"; *p; p++) { 1162 vow += cfreq[*p]; 1163 vow += cfreq[tolower(*p)]; 1164 } 1165 comm = 0; 1166 for(p="ETAION"; *p; p++) { 1167 comm += cfreq[*p]; 1168 comm += cfreq[tolower(*p)]; 1169 } 1170 rare = 0; 1171 for(p="VJKQXZ"; *p; p++) { 1172 rare += cfreq[*p]; 1173 rare += cfreq[tolower(*p)]; 1174 } 1175 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1176 print(mime ? PLAIN : "English text\n"); 1177 return 1; 1178 } 1179 return 0; 1180 } 1181 1182 /* 1183 * pick up a number with 1184 * syntax _*[0-9]+_ 1185 */ 1186 #define P9BITLEN 12 1187 int 1188 p9bitnum(uchar *bp) 1189 { 1190 int n, c, len; 1191 1192 len = P9BITLEN; 1193 while(*bp == ' ') { 1194 bp++; 1195 len--; 1196 if(len <= 0) 1197 return -1; 1198 } 1199 n = 0; 1200 while(len > 1) { 1201 c = *bp++; 1202 if(!isdigit(c)) 1203 return -1; 1204 n = n*10 + c-'0'; 1205 len--; 1206 } 1207 if(*bp != ' ') 1208 return -1; 1209 return n; 1210 } 1211 1212 int 1213 depthof(char *s, int *newp) 1214 { 1215 char *es; 1216 int d; 1217 1218 *newp = 0; 1219 es = s+12; 1220 while(s<es && *s==' ') 1221 s++; 1222 if(s == es) 1223 return -1; 1224 if('0'<=*s && *s<='9') 1225 return 1<<strtol(s, 0, 0); 1226 1227 *newp = 1; 1228 d = 0; 1229 while(s<es && *s!=' '){ 1230 s++; /* skip letter */ 1231 d += strtoul(s, &s, 10); 1232 } 1233 1234 switch(d){ 1235 case 32: 1236 case 24: 1237 case 16: 1238 case 8: 1239 return d; 1240 } 1241 return -1; 1242 } 1243 1244 int 1245 isp9bit(void) 1246 { 1247 int dep, lox, loy, hix, hiy, px, new; 1248 ulong t; 1249 long len; 1250 char *newlabel; 1251 1252 newlabel = "old "; 1253 1254 dep = depthof((char*)buf + 0*P9BITLEN, &new); 1255 if(new) 1256 newlabel = ""; 1257 lox = p9bitnum(buf + 1*P9BITLEN); 1258 loy = p9bitnum(buf + 2*P9BITLEN); 1259 hix = p9bitnum(buf + 3*P9BITLEN); 1260 hiy = p9bitnum(buf + 4*P9BITLEN); 1261 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1262 return 0; 1263 1264 if(dep < 8){ 1265 px = 8/dep; /* pixels per byte */ 1266 /* set l to number of bytes of data per scan line */ 1267 if(lox >= 0) 1268 len = (hix+px-1)/px - lox/px; 1269 else{ /* make positive before divide */ 1270 t = (-lox)+px-1; 1271 t = (t/px)*px; 1272 len = (t+hix+px-1)/px; 1273 } 1274 }else 1275 len = (hix-lox)*dep/8; 1276 len *= (hiy-loy); /* col length */ 1277 len += 5*P9BITLEN; /* size of initial ascii */ 1278 1279 /* 1280 * for image file, length is non-zero and must match calculation above 1281 * for /dev/window and /dev/screen the length is always zero 1282 * for subfont, the subfont header should follow immediately. 1283 */ 1284 if (len != 0 && mbuf->length == 0) { 1285 print("%splan 9 image\n", newlabel); 1286 return 1; 1287 } 1288 if (mbuf->length == len) { 1289 print("%splan 9 image\n", newlabel); 1290 return 1; 1291 } 1292 /* Ghostscript sometimes produces a little extra on the end */ 1293 if (mbuf->length < len+P9BITLEN) { 1294 print("%splan 9 image\n", newlabel); 1295 return 1; 1296 } 1297 if (p9subfont(buf+len)) { 1298 print("%ssubfont file\n", newlabel); 1299 return 1; 1300 } 1301 return 0; 1302 } 1303 1304 int 1305 p9subfont(uchar *p) 1306 { 1307 int n, h, a; 1308 1309 /* if image too big, assume it's a subfont */ 1310 if (p+3*P9BITLEN > buf+sizeof(buf)) 1311 return 1; 1312 1313 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1314 if (n < 0) 1315 return 0; 1316 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1317 if (h < 0) 1318 return 0; 1319 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1320 if (a < 0) 1321 return 0; 1322 return 1; 1323 } 1324 1325 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1326 1327 int 1328 isp9font(void) 1329 { 1330 uchar *cp, *p; 1331 int i, n; 1332 char pathname[1024]; 1333 1334 cp = buf; 1335 if (!getfontnum(cp, &cp)) /* height */ 1336 return 0; 1337 if (!getfontnum(cp, &cp)) /* ascent */ 1338 return 0; 1339 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1340 if (!getfontnum(cp, &cp)) /* min */ 1341 break; 1342 if (!getfontnum(cp, &cp)) /* max */ 1343 return 0; 1344 getfontnum(cp, &cp); /* optional offset */ 1345 while (WHITESPACE(*cp)) 1346 cp++; 1347 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1348 ; 1349 /* construct a path name, if needed */ 1350 n = 0; 1351 if (*p != '/' && slash) { 1352 n = slash-fname+1; 1353 if (n < sizeof(pathname)) 1354 memcpy(pathname, fname, n); 1355 else n = 0; 1356 } 1357 if (n+cp-p+4 < sizeof(pathname)) { 1358 memcpy(pathname+n, p, cp-p); 1359 n += cp-p; 1360 pathname[n] = 0; 1361 if (access(pathname, AEXIST) < 0) { 1362 strcpy(pathname+n, ".0"); 1363 if (access(pathname, AEXIST) < 0) 1364 return 0; 1365 } 1366 } 1367 } 1368 if (i) { 1369 print(mime ? "text/plain\n" : "font file\n"); 1370 return 1; 1371 } 1372 return 0; 1373 } 1374 1375 int 1376 getfontnum(uchar *cp, uchar **rp) 1377 { 1378 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1379 cp++; 1380 if (*cp < '0' || *cp > '9') 1381 return 0; 1382 strtoul((char *)cp, (char **)rp, 0); 1383 if (!WHITESPACE(**rp)) { 1384 *rp = cp; 1385 return 0; 1386 } 1387 return 1; 1388 } 1389 1390 int 1391 isrtf(void) 1392 { 1393 if(strstr((char *)buf, "\\rtf1")){ 1394 print(mime ? "application/rtf\n" : "rich text format\n"); 1395 return 1; 1396 } 1397 return 0; 1398 } 1399 1400 int 1401 ismsdos(void) 1402 { 1403 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1404 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1405 return 1; 1406 } 1407 return 0; 1408 } 1409 1410 int 1411 iself(void) 1412 { 1413 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1414 [1] "WE32100", 1415 [2] "SPARC", 1416 [3] "i386", 1417 [4] "M68000", 1418 [5] "M88000", 1419 [6] "i486", 1420 [7] "i860", 1421 [8] "R3000", 1422 [9] "S370", 1423 [10] "R4000", 1424 [15] "HP-PA", 1425 [18] "sparc v8+", 1426 [19] "i960", 1427 [20] "PPC-32", 1428 [21] "PPC-64", 1429 [40] "ARM", 1430 [41] "Alpha", 1431 [43] "sparc v9", 1432 [50] "IA-64", 1433 [62] "AMD64", 1434 [75] "VAX", 1435 }; 1436 static char *type[] = { 1437 [1] "relocatable object", 1438 [2] "executable", 1439 [3] "shared library", 1440 [4] "core dump", 1441 }; 1442 1443 if (memcmp(buf, "\x7fELF", 4) == 0){ 1444 if (!mime){ 1445 int n = (buf[19] << 8) | buf[18]; 1446 char *p = "unknown"; 1447 char *t = "unknown"; 1448 1449 if (n > 0 && n < nelem(cpu) && cpu[n]) 1450 p = cpu[n]; 1451 else { 1452 /* try the other byte order */ 1453 n = (buf[18] << 8) | buf[19]; 1454 if (n > 0 && n < nelem(cpu) && cpu[n]) 1455 p = cpu[n]; 1456 } 1457 n = buf[16]; 1458 if(n>0 && n < nelem(type) && type[n]) 1459 t = type[n]; 1460 print("%s ELF %s\n", p, t); 1461 } 1462 else 1463 print("application/x-elf-executable"); 1464 return 1; 1465 } 1466 1467 return 0; 1468 } 1469 1470 int 1471 isface(void) 1472 { 1473 int i, j, ldepth, l; 1474 char *p; 1475 1476 ldepth = -1; 1477 for(j = 0; j < 3; j++){ 1478 for(p = (char*)buf, i=0; i<3; i++){ 1479 if(p[0] != '0' || p[1] != 'x') 1480 return 0; 1481 if(buf[2+8] == ',') 1482 l = 2; 1483 else if(buf[2+4] == ',') 1484 l = 1; 1485 else 1486 return 0; 1487 if(ldepth == -1) 1488 ldepth = l; 1489 if(l != ldepth) 1490 return 0; 1491 strtoul(p, &p, 16); 1492 if(*p++ != ',') 1493 return 0; 1494 while(*p == ' ' || *p == '\t') 1495 p++; 1496 } 1497 if (*p++ != '\n') 1498 return 0; 1499 } 1500 1501 if(mime) 1502 print("application/x-face\n"); 1503 else 1504 print("face image depth %d\n", ldepth); 1505 return 1; 1506 } 1507 1508