1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "char", Cword, 51 "common", Fword, 52 "con", Lword, 53 "data", Fword, 54 "dimension", Fword, 55 "double", Cword, 56 "extern", Cword, 57 "bio", I2, 58 "float", Cword, 59 "fn", Lword, 60 "function", Fword, 61 "h", I3, 62 "implement", Lword, 63 "import", Lword, 64 "include", I1, 65 "int", Cword, 66 "integer", Fword, 67 "iota", Lword, 68 "libc", I2, 69 "long", Cword, 70 "module", Lword, 71 "real", Fword, 72 "ref", Lword, 73 "register", Cword, 74 "self", Lword, 75 "short", Cword, 76 "static", Cword, 77 "stdio", I2, 78 "struct", Cword, 79 "subroutine", Fword, 80 "u", I2, 81 "void", Cword, 82 }; 83 84 /* codes for 'mode' field in language structure */ 85 enum { 86 Normal = 0, 87 First, /* first entry for language spanning several ranges */ 88 Multi, /* later entries " " " ... */ 89 Shared, /* codes used in several languages */ 90 }; 91 92 struct 93 { 94 int mode; /* see enum above */ 95 int count; 96 int low; 97 int high; 98 char *name; 99 100 } language[] = 101 { 102 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 103 Normal, 0, 0x0370, 0x03FF, "Greek", 104 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 105 Normal, 0, 0x0530, 0x058F, "Armenian", 106 Normal, 0, 0x0590, 0x05FF, "Hebrew", 107 Normal, 0, 0x0600, 0x06FF, "Arabic", 108 Normal, 0, 0x0900, 0x097F, "Devanagari", 109 Normal, 0, 0x0980, 0x09FF, "Bengali", 110 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 111 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 112 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 113 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 114 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 115 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 116 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 117 Normal, 0, 0x0E00, 0x0E7F, "Thai", 118 Normal, 0, 0x0E80, 0x0EFF, "Lao", 119 Normal, 0, 0x1000, 0x105F, "Tibetan", 120 Normal, 0, 0x10A0, 0x10FF, "Georgian", 121 Normal, 0, 0x3040, 0x30FF, "Japanese", 122 Normal, 0, 0x3100, 0x312F, "Chinese", 123 First, 0, 0x3130, 0x318F, "Korean", 124 Multi, 0, 0x3400, 0x3D2F, "Korean", 125 Shared, 0, 0x4e00, 0x9fff, "CJK", 126 Normal, 0, 0, 0, 0, /* terminal entry */ 127 }; 128 129 130 enum 131 { 132 Fascii, /* printable ascii */ 133 Flatin, /* latin 1*/ 134 Futf, /* UTF character set */ 135 Fbinary, /* binary */ 136 Feascii, /* ASCII with control chars */ 137 Fnull, /* NULL in file */ 138 } guess; 139 140 void bump_utf_count(Rune); 141 int cistrncmp(char*, char*, int); 142 void filetype(int); 143 int getfontnum(uchar*, uchar**); 144 int isas(void); 145 int isc(void); 146 int iscint(void); 147 int isenglish(void); 148 int ishp(void); 149 int ishtml(void); 150 int isrfc822(void); 151 int ismbox(void); 152 int islimbo(void); 153 int ismung(void); 154 int isp9bit(void); 155 int isp9font(void); 156 int isrtf(void); 157 int ismsdos(void); 158 int iself(void); 159 int istring(void); 160 int isoffstr(void); 161 int iff(void); 162 int long0(void); 163 int longoff(void); 164 int istar(void); 165 int isface(void); 166 int isexec(void); 167 int p9bitnum(uchar*); 168 int p9subfont(uchar*); 169 void print_utf(void); 170 void type(char*, int); 171 int utf_count(void); 172 void wordfreq(void); 173 174 int (*call[])(void) = 175 { 176 long0, /* recognizable by first 4 bytes */ 177 istring, /* recognizable by first string */ 178 iself, /* ELF (foreign) executable */ 179 isexec, /* native executables */ 180 iff, /* interchange file format (strings) */ 181 longoff, /* recognizable by 4 bytes at some offset */ 182 isoffstr, /* recognizable by string at some offset */ 183 isrfc822, /* email file */ 184 ismbox, /* mail box */ 185 istar, /* recognizable by tar checksum */ 186 ishtml, /* html keywords */ 187 iscint, /* compiler/assembler intermediate */ 188 islimbo, /* limbo source */ 189 isc, /* c & alef compiler key words */ 190 isas, /* assembler key words */ 191 isp9font, /* plan 9 font */ 192 isp9bit, /* plan 9 image (as from /dev/window) */ 193 ismung, /* entropy compressed/encrypted */ 194 isenglish, /* char frequency English */ 195 isrtf, /* rich text format */ 196 ismsdos, /* msdos exe (virus file attachement) */ 197 isface, /* ascii face file */ 198 0 199 }; 200 201 int mime; 202 203 #define OCTET "application/octet-stream\n" 204 #define PLAIN "text/plain\n" 205 206 void 207 main(int argc, char *argv[]) 208 { 209 int i, j, maxlen; 210 char *cp; 211 Rune r; 212 213 ARGBEGIN{ 214 case 'm': 215 mime = 1; 216 break; 217 default: 218 fprint(2, "usage: file [-m] [file...]\n"); 219 exits("usage"); 220 }ARGEND; 221 222 maxlen = 0; 223 if(mime == 0 || argc > 1){ 224 for(i = 0; i < argc; i++) { 225 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 226 ; 227 if(j > maxlen) 228 maxlen = j; 229 } 230 } 231 if (argc <= 0) { 232 if(!mime) 233 print ("stdin: "); 234 filetype(0); 235 } 236 else { 237 for(i = 0; i < argc; i++) 238 type(argv[i], maxlen); 239 } 240 exits(0); 241 } 242 243 void 244 type(char *file, int nlen) 245 { 246 Rune r; 247 int i; 248 char *p; 249 250 if(nlen > 0){ 251 slash = 0; 252 for (i = 0, p = file; *p; i++) { 253 if (*p == '/') /* find rightmost slash */ 254 slash = p; 255 p += chartorune(&r, p); /* count runes */ 256 } 257 print("%s:%*s",file, nlen-i+1, ""); 258 } 259 fname = file; 260 if ((fd = open(file, OREAD)) < 0) { 261 print("cannot open\n"); 262 return; 263 } 264 filetype(fd); 265 close(fd); 266 } 267 268 /* 269 * Unicode 4.0 4-byte runes. 270 */ 271 typedef int Rune1; 272 273 enum { 274 UTFmax1 = 4, 275 }; 276 277 int 278 fullrune1(char *p, int n) 279 { 280 int c; 281 282 if(n >= 1) { 283 c = *(uchar*)p; 284 if(c < 0x80) 285 return 1; 286 if(n >= 2 && c < 0xE0) 287 return 1; 288 if(n >= 3 && c < 0xF0) 289 return 1; 290 if(n >= 4) 291 return 1; 292 } 293 return 0; 294 } 295 296 int 297 chartorune1(Rune1 *rune, char *str) 298 { 299 int c, c1, c2, c3, n; 300 Rune r; 301 302 c = *(uchar*)str; 303 if(c < 0xF0){ 304 r = 0; 305 n = chartorune(&r, str); 306 *rune = r; 307 return n; 308 } 309 c &= ~0xF0; 310 c1 = *(uchar*)(str+1) & ~0x80; 311 c2 = *(uchar*)(str+2) & ~0x80; 312 c3 = *(uchar*)(str+3) & ~0x80; 313 n = (c<<18) | (c1<<12) | (c2<<6) | c3; 314 if(n < 0x10000 || n > 0x10FFFF){ 315 *rune = Runeerror; 316 return 1; 317 } 318 *rune = n; 319 return 4; 320 } 321 322 void 323 filetype(int fd) 324 { 325 Rune1 r; 326 int i, f, n; 327 char *p, *eob; 328 329 free(mbuf); 330 mbuf = dirfstat(fd); 331 if(mbuf == nil){ 332 print("cannot stat: %r\n"); 333 return; 334 } 335 if(mbuf->mode & DMDIR) { 336 print(mime ? "text/directory\n" : "directory\n"); 337 return; 338 } 339 if(mbuf->type != 'M' && mbuf->type != '|') { 340 print(mime ? OCTET : "special file #%c/%s\n", 341 mbuf->type, mbuf->name); 342 return; 343 } 344 nbuf = read(fd, buf, sizeof(buf)-1); 345 346 if(nbuf < 0) { 347 print("cannot read\n"); 348 return; 349 } 350 if(nbuf == 0) { 351 print(mime ? PLAIN : "empty file\n"); 352 return; 353 } 354 buf[nbuf] = 0; 355 356 /* 357 * build histogram table 358 */ 359 memset(cfreq, 0, sizeof(cfreq)); 360 for (i = 0; language[i].name; i++) 361 language[i].count = 0; 362 eob = (char *)buf+nbuf; 363 for(n = 0, p = (char *)buf; p < eob; n++) { 364 if (!fullrune1(p, eob-p) && eob-p < UTFmax1) 365 break; 366 p += chartorune1(&r, p); 367 if (r == 0) 368 f = Cnull; 369 else if (r <= 0x7f) { 370 if (!isprint(r) && !isspace(r)) 371 f = Ceascii; /* ASCII control char */ 372 else f = r; 373 } else if (r == 0x80) { 374 bump_utf_count(r); 375 f = Cutf; 376 } else if (r < 0xA0) 377 f = Cbinary; /* Invalid Runes */ 378 else if (r <= 0xff) 379 f = Clatin; /* Latin 1 */ 380 else { 381 bump_utf_count(r); 382 f = Cutf; /* UTF extension */ 383 } 384 cfreq[f]++; /* ASCII chars peg directly */ 385 } 386 /* 387 * gross classify 388 */ 389 if (cfreq[Cbinary]) 390 guess = Fbinary; 391 else if (cfreq[Cutf]) 392 guess = Futf; 393 else if (cfreq[Clatin]) 394 guess = Flatin; 395 else if (cfreq[Ceascii]) 396 guess = Feascii; 397 else if (cfreq[Cnull]) 398 guess = Fbinary; 399 else 400 guess = Fascii; 401 /* 402 * lookup dictionary words 403 */ 404 memset(wfreq, 0, sizeof(wfreq)); 405 if(guess == Fascii || guess == Flatin || guess == Futf) 406 wordfreq(); 407 /* 408 * call individual classify routines 409 */ 410 for(i=0; call[i]; i++) 411 if((*call[i])()) 412 return; 413 414 /* 415 * if all else fails, 416 * print out gross classification 417 */ 418 if (nbuf < 100 && !mime) 419 print(mime ? PLAIN : "short "); 420 if (guess == Fascii) 421 print(mime ? PLAIN : "Ascii\n"); 422 else if (guess == Feascii) 423 print(mime ? PLAIN : "extended ascii\n"); 424 else if (guess == Flatin) 425 print(mime ? PLAIN : "latin ascii\n"); 426 else if (guess == Futf && utf_count() < 4) 427 print_utf(); 428 else print(mime ? OCTET : "binary\n"); 429 } 430 431 void 432 bump_utf_count(Rune r) 433 { 434 int low, high, mid; 435 436 high = sizeof(language)/sizeof(language[0])-1; 437 for (low = 0; low < high;) { 438 mid = (low+high)/2; 439 if (r >= language[mid].low) { 440 if (r <= language[mid].high) { 441 language[mid].count++; 442 break; 443 } else low = mid+1; 444 } else high = mid; 445 } 446 } 447 448 int 449 utf_count(void) 450 { 451 int i, count; 452 453 count = 0; 454 for (i = 0; language[i].name; i++) 455 if (language[i].count > 0) 456 switch (language[i].mode) { 457 case Normal: 458 case First: 459 count++; 460 break; 461 default: 462 break; 463 } 464 return count; 465 } 466 467 int 468 chkascii(void) 469 { 470 int i; 471 472 for (i = 'a'; i < 'z'; i++) 473 if (cfreq[i]) 474 return 1; 475 for (i = 'A'; i < 'Z'; i++) 476 if (cfreq[i]) 477 return 1; 478 return 0; 479 } 480 481 int 482 find_first(char *name) 483 { 484 int i; 485 486 for (i = 0; language[i].name != 0; i++) 487 if (language[i].mode == First 488 && strcmp(language[i].name, name) == 0) 489 return i; 490 return -1; 491 } 492 493 void 494 print_utf(void) 495 { 496 int i, printed, j; 497 498 if(mime){ 499 print(PLAIN); 500 return; 501 } 502 if (chkascii()) { 503 printed = 1; 504 print("Ascii"); 505 } else 506 printed = 0; 507 for (i = 0; language[i].name; i++) 508 if (language[i].count) { 509 switch(language[i].mode) { 510 case Multi: 511 j = find_first(language[i].name); 512 if (j < 0) 513 break; 514 if (language[j].count > 0) 515 break; 516 /* Fall through */ 517 case Normal: 518 case First: 519 if (printed) 520 print(" & "); 521 else printed = 1; 522 print("%s", language[i].name); 523 break; 524 case Shared: 525 default: 526 break; 527 } 528 } 529 if(!printed) 530 print("UTF"); 531 print(" text\n"); 532 } 533 534 void 535 wordfreq(void) 536 { 537 int low, high, mid, r; 538 uchar *p, *p2, c; 539 540 p = buf; 541 for(;;) { 542 while (p < buf+nbuf && !isalpha(*p)) 543 p++; 544 if (p >= buf+nbuf) 545 return; 546 p2 = p; 547 while(p < buf+nbuf && isalpha(*p)) 548 p++; 549 c = *p; 550 *p = 0; 551 high = sizeof(dict)/sizeof(dict[0]); 552 for(low = 0;low < high;) { 553 mid = (low+high)/2; 554 r = strcmp(dict[mid].word, (char*)p2); 555 if(r == 0) { 556 wfreq[dict[mid].class]++; 557 break; 558 } 559 if(r < 0) 560 low = mid+1; 561 else 562 high = mid; 563 } 564 *p++ = c; 565 } 566 } 567 568 typedef struct Filemagic Filemagic; 569 struct Filemagic { 570 ulong x; 571 ulong mask; 572 char *desc; 573 char *mime; 574 }; 575 576 /* 577 * integers in this table must be as seen on a little-endian machine 578 * when read from a file. 579 */ 580 Filemagic long0tab[] = { 581 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 582 /* "pac1" */ 583 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 584 /* "pXc2 */ 585 0x32630070, 0xFFFF00FF, "pac4 audio file\n", OCTET, 586 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 587 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 588 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip", 589 070707, 0xFFFF, "cpio archive\n", OCTET, 590 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 591 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", 592 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be", 593 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le", 594 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be", 595 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le", 596 /* 597 * venti & fossil magic numbers are stored big-endian on disk, 598 * thus the numbers appear reversed in this table. 599 */ 600 0xad4e5cd1, 0xFFFFFFFF, "venti arena\n", OCTET, 601 }; 602 603 int 604 filemagic(Filemagic *tab, int ntab, ulong x) 605 { 606 int i; 607 608 for(i=0; i<ntab; i++) 609 if((x&tab[i].mask) == tab[i].x){ 610 print(mime ? tab[i].mime : tab[i].desc); 611 return 1; 612 } 613 return 0; 614 } 615 616 int 617 long0(void) 618 { 619 return filemagic(long0tab, nelem(long0tab), LENDIAN(buf)); 620 } 621 622 typedef struct Fileoffmag Fileoffmag; 623 struct Fileoffmag { 624 ulong off; 625 Filemagic; 626 }; 627 628 /* 629 * integers in this table must be as seen on a little-endian machine 630 * when read from a file. 631 */ 632 Fileoffmag longofftab[] = { 633 /* 634 * venti & fossil magic numbers are stored big-endian on disk, 635 * thus the numbers appear reversed in this table. 636 */ 637 256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition\n", OCTET, 638 256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section\n", OCTET, 639 128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer\n", OCTET, 640 }; 641 642 int 643 fileoffmagic(Fileoffmag *tab, int ntab) 644 { 645 int i; 646 ulong x; 647 Fileoffmag *tp; 648 uchar buf[sizeof(long)]; 649 650 for(i=0; i<ntab; i++) { 651 tp = tab + i; 652 seek(fd, tp->off, 0); 653 if (read(fd, buf, sizeof buf) != sizeof buf) 654 continue; 655 x = LENDIAN(buf); 656 if((x&tp->mask) == tp->x){ 657 print(mime? tp->mime: tp->desc); 658 return 1; 659 } 660 } 661 return 0; 662 } 663 664 int 665 longoff(void) 666 { 667 return fileoffmagic(longofftab, nelem(longofftab)); 668 } 669 670 int 671 isexec(void) 672 { 673 Fhdr f; 674 675 seek(fd, 0, 0); /* reposition to start of file */ 676 if(crackhdr(fd, &f)) { 677 print(mime ? OCTET : "%s\n", f.name); 678 return 1; 679 } 680 return 0; 681 } 682 683 684 /* from tar.c */ 685 enum { NAMSIZ = 100, TBLOCK = 512 }; 686 687 union hblock 688 { 689 char dummy[TBLOCK]; 690 struct header 691 { 692 char name[NAMSIZ]; 693 char mode[8]; 694 char uid[8]; 695 char gid[8]; 696 char size[12]; 697 char mtime[12]; 698 char chksum[8]; 699 char linkflag; 700 char linkname[NAMSIZ]; 701 /* rest are defined by POSIX's ustar format; see p1003.2b */ 702 char magic[6]; /* "ustar" */ 703 char version[2]; 704 char uname[32]; 705 char gname[32]; 706 char devmajor[8]; 707 char devminor[8]; 708 char prefix[155]; /* if non-null, path = prefix "/" name */ 709 } dbuf; 710 }; 711 712 int 713 checksum(union hblock *hp) 714 { 715 int i; 716 char *cp; 717 struct header *hdr = &hp->dbuf; 718 719 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 720 *cp = ' '; 721 i = 0; 722 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 723 i += *cp & 0xff; 724 return i; 725 } 726 727 int 728 istar(void) 729 { 730 int chksum; 731 char tblock[TBLOCK]; 732 union hblock *hp = (union hblock *)tblock; 733 struct header *hdr = &hp->dbuf; 734 735 seek(fd, 0, 0); /* reposition to start of file */ 736 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 737 return 0; 738 chksum = strtol(hdr->chksum, 0, 8); 739 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 740 if (strcmp(hdr->magic, "ustar") == 0) 741 print(mime? "application/x-ustar\n": 742 "posix tar archive\n"); 743 else 744 print(mime? "application/x-tar\n": "tar archive\n"); 745 return 1; 746 } 747 return 0; 748 } 749 750 /* 751 * initial words to classify file 752 */ 753 struct FILE_STRING 754 { 755 char *key; 756 char *filetype; 757 int length; 758 char *mime; 759 } file_string[] = 760 { 761 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 762 "!<arch>\n", "archive", 8, "application/octet-stream", 763 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 764 "#!/bin/rc", "rc executable file", 9, "text/plain", 765 "#!/bin/sh", "sh executable file", 9, "text/plain", 766 "%!", "postscript", 2, "application/postscript", 767 "\004%!", "postscript", 3, "application/postscript", 768 "x T post", "troff output for post", 8, "application/troff", 769 "x T Latin1", "troff output for Latin1", 10, "application/troff", 770 "x T utf", "troff output for UTF", 7, "application/troff", 771 "x T 202", "troff output for 202", 7, "application/troff", 772 "x T aps", "troff output for aps", 7, "application/troff", 773 "GIF", "GIF image", 3, "image/gif", 774 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 775 "%PDF", "PDF", 4, "application/pdf", 776 "<html>\n", "HTML file", 7, "text/html", 777 "<HTML>\n", "HTML file", 7, "text/html", 778 "\111\111\052\000", "tiff", 4, "image/tiff", 779 "\115\115\000\052", "tiff", 4, "image/tiff", 780 "\377\330\377\340", "jpeg", 4, "image/jpeg", 781 "\377\330\377\341", "jpeg", 4, "image/jpeg", 782 "\377\330\377\333", "jpeg", 4, "image/jpeg", 783 "BM", "bmp", 2, "image/bmp", 784 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 785 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 786 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 787 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 788 "\211PNG", "PNG image", 4, "image/png", 789 "P3\n", "ppm", 3, "image/ppm", 790 "P6\n", "ppm", 3, "image/ppm", 791 "/* XPM */\n", "xbm", 10, "image/xbm", 792 ".HTML ", "troff -ms input", 6, "text/troff", 793 ".LP", "troff -ms input", 3, "text/troff", 794 ".ND", "troff -ms input", 3, "text/troff", 795 ".PP", "troff -ms input", 3, "text/troff", 796 ".TL", "troff -ms input", 3, "text/troff", 797 ".TR", "troff -ms input", 3, "text/troff", 798 ".TH", "manual page", 3, "text/troff", 799 ".\\\"", "troff input", 3, "text/troff", 800 ".de", "troff input", 3, "text/troff", 801 ".if", "troff input", 3, "text/troff", 802 ".nr", "troff input", 3, "text/troff", 803 ".tr", "troff input", 3, "text/troff", 804 "vac:", "venti score", 4, "text/plain", 805 0,0,0,0 806 }; 807 808 int 809 istring(void) 810 { 811 int i; 812 struct FILE_STRING *p; 813 814 for(p = file_string; p->key; p++) { 815 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) { 816 if(mime) 817 print("%s\n", p->mime); 818 else 819 print("%s\n", p->filetype); 820 return 1; 821 } 822 } 823 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 824 for(i = 5; i < nbuf; i++) 825 if(buf[i] == '\n') 826 break; 827 if(mime) 828 print(OCTET); 829 else 830 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 831 return 1; 832 } 833 return 0; 834 } 835 836 struct offstr 837 { 838 ulong off; 839 struct FILE_STRING; 840 } offstrs[] = { 841 32*1024, "\001CD001\001", "ISO9660 CD image", 7, OCTET, 842 0, 0, 0, 0, 0 843 }; 844 845 int 846 isoffstr(void) 847 { 848 int n; 849 char buf[256]; 850 struct offstr *p; 851 852 for(p = offstrs; p->key; p++) { 853 seek(fd, p->off, 0); 854 n = p->length; 855 if (n > sizeof buf) 856 n = sizeof buf; 857 if (read(fd, buf, n) != n) 858 continue; 859 if(memcmp(buf, p->key, n) == 0) { 860 if(mime) 861 print("%s\n", p->mime); 862 else 863 print("%s\n", p->filetype); 864 return 1; 865 } 866 } 867 return 0; 868 } 869 870 int 871 iff(void) 872 { 873 if (strncmp((char*)buf, "FORM", 4) == 0 && 874 strncmp((char*)buf+8, "AIFF", 4) == 0) { 875 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 876 return 1; 877 } 878 return 0; 879 } 880 881 char* html_string[] = 882 { 883 "title", 884 "body", 885 "head", 886 "strong", 887 "h1", 888 "h2", 889 "h3", 890 "h4", 891 "h5", 892 "h6", 893 "ul", 894 "li", 895 "dl", 896 "br", 897 "em", 898 0, 899 }; 900 901 int 902 ishtml(void) 903 { 904 uchar *p, *q; 905 int i, count; 906 907 /* compare strings between '<' and '>' to html table */ 908 count = 0; 909 p = buf; 910 for(;;) { 911 while (p < buf+nbuf && *p != '<') 912 p++; 913 p++; 914 if (p >= buf+nbuf) 915 break; 916 if(*p == '/') 917 p++; 918 q = p; 919 while(p < buf+nbuf && *p != '>') 920 p++; 921 if (p >= buf+nbuf) 922 break; 923 for(i = 0; html_string[i]; i++) { 924 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 925 if(count++ > 4) { 926 print(mime ? "text/html\n" : "HTML file\n"); 927 return 1; 928 } 929 break; 930 } 931 } 932 p++; 933 } 934 return 0; 935 } 936 937 char* rfc822_string[] = 938 { 939 "from:", 940 "date:", 941 "to:", 942 "subject:", 943 "received:", 944 "reply to:", 945 "sender:", 946 0, 947 }; 948 949 int 950 isrfc822(void) 951 { 952 953 char *p, *q, *r; 954 int i, count; 955 956 count = 0; 957 p = (char*)buf; 958 for(;;) { 959 q = strchr(p, '\n'); 960 if(q == nil) 961 break; 962 *q = 0; 963 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 964 count++; 965 *q = '\n'; 966 p = q+1; 967 continue; 968 } 969 *q = '\n'; 970 if(*p != '\t' && *p != ' '){ 971 r = strchr(p, ':'); 972 if(r == 0 || r > q) 973 break; 974 for(i = 0; rfc822_string[i]; i++) { 975 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 976 count++; 977 break; 978 } 979 } 980 } 981 p = q+1; 982 } 983 if(count >= 3){ 984 print(mime ? "message/rfc822\n" : "email file\n"); 985 return 1; 986 } 987 return 0; 988 } 989 990 int 991 ismbox(void) 992 { 993 char *p, *q; 994 995 p = (char*)buf; 996 q = strchr(p, '\n'); 997 if(q == nil) 998 return 0; 999 *q = 0; 1000 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 1001 print(mime ? "text/plain\n" : "mail box\n"); 1002 return 1; 1003 } 1004 *q = '\n'; 1005 return 0; 1006 } 1007 1008 int 1009 iscint(void) 1010 { 1011 int type; 1012 char *name; 1013 Biobuf b; 1014 1015 if(Binit(&b, fd, OREAD) == Beof) 1016 return 0; 1017 seek(fd, 0, 0); 1018 type = objtype(&b, &name); 1019 if(type < 0) 1020 return 0; 1021 if(mime) 1022 print(OCTET); 1023 else 1024 print("%s intermediate\n", name); 1025 return 1; 1026 } 1027 1028 int 1029 isc(void) 1030 { 1031 int n; 1032 1033 n = wfreq[I1]; 1034 /* 1035 * includes 1036 */ 1037 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1038 goto yes; 1039 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 1040 goto yes; 1041 /* 1042 * declarations 1043 */ 1044 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 1045 goto yes; 1046 /* 1047 * assignments 1048 */ 1049 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 1050 goto yes; 1051 return 0; 1052 1053 yes: 1054 if(mime){ 1055 print(PLAIN); 1056 return 1; 1057 } 1058 if(wfreq[Alword] > 0) 1059 print("alef program\n"); 1060 else 1061 print("c program\n"); 1062 return 1; 1063 } 1064 1065 int 1066 islimbo(void) 1067 { 1068 1069 /* 1070 * includes 1071 */ 1072 if(wfreq[Lword] < 4) 1073 return 0; 1074 print(mime ? PLAIN : "limbo program\n"); 1075 return 1; 1076 } 1077 1078 int 1079 isas(void) 1080 { 1081 1082 /* 1083 * includes 1084 */ 1085 if(wfreq[Aword] < 2) 1086 return 0; 1087 print(mime ? PLAIN : "as program\n"); 1088 return 1; 1089 } 1090 1091 /* 1092 * low entropy means encrypted 1093 */ 1094 int 1095 ismung(void) 1096 { 1097 int i, bucket[8]; 1098 float cs; 1099 1100 if(nbuf < 64) 1101 return 0; 1102 memset(bucket, 0, sizeof(bucket)); 1103 for(i=nbuf-64; i<nbuf; i++) 1104 bucket[(buf[i]>>5)&07] += 1; 1105 1106 cs = 0.; 1107 for(i=0; i<8; i++) 1108 cs += (bucket[i]-8)*(bucket[i]-8); 1109 cs /= 8.; 1110 if(cs <= 24.322) { 1111 if(buf[0]==0x1f && buf[1]==0x9d) 1112 print(mime ? OCTET : "compressed\n"); 1113 else 1114 if(buf[0]==0x1f && buf[1]==0x8b) 1115 print(mime ? OCTET : "gzip compressed\n"); 1116 else 1117 if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h') 1118 print(mime ? OCTET : "bzip2 compressed\n"); 1119 else 1120 print(mime ? OCTET : "encrypted\n"); 1121 return 1; 1122 } 1123 return 0; 1124 } 1125 1126 /* 1127 * english by punctuation and frequencies 1128 */ 1129 int 1130 isenglish(void) 1131 { 1132 int vow, comm, rare, badpun, punct; 1133 char *p; 1134 1135 if(guess != Fascii && guess != Feascii) 1136 return 0; 1137 badpun = 0; 1138 punct = 0; 1139 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 1140 switch(*p) { 1141 case '.': 1142 case ',': 1143 case ')': 1144 case '%': 1145 case ';': 1146 case ':': 1147 case '?': 1148 punct++; 1149 if(p[1] != ' ' && p[1] != '\n') 1150 badpun++; 1151 } 1152 if(badpun*5 > punct) 1153 return 0; 1154 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 1155 return 0; 1156 if(2*cfreq[';'] > cfreq['e']) 1157 return 0; 1158 1159 vow = 0; 1160 for(p="AEIOU"; *p; p++) { 1161 vow += cfreq[*p]; 1162 vow += cfreq[tolower(*p)]; 1163 } 1164 comm = 0; 1165 for(p="ETAION"; *p; p++) { 1166 comm += cfreq[*p]; 1167 comm += cfreq[tolower(*p)]; 1168 } 1169 rare = 0; 1170 for(p="VJKQXZ"; *p; p++) { 1171 rare += cfreq[*p]; 1172 rare += cfreq[tolower(*p)]; 1173 } 1174 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 1175 print(mime ? PLAIN : "English text\n"); 1176 return 1; 1177 } 1178 return 0; 1179 } 1180 1181 /* 1182 * pick up a number with 1183 * syntax _*[0-9]+_ 1184 */ 1185 #define P9BITLEN 12 1186 int 1187 p9bitnum(uchar *bp) 1188 { 1189 int n, c, len; 1190 1191 len = P9BITLEN; 1192 while(*bp == ' ') { 1193 bp++; 1194 len--; 1195 if(len <= 0) 1196 return -1; 1197 } 1198 n = 0; 1199 while(len > 1) { 1200 c = *bp++; 1201 if(!isdigit(c)) 1202 return -1; 1203 n = n*10 + c-'0'; 1204 len--; 1205 } 1206 if(*bp != ' ') 1207 return -1; 1208 return n; 1209 } 1210 1211 int 1212 depthof(char *s, int *newp) 1213 { 1214 char *es; 1215 int d; 1216 1217 *newp = 0; 1218 es = s+12; 1219 while(s<es && *s==' ') 1220 s++; 1221 if(s == es) 1222 return -1; 1223 if('0'<=*s && *s<='9') 1224 return 1<<strtol(s, 0, 0); 1225 1226 *newp = 1; 1227 d = 0; 1228 while(s<es && *s!=' '){ 1229 s++; /* skip letter */ 1230 d += strtoul(s, &s, 10); 1231 } 1232 1233 if(d % 8 == 0 || 8 % d == 0) 1234 return d; 1235 else 1236 return -1; 1237 } 1238 1239 int 1240 isp9bit(void) 1241 { 1242 int dep, lox, loy, hix, hiy, px, new, cmpr; 1243 ulong t; 1244 long len; 1245 char *newlabel; 1246 uchar *cp; 1247 1248 cp = buf; 1249 cmpr = 0; 1250 newlabel = "old "; 1251 1252 if(memcmp(cp, "compressed\n", 11) == 0) { 1253 cmpr = 1; 1254 cp = buf + 11; 1255 } 1256 1257 dep = depthof((char*)cp + 0*P9BITLEN, &new); 1258 if(new) 1259 newlabel = ""; 1260 lox = p9bitnum(cp + 1*P9BITLEN); 1261 loy = p9bitnum(cp + 2*P9BITLEN); 1262 hix = p9bitnum(cp + 3*P9BITLEN); 1263 hiy = p9bitnum(cp + 4*P9BITLEN); 1264 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1265 return 0; 1266 1267 if(dep < 8){ 1268 px = 8/dep; /* pixels per byte */ 1269 /* set l to number of bytes of data per scan line */ 1270 if(lox >= 0) 1271 len = (hix+px-1)/px - lox/px; 1272 else{ /* make positive before divide */ 1273 t = (-lox)+px-1; 1274 t = (t/px)*px; 1275 len = (t+hix+px-1)/px; 1276 } 1277 }else 1278 len = (hix-lox)*dep/8; 1279 len *= hiy - loy; /* col length */ 1280 len += 5 * P9BITLEN; /* size of initial ascii */ 1281 1282 /* 1283 * for compressed images, don't look any further. otherwise: 1284 * for image file, length is non-zero and must match calculation above 1285 * for /dev/window and /dev/screen the length is always zero 1286 * for subfont, the subfont header should follow immediately. 1287 */ 1288 if (cmpr) { 1289 print(mime ? OCTET : "Compressed %splan 9 image or subfont, depth %d\n", 1290 newlabel, dep); 1291 return 1; 1292 } 1293 if (len != 0 && mbuf->length == 0) { 1294 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1295 return 1; 1296 } 1297 if (mbuf->length == len) { 1298 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1299 return 1; 1300 } 1301 /* Ghostscript sometimes produces a little extra on the end */ 1302 if (mbuf->length < len+P9BITLEN) { 1303 print(mime ? OCTET : "%splan 9 image, depth %d\n", newlabel, dep); 1304 return 1; 1305 } 1306 if (p9subfont(buf+len)) { 1307 print(mime ? OCTET : "%ssubfont file, depth %d\n", newlabel, dep); 1308 return 1; 1309 } 1310 return 0; 1311 } 1312 1313 int 1314 p9subfont(uchar *p) 1315 { 1316 int n, h, a; 1317 1318 /* if image too big, assume it's a subfont */ 1319 if (p+3*P9BITLEN > buf+sizeof(buf)) 1320 return 1; 1321 1322 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1323 if (n < 0) 1324 return 0; 1325 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1326 if (h < 0) 1327 return 0; 1328 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1329 if (a < 0) 1330 return 0; 1331 return 1; 1332 } 1333 1334 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1335 1336 int 1337 isp9font(void) 1338 { 1339 uchar *cp, *p; 1340 int i, n; 1341 char pathname[1024]; 1342 1343 cp = buf; 1344 if (!getfontnum(cp, &cp)) /* height */ 1345 return 0; 1346 if (!getfontnum(cp, &cp)) /* ascent */ 1347 return 0; 1348 for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) { 1349 if (!getfontnum(cp, &cp)) /* min */ 1350 break; 1351 if (!getfontnum(cp, &cp)) /* max */ 1352 return 0; 1353 getfontnum(cp, &cp); /* optional offset */ 1354 while (WHITESPACE(*cp)) 1355 cp++; 1356 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1357 ; 1358 /* construct a path name, if needed */ 1359 n = 0; 1360 if (*p != '/' && slash) { 1361 n = slash-fname+1; 1362 if (n < sizeof(pathname)) 1363 memcpy(pathname, fname, n); 1364 else n = 0; 1365 } 1366 if (n+cp-p+4 < sizeof(pathname)) { 1367 memcpy(pathname+n, p, cp-p); 1368 n += cp-p; 1369 pathname[n] = 0; 1370 if (access(pathname, AEXIST) < 0) { 1371 strcpy(pathname+n, ".0"); 1372 if (access(pathname, AEXIST) < 0) 1373 return 0; 1374 } 1375 } 1376 } 1377 if (i) { 1378 print(mime ? "text/plain\n" : "font file\n"); 1379 return 1; 1380 } 1381 return 0; 1382 } 1383 1384 int 1385 getfontnum(uchar *cp, uchar **rp) 1386 { 1387 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1388 cp++; 1389 if (*cp < '0' || *cp > '9') 1390 return 0; 1391 strtoul((char *)cp, (char **)rp, 0); 1392 if (!WHITESPACE(**rp)) { 1393 *rp = cp; 1394 return 0; 1395 } 1396 return 1; 1397 } 1398 1399 int 1400 isrtf(void) 1401 { 1402 if(strstr((char *)buf, "\\rtf1")){ 1403 print(mime ? "application/rtf\n" : "rich text format\n"); 1404 return 1; 1405 } 1406 return 0; 1407 } 1408 1409 int 1410 ismsdos(void) 1411 { 1412 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1413 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1414 return 1; 1415 } 1416 return 0; 1417 } 1418 1419 int 1420 iself(void) 1421 { 1422 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1423 [1] "WE32100", 1424 [2] "SPARC", 1425 [3] "i386", 1426 [4] "M68000", 1427 [5] "M88000", 1428 [6] "i486", 1429 [7] "i860", 1430 [8] "R3000", 1431 [9] "S370", 1432 [10] "R4000", 1433 [15] "HP-PA", 1434 [18] "sparc v8+", 1435 [19] "i960", 1436 [20] "PPC-32", 1437 [21] "PPC-64", 1438 [40] "ARM", 1439 [41] "Alpha", 1440 [43] "sparc v9", 1441 [50] "IA-64", 1442 [62] "AMD64", 1443 [75] "VAX", 1444 }; 1445 static char *type[] = { 1446 [1] "relocatable object", 1447 [2] "executable", 1448 [3] "shared library", 1449 [4] "core dump", 1450 }; 1451 1452 if (memcmp(buf, "\x7fELF", 4) == 0){ 1453 if (!mime){ 1454 int isdifend = 0; 1455 int n = (buf[19] << 8) | buf[18]; 1456 char *p = "unknown"; 1457 char *t = "unknown"; 1458 1459 if (n > 0 && n < nelem(cpu) && cpu[n]) 1460 p = cpu[n]; 1461 else { 1462 /* try the other byte order */ 1463 isdifend = 1; 1464 n = (buf[18] << 8) | buf[19]; 1465 if (n > 0 && n < nelem(cpu) && cpu[n]) 1466 p = cpu[n]; 1467 } 1468 if(isdifend) 1469 n = (buf[16]<< 8) | buf[17]; 1470 else 1471 n = (buf[17]<< 8) | buf[16]; 1472 1473 if(n>0 && n < nelem(type) && type[n]) 1474 t = type[n]; 1475 print("%s ELF %s\n", p, t); 1476 } 1477 else 1478 print("application/x-elf-executable"); 1479 return 1; 1480 } 1481 1482 return 0; 1483 } 1484 1485 int 1486 isface(void) 1487 { 1488 int i, j, ldepth, l; 1489 char *p; 1490 1491 ldepth = -1; 1492 for(j = 0; j < 3; j++){ 1493 for(p = (char*)buf, i=0; i<3; i++){ 1494 if(p[0] != '0' || p[1] != 'x') 1495 return 0; 1496 if(buf[2+8] == ',') 1497 l = 2; 1498 else if(buf[2+4] == ',') 1499 l = 1; 1500 else 1501 return 0; 1502 if(ldepth == -1) 1503 ldepth = l; 1504 if(l != ldepth) 1505 return 0; 1506 strtoul(p, &p, 16); 1507 if(*p++ != ',') 1508 return 0; 1509 while(*p == ' ' || *p == '\t') 1510 p++; 1511 } 1512 if (*p++ != '\n') 1513 return 0; 1514 } 1515 1516 if(mime) 1517 print("application/x-face\n"); 1518 else 1519 print("face image depth %d\n", ldepth); 1520 return 1; 1521 } 1522 1523