1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 5 /* 6 bugs: 7 00/ff for end of file can conflict with 00/ff characters 8 */ 9 10 enum 11 { 12 Nline = 100000, /* max number of lines saved in memory */ 13 Nmerge = 10, /* max number of temporary files merged */ 14 Nfield = 20, /* max number of argument fields */ 15 16 Bflag = 1<<0, /* flags per field */ 17 B1flag = 1<<1, 18 19 Dflag = 1<<2, 20 Fflag = 1<<3, 21 Gflag = 1<<4, 22 Iflag = 1<<5, 23 Mflag = 1<<6, 24 Nflag = 1<<7, 25 Rflag = 1<<8, 26 Wflag = 1<<9, 27 28 NSstart = 0, /* states for number to key decoding */ 29 NSsign, 30 NSzero, 31 NSdigit, 32 NSpoint, 33 NSfract, 34 NSzerofract, 35 NSexp, 36 NSexpsign, 37 NSexpdigit, 38 }; 39 40 typedef struct Line Line; 41 typedef struct Key Key; 42 typedef struct Merge Merge; 43 typedef struct Field Field; 44 45 struct Line 46 { 47 Key* key; 48 int llen; /* always >= 1 */ 49 uchar line[1]; /* always ends in '\n' */ 50 }; 51 52 struct Merge 53 { 54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */ 55 Line* line; /* line at the head of a merged temp file */ 56 int fd; /* file descriptor */ 57 Biobuf b; /* iobuf for reading a temp file */ 58 }; 59 60 struct Key 61 { 62 int klen; 63 uchar key[1]; 64 }; 65 66 struct Field 67 { 68 int beg1; 69 int beg2; 70 int end1; 71 int end2; 72 73 long flags; 74 uchar mapto[256]; 75 76 void (*dokey)(Key*, uchar*, uchar*, Field*); 77 }; 78 79 struct args 80 { 81 char* ofile; 82 char* tname; 83 Rune tabchar; 84 char cflag; 85 char uflag; 86 char vflag; 87 int nfield; 88 int nfile; 89 Field field[Nfield]; 90 91 Line** linep; 92 long nline; /* number of lines in this temp file */ 93 long lineno; /* overall ordinal for -s option */ 94 int ntemp; 95 } args; 96 97 extern int latinmap[]; 98 extern Rune* month[12]; 99 100 void buildkey(Line*); 101 void doargs(int, char*[]); 102 void dofield(char*, int*, int*, int, int); 103 void dofile(Biobuf*); 104 void dokey_(Key*, uchar*, uchar*, Field*); 105 void dokey_dfi(Key*, uchar*, uchar*, Field*); 106 void dokey_gn(Key*, uchar*, uchar*, Field*); 107 void dokey_m(Key*, uchar*, uchar*, Field*); 108 void dokey_r(Key*, uchar*, uchar*, Field*); 109 void done(char*); 110 int kcmp(Key*, Key*); 111 void makemapd(Field*); 112 void makemapm(Field*); 113 void mergefiles(int, int, Biobuf*); 114 void mergeout(Biobuf*); 115 void newfield(void); 116 Line* newline(Biobuf*); 117 void nomem(void); 118 void notifyf(void*, char*); 119 void printargs(void); 120 void printout(Biobuf*); 121 void setfield(int, int); 122 uchar* skip(uchar*, int, int, int, int); 123 void sort4(void*, ulong); 124 char* tempfile(int); 125 void tempout(void); 126 void lineout(Biobuf*, Line*); 127 128 void 129 main(int argc, char *argv[]) 130 { 131 int i, f; 132 char *s; 133 Biobuf bbuf; 134 135 notify(notifyf); /**/ 136 doargs(argc, argv); 137 if(args.vflag) 138 printargs(); 139 140 for(i=1; i<argc; i++) { 141 s = argv[i]; 142 if(s == 0) 143 continue; 144 if(strcmp(s, "-") == 0) { 145 Binit(&bbuf, 0, OREAD); 146 dofile(&bbuf); 147 Bterm(&bbuf); 148 continue; 149 } 150 f = open(s, OREAD); 151 if(f < 0) { 152 fprint(2, "sort: open %s: %r\n", s); 153 done("open"); 154 } 155 Binit(&bbuf, f, OREAD); 156 dofile(&bbuf); 157 Bterm(&bbuf); 158 close(f); 159 } 160 if(args.nfile == 0) { 161 Binit(&bbuf, 0, OREAD); 162 dofile(&bbuf); 163 Bterm(&bbuf); 164 } 165 if(args.cflag) 166 done(0); 167 if(args.vflag) 168 fprint(2, "=========\n"); 169 170 f = 1; 171 if(args.ofile) { 172 f = create(args.ofile, OWRITE, 0666); 173 if(f < 0) { 174 fprint(2, "sort: create %s: %r\n", args.ofile); 175 done("create"); 176 } 177 } 178 179 Binit(&bbuf, f, OWRITE); 180 if(args.ntemp) { 181 tempout(); 182 mergeout(&bbuf); 183 } else { 184 printout(&bbuf); 185 } 186 Bterm(&bbuf); 187 done(0); 188 } 189 190 void 191 dofile(Biobuf *b) 192 { 193 Line *l, *ol; 194 int n; 195 196 if(args.cflag) { 197 ol = newline(b); 198 if(ol == 0) 199 return; 200 for(;;) { 201 l = newline(b); 202 if(l == 0) 203 break; 204 n = kcmp(ol->key, l->key); 205 if(n > 0 || (n == 0 && args.uflag)) { 206 /* no output shall be produced 207 fprint(2, "sort: -c file not in sort\n"); /**/ 208 done("order"); 209 } 210 free(ol->key); 211 free(ol); 212 ol = l; 213 } 214 return; 215 } 216 217 if(args.linep == 0) { 218 args.linep = malloc(Nline * sizeof(args.linep)); 219 if(args.linep == 0) 220 nomem(); 221 } 222 for(;;) { 223 l = newline(b); 224 if(l == 0) 225 break; 226 if(args.nline >= Nline) 227 tempout(); 228 args.linep[args.nline] = l; 229 args.nline++; 230 args.lineno++; 231 } 232 } 233 234 void 235 notifyf(void *a, char *s) 236 { 237 238 USED(a); 239 if(strcmp(s, "interrupt") == 0) 240 done(0); 241 if(strcmp(s, "hangup") == 0) 242 done(0); 243 if(strcmp(s, "kill") == 0) 244 done(0); 245 if(strcmp(s, "sys: write on closed pipe") == 0) 246 done(0); 247 fprint(2, "sort: note: %s\n", s); 248 abort(); 249 } 250 251 Line* 252 newline(Biobuf *b) 253 { 254 Line *l; 255 char *p; 256 int n, c; 257 258 p = Brdline(b, '\n'); 259 n = Blinelen(b); 260 if(p == 0) { 261 if(n == 0) 262 return 0; 263 l = 0; 264 for(n=0;;) { 265 if((n & 31) == 0) { 266 l = realloc(l, sizeof(Line) + 267 (n+31)*sizeof(l->line[0])); 268 if(l == 0) 269 nomem(); 270 } 271 c = Bgetc(b); 272 if(c < 0) { 273 fprint(2, "sort: newline added\n"); 274 c = '\n'; 275 } 276 l->line[n++] = c; 277 if(c == '\n') 278 break; 279 } 280 l->llen = n; 281 buildkey(l); 282 return l; 283 } 284 l = malloc(sizeof(Line) + 285 (n-1)*sizeof(l->line[0])); 286 if(l == 0) 287 nomem(); 288 l->llen = n; 289 memmove(l->line, p, n); 290 buildkey(l); 291 return l; 292 } 293 294 void 295 lineout(Biobuf *b, Line *l) 296 { 297 int n, m; 298 299 n = l->llen; 300 m = Bwrite(b, l->line, n); 301 if(n != m) 302 exits("write"); 303 } 304 305 void 306 tempout(void) 307 { 308 long n; 309 Line **lp, *l; 310 char *tf; 311 int f; 312 Biobuf tb; 313 314 sort4(args.linep, args.nline); 315 tf = tempfile(args.ntemp); 316 args.ntemp++; 317 f = create(tf, OWRITE, 0666); 318 if(f < 0) { 319 fprint(2, "sort: create %s: %r\n", tf); 320 done("create"); 321 } 322 323 Binit(&tb, f, OWRITE); 324 lp = args.linep; 325 for(n=args.nline; n>0; n--) { 326 l = *lp++; 327 lineout(&tb, l); 328 free(l->key); 329 free(l); 330 } 331 args.nline = 0; 332 Bterm(&tb); 333 close(f); 334 } 335 336 void 337 done(char *xs) 338 { 339 int i; 340 341 for(i=0; i<args.ntemp; i++) 342 remove(tempfile(i)); 343 exits(xs); 344 } 345 346 void 347 nomem(void) 348 { 349 fprint(2, "sort: out of memory\n"); 350 done("mem"); 351 } 352 353 char* 354 tempfile(int n) 355 { 356 static char file[20]; 357 static int pid; 358 char *dir; 359 360 dir = "/tmp"; 361 if(args.tname) 362 dir = args.tname; 363 if(pid == 0) 364 pid = getpid() % 1000; 365 if(pid == 0) 366 pid = 1; 367 sprint(file, "%s/sort.%.3d.%.3d", dir, pid, n); 368 return file; 369 } 370 371 void 372 mergeout(Biobuf *b) 373 { 374 int n, i, f; 375 char *tf; 376 Biobuf tb; 377 378 for(i=0; i<args.ntemp; i+=n) { 379 n = args.ntemp - i; 380 if(n > Nmerge) { 381 tf = tempfile(args.ntemp); 382 args.ntemp++; 383 f = create(tf, OWRITE, 0666); 384 if(f < 0) { 385 fprint(2, "sort: create %s: %r\n", tf); 386 done("create"); 387 } 388 Binit(&tb, f, OWRITE); 389 390 n = Nmerge; 391 mergefiles(i, n, &tb); 392 393 Bterm(&tb); 394 close(f); 395 } else 396 mergefiles(i, n, b); 397 } 398 } 399 400 void 401 mergefiles(int t, int n, Biobuf *b) 402 { 403 Merge *m, *mp, **mmp; 404 Key *ok; 405 Line *l; 406 char *tf; 407 int i, f, nn; 408 409 mmp = malloc(n*sizeof(*mmp)); 410 mp = malloc(n*sizeof(*mp)); 411 if(mmp == 0 || mp == 0) 412 nomem(); 413 414 nn = 0; 415 m = mp; 416 for(i=0; i<n; i++,m++) { 417 tf = tempfile(t+i); 418 f = open(tf, OREAD); 419 if(f < 0) { 420 fprint(2, "sort: reopen %s: %r\n", tf); 421 done("open"); 422 } 423 m->fd = f; 424 Binit(&m->b, f, OREAD); 425 mmp[nn] = m; 426 427 l = newline(&m->b); 428 if(l == 0) 429 continue; 430 nn++; 431 m->line = l; 432 m->key = l->key; 433 } 434 435 ok = 0; 436 for(;;) { 437 sort4(mmp, nn); 438 m = *mmp; 439 if(nn == 0) 440 break; 441 for(;;) { 442 l = m->line; 443 if(args.uflag && ok && kcmp(ok, l->key) == 0) { 444 free(l->key); 445 } else { 446 lineout(b, l); 447 if(ok) 448 free(ok); 449 ok = l->key; 450 free(l); 451 } 452 453 l = newline(&m->b); 454 if(l == 0) { 455 nn--; 456 mmp[0] = mmp[nn]; 457 break; 458 } 459 m->line = l; 460 m->key = l->key; 461 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0) 462 break; 463 } 464 } 465 if(ok) 466 free(ok); 467 468 m = mp; 469 for(i=0; i<n; i++,m++) { 470 Bterm(&m->b); 471 close(m->fd); 472 } 473 474 free(mp); 475 free(mmp); 476 } 477 478 int 479 kcmp(Key *ka, Key *kb) 480 { 481 int n, m; 482 483 /* 484 * set n to length of smaller key 485 */ 486 n = ka->klen; 487 m = kb->klen; 488 if(n > m) 489 n = m; 490 return memcmp(ka->key, kb->key, n); 491 } 492 493 void 494 printout(Biobuf *b) 495 { 496 long n; 497 Line **lp, *l; 498 Key *ok; 499 500 sort4(args.linep, args.nline); 501 lp = args.linep; 502 ok = 0; 503 for(n=args.nline; n>0; n--) { 504 l = *lp++; 505 if(args.uflag && ok && kcmp(ok, l->key) == 0) 506 continue; 507 lineout(b, l); 508 ok = l->key; 509 } 510 } 511 512 void 513 setfield(int n, int c) 514 { 515 Field *f; 516 517 f = &args.field[n]; 518 switch(c) { 519 default: 520 fprint(2, "sort: unknown option: field.%C\n", c); 521 done("option"); 522 case 'b': /* skip blanks */ 523 f->flags |= Bflag; 524 break; 525 case 'd': /* directory order */ 526 f->flags |= Dflag; 527 break; 528 case 'f': /* fold case */ 529 f->flags |= Fflag; 530 break; 531 case 'g': /* floating point -n case */ 532 f->flags |= Gflag; 533 break; 534 case 'i': /* ignore non-ascii */ 535 f->flags |= Iflag; 536 break; 537 case 'M': /* month */ 538 f->flags |= Mflag; 539 break; 540 case 'n': /* numbers */ 541 f->flags |= Nflag; 542 break; 543 case 'r': /* reverse */ 544 f->flags |= Rflag; 545 break; 546 case 'w': /* ignore white */ 547 f->flags |= Wflag; 548 break; 549 } 550 } 551 552 void 553 dofield(char *s, int *n1, int *n2, int off1, int off2) 554 { 555 int c, n; 556 557 c = *s++; 558 if(c >= '0' && c <= '9') { 559 n = 0; 560 while(c >= '0' && c <= '9') { 561 n = n*10 + (c-'0'); 562 c = *s++; 563 } 564 n -= off1; /* posix committee: rot in hell */ 565 if(n < 0) { 566 fprint(2, "sort: field offset must be positive\n"); 567 done("option"); 568 } 569 *n1 = n; 570 } 571 if(c == '.') { 572 c = *s++; 573 if(c >= '0' && c <= '9') { 574 n = 0; 575 while(c >= '0' && c <= '9') { 576 n = n*10 + (c-'0'); 577 c = *s++; 578 } 579 n -= off2; 580 if(n < 0) { 581 fprint(2, "sort: character offset must be positive\n"); 582 done("option"); 583 } 584 *n2 = n; 585 } 586 } 587 while(c != 0) { 588 setfield(args.nfield, c); 589 c = *s++; 590 } 591 } 592 593 void 594 printargs(void) 595 { 596 int i, n; 597 Field *f; 598 char *prefix; 599 600 fprint(2, "sort"); 601 for(i=0; i<=args.nfield; i++) { 602 f = &args.field[i]; 603 prefix = " -"; 604 if(i) { 605 n = f->beg1; 606 if(n >= 0) 607 fprint(2, " +%d", n); 608 else 609 fprint(2, " +*"); 610 n = f->beg2; 611 if(n >= 0) 612 fprint(2, ".%d", n); 613 else 614 fprint(2, ".*"); 615 616 if(f->flags & B1flag) 617 fprint(2, "b"); 618 619 n = f->end1; 620 if(n >= 0) 621 fprint(2, " -%d", n); 622 else 623 fprint(2, " -*"); 624 n = f->end2; 625 if(n >= 0) 626 fprint(2, ".%d", n); 627 else 628 fprint(2, ".*"); 629 prefix = ""; 630 } 631 if(f->flags & Bflag) 632 fprint(2, "%sb", prefix); 633 if(f->flags & Dflag) 634 fprint(2, "%sd", prefix); 635 if(f->flags & Fflag) 636 fprint(2, "%sf", prefix); 637 if(f->flags & Gflag) 638 fprint(2, "%sg", prefix); 639 if(f->flags & Iflag) 640 fprint(2, "%si", prefix); 641 if(f->flags & Mflag) 642 fprint(2, "%sM", prefix); 643 if(f->flags & Nflag) 644 fprint(2, "%sn", prefix); 645 if(f->flags & Rflag) 646 fprint(2, "%sr", prefix); 647 if(f->flags & Wflag) 648 fprint(2, "%sw", prefix); 649 } 650 if(args.cflag) 651 fprint(2, " -c"); 652 if(args.uflag) 653 fprint(2, " -u"); 654 if(args.ofile) 655 fprint(2, " -o %s", args.ofile); 656 fprint(2, "\n"); 657 } 658 659 void 660 newfield(void) 661 { 662 int n; 663 Field *f; 664 665 n = args.nfield + 1; 666 if(n >= Nfield) { 667 fprint(2, "sort: too many fields specified\n"); 668 done("option"); 669 } 670 args.nfield = n; 671 f = &args.field[n]; 672 f->beg1 = -1; 673 f->beg2 = -1; 674 f->end1 = -1; 675 f->end2 = -1; 676 } 677 678 void 679 doargs(int argc, char *argv[]) 680 { 681 int i, c, hadplus; 682 char *s, *p, *q; 683 Field *f; 684 685 hadplus = 0; 686 for(i=1; i<argc; i++) { 687 s = argv[i]; 688 c = *s++; 689 if(c == '-') { 690 c = *s; 691 if(c == 0) /* forced end of arg marker */ 692 break; 693 argv[i] = 0; /* clobber args processed */ 694 if(c == '.' || (c >= '0' && c <= '9')) { 695 if(!hadplus) 696 newfield(); 697 f = &args.field[args.nfield]; 698 dofield(s, &f->end1, &f->end2, 0, 0); 699 hadplus = 0; 700 continue; 701 } 702 703 while(c = *s++) 704 switch(c) { 705 case '-': /* end of options */ 706 i = argc; 707 continue; 708 case 'T': /* temp directory */ 709 if(*s == 0) { 710 i++; 711 if(i < argc) { 712 args.tname = argv[i]; 713 argv[i] = 0; 714 } 715 } else 716 args.tname = s; 717 s = strchr(s, 0); 718 break; 719 case 'o': /* output file */ 720 if(*s == 0) { 721 i++; 722 if(i < argc) { 723 args.ofile = argv[i]; 724 argv[i] = 0; 725 } 726 } else 727 args.ofile = s; 728 s = strchr(s, 0); 729 break; 730 case 'k': /* posix key (what were they thinking?) */ 731 p = 0; 732 if(*s == 0) { 733 i++; 734 if(i < argc) { 735 p = argv[i]; 736 argv[i] = 0; 737 } 738 } else 739 p = s; 740 s = strchr(s, 0); 741 if(p == 0) 742 break; 743 744 newfield(); 745 q = strchr(p, ','); 746 if(q) 747 *q++ = 0; 748 f = &args.field[args.nfield]; 749 dofield(p, &f->beg1, &f->beg2, 1, 1); 750 if(f->flags & Bflag) { 751 f->flags |= B1flag; 752 f->flags &= ~Bflag; 753 } 754 if(q) { 755 dofield(q, &f->end1, &f->end2, 1, 0); 756 if(f->end2 <= 0) 757 f->end1++; 758 } 759 hadplus = 0; 760 break; 761 case 't': /* tab character */ 762 if(*s == 0) { 763 i++; 764 if(i < argc) { 765 chartorune(&args.tabchar, argv[i]); 766 argv[i] = 0; 767 } 768 } else 769 s += chartorune(&args.tabchar, s); 770 if(args.tabchar == '\n') { 771 fprint(2, "aw come on, rob\n"); 772 done("rob"); 773 } 774 break; 775 case 'c': /* check order */ 776 args.cflag = 1; 777 break; 778 case 'u': /* unique */ 779 args.uflag = 1; 780 break; 781 case 'v': /* debugging noise */ 782 args.vflag = 1; 783 break; 784 785 case 'M': /* month */ 786 case 'b': /* skip blanks */ 787 case 'd': /* directory order */ 788 case 'f': /* fold case */ 789 case 'g': /* floating numbers */ 790 case 'i': /* ignore non-ascii */ 791 case 'n': /* numbers */ 792 case 'r': /* reverse */ 793 case 'w': /* ignore white */ 794 if(args.nfield > 0) 795 fprint(2, "sort: global field set after -k\n"); 796 setfield(0, c); 797 break; 798 case 'm': 799 /* option m silently ignored but required by posix */ 800 break; 801 default: 802 fprint(2, "sort: unknown option: -%C\n", c); 803 done("option"); 804 } 805 continue; 806 } 807 if(c == '+') { 808 argv[i] = 0; /* clobber args processed */ 809 c = *s; 810 if(c == '.' || (c >= '0' && c <= '9')) { 811 newfield(); 812 f = &args.field[args.nfield]; 813 dofield(s, &f->beg1, &f->beg2, 0, 0); 814 if(f->flags & Bflag) { 815 f->flags |= B1flag; 816 f->flags &= ~Bflag; 817 } 818 hadplus = 1; 819 continue; 820 } 821 fprint(2, "sort: unknown option: +%C\n", c); 822 done("option"); 823 } 824 args.nfile++; 825 } 826 827 for(i=0; i<=args.nfield; i++) { 828 f = &args.field[i]; 829 830 /* 831 * global options apply to fields that 832 * specify no options 833 */ 834 if(f->flags == 0) { 835 f->flags = args.field[0].flags; 836 if(args.field[0].flags & Bflag) 837 f->flags |= B1flag; 838 } 839 840 841 /* 842 * build buildkey specification 843 */ 844 switch(f->flags & ~(Bflag|B1flag)) { 845 default: 846 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags); 847 done("option"); 848 case 0: 849 f->dokey = dokey_; 850 break; 851 case Rflag: 852 f->dokey = dokey_r; 853 break; 854 case Gflag: 855 case Nflag: 856 case Gflag|Nflag: 857 case Gflag|Rflag: 858 case Nflag|Rflag: 859 case Gflag|Nflag|Rflag: 860 f->dokey = dokey_gn; 861 break; 862 case Mflag: 863 case Mflag|Rflag: 864 f->dokey = dokey_m; 865 makemapm(f); 866 break; 867 case Dflag: 868 case Dflag|Fflag: 869 case Dflag|Fflag|Iflag: 870 case Dflag|Fflag|Iflag|Rflag: 871 case Dflag|Fflag|Iflag|Rflag|Wflag: 872 case Dflag|Fflag|Iflag|Wflag: 873 case Dflag|Fflag|Rflag: 874 case Dflag|Fflag|Rflag|Wflag: 875 case Dflag|Fflag|Wflag: 876 case Dflag|Iflag: 877 case Dflag|Iflag|Rflag: 878 case Dflag|Iflag|Rflag|Wflag: 879 case Dflag|Iflag|Wflag: 880 case Dflag|Rflag: 881 case Dflag|Rflag|Wflag: 882 case Dflag|Wflag: 883 case Fflag: 884 case Fflag|Iflag: 885 case Fflag|Iflag|Rflag: 886 case Fflag|Iflag|Rflag|Wflag: 887 case Fflag|Iflag|Wflag: 888 case Fflag|Rflag: 889 case Fflag|Rflag|Wflag: 890 case Fflag|Wflag: 891 case Iflag: 892 case Iflag|Rflag: 893 case Iflag|Rflag|Wflag: 894 case Iflag|Wflag: 895 case Wflag: 896 f->dokey = dokey_dfi; 897 makemapd(f); 898 break; 899 } 900 } 901 902 /* 903 * random spot checks 904 */ 905 if(args.nfile > 1 && args.cflag) { 906 fprint(2, "sort: -c can have at most one input file\n"); 907 done("option"); 908 } 909 return; 910 } 911 912 uchar* 913 skip(uchar *l, int n1, int n2, int bflag, int endfield) 914 { 915 int i, c, tc; 916 Rune r; 917 918 if(endfield && n1 < 0) 919 return 0; 920 921 c = *l++; 922 tc = args.tabchar; 923 if(tc) { 924 if(tc < Runeself) { 925 for(i=n1; i>0; i--) { 926 while(c != tc) { 927 if(c == '\n') 928 return 0; 929 c = *l++; 930 } 931 if(!(endfield && i == 1)) 932 c = *l++; 933 } 934 } else { 935 l--; 936 l += chartorune(&r, (char*)l); 937 for(i=n1; i>0; i--) { 938 while(r != tc) { 939 if(r == '\n') 940 return 0; 941 l += chartorune(&r, (char*)l); 942 } 943 if(!(endfield && i == 1)) 944 l += chartorune(&r, (char*)l); 945 } 946 c = r; 947 } 948 } else { 949 for(i=n1; i>0; i--) { 950 while(c == ' ' || c == '\t') 951 c = *l++; 952 while(c != ' ' && c != '\t') { 953 if(c == '\n') 954 return 0; 955 c = *l++; 956 } 957 } 958 } 959 960 if(bflag) 961 while(c == ' ' || c == '\t') 962 c = *l++; 963 964 l--; 965 for(i=n2; i>0; i--) { 966 c = *l; 967 if(c < Runeself) { 968 if(c == '\n') 969 return 0; 970 l++; 971 continue; 972 } 973 l += chartorune(&r, (char*)l); 974 } 975 return l; 976 } 977 978 void 979 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f) 980 { 981 uchar *kp; 982 int c, cl, dp; 983 int state, nzero, exp, expsign, rflag; 984 985 cl = k->klen + 3; 986 kp = k->key + cl; /* skip place for sign, exponent[2] */ 987 988 nzero = 0; /* number of trailing zeros */ 989 exp = 0; /* value of the exponent */ 990 expsign = 0; /* sign of the exponent */ 991 dp = 0x4040; /* location of decimal point */ 992 rflag = f->flags&Rflag; /* xor of rflag and - sign */ 993 state = NSstart; 994 995 for(;; lp++) { 996 if(lp >= lpe) 997 break; 998 c = *lp; 999 1000 if(c == ' ' || c == '\t') { 1001 switch(state) { 1002 case NSstart: 1003 case NSsign: 1004 continue; 1005 } 1006 break; 1007 } 1008 if(c == '+' || c == '-') { 1009 switch(state) { 1010 case NSstart: 1011 state = NSsign; 1012 if(c == '-') 1013 rflag = !rflag; 1014 continue; 1015 case NSexp: 1016 state = NSexpsign; 1017 if(c == '-') 1018 expsign = 1; 1019 continue; 1020 } 1021 break; 1022 } 1023 if(c == '0') { 1024 switch(state) { 1025 case NSdigit: 1026 if(rflag) 1027 c = ~c; 1028 *kp++ = c; 1029 cl++; 1030 nzero++; 1031 dp++; 1032 state = NSdigit; 1033 continue; 1034 case NSfract: 1035 if(rflag) 1036 c = ~c; 1037 *kp++ = c; 1038 cl++; 1039 nzero++; 1040 state = NSfract; 1041 continue; 1042 case NSstart: 1043 case NSsign: 1044 case NSzero: 1045 state = NSzero; 1046 continue; 1047 case NSzerofract: 1048 case NSpoint: 1049 dp--; 1050 state = NSzerofract; 1051 continue; 1052 case NSexpsign: 1053 case NSexp: 1054 case NSexpdigit: 1055 exp = exp*10 + (c - '0'); 1056 state = NSexpdigit; 1057 continue; 1058 } 1059 break; 1060 } 1061 if(c >= '1' && c <= '9') { 1062 switch(state) { 1063 case NSzero: 1064 case NSstart: 1065 case NSsign: 1066 case NSdigit: 1067 if(rflag) 1068 c = ~c; 1069 *kp++ = c; 1070 cl++; 1071 nzero = 0; 1072 dp++; 1073 state = NSdigit; 1074 continue; 1075 case NSzerofract: 1076 case NSpoint: 1077 case NSfract: 1078 if(rflag) 1079 c = ~c; 1080 *kp++ = c; 1081 cl++; 1082 nzero = 0; 1083 state = NSfract; 1084 continue; 1085 case NSexpsign: 1086 case NSexp: 1087 case NSexpdigit: 1088 exp = exp*10 + (c - '0'); 1089 state = NSexpdigit; 1090 continue; 1091 } 1092 break; 1093 } 1094 if(c == '.') { 1095 switch(state) { 1096 case NSstart: 1097 case NSsign: 1098 state = NSpoint; 1099 continue; 1100 case NSzero: 1101 state = NSzerofract; 1102 continue; 1103 case NSdigit: 1104 state = NSfract; 1105 continue; 1106 } 1107 break; 1108 } 1109 if((f->flags & Gflag) && (c == 'e' || c == 'E')) { 1110 switch(state) { 1111 case NSdigit: 1112 case NSfract: 1113 state = NSexp; 1114 continue; 1115 } 1116 break; 1117 } 1118 break; 1119 } 1120 1121 switch(state) { 1122 /* 1123 * result is zero 1124 */ 1125 case NSstart: 1126 case NSsign: 1127 case NSzero: 1128 case NSzerofract: 1129 case NSpoint: 1130 kp = k->key + k->klen; 1131 k->klen += 2; 1132 kp[0] = 0x20; /* between + and - */ 1133 kp[1] = 0; 1134 return; 1135 /* 1136 * result has exponent 1137 */ 1138 case NSexpsign: 1139 case NSexp: 1140 case NSexpdigit: 1141 if(expsign) 1142 exp = -exp; 1143 dp += exp; 1144 1145 /* 1146 * result is fixed point number 1147 */ 1148 case NSdigit: 1149 case NSfract: 1150 kp -= nzero; 1151 cl -= nzero; 1152 break; 1153 } 1154 1155 /* 1156 * end of number 1157 */ 1158 c = 0; 1159 if(rflag) 1160 c = ~c; 1161 *kp = c; 1162 1163 /* 1164 * sign and exponent 1165 */ 1166 c = 0x30; 1167 if(rflag) { 1168 c = 0x10; 1169 dp = ~dp; 1170 } 1171 kp = k->key + k->klen; 1172 kp[0] = c; 1173 kp[1] = (dp >> 8); 1174 kp[2] = dp; 1175 k->klen = cl+1; 1176 } 1177 1178 void 1179 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f) 1180 { 1181 uchar *kp; 1182 Rune r, place[3]; 1183 int c, cl, pc; 1184 int rflag; 1185 1186 rflag = f->flags&Rflag; 1187 pc = 0; 1188 1189 cl = k->klen; 1190 kp = k->key + cl; 1191 1192 for(;;) { 1193 /* 1194 * get the character 1195 */ 1196 if(lp >= lpe) 1197 break; 1198 c = *lp; 1199 if(c >= Runeself) { 1200 lp += chartorune(&r, (char*)lp); 1201 c = r; 1202 } else 1203 lp++; 1204 1205 if(c < nelem(f->mapto)) { 1206 c = f->mapto[c]; 1207 if(c == 0) 1208 continue; 1209 } 1210 place[pc++] = c; 1211 if(pc < 3) 1212 continue; 1213 for(c=11; c>=0; c--) 1214 if(memcmp(month[c], place, sizeof(place)) == 0) 1215 break; 1216 c += 10; 1217 if(rflag) 1218 c = ~c; 1219 *kp++ = c; 1220 cl++; 1221 break; 1222 } 1223 1224 c = 0; 1225 if(rflag) 1226 c = ~c; 1227 *kp = c; 1228 k->klen = cl+1; 1229 } 1230 1231 void 1232 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f) 1233 { 1234 uchar *kp; 1235 Rune r; 1236 int c, cl, n, rflag; 1237 1238 cl = k->klen; 1239 kp = k->key + cl; 1240 rflag = f->flags & Rflag; 1241 1242 for(;;) { 1243 /* 1244 * get the character 1245 */ 1246 if(lp >= lpe) 1247 break; 1248 c = *lp; 1249 if(c >= Runeself) { 1250 lp += chartorune(&r, (char*)lp); 1251 c = r; 1252 } else 1253 lp++; 1254 1255 /* 1256 * do the various mappings. 1257 * the common case is handled 1258 * completely by the table. 1259 */ 1260 if(c != 0 && c < Runeself) { 1261 c = f->mapto[c]; 1262 if(c) { 1263 *kp++ = c; 1264 cl++; 1265 } 1266 continue; 1267 } 1268 1269 /* 1270 * for characters out of range, 1271 * the table does not do Rflag. 1272 * ignore is based on mapto[255] 1273 */ 1274 if(c != 0 && c < nelem(f->mapto)) { 1275 c = f->mapto[c]; 1276 if(c == 0) 1277 continue; 1278 } else 1279 if(f->mapto[nelem(f->mapto)-1] == 0) 1280 continue; 1281 1282 /* 1283 * put it in the key 1284 */ 1285 r = c; 1286 n = runetochar((char*)kp, &r); 1287 kp += n; 1288 cl += n; 1289 if(rflag) 1290 while(n > 0) { 1291 kp[-n] = ~kp[-n]; 1292 n--; 1293 } 1294 } 1295 1296 /* 1297 * end of key 1298 */ 1299 k->klen = cl+1; 1300 if(rflag) { 1301 *kp = ~0; 1302 return; 1303 } 1304 *kp = 0; 1305 } 1306 1307 void 1308 dokey_r(Key *k, uchar *lp, uchar *lpe, Field *f) 1309 { 1310 int cl, n; 1311 uchar *kp; 1312 1313 USED(f); 1314 n = lpe - lp; 1315 if(n < 0) 1316 n = 0; 1317 cl = k->klen; 1318 kp = k->key + cl; 1319 k->klen = cl+n+1; 1320 1321 lpe -= 3; 1322 while(lp < lpe) { 1323 kp[0] = ~lp[0]; 1324 kp[1] = ~lp[1]; 1325 kp[2] = ~lp[2]; 1326 kp[3] = ~lp[3]; 1327 kp += 4; 1328 lp += 4; 1329 } 1330 1331 lpe += 3; 1332 while(lp < lpe) 1333 *kp++ = ~*lp++; 1334 *kp = ~0; 1335 } 1336 1337 void 1338 dokey_(Key *k, uchar *lp, uchar *lpe, Field *f) 1339 { 1340 int n, cl; 1341 uchar *kp; 1342 1343 USED(f); 1344 n = lpe - lp; 1345 if(n < 0) 1346 n = 0; 1347 cl = k->klen; 1348 kp = k->key + cl; 1349 k->klen = cl+n+1; 1350 memmove(kp, lp, n); 1351 kp[n] = 0; 1352 } 1353 1354 void 1355 buildkey(Line *l) 1356 { 1357 Key *k; 1358 uchar *lp, *lpe; 1359 int ll, kl, cl, i, n; 1360 Field *f; 1361 1362 ll = l->llen - 1; 1363 kl = 0; /* allocated length */ 1364 cl = 0; /* current length */ 1365 k = 0; 1366 1367 for(i=1; i<=args.nfield; i++) { 1368 f = &args.field[i]; 1369 lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0); 1370 if(lp == 0) 1371 lp = l->line + ll; 1372 lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1); 1373 if(lpe == 0) 1374 lpe = l->line + ll; 1375 n = (lpe - lp) + 1; 1376 if(n <= 0) 1377 n = 1; 1378 if(cl+(n+4) > kl) { 1379 kl = cl+(n+4); 1380 k = realloc(k, sizeof(Key) + 1381 (kl-1)*sizeof(k->key[0])); 1382 if(k == 0) 1383 nomem(); 1384 } 1385 k->klen = cl; 1386 (*f->dokey)(k, lp, lpe, f); 1387 cl = k->klen; 1388 } 1389 1390 /* 1391 * global comparisons 1392 */ 1393 if(!(args.uflag && cl > 0)) { 1394 f = &args.field[0]; 1395 if(cl+(ll+4) > kl) { 1396 kl = cl+(ll+4); 1397 k = realloc(k, sizeof(Key) + 1398 (kl-1)*sizeof(k->key[0])); 1399 if(k == 0) 1400 nomem(); 1401 } 1402 k->klen = cl; 1403 (*f->dokey)(k, l->line, l->line+ll, f); 1404 cl = k->klen; 1405 } 1406 1407 l->key = k; 1408 k->klen = cl; 1409 1410 if(args.vflag) { 1411 fprint(2, "%.*s", l->llen, l->line); 1412 for(i=0; i<k->klen; i++) { 1413 fprint(2, " %.2x", k->key[i]); 1414 if(k->key[i] == 0x00 || k->key[i] == 0xff) 1415 fprint(2, "\n"); 1416 } 1417 } 1418 } 1419 1420 void 1421 makemapm(Field *f) 1422 { 1423 int i, c; 1424 1425 for(i=0; i<nelem(f->mapto); i++) { 1426 c = 1; 1427 if(i == ' ' || i == '\t') 1428 c = 0; 1429 if(i >= 'a' && i <= 'z') 1430 c = i + ('A' - 'a'); 1431 if(i >= 'A' && i <= 'Z') 1432 c = i; 1433 f->mapto[i] = c; 1434 if(args.vflag) { 1435 if((i & 15) == 0) 1436 fprint(2, " "); 1437 fprint(2, " %.2x", c); 1438 if((i & 15) == 15) 1439 fprint(2, "\n"); 1440 } 1441 } 1442 } 1443 1444 void 1445 makemapd(Field *f) 1446 { 1447 int i, j, c; 1448 1449 for(i=0; i<nelem(f->mapto); i++) { 1450 c = i; 1451 if(f->flags & Iflag) 1452 if(c < 040 || c > 0176) 1453 c = -1; 1454 if((f->flags & Wflag) && c >= 0) 1455 if(c == ' ' || c == '\t') 1456 c = -1; 1457 if((f->flags & Dflag) && c >= 0) 1458 if(!(c == ' ' || c == '\t' || 1459 (c >= 'a' && c <= 'z') || 1460 (c >= 'A' && c <= 'Z') || 1461 (c >= '0' && c <= '9'))) { 1462 for(j=0; latinmap[j]; j+=3) 1463 if(c == latinmap[j+0] || 1464 c == latinmap[j+1]) 1465 break; 1466 if(latinmap[j] == 0) 1467 c = -1; 1468 } 1469 if((f->flags & Fflag) && c >= 0) { 1470 if(c >= 'a' && c <= 'z') 1471 c += 'A' - 'a'; 1472 for(j=0; latinmap[j]; j+=3) 1473 if(c == latinmap[j+0] || 1474 c == latinmap[j+1]) { 1475 c = latinmap[j+2]; 1476 break; 1477 } 1478 } 1479 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself) 1480 c = ~c & 0xff; 1481 if(c < 0) 1482 c = 0; 1483 f->mapto[i] = c; 1484 if(args.vflag) { 1485 if((i & 15) == 0) 1486 fprint(2, " "); 1487 fprint(2, " %.2x", c); 1488 if((i & 15) == 15) 1489 fprint(2, "\n"); 1490 } 1491 } 1492 } 1493 1494 int latinmap[] = 1495 { 1496 /* lcase ucase fold */ 1497 L'à', L'À', L'A', 1498 L'á', L'Á', L'A', 1499 L'â', L'Â', L'A', 1500 L'ä', L'Ä', L'A', 1501 L'ã', L'Ã', L'A', 1502 L'å', L'Å', L'A', 1503 L'è', L'È', L'E', 1504 L'é', L'É', L'E', 1505 L'ê', L'Ê', L'E', 1506 L'ë', L'Ë', L'E', 1507 L'ì', L'Ì', L'I', 1508 L'í', L'Í', L'I', 1509 L'î', L'Î', L'I', 1510 L'ï', L'Ï', L'I', 1511 L'ò', L'Ò', L'O', 1512 L'ó', L'Ó', L'O', 1513 L'ô', L'Ô', L'O', 1514 L'ö', L'Ö', L'O', 1515 L'õ', L'Õ', L'O', 1516 L'ø', L'Ø', L'O', 1517 L'ù', L'Ù', L'U', 1518 L'ú', L'Ú', L'U', 1519 L'û', L'Û', L'U', 1520 L'ü', L'Ü', L'U', 1521 L'æ', L'Æ', L'A', 1522 L'ð', L'Ð', L'D', 1523 L'ñ', L'Ñ', L'N', 1524 L'ý', L'Ý', L'Y', 1525 L'ç', L'Ç', L'C', 1526 0, 1527 }; 1528 1529 Rune* month[12] = 1530 { 1531 L"JAN", 1532 L"FEB", 1533 L"MAR", 1534 L"APR", 1535 L"MAY", 1536 L"JUN", 1537 L"JUL", 1538 L"AUG", 1539 L"SEP", 1540 L"OCT", 1541 L"NOV", 1542 L"DEC", 1543 }; 1544 1545 /************** radix sort ***********/ 1546 1547 enum 1548 { 1549 Threshold = 14, 1550 }; 1551 1552 void rsort4(Key***, ulong, int); 1553 void bsort4(Key***, ulong, int); 1554 1555 void 1556 sort4(void *a, ulong n) 1557 { 1558 if(n > Threshold) 1559 rsort4((Key***)a, n, 0); 1560 else 1561 bsort4((Key***)a, n, 0); 1562 } 1563 1564 void 1565 rsort4(Key ***a, ulong n, int b) 1566 { 1567 Key ***ea, ***t, ***u, **t1, **u1, *k; 1568 Key ***part[257]; 1569 static long count[257]; 1570 long clist[257+257], *cp, *cp1; 1571 int c, lowc, higc; 1572 1573 /* 1574 * pass 1 over all keys, 1575 * count the number of each key[b]. 1576 * find low count and high count. 1577 */ 1578 lowc = 256; 1579 higc = 0; 1580 ea = a+n; 1581 for(t=a; t<ea; t++) { 1582 k = **t; 1583 n = k->klen; 1584 if(b >= n) { 1585 count[256]++; 1586 continue; 1587 } 1588 c = k->key[b]; 1589 n = count[c]++; 1590 if(n == 0) { 1591 if(c < lowc) 1592 lowc = c; 1593 if(c > higc) 1594 higc = c; 1595 } 1596 } 1597 1598 /* 1599 * pass 2 over all counts, 1600 * put partition pointers in part[c]. 1601 * save compacted indexes and counts 1602 * in clist[]. 1603 */ 1604 t = a; 1605 n = count[256]; 1606 clist[0] = n; 1607 part[256] = t; 1608 t += n; 1609 1610 cp1 = clist+1; 1611 cp = count+lowc; 1612 for(c=lowc; c<=higc; c++,cp++) { 1613 n = *cp; 1614 if(n) { 1615 cp1[0] = n; 1616 cp1[1] = c; 1617 cp1 += 2; 1618 part[c] = t; 1619 t += n; 1620 } 1621 } 1622 *cp1 = 0; 1623 1624 /* 1625 * pass 3 over all counts. 1626 * chase lowest pointer in each partition 1627 * around a permutation until it comes 1628 * back and is stored where it started. 1629 * static array, count[], should be 1630 * reduced to zero entries except maybe 1631 * count[256]. 1632 */ 1633 for(cp1=clist+1; cp1[0]; cp1+=2) { 1634 c = cp1[1]; 1635 cp = count+c; 1636 while(*cp) { 1637 t1 = *part[c]; 1638 for(;;) { 1639 k = *t1; 1640 n = 256; 1641 if(b < k->klen) 1642 n = k->key[b]; 1643 u = part[n]++; 1644 count[n]--; 1645 u1 = *u; 1646 *u = t1; 1647 if(n == c) 1648 break; 1649 t1 = u1; 1650 } 1651 } 1652 } 1653 1654 /* 1655 * pass 4 over all partitions. 1656 * call recursively. 1657 */ 1658 b++; 1659 t = a + clist[0]; 1660 count[256] = 0; 1661 for(cp1=clist+1; n=cp1[0]; cp1+=2) { 1662 if(n > Threshold) 1663 rsort4(t, n, b); 1664 else 1665 if(n > 1) 1666 bsort4(t, n, b); 1667 t += n; 1668 } 1669 } 1670 1671 /* 1672 * bubble sort to pick up 1673 * the pieces. 1674 */ 1675 void 1676 bsort4(Key ***a, ulong n, int b) 1677 { 1678 Key ***i, ***j, ***k, ***l, **t; 1679 Key *ka, *kb; 1680 int n1, n2; 1681 1682 l = a+n; 1683 j = a; 1684 1685 loop: 1686 i = j; 1687 j++; 1688 if(j >= l) 1689 return; 1690 1691 ka = **i; 1692 kb = **j; 1693 n1 = ka->klen - b; 1694 n2 = kb->klen - b; 1695 if(n1 > n2) 1696 n1 = n2; 1697 if(n1 <= 0) 1698 goto loop; 1699 n2 = ka->key[b] - kb->key[b]; 1700 if(n2 == 0) 1701 n2 = memcmp(ka->key+b, kb->key+b, n1); 1702 if(n2 <= 0) 1703 goto loop; 1704 1705 for(;;) { 1706 k = i+1; 1707 1708 t = *k; 1709 *k = *i; 1710 *i = t; 1711 1712 if(i <= a) 1713 goto loop; 1714 1715 i--; 1716 ka = **i; 1717 kb = *t; 1718 n1 = ka->klen - b; 1719 n2 = kb->klen - b; 1720 if(n1 > n2) 1721 n1 = n2; 1722 if(n1 <= 0) 1723 goto loop; 1724 n2 = ka->key[b] - kb->key[b]; 1725 if(n2 == 0) 1726 n2 = memcmp(ka->key+b, kb->key+b, n1); 1727 if(n2 <= 0) 1728 goto loop; 1729 } 1730 } 1731