1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 5 /* 6 bugs: 7 00/ff for end of file can conflict with 00/ff characters 8 */ 9 10 enum 11 { 12 Nline = 100000, /* default max number of lines saved in memory */ 13 Nmerge = 10, /* max number of temporary files merged */ 14 Nfield = 20, /* max number of argument fields */ 15 16 Bflag = 1<<0, /* flags per field */ 17 B1flag = 1<<1, 18 19 Dflag = 1<<2, 20 Fflag = 1<<3, 21 Gflag = 1<<4, 22 Iflag = 1<<5, 23 Mflag = 1<<6, 24 Nflag = 1<<7, 25 Rflag = 1<<8, 26 Wflag = 1<<9, 27 28 NSstart = 0, /* states for number to key decoding */ 29 NSsign, 30 NSzero, 31 NSdigit, 32 NSpoint, 33 NSfract, 34 NSzerofract, 35 NSexp, 36 NSexpsign, 37 NSexpdigit, 38 }; 39 40 typedef struct Line Line; 41 typedef struct Key Key; 42 typedef struct Merge Merge; 43 typedef struct Field Field; 44 45 struct Line 46 { 47 Key* key; 48 int llen; /* always >= 1 */ 49 uchar line[1]; /* always ends in '\n' */ 50 }; 51 52 struct Merge 53 { 54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */ 55 Line* line; /* line at the head of a merged temp file */ 56 int fd; /* file descriptor */ 57 Biobuf b; /* iobuf for reading a temp file */ 58 }; 59 60 struct Key 61 { 62 int klen; 63 uchar key[1]; 64 }; 65 66 struct Field 67 { 68 int beg1; 69 int beg2; 70 int end1; 71 int end2; 72 73 long flags; 74 uchar mapto[256]; 75 76 void (*dokey)(Key*, uchar*, uchar*, Field*); 77 }; 78 79 struct args 80 { 81 char* ofile; 82 char* tname; 83 Rune tabchar; 84 char cflag; 85 char uflag; 86 char vflag; 87 int nfield; 88 int nfile; 89 Field field[Nfield]; 90 91 Line** linep; 92 long nline; /* number of lines in this temp file */ 93 long lineno; /* overall ordinal for -s option */ 94 int ntemp; 95 long mline; /* max lines per file */ 96 } args; 97 98 extern int latinmap[]; 99 extern Rune* month[12]; 100 101 void buildkey(Line*); 102 void doargs(int, char*[]); 103 void dofield(char*, int*, int*, int, int); 104 void dofile(Biobuf*); 105 void dokey_(Key*, uchar*, uchar*, Field*); 106 void dokey_dfi(Key*, uchar*, uchar*, Field*); 107 void dokey_gn(Key*, uchar*, uchar*, Field*); 108 void dokey_m(Key*, uchar*, uchar*, Field*); 109 void dokey_r(Key*, uchar*, uchar*, Field*); 110 void done(char*); 111 int kcmp(Key*, Key*); 112 void makemapd(Field*); 113 void makemapm(Field*); 114 void mergefiles(int, int, Biobuf*); 115 void mergeout(Biobuf*); 116 void newfield(void); 117 Line* newline(Biobuf*); 118 void nomem(void); 119 void notifyf(void*, char*); 120 void printargs(void); 121 void printout(Biobuf*); 122 void setfield(int, int); 123 uchar* skip(uchar*, int, int, int, int); 124 void sort4(void*, ulong); 125 char* tempfile(int); 126 void tempout(void); 127 void lineout(Biobuf*, Line*); 128 129 void 130 main(int argc, char *argv[]) 131 { 132 int i, f; 133 char *s; 134 Biobuf bbuf; 135 136 notify(notifyf); /**/ 137 doargs(argc, argv); 138 if(args.vflag) 139 printargs(); 140 141 for(i=1; i<argc; i++) { 142 s = argv[i]; 143 if(s == 0) 144 continue; 145 if(strcmp(s, "-") == 0) { 146 Binit(&bbuf, 0, OREAD); 147 dofile(&bbuf); 148 Bterm(&bbuf); 149 continue; 150 } 151 f = open(s, OREAD); 152 if(f < 0) { 153 fprint(2, "sort: open %s: %r\n", s); 154 done("open"); 155 } 156 Binit(&bbuf, f, OREAD); 157 dofile(&bbuf); 158 Bterm(&bbuf); 159 close(f); 160 } 161 if(args.nfile == 0) { 162 Binit(&bbuf, 0, OREAD); 163 dofile(&bbuf); 164 Bterm(&bbuf); 165 } 166 if(args.cflag) 167 done(0); 168 if(args.vflag) 169 fprint(2, "=========\n"); 170 171 f = 1; 172 if(args.ofile) { 173 f = create(args.ofile, OWRITE, 0666); 174 if(f < 0) { 175 fprint(2, "sort: create %s: %r\n", args.ofile); 176 done("create"); 177 } 178 } 179 180 Binit(&bbuf, f, OWRITE); 181 if(args.ntemp) { 182 tempout(); 183 mergeout(&bbuf); 184 } else { 185 printout(&bbuf); 186 } 187 Bterm(&bbuf); 188 done(0); 189 } 190 191 void 192 dofile(Biobuf *b) 193 { 194 Line *l, *ol; 195 int n; 196 197 if(args.cflag) { 198 ol = newline(b); 199 if(ol == 0) 200 return; 201 for(;;) { 202 l = newline(b); 203 if(l == 0) 204 break; 205 n = kcmp(ol->key, l->key); 206 if(n > 0 || (n == 0 && args.uflag)) { 207 fprint(2, "sort: -c file not in sort\n"); /**/ 208 done("order"); 209 } 210 free(ol->key); 211 free(ol); 212 ol = l; 213 } 214 return; 215 } 216 217 if(args.linep == 0) { 218 args.linep = malloc(args.mline * sizeof(args.linep)); 219 if(args.linep == 0) 220 nomem(); 221 } 222 for(;;) { 223 l = newline(b); 224 if(l == 0) 225 break; 226 if(args.nline >= args.mline) 227 tempout(); 228 args.linep[args.nline] = l; 229 args.nline++; 230 args.lineno++; 231 } 232 } 233 234 void 235 notifyf(void*, char *s) 236 { 237 238 if(strcmp(s, "interrupt") == 0) 239 done(0); 240 if(strcmp(s, "hangup") == 0) 241 done(0); 242 if(strcmp(s, "kill") == 0) 243 done(0); 244 if(strncmp(s, "sys: write on closed pipe", 25) == 0) 245 done(0); 246 fprint(2, "sort: note: %s\n", s); 247 abort(); 248 } 249 250 Line* 251 newline(Biobuf *b) 252 { 253 Line *l; 254 char *p; 255 int n, c; 256 257 p = Brdline(b, '\n'); 258 n = Blinelen(b); 259 if(p == 0) { 260 if(n == 0) 261 return 0; 262 l = 0; 263 for(n=0;;) { 264 if((n & 31) == 0) { 265 l = realloc(l, sizeof(Line) + 266 (n+31)*sizeof(l->line[0])); 267 if(l == 0) 268 nomem(); 269 } 270 c = Bgetc(b); 271 if(c < 0) { 272 fprint(2, "sort: newline added\n"); 273 c = '\n'; 274 } 275 l->line[n++] = c; 276 if(c == '\n') 277 break; 278 } 279 l->llen = n; 280 buildkey(l); 281 return l; 282 } 283 l = malloc(sizeof(Line) + 284 (n-1)*sizeof(l->line[0])); 285 if(l == 0) 286 nomem(); 287 l->llen = n; 288 memmove(l->line, p, n); 289 buildkey(l); 290 return l; 291 } 292 293 void 294 lineout(Biobuf *b, Line *l) 295 { 296 int n, m; 297 298 n = l->llen; 299 m = Bwrite(b, l->line, n); 300 if(n != m) 301 exits("write"); 302 } 303 304 void 305 tempout(void) 306 { 307 long n; 308 Line **lp, *l; 309 char *tf; 310 int f; 311 Biobuf tb; 312 313 sort4(args.linep, args.nline); 314 tf = tempfile(args.ntemp); 315 args.ntemp++; 316 f = create(tf, OWRITE, 0666); 317 if(f < 0) { 318 fprint(2, "sort: create %s: %r\n", tf); 319 done("create"); 320 } 321 322 Binit(&tb, f, OWRITE); 323 lp = args.linep; 324 for(n=args.nline; n>0; n--) { 325 l = *lp++; 326 lineout(&tb, l); 327 free(l->key); 328 free(l); 329 } 330 args.nline = 0; 331 Bterm(&tb); 332 close(f); 333 } 334 335 void 336 done(char *xs) 337 { 338 int i; 339 340 for(i=0; i<args.ntemp; i++) 341 remove(tempfile(i)); 342 exits(xs); 343 } 344 345 void 346 nomem(void) 347 { 348 fprint(2, "sort: out of memory\n"); 349 done("mem"); 350 } 351 352 char* 353 tempfile(int n) 354 { 355 static char file[100]; 356 static uint pid; 357 char *dir; 358 359 dir = "/tmp"; 360 if(args.tname) 361 dir = args.tname; 362 if(strlen(dir) >= nelem(file)-20) { 363 fprint(2, "temp file directory name is too long: %s\n", dir); 364 done("tdir"); 365 } 366 367 if(pid == 0) { 368 pid = getpid(); 369 if(pid == 0) { 370 pid = time(0); 371 if(pid == 0) 372 pid = 1; 373 } 374 } 375 376 sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n); 377 return file; 378 } 379 380 void 381 mergeout(Biobuf *b) 382 { 383 int n, i, f; 384 char *tf; 385 Biobuf tb; 386 387 for(i=0; i<args.ntemp; i+=n) { 388 n = args.ntemp - i; 389 if(n > Nmerge) { 390 tf = tempfile(args.ntemp); 391 args.ntemp++; 392 f = create(tf, OWRITE, 0666); 393 if(f < 0) { 394 fprint(2, "sort: create %s: %r\n", tf); 395 done("create"); 396 } 397 Binit(&tb, f, OWRITE); 398 399 n = Nmerge; 400 mergefiles(i, n, &tb); 401 402 Bterm(&tb); 403 close(f); 404 } else 405 mergefiles(i, n, b); 406 } 407 } 408 409 void 410 mergefiles(int t, int n, Biobuf *b) 411 { 412 Merge *m, *mp, **mmp; 413 Key *ok; 414 Line *l; 415 char *tf; 416 int i, f, nn; 417 418 mmp = malloc(n*sizeof(*mmp)); 419 mp = malloc(n*sizeof(*mp)); 420 if(mmp == 0 || mp == 0) 421 nomem(); 422 423 nn = 0; 424 m = mp; 425 for(i=0; i<n; i++,m++) { 426 tf = tempfile(t+i); 427 f = open(tf, OREAD); 428 if(f < 0) { 429 fprint(2, "sort: reopen %s: %r\n", tf); 430 done("open"); 431 } 432 m->fd = f; 433 Binit(&m->b, f, OREAD); 434 mmp[nn] = m; 435 436 l = newline(&m->b); 437 if(l == 0) 438 continue; 439 nn++; 440 m->line = l; 441 m->key = l->key; 442 } 443 444 ok = 0; 445 for(;;) { 446 sort4(mmp, nn); 447 m = *mmp; 448 if(nn == 0) 449 break; 450 for(;;) { 451 l = m->line; 452 if(args.uflag && ok && kcmp(ok, l->key) == 0) { 453 free(l->key); 454 free(l); 455 } else { 456 lineout(b, l); 457 if(ok) 458 free(ok); 459 ok = l->key; 460 free(l); 461 } 462 463 l = newline(&m->b); 464 if(l == 0) { 465 nn--; 466 mmp[0] = mmp[nn]; 467 break; 468 } 469 m->line = l; 470 m->key = l->key; 471 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0) 472 break; 473 } 474 } 475 if(ok) 476 free(ok); 477 478 m = mp; 479 for(i=0; i<n; i++,m++) { 480 Bterm(&m->b); 481 close(m->fd); 482 } 483 484 free(mp); 485 free(mmp); 486 } 487 488 int 489 kcmp(Key *ka, Key *kb) 490 { 491 int n, m; 492 493 /* 494 * set n to length of smaller key 495 */ 496 n = ka->klen; 497 m = kb->klen; 498 if(n > m) 499 n = m; 500 return memcmp(ka->key, kb->key, n); 501 } 502 503 void 504 printout(Biobuf *b) 505 { 506 long n; 507 Line **lp, *l; 508 Key *ok; 509 510 sort4(args.linep, args.nline); 511 lp = args.linep; 512 ok = 0; 513 for(n=args.nline; n>0; n--) { 514 l = *lp++; 515 if(args.uflag && ok && kcmp(ok, l->key) == 0) 516 continue; 517 lineout(b, l); 518 ok = l->key; 519 } 520 } 521 522 void 523 setfield(int n, int c) 524 { 525 Field *f; 526 527 f = &args.field[n]; 528 switch(c) { 529 default: 530 fprint(2, "sort: unknown option: field.%C\n", c); 531 done("option"); 532 case 'b': /* skip blanks */ 533 f->flags |= Bflag; 534 break; 535 case 'd': /* directory order */ 536 f->flags |= Dflag; 537 break; 538 case 'f': /* fold case */ 539 f->flags |= Fflag; 540 break; 541 case 'g': /* floating point -n case */ 542 f->flags |= Gflag; 543 break; 544 case 'i': /* ignore non-ascii */ 545 f->flags |= Iflag; 546 break; 547 case 'M': /* month */ 548 f->flags |= Mflag; 549 break; 550 case 'n': /* numbers */ 551 f->flags |= Nflag; 552 break; 553 case 'r': /* reverse */ 554 f->flags |= Rflag; 555 break; 556 case 'w': /* ignore white */ 557 f->flags |= Wflag; 558 break; 559 } 560 } 561 562 void 563 dofield(char *s, int *n1, int *n2, int off1, int off2) 564 { 565 int c, n; 566 567 c = *s++; 568 if(c >= '0' && c <= '9') { 569 n = 0; 570 while(c >= '0' && c <= '9') { 571 n = n*10 + (c-'0'); 572 c = *s++; 573 } 574 n -= off1; /* posix committee: rot in hell */ 575 if(n < 0) { 576 fprint(2, "sort: field offset must be positive\n"); 577 done("option"); 578 } 579 *n1 = n; 580 } 581 if(c == '.') { 582 c = *s++; 583 if(c >= '0' && c <= '9') { 584 n = 0; 585 while(c >= '0' && c <= '9') { 586 n = n*10 + (c-'0'); 587 c = *s++; 588 } 589 n -= off2; 590 if(n < 0) { 591 fprint(2, "sort: character offset must be positive\n"); 592 done("option"); 593 } 594 *n2 = n; 595 } 596 } 597 while(c != 0) { 598 setfield(args.nfield, c); 599 c = *s++; 600 } 601 } 602 603 void 604 printargs(void) 605 { 606 int i, n; 607 Field *f; 608 char *prefix; 609 610 fprint(2, "sort"); 611 for(i=0; i<=args.nfield; i++) { 612 f = &args.field[i]; 613 prefix = " -"; 614 if(i) { 615 n = f->beg1; 616 if(n >= 0) 617 fprint(2, " +%d", n); 618 else 619 fprint(2, " +*"); 620 n = f->beg2; 621 if(n >= 0) 622 fprint(2, ".%d", n); 623 else 624 fprint(2, ".*"); 625 626 if(f->flags & B1flag) 627 fprint(2, "b"); 628 629 n = f->end1; 630 if(n >= 0) 631 fprint(2, " -%d", n); 632 else 633 fprint(2, " -*"); 634 n = f->end2; 635 if(n >= 0) 636 fprint(2, ".%d", n); 637 else 638 fprint(2, ".*"); 639 prefix = ""; 640 } 641 if(f->flags & Bflag) 642 fprint(2, "%sb", prefix); 643 if(f->flags & Dflag) 644 fprint(2, "%sd", prefix); 645 if(f->flags & Fflag) 646 fprint(2, "%sf", prefix); 647 if(f->flags & Gflag) 648 fprint(2, "%sg", prefix); 649 if(f->flags & Iflag) 650 fprint(2, "%si", prefix); 651 if(f->flags & Mflag) 652 fprint(2, "%sM", prefix); 653 if(f->flags & Nflag) 654 fprint(2, "%sn", prefix); 655 if(f->flags & Rflag) 656 fprint(2, "%sr", prefix); 657 if(f->flags & Wflag) 658 fprint(2, "%sw", prefix); 659 } 660 if(args.cflag) 661 fprint(2, " -c"); 662 if(args.uflag) 663 fprint(2, " -u"); 664 if(args.ofile) 665 fprint(2, " -o %s", args.ofile); 666 if(args.mline != Nline) 667 fprint(2, " -l %ld", args.mline); 668 fprint(2, "\n"); 669 } 670 671 void 672 newfield(void) 673 { 674 int n; 675 Field *f; 676 677 n = args.nfield + 1; 678 if(n >= Nfield) { 679 fprint(2, "sort: too many fields specified\n"); 680 done("option"); 681 } 682 args.nfield = n; 683 f = &args.field[n]; 684 f->beg1 = -1; 685 f->beg2 = -1; 686 f->end1 = -1; 687 f->end2 = -1; 688 } 689 690 void 691 doargs(int argc, char *argv[]) 692 { 693 int i, c, hadplus; 694 char *s, *p, *q; 695 Field *f; 696 697 hadplus = 0; 698 args.mline = Nline; 699 for(i=1; i<argc; i++) { 700 s = argv[i]; 701 c = *s++; 702 if(c == '-') { 703 c = *s; 704 if(c == 0) /* forced end of arg marker */ 705 break; 706 argv[i] = 0; /* clobber args processed */ 707 if(c == '.' || (c >= '0' && c <= '9')) { 708 if(!hadplus) 709 newfield(); 710 f = &args.field[args.nfield]; 711 dofield(s, &f->end1, &f->end2, 0, 0); 712 hadplus = 0; 713 continue; 714 } 715 716 while(c = *s++) 717 switch(c) { 718 case '-': /* end of options */ 719 i = argc; 720 continue; 721 case 'T': /* temp directory */ 722 if(*s == 0) { 723 i++; 724 if(i < argc) { 725 args.tname = argv[i]; 726 argv[i] = 0; 727 } 728 } else 729 args.tname = s; 730 s = strchr(s, 0); 731 break; 732 case 'o': /* output file */ 733 if(*s == 0) { 734 i++; 735 if(i < argc) { 736 args.ofile = argv[i]; 737 argv[i] = 0; 738 } 739 } else 740 args.ofile = s; 741 s = strchr(s, 0); 742 break; 743 case 'k': /* posix key (what were they thinking?) */ 744 p = 0; 745 if(*s == 0) { 746 i++; 747 if(i < argc) { 748 p = argv[i]; 749 argv[i] = 0; 750 } 751 } else 752 p = s; 753 s = strchr(s, 0); 754 if(p == 0) 755 break; 756 757 newfield(); 758 q = strchr(p, ','); 759 if(q) 760 *q++ = 0; 761 f = &args.field[args.nfield]; 762 dofield(p, &f->beg1, &f->beg2, 1, 1); 763 if(f->flags & Bflag) { 764 f->flags |= B1flag; 765 f->flags &= ~Bflag; 766 } 767 if(q) { 768 dofield(q, &f->end1, &f->end2, 1, 0); 769 if(f->end2 <= 0) 770 f->end1++; 771 } 772 hadplus = 0; 773 break; 774 case 't': /* tab character */ 775 if(*s == 0) { 776 i++; 777 if(i < argc) { 778 chartorune(&args.tabchar, argv[i]); 779 argv[i] = 0; 780 } 781 } else 782 s += chartorune(&args.tabchar, s); 783 if(args.tabchar == '\n') { 784 fprint(2, "aw come on, rob\n"); 785 done("rob"); 786 } 787 break; 788 case 'c': /* check order */ 789 args.cflag = 1; 790 break; 791 case 'u': /* unique */ 792 args.uflag = 1; 793 break; 794 case 'v': /* debugging noise */ 795 args.vflag = 1; 796 break; 797 case 'l': 798 if(*s == 0) { 799 i++; 800 if(i < argc) { 801 args.mline = atol(argv[i]); 802 argv[i] = 0; 803 } 804 } else 805 args.mline = atol(s); 806 s = strchr(s, 0); 807 break; 808 809 case 'M': /* month */ 810 case 'b': /* skip blanks */ 811 case 'd': /* directory order */ 812 case 'f': /* fold case */ 813 case 'g': /* floating numbers */ 814 case 'i': /* ignore non-ascii */ 815 case 'n': /* numbers */ 816 case 'r': /* reverse */ 817 case 'w': /* ignore white */ 818 if(args.nfield > 0) 819 fprint(2, "sort: global field set after -k\n"); 820 setfield(0, c); 821 break; 822 case 'm': 823 /* option m silently ignored but required by posix */ 824 break; 825 default: 826 fprint(2, "sort: unknown option: -%C\n", c); 827 done("option"); 828 } 829 continue; 830 } 831 if(c == '+') { 832 argv[i] = 0; /* clobber args processed */ 833 c = *s; 834 if(c == '.' || (c >= '0' && c <= '9')) { 835 newfield(); 836 f = &args.field[args.nfield]; 837 dofield(s, &f->beg1, &f->beg2, 0, 0); 838 if(f->flags & Bflag) { 839 f->flags |= B1flag; 840 f->flags &= ~Bflag; 841 } 842 hadplus = 1; 843 continue; 844 } 845 fprint(2, "sort: unknown option: +%C\n", c); 846 done("option"); 847 } 848 args.nfile++; 849 } 850 851 for(i=0; i<=args.nfield; i++) { 852 f = &args.field[i]; 853 854 /* 855 * global options apply to fields that 856 * specify no options 857 */ 858 if(f->flags == 0) { 859 f->flags = args.field[0].flags; 860 if(args.field[0].flags & Bflag) 861 f->flags |= B1flag; 862 } 863 864 865 /* 866 * build buildkey specification 867 */ 868 switch(f->flags & ~(Bflag|B1flag)) { 869 default: 870 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags); 871 done("option"); 872 case 0: 873 f->dokey = dokey_; 874 break; 875 case Rflag: 876 f->dokey = dokey_r; 877 break; 878 case Gflag: 879 case Nflag: 880 case Gflag|Nflag: 881 case Gflag|Rflag: 882 case Nflag|Rflag: 883 case Gflag|Nflag|Rflag: 884 f->dokey = dokey_gn; 885 break; 886 case Mflag: 887 case Mflag|Rflag: 888 f->dokey = dokey_m; 889 makemapm(f); 890 break; 891 case Dflag: 892 case Dflag|Fflag: 893 case Dflag|Fflag|Iflag: 894 case Dflag|Fflag|Iflag|Rflag: 895 case Dflag|Fflag|Iflag|Rflag|Wflag: 896 case Dflag|Fflag|Iflag|Wflag: 897 case Dflag|Fflag|Rflag: 898 case Dflag|Fflag|Rflag|Wflag: 899 case Dflag|Fflag|Wflag: 900 case Dflag|Iflag: 901 case Dflag|Iflag|Rflag: 902 case Dflag|Iflag|Rflag|Wflag: 903 case Dflag|Iflag|Wflag: 904 case Dflag|Rflag: 905 case Dflag|Rflag|Wflag: 906 case Dflag|Wflag: 907 case Fflag: 908 case Fflag|Iflag: 909 case Fflag|Iflag|Rflag: 910 case Fflag|Iflag|Rflag|Wflag: 911 case Fflag|Iflag|Wflag: 912 case Fflag|Rflag: 913 case Fflag|Rflag|Wflag: 914 case Fflag|Wflag: 915 case Iflag: 916 case Iflag|Rflag: 917 case Iflag|Rflag|Wflag: 918 case Iflag|Wflag: 919 case Wflag: 920 f->dokey = dokey_dfi; 921 makemapd(f); 922 break; 923 } 924 } 925 926 /* 927 * random spot checks 928 */ 929 if(args.nfile > 1 && args.cflag) { 930 fprint(2, "sort: -c can have at most one input file\n"); 931 done("option"); 932 } 933 return; 934 } 935 936 uchar* 937 skip(uchar *l, int n1, int n2, int bflag, int endfield) 938 { 939 int i, c, tc; 940 Rune r; 941 942 if(endfield && n1 < 0) 943 return 0; 944 945 c = *l++; 946 tc = args.tabchar; 947 if(tc) { 948 if(tc < Runeself) { 949 for(i=n1; i>0; i--) { 950 while(c != tc) { 951 if(c == '\n') 952 return 0; 953 c = *l++; 954 } 955 if(!(endfield && i == 1)) 956 c = *l++; 957 } 958 } else { 959 l--; 960 l += chartorune(&r, (char*)l); 961 for(i=n1; i>0; i--) { 962 while(r != tc) { 963 if(r == '\n') 964 return 0; 965 l += chartorune(&r, (char*)l); 966 } 967 if(!(endfield && i == 1)) 968 l += chartorune(&r, (char*)l); 969 } 970 c = r; 971 } 972 } else { 973 for(i=n1; i>0; i--) { 974 while(c == ' ' || c == '\t') 975 c = *l++; 976 while(c != ' ' && c != '\t') { 977 if(c == '\n') 978 return 0; 979 c = *l++; 980 } 981 } 982 } 983 984 if(bflag) 985 while(c == ' ' || c == '\t') 986 c = *l++; 987 988 l--; 989 for(i=n2; i>0; i--) { 990 c = *l; 991 if(c < Runeself) { 992 if(c == '\n') 993 return 0; 994 l++; 995 continue; 996 } 997 l += chartorune(&r, (char*)l); 998 } 999 return l; 1000 } 1001 1002 void 1003 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f) 1004 { 1005 uchar *kp; 1006 int c, cl, dp; 1007 int state, nzero, exp, expsign, rflag; 1008 1009 cl = k->klen + 3; 1010 kp = k->key + cl; /* skip place for sign, exponent[2] */ 1011 1012 nzero = 0; /* number of trailing zeros */ 1013 exp = 0; /* value of the exponent */ 1014 expsign = 0; /* sign of the exponent */ 1015 dp = 0x4040; /* location of decimal point */ 1016 rflag = f->flags&Rflag; /* xor of rflag and - sign */ 1017 state = NSstart; 1018 1019 for(;; lp++) { 1020 if(lp >= lpe) 1021 break; 1022 c = *lp; 1023 1024 if(c == ' ' || c == '\t') { 1025 switch(state) { 1026 case NSstart: 1027 case NSsign: 1028 continue; 1029 } 1030 break; 1031 } 1032 if(c == '+' || c == '-') { 1033 switch(state) { 1034 case NSstart: 1035 state = NSsign; 1036 if(c == '-') 1037 rflag = !rflag; 1038 continue; 1039 case NSexp: 1040 state = NSexpsign; 1041 if(c == '-') 1042 expsign = 1; 1043 continue; 1044 } 1045 break; 1046 } 1047 if(c == '0') { 1048 switch(state) { 1049 case NSdigit: 1050 if(rflag) 1051 c = ~c; 1052 *kp++ = c; 1053 cl++; 1054 nzero++; 1055 dp++; 1056 state = NSdigit; 1057 continue; 1058 case NSfract: 1059 if(rflag) 1060 c = ~c; 1061 *kp++ = c; 1062 cl++; 1063 nzero++; 1064 state = NSfract; 1065 continue; 1066 case NSstart: 1067 case NSsign: 1068 case NSzero: 1069 state = NSzero; 1070 continue; 1071 case NSzerofract: 1072 case NSpoint: 1073 dp--; 1074 state = NSzerofract; 1075 continue; 1076 case NSexpsign: 1077 case NSexp: 1078 case NSexpdigit: 1079 exp = exp*10 + (c - '0'); 1080 state = NSexpdigit; 1081 continue; 1082 } 1083 break; 1084 } 1085 if(c >= '1' && c <= '9') { 1086 switch(state) { 1087 case NSzero: 1088 case NSstart: 1089 case NSsign: 1090 case NSdigit: 1091 if(rflag) 1092 c = ~c; 1093 *kp++ = c; 1094 cl++; 1095 nzero = 0; 1096 dp++; 1097 state = NSdigit; 1098 continue; 1099 case NSzerofract: 1100 case NSpoint: 1101 case NSfract: 1102 if(rflag) 1103 c = ~c; 1104 *kp++ = c; 1105 cl++; 1106 nzero = 0; 1107 state = NSfract; 1108 continue; 1109 case NSexpsign: 1110 case NSexp: 1111 case NSexpdigit: 1112 exp = exp*10 + (c - '0'); 1113 state = NSexpdigit; 1114 continue; 1115 } 1116 break; 1117 } 1118 if(c == '.') { 1119 switch(state) { 1120 case NSstart: 1121 case NSsign: 1122 state = NSpoint; 1123 continue; 1124 case NSzero: 1125 state = NSzerofract; 1126 continue; 1127 case NSdigit: 1128 state = NSfract; 1129 continue; 1130 } 1131 break; 1132 } 1133 if((f->flags & Gflag) && (c == 'e' || c == 'E')) { 1134 switch(state) { 1135 case NSdigit: 1136 case NSfract: 1137 state = NSexp; 1138 continue; 1139 } 1140 break; 1141 } 1142 break; 1143 } 1144 1145 switch(state) { 1146 /* 1147 * result is zero 1148 */ 1149 case NSstart: 1150 case NSsign: 1151 case NSzero: 1152 case NSzerofract: 1153 case NSpoint: 1154 kp = k->key + k->klen; 1155 k->klen += 2; 1156 kp[0] = 0x20; /* between + and - */ 1157 kp[1] = 0; 1158 return; 1159 /* 1160 * result has exponent 1161 */ 1162 case NSexpsign: 1163 case NSexp: 1164 case NSexpdigit: 1165 if(expsign) 1166 exp = -exp; 1167 dp += exp; 1168 1169 /* 1170 * result is fixed point number 1171 */ 1172 case NSdigit: 1173 case NSfract: 1174 kp -= nzero; 1175 cl -= nzero; 1176 break; 1177 } 1178 1179 /* 1180 * end of number 1181 */ 1182 c = 0; 1183 if(rflag) 1184 c = ~c; 1185 *kp = c; 1186 1187 /* 1188 * sign and exponent 1189 */ 1190 c = 0x30; 1191 if(rflag) { 1192 c = 0x10; 1193 dp = ~dp; 1194 } 1195 kp = k->key + k->klen; 1196 kp[0] = c; 1197 kp[1] = (dp >> 8); 1198 kp[2] = dp; 1199 k->klen = cl+1; 1200 } 1201 1202 void 1203 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f) 1204 { 1205 uchar *kp; 1206 Rune r, place[3]; 1207 int c, cl, pc; 1208 int rflag; 1209 1210 rflag = f->flags&Rflag; 1211 pc = 0; 1212 1213 cl = k->klen; 1214 kp = k->key + cl; 1215 1216 for(;;) { 1217 /* 1218 * get the character 1219 */ 1220 if(lp >= lpe) 1221 break; 1222 c = *lp; 1223 if(c >= Runeself) { 1224 lp += chartorune(&r, (char*)lp); 1225 c = r; 1226 } else 1227 lp++; 1228 1229 if(c < nelem(f->mapto)) { 1230 c = f->mapto[c]; 1231 if(c == 0) 1232 continue; 1233 } 1234 place[pc++] = c; 1235 if(pc < 3) 1236 continue; 1237 for(c=11; c>=0; c--) 1238 if(memcmp(month[c], place, sizeof(place)) == 0) 1239 break; 1240 c += 10; 1241 if(rflag) 1242 c = ~c; 1243 *kp++ = c; 1244 cl++; 1245 break; 1246 } 1247 1248 c = 0; 1249 if(rflag) 1250 c = ~c; 1251 *kp = c; 1252 k->klen = cl+1; 1253 } 1254 1255 void 1256 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f) 1257 { 1258 uchar *kp; 1259 Rune r; 1260 int c, cl, n, rflag; 1261 1262 cl = k->klen; 1263 kp = k->key + cl; 1264 rflag = f->flags & Rflag; 1265 1266 for(;;) { 1267 /* 1268 * get the character 1269 */ 1270 if(lp >= lpe) 1271 break; 1272 c = *lp; 1273 if(c >= Runeself) { 1274 lp += chartorune(&r, (char*)lp); 1275 c = r; 1276 } else 1277 lp++; 1278 1279 /* 1280 * do the various mappings. 1281 * the common case is handled 1282 * completely by the table. 1283 */ 1284 if(c != 0 && c < Runeself) { 1285 c = f->mapto[c]; 1286 if(c) { 1287 *kp++ = c; 1288 cl++; 1289 } 1290 continue; 1291 } 1292 1293 /* 1294 * for characters out of range, 1295 * the table does not do Rflag. 1296 * ignore is based on mapto[255] 1297 */ 1298 if(c != 0 && c < nelem(f->mapto)) { 1299 c = f->mapto[c]; 1300 if(c == 0) 1301 continue; 1302 } else 1303 if(f->mapto[nelem(f->mapto)-1] == 0) 1304 continue; 1305 1306 /* 1307 * put it in the key 1308 */ 1309 r = c; 1310 n = runetochar((char*)kp, &r); 1311 kp += n; 1312 cl += n; 1313 if(rflag) 1314 while(n > 0) { 1315 kp[-n] = ~kp[-n]; 1316 n--; 1317 } 1318 } 1319 1320 /* 1321 * end of key 1322 */ 1323 k->klen = cl+1; 1324 if(rflag) { 1325 *kp = ~0; 1326 return; 1327 } 1328 *kp = 0; 1329 } 1330 1331 void 1332 dokey_r(Key *k, uchar *lp, uchar *lpe, Field*) 1333 { 1334 int cl, n; 1335 uchar *kp; 1336 1337 n = lpe - lp; 1338 if(n < 0) 1339 n = 0; 1340 cl = k->klen; 1341 kp = k->key + cl; 1342 k->klen = cl+n+1; 1343 1344 lpe -= 3; 1345 while(lp < lpe) { 1346 kp[0] = ~lp[0]; 1347 kp[1] = ~lp[1]; 1348 kp[2] = ~lp[2]; 1349 kp[3] = ~lp[3]; 1350 kp += 4; 1351 lp += 4; 1352 } 1353 1354 lpe += 3; 1355 while(lp < lpe) 1356 *kp++ = ~*lp++; 1357 *kp = ~0; 1358 } 1359 1360 void 1361 dokey_(Key *k, uchar *lp, uchar *lpe, Field*) 1362 { 1363 int n, cl; 1364 uchar *kp; 1365 1366 n = lpe - lp; 1367 if(n < 0) 1368 n = 0; 1369 cl = k->klen; 1370 kp = k->key + cl; 1371 k->klen = cl+n+1; 1372 memmove(kp, lp, n); 1373 kp[n] = 0; 1374 } 1375 1376 void 1377 buildkey(Line *l) 1378 { 1379 Key *k; 1380 uchar *lp, *lpe; 1381 int ll, kl, cl, i, n; 1382 Field *f; 1383 1384 ll = l->llen - 1; 1385 kl = 0; /* allocated length */ 1386 cl = 0; /* current length */ 1387 k = 0; 1388 1389 for(i=1; i<=args.nfield; i++) { 1390 f = &args.field[i]; 1391 lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0); 1392 if(lp == 0) 1393 lp = l->line + ll; 1394 lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1); 1395 if(lpe == 0) 1396 lpe = l->line + ll; 1397 n = (lpe - lp) + 1; 1398 if(n <= 0) 1399 n = 1; 1400 if(cl+(n+4) > kl) { 1401 kl = cl+(n+4); 1402 k = realloc(k, sizeof(Key) + 1403 (kl-1)*sizeof(k->key[0])); 1404 if(k == 0) 1405 nomem(); 1406 } 1407 k->klen = cl; 1408 (*f->dokey)(k, lp, lpe, f); 1409 cl = k->klen; 1410 } 1411 1412 /* 1413 * global comparisons 1414 */ 1415 if(!(args.uflag && cl > 0)) { 1416 f = &args.field[0]; 1417 if(cl+(ll+4) > kl) { 1418 kl = cl+(ll+4); 1419 k = realloc(k, sizeof(Key) + 1420 (kl-1)*sizeof(k->key[0])); 1421 if(k == 0) 1422 nomem(); 1423 } 1424 k->klen = cl; 1425 (*f->dokey)(k, l->line, l->line+ll, f); 1426 cl = k->klen; 1427 } 1428 1429 l->key = k; 1430 k->klen = cl; 1431 1432 if(args.vflag) { 1433 write(2, l->line, l->llen); 1434 for(i=0; i<k->klen; i++) { 1435 fprint(2, " %.2x", k->key[i]); 1436 if(k->key[i] == 0x00 || k->key[i] == 0xff) 1437 fprint(2, "\n"); 1438 } 1439 } 1440 } 1441 1442 void 1443 makemapm(Field *f) 1444 { 1445 int i, c; 1446 1447 for(i=0; i<nelem(f->mapto); i++) { 1448 c = 1; 1449 if(i == ' ' || i == '\t') 1450 c = 0; 1451 if(i >= 'a' && i <= 'z') 1452 c = i + ('A' - 'a'); 1453 if(i >= 'A' && i <= 'Z') 1454 c = i; 1455 f->mapto[i] = c; 1456 if(args.vflag) { 1457 if((i & 15) == 0) 1458 fprint(2, " "); 1459 fprint(2, " %.2x", c); 1460 if((i & 15) == 15) 1461 fprint(2, "\n"); 1462 } 1463 } 1464 } 1465 1466 void 1467 makemapd(Field *f) 1468 { 1469 int i, j, c; 1470 1471 for(i=0; i<nelem(f->mapto); i++) { 1472 c = i; 1473 if(f->flags & Iflag) 1474 if(c < 040 || c > 0176) 1475 c = -1; 1476 if((f->flags & Wflag) && c >= 0) 1477 if(c == ' ' || c == '\t') 1478 c = -1; 1479 if((f->flags & Dflag) && c >= 0) 1480 if(!(c == ' ' || c == '\t' || 1481 (c >= 'a' && c <= 'z') || 1482 (c >= 'A' && c <= 'Z') || 1483 (c >= '0' && c <= '9'))) { 1484 for(j=0; latinmap[j]; j+=3) 1485 if(c == latinmap[j+0] || 1486 c == latinmap[j+1]) 1487 break; 1488 if(latinmap[j] == 0) 1489 c = -1; 1490 } 1491 if((f->flags & Fflag) && c >= 0) { 1492 if(c >= 'a' && c <= 'z') 1493 c += 'A' - 'a'; 1494 for(j=0; latinmap[j]; j+=3) 1495 if(c == latinmap[j+0] || 1496 c == latinmap[j+1]) { 1497 c = latinmap[j+2]; 1498 break; 1499 } 1500 } 1501 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself) 1502 c = ~c & 0xff; 1503 if(c < 0) 1504 c = 0; 1505 f->mapto[i] = c; 1506 if(args.vflag) { 1507 if((i & 15) == 0) 1508 fprint(2, " "); 1509 fprint(2, " %.2x", c); 1510 if((i & 15) == 15) 1511 fprint(2, "\n"); 1512 } 1513 } 1514 } 1515 1516 int latinmap[] = 1517 { 1518 /* lcase ucase fold */ 1519 L'à', L'À', L'A', 1520 L'á', L'Á', L'A', 1521 L'â', L'Â', L'A', 1522 L'ä', L'Ä', L'A', 1523 L'ã', L'Ã', L'A', 1524 L'å', L'Å', L'A', 1525 L'è', L'È', L'E', 1526 L'é', L'É', L'E', 1527 L'ê', L'Ê', L'E', 1528 L'ë', L'Ë', L'E', 1529 L'ì', L'Ì', L'I', 1530 L'í', L'Í', L'I', 1531 L'î', L'Î', L'I', 1532 L'ï', L'Ï', L'I', 1533 L'ò', L'Ò', L'O', 1534 L'ó', L'Ó', L'O', 1535 L'ô', L'Ô', L'O', 1536 L'ö', L'Ö', L'O', 1537 L'õ', L'Õ', L'O', 1538 L'ø', L'Ø', L'O', 1539 L'ù', L'Ù', L'U', 1540 L'ú', L'Ú', L'U', 1541 L'û', L'Û', L'U', 1542 L'ü', L'Ü', L'U', 1543 L'æ', L'Æ', L'A', 1544 L'ð', L'Ð', L'D', 1545 L'ñ', L'Ñ', L'N', 1546 L'ý', L'Ý', L'Y', 1547 L'ç', L'Ç', L'C', 1548 0, 1549 }; 1550 1551 Rune* month[12] = 1552 { 1553 L"JAN", 1554 L"FEB", 1555 L"MAR", 1556 L"APR", 1557 L"MAY", 1558 L"JUN", 1559 L"JUL", 1560 L"AUG", 1561 L"SEP", 1562 L"OCT", 1563 L"NOV", 1564 L"DEC", 1565 }; 1566 1567 /************** radix sort ***********/ 1568 1569 enum 1570 { 1571 Threshold = 14, 1572 }; 1573 1574 void rsort4(Key***, ulong, int); 1575 void bsort4(Key***, ulong, int); 1576 1577 void 1578 sort4(void *a, ulong n) 1579 { 1580 if(n > Threshold) 1581 rsort4((Key***)a, n, 0); 1582 else 1583 bsort4((Key***)a, n, 0); 1584 } 1585 1586 void 1587 rsort4(Key ***a, ulong n, int b) 1588 { 1589 Key ***ea, ***t, ***u, **t1, **u1, *k; 1590 Key ***part[257]; 1591 static long count[257]; 1592 long clist[257+257], *cp, *cp1; 1593 int c, lowc, higc; 1594 1595 /* 1596 * pass 1 over all keys, 1597 * count the number of each key[b]. 1598 * find low count and high count. 1599 */ 1600 lowc = 256; 1601 higc = 0; 1602 ea = a+n; 1603 for(t=a; t<ea; t++) { 1604 k = **t; 1605 n = k->klen; 1606 if(b >= n) { 1607 count[256]++; 1608 continue; 1609 } 1610 c = k->key[b]; 1611 n = count[c]++; 1612 if(n == 0) { 1613 if(c < lowc) 1614 lowc = c; 1615 if(c > higc) 1616 higc = c; 1617 } 1618 } 1619 1620 /* 1621 * pass 2 over all counts, 1622 * put partition pointers in part[c]. 1623 * save compacted indexes and counts 1624 * in clist[]. 1625 */ 1626 t = a; 1627 n = count[256]; 1628 clist[0] = n; 1629 part[256] = t; 1630 t += n; 1631 1632 cp1 = clist+1; 1633 cp = count+lowc; 1634 for(c=lowc; c<=higc; c++,cp++) { 1635 n = *cp; 1636 if(n) { 1637 cp1[0] = n; 1638 cp1[1] = c; 1639 cp1 += 2; 1640 part[c] = t; 1641 t += n; 1642 } 1643 } 1644 *cp1 = 0; 1645 1646 /* 1647 * pass 3 over all counts. 1648 * chase lowest pointer in each partition 1649 * around a permutation until it comes 1650 * back and is stored where it started. 1651 * static array, count[], should be 1652 * reduced to zero entries except maybe 1653 * count[256]. 1654 */ 1655 for(cp1=clist+1; cp1[0]; cp1+=2) { 1656 c = cp1[1]; 1657 cp = count+c; 1658 while(*cp) { 1659 t1 = *part[c]; 1660 for(;;) { 1661 k = *t1; 1662 n = 256; 1663 if(b < k->klen) 1664 n = k->key[b]; 1665 u = part[n]++; 1666 count[n]--; 1667 u1 = *u; 1668 *u = t1; 1669 if(n == c) 1670 break; 1671 t1 = u1; 1672 } 1673 } 1674 } 1675 1676 /* 1677 * pass 4 over all partitions. 1678 * call recursively. 1679 */ 1680 b++; 1681 t = a + clist[0]; 1682 count[256] = 0; 1683 for(cp1=clist+1; n=cp1[0]; cp1+=2) { 1684 if(n > Threshold) 1685 rsort4(t, n, b); 1686 else 1687 if(n > 1) 1688 bsort4(t, n, b); 1689 t += n; 1690 } 1691 } 1692 1693 /* 1694 * bubble sort to pick up 1695 * the pieces. 1696 */ 1697 void 1698 bsort4(Key ***a, ulong n, int b) 1699 { 1700 Key ***i, ***j, ***k, ***l, **t; 1701 Key *ka, *kb; 1702 int n1, n2; 1703 1704 l = a+n; 1705 j = a; 1706 1707 loop: 1708 i = j; 1709 j++; 1710 if(j >= l) 1711 return; 1712 1713 ka = **i; 1714 kb = **j; 1715 n1 = ka->klen - b; 1716 n2 = kb->klen - b; 1717 if(n1 > n2) 1718 n1 = n2; 1719 if(n1 <= 0) 1720 goto loop; 1721 n2 = ka->key[b] - kb->key[b]; 1722 if(n2 == 0) 1723 n2 = memcmp(ka->key+b, kb->key+b, n1); 1724 if(n2 <= 0) 1725 goto loop; 1726 1727 for(;;) { 1728 k = i+1; 1729 1730 t = *k; 1731 *k = *i; 1732 *i = t; 1733 1734 if(i <= a) 1735 goto loop; 1736 1737 i--; 1738 ka = **i; 1739 kb = *t; 1740 n1 = ka->klen - b; 1741 n2 = kb->klen - b; 1742 if(n1 > n2) 1743 n1 = n2; 1744 if(n1 <= 0) 1745 goto loop; 1746 n2 = ka->key[b] - kb->key[b]; 1747 if(n2 == 0) 1748 n2 = memcmp(ka->key+b, kb->key+b, n1); 1749 if(n2 <= 0) 1750 goto loop; 1751 } 1752 } 1753