1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4
5 /*
6 bugs:
7 00/ff for end of file can conflict with 00/ff characters
8 */
9
10 enum
11 {
12 Nline = 100000, /* default max number of lines saved in memory */
13 Nmerge = 10, /* max number of temporary files merged */
14 Nfield = 20, /* max number of argument fields */
15
16 Bflag = 1<<0, /* flags per field */
17 B1flag = 1<<1,
18
19 Dflag = 1<<2,
20 Fflag = 1<<3,
21 Gflag = 1<<4,
22 Iflag = 1<<5,
23 Mflag = 1<<6,
24 Nflag = 1<<7,
25 Rflag = 1<<8,
26 Wflag = 1<<9,
27
28 NSstart = 0, /* states for number to key decoding */
29 NSsign,
30 NSzero,
31 NSdigit,
32 NSpoint,
33 NSfract,
34 NSzerofract,
35 NSexp,
36 NSexpsign,
37 NSexpdigit,
38 };
39
40 typedef struct Line Line;
41 typedef struct Key Key;
42 typedef struct Merge Merge;
43 typedef struct Field Field;
44
45 struct Line
46 {
47 Key* key;
48 int llen; /* always >= 1 */
49 uchar line[1]; /* always ends in '\n' */
50 };
51
52 struct Merge
53 {
54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */
55 Line* line; /* line at the head of a merged temp file */
56 int fd; /* file descriptor */
57 Biobuf b; /* iobuf for reading a temp file */
58 };
59
60 struct Key
61 {
62 int klen;
63 uchar key[1];
64 };
65
66 struct Field
67 {
68 int beg1;
69 int beg2;
70 int end1;
71 int end2;
72
73 long flags;
74 uchar mapto[1+255];
75
76 void (*dokey)(Key*, uchar*, uchar*, Field*);
77 };
78
79 struct args
80 {
81 char* ofile;
82 char* tname;
83 Rune tabchar;
84 char cflag;
85 char uflag;
86 char vflag;
87 int nfield;
88 int nfile;
89 Field field[Nfield];
90
91 Line** linep;
92 long nline; /* number of lines in this temp file */
93 long lineno; /* overall ordinal for -s option */
94 int ntemp;
95 long mline; /* max lines per file */
96 } args;
97
98 extern Rune* month[12];
99
100 void buildkey(Line*);
101 void doargs(int, char*[]);
102 void dofield(char*, int*, int*, int, int);
103 void dofile(Biobuf*);
104 void dokey_(Key*, uchar*, uchar*, Field*);
105 void dokey_dfi(Key*, uchar*, uchar*, Field*);
106 void dokey_gn(Key*, uchar*, uchar*, Field*);
107 void dokey_m(Key*, uchar*, uchar*, Field*);
108 void dokey_r(Key*, uchar*, uchar*, Field*);
109 void done(char*);
110 int kcmp(Key*, Key*);
111 void makemapd(Field*);
112 void makemapm(Field*);
113 void mergefiles(int, int, Biobuf*);
114 void mergeout(Biobuf*);
115 void newfield(void);
116 Line* newline(Biobuf*);
117 void nomem(void);
118 void notifyf(void*, char*);
119 void printargs(void);
120 void printout(Biobuf*);
121 void setfield(int, int);
122 uchar* skip(uchar*, int, int, int, int);
123 void sort4(void*, ulong);
124 char* tempfile(int);
125 void tempout(void);
126 void lineout(Biobuf*, Line*);
127
128 void
main(int argc,char * argv[])129 main(int argc, char *argv[])
130 {
131 int i, f;
132 char *s;
133 Biobuf bbuf;
134
135 notify(notifyf); /**/
136 doargs(argc, argv);
137 if(args.vflag)
138 printargs();
139
140 for(i=1; i<argc; i++) {
141 s = argv[i];
142 if(s == 0)
143 continue;
144 if(strcmp(s, "-") == 0) {
145 Binit(&bbuf, 0, OREAD);
146 dofile(&bbuf);
147 Bterm(&bbuf);
148 continue;
149 }
150 f = open(s, OREAD);
151 if(f < 0) {
152 fprint(2, "sort: open %s: %r\n", s);
153 done("open");
154 }
155 Binit(&bbuf, f, OREAD);
156 dofile(&bbuf);
157 Bterm(&bbuf);
158 close(f);
159 }
160 if(args.nfile == 0) {
161 Binit(&bbuf, 0, OREAD);
162 dofile(&bbuf);
163 Bterm(&bbuf);
164 }
165 if(args.cflag)
166 done(0);
167 if(args.vflag)
168 fprint(2, "=========\n");
169
170 f = 1;
171 if(args.ofile) {
172 f = create(args.ofile, OWRITE, 0666);
173 if(f < 0) {
174 fprint(2, "sort: create %s: %r\n", args.ofile);
175 done("create");
176 }
177 }
178
179 Binit(&bbuf, f, OWRITE);
180 if(args.ntemp) {
181 tempout();
182 mergeout(&bbuf);
183 } else {
184 printout(&bbuf);
185 }
186 Bterm(&bbuf);
187 done(0);
188 }
189
190 void
dofile(Biobuf * b)191 dofile(Biobuf *b)
192 {
193 Line *l, *ol;
194 int n;
195
196 if(args.cflag) {
197 ol = newline(b);
198 if(ol == 0)
199 return;
200 for(;;) {
201 l = newline(b);
202 if(l == 0)
203 break;
204 n = kcmp(ol->key, l->key);
205 if(n > 0 || (n == 0 && args.uflag)) {
206 fprint(2, "sort: -c file not in sort\n"); /**/
207 done("order");
208 }
209 free(ol->key);
210 free(ol);
211 ol = l;
212 }
213 return;
214 }
215
216 if(args.linep == 0) {
217 args.linep = malloc(args.mline * sizeof(args.linep));
218 if(args.linep == 0)
219 nomem();
220 }
221 for(;;) {
222 l = newline(b);
223 if(l == 0)
224 break;
225 if(args.nline >= args.mline)
226 tempout();
227 args.linep[args.nline] = l;
228 args.nline++;
229 args.lineno++;
230 }
231 }
232
233 void
notifyf(void *,char * s)234 notifyf(void*, char *s)
235 {
236
237 if(strcmp(s, "interrupt") == 0)
238 done(0);
239 if(strcmp(s, "hangup") == 0)
240 done(0);
241 if(strcmp(s, "kill") == 0)
242 done(0);
243 if(strncmp(s, "sys: write on closed pipe", 25) == 0)
244 done(0);
245 fprint(2, "sort: note: %s\n", s);
246 abort();
247 }
248
249 Line*
newline(Biobuf * b)250 newline(Biobuf *b)
251 {
252 Line *l;
253 char *p;
254 int n, c;
255
256 p = Brdline(b, '\n');
257 n = Blinelen(b);
258 if(p == 0) {
259 if(n == 0)
260 return 0;
261 l = 0;
262 for(n=0;;) {
263 if((n & 31) == 0) {
264 l = realloc(l, sizeof(Line) +
265 (n+31)*sizeof(l->line[0]));
266 if(l == 0)
267 nomem();
268 }
269 c = Bgetc(b);
270 if(c < 0) {
271 fprint(2, "sort: newline added\n");
272 c = '\n';
273 }
274 l->line[n++] = c;
275 if(c == '\n')
276 break;
277 }
278 l->llen = n;
279 buildkey(l);
280 return l;
281 }
282 l = malloc(sizeof(Line) +
283 (n-1)*sizeof(l->line[0]));
284 if(l == 0)
285 nomem();
286 l->llen = n;
287 memmove(l->line, p, n);
288 buildkey(l);
289 return l;
290 }
291
292 void
lineout(Biobuf * b,Line * l)293 lineout(Biobuf *b, Line *l)
294 {
295 int n, m;
296
297 n = l->llen;
298 m = Bwrite(b, l->line, n);
299 if(n != m)
300 exits("write");
301 }
302
303 void
tempout(void)304 tempout(void)
305 {
306 long n;
307 Line **lp, *l;
308 char *tf;
309 int f;
310 Biobuf tb;
311
312 sort4(args.linep, args.nline);
313 tf = tempfile(args.ntemp);
314 args.ntemp++;
315 f = create(tf, OWRITE, 0666);
316 if(f < 0) {
317 fprint(2, "sort: create %s: %r\n", tf);
318 done("create");
319 }
320
321 Binit(&tb, f, OWRITE);
322 lp = args.linep;
323 for(n=args.nline; n>0; n--) {
324 l = *lp++;
325 lineout(&tb, l);
326 free(l->key);
327 free(l);
328 }
329 args.nline = 0;
330 Bterm(&tb);
331 close(f);
332 }
333
334 void
done(char * xs)335 done(char *xs)
336 {
337 int i;
338
339 for(i=0; i<args.ntemp; i++)
340 remove(tempfile(i));
341 exits(xs);
342 }
343
344 void
nomem(void)345 nomem(void)
346 {
347 fprint(2, "sort: out of memory\n");
348 done("mem");
349 }
350
351 char*
tempfile(int n)352 tempfile(int n)
353 {
354 static char file[100];
355 static uint pid;
356 char *dir;
357
358 dir = "/tmp";
359 if(args.tname)
360 dir = args.tname;
361 if(strlen(dir) >= nelem(file)-20) {
362 fprint(2, "temp file directory name is too long: %s\n", dir);
363 done("tdir");
364 }
365
366 if(pid == 0) {
367 pid = getpid();
368 if(pid == 0) {
369 pid = time(0);
370 if(pid == 0)
371 pid = 1;
372 }
373 }
374
375 sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n);
376 return file;
377 }
378
379 void
mergeout(Biobuf * b)380 mergeout(Biobuf *b)
381 {
382 int n, i, f;
383 char *tf;
384 Biobuf tb;
385
386 for(i=0; i<args.ntemp; i+=n) {
387 n = args.ntemp - i;
388 if(n > Nmerge) {
389 tf = tempfile(args.ntemp);
390 args.ntemp++;
391 f = create(tf, OWRITE, 0666);
392 if(f < 0) {
393 fprint(2, "sort: create %s: %r\n", tf);
394 done("create");
395 }
396 Binit(&tb, f, OWRITE);
397
398 n = Nmerge;
399 mergefiles(i, n, &tb);
400
401 Bterm(&tb);
402 close(f);
403 } else
404 mergefiles(i, n, b);
405 }
406 }
407
408 void
mergefiles(int t,int n,Biobuf * b)409 mergefiles(int t, int n, Biobuf *b)
410 {
411 Merge *m, *mp, **mmp;
412 Key *ok;
413 Line *l;
414 char *tf;
415 int i, f, nn;
416
417 mmp = malloc(n*sizeof(*mmp));
418 mp = malloc(n*sizeof(*mp));
419 if(mmp == 0 || mp == 0)
420 nomem();
421
422 nn = 0;
423 m = mp;
424 for(i=0; i<n; i++,m++) {
425 tf = tempfile(t+i);
426 f = open(tf, OREAD);
427 if(f < 0) {
428 fprint(2, "sort: reopen %s: %r\n", tf);
429 done("open");
430 }
431 m->fd = f;
432 Binit(&m->b, f, OREAD);
433 mmp[nn] = m;
434
435 l = newline(&m->b);
436 if(l == 0)
437 continue;
438 nn++;
439 m->line = l;
440 m->key = l->key;
441 }
442
443 ok = 0;
444 for(;;) {
445 sort4(mmp, nn);
446 m = *mmp;
447 if(nn == 0)
448 break;
449 for(;;) {
450 l = m->line;
451 if(args.uflag && ok && kcmp(ok, l->key) == 0) {
452 free(l->key);
453 free(l);
454 } else {
455 lineout(b, l);
456 if(ok)
457 free(ok);
458 ok = l->key;
459 free(l);
460 }
461
462 l = newline(&m->b);
463 if(l == 0) {
464 nn--;
465 mmp[0] = mmp[nn];
466 break;
467 }
468 m->line = l;
469 m->key = l->key;
470 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0)
471 break;
472 }
473 }
474 if(ok)
475 free(ok);
476
477 m = mp;
478 for(i=0; i<n; i++,m++) {
479 Bterm(&m->b);
480 close(m->fd);
481 }
482
483 free(mp);
484 free(mmp);
485 }
486
487 int
kcmp(Key * ka,Key * kb)488 kcmp(Key *ka, Key *kb)
489 {
490 int n, m;
491
492 /*
493 * set n to length of smaller key
494 */
495 n = ka->klen;
496 m = kb->klen;
497 if(n > m)
498 n = m;
499 return memcmp(ka->key, kb->key, n);
500 }
501
502 void
printout(Biobuf * b)503 printout(Biobuf *b)
504 {
505 long n;
506 Line **lp, *l;
507 Key *ok;
508
509 sort4(args.linep, args.nline);
510 lp = args.linep;
511 ok = 0;
512 for(n=args.nline; n>0; n--) {
513 l = *lp++;
514 if(args.uflag && ok && kcmp(ok, l->key) == 0)
515 continue;
516 lineout(b, l);
517 ok = l->key;
518 }
519 }
520
521 void
setfield(int n,int c)522 setfield(int n, int c)
523 {
524 Field *f;
525
526 f = &args.field[n];
527 switch(c) {
528 default:
529 fprint(2, "sort: unknown option: field.%C\n", c);
530 done("option");
531 case 'b': /* skip blanks */
532 f->flags |= Bflag;
533 break;
534 case 'd': /* directory order */
535 f->flags |= Dflag;
536 break;
537 case 'f': /* fold case */
538 f->flags |= Fflag;
539 break;
540 case 'g': /* floating point -n case */
541 f->flags |= Gflag;
542 break;
543 case 'i': /* ignore non-ascii */
544 f->flags |= Iflag;
545 break;
546 case 'M': /* month */
547 f->flags |= Mflag;
548 break;
549 case 'n': /* numbers */
550 f->flags |= Nflag;
551 break;
552 case 'r': /* reverse */
553 f->flags |= Rflag;
554 break;
555 case 'w': /* ignore white */
556 f->flags |= Wflag;
557 break;
558 }
559 }
560
561 void
dofield(char * s,int * n1,int * n2,int off1,int off2)562 dofield(char *s, int *n1, int *n2, int off1, int off2)
563 {
564 int c, n;
565
566 c = *s++;
567 if(c >= '0' && c <= '9') {
568 n = 0;
569 while(c >= '0' && c <= '9') {
570 n = n*10 + (c-'0');
571 c = *s++;
572 }
573 n -= off1; /* posix committee: rot in hell */
574 if(n < 0) {
575 fprint(2, "sort: field offset must be positive\n");
576 done("option");
577 }
578 *n1 = n;
579 }
580 if(c == '.') {
581 c = *s++;
582 if(c >= '0' && c <= '9') {
583 n = 0;
584 while(c >= '0' && c <= '9') {
585 n = n*10 + (c-'0');
586 c = *s++;
587 }
588 n -= off2;
589 if(n < 0) {
590 fprint(2, "sort: character offset must be positive\n");
591 done("option");
592 }
593 *n2 = n;
594 }
595 }
596 while(c != 0) {
597 setfield(args.nfield, c);
598 c = *s++;
599 }
600 }
601
602 void
printargs(void)603 printargs(void)
604 {
605 int i, n;
606 Field *f;
607 char *prefix;
608
609 fprint(2, "sort");
610 for(i=0; i<=args.nfield; i++) {
611 f = &args.field[i];
612 prefix = " -";
613 if(i) {
614 n = f->beg1;
615 if(n >= 0)
616 fprint(2, " +%d", n);
617 else
618 fprint(2, " +*");
619 n = f->beg2;
620 if(n >= 0)
621 fprint(2, ".%d", n);
622 else
623 fprint(2, ".*");
624
625 if(f->flags & B1flag)
626 fprint(2, "b");
627
628 n = f->end1;
629 if(n >= 0)
630 fprint(2, " -%d", n);
631 else
632 fprint(2, " -*");
633 n = f->end2;
634 if(n >= 0)
635 fprint(2, ".%d", n);
636 else
637 fprint(2, ".*");
638 prefix = "";
639 }
640 if(f->flags & Bflag)
641 fprint(2, "%sb", prefix);
642 if(f->flags & Dflag)
643 fprint(2, "%sd", prefix);
644 if(f->flags & Fflag)
645 fprint(2, "%sf", prefix);
646 if(f->flags & Gflag)
647 fprint(2, "%sg", prefix);
648 if(f->flags & Iflag)
649 fprint(2, "%si", prefix);
650 if(f->flags & Mflag)
651 fprint(2, "%sM", prefix);
652 if(f->flags & Nflag)
653 fprint(2, "%sn", prefix);
654 if(f->flags & Rflag)
655 fprint(2, "%sr", prefix);
656 if(f->flags & Wflag)
657 fprint(2, "%sw", prefix);
658 }
659 if(args.cflag)
660 fprint(2, " -c");
661 if(args.uflag)
662 fprint(2, " -u");
663 if(args.ofile)
664 fprint(2, " -o %s", args.ofile);
665 if(args.mline != Nline)
666 fprint(2, " -l %ld", args.mline);
667 fprint(2, "\n");
668 }
669
670 void
newfield(void)671 newfield(void)
672 {
673 int n;
674 Field *f;
675
676 n = args.nfield + 1;
677 if(n >= Nfield) {
678 fprint(2, "sort: too many fields specified\n");
679 done("option");
680 }
681 args.nfield = n;
682 f = &args.field[n];
683 f->beg1 = -1;
684 f->beg2 = -1;
685 f->end1 = -1;
686 f->end2 = -1;
687 }
688
689 void
doargs(int argc,char * argv[])690 doargs(int argc, char *argv[])
691 {
692 int i, c, hadplus;
693 char *s, *p, *q;
694 Field *f;
695
696 hadplus = 0;
697 args.mline = Nline;
698 for(i=1; i<argc; i++) {
699 s = argv[i];
700 c = *s++;
701 if(c == '-') {
702 c = *s;
703 if(c == 0) /* forced end of arg marker */
704 break;
705 argv[i] = 0; /* clobber args processed */
706 if(c == '.' || (c >= '0' && c <= '9')) {
707 if(!hadplus)
708 newfield();
709 f = &args.field[args.nfield];
710 dofield(s, &f->end1, &f->end2, 0, 0);
711 hadplus = 0;
712 continue;
713 }
714
715 while(c = *s++)
716 switch(c) {
717 case '-': /* end of options */
718 i = argc;
719 continue;
720 case 'T': /* temp directory */
721 if(*s == 0) {
722 i++;
723 if(i < argc) {
724 args.tname = argv[i];
725 argv[i] = 0;
726 }
727 } else
728 args.tname = s;
729 s = strchr(s, 0);
730 break;
731 case 'o': /* output file */
732 if(*s == 0) {
733 i++;
734 if(i < argc) {
735 args.ofile = argv[i];
736 argv[i] = 0;
737 }
738 } else
739 args.ofile = s;
740 s = strchr(s, 0);
741 break;
742 case 'k': /* posix key (what were they thinking?) */
743 p = 0;
744 if(*s == 0) {
745 i++;
746 if(i < argc) {
747 p = argv[i];
748 argv[i] = 0;
749 }
750 } else
751 p = s;
752 s = strchr(s, 0);
753 if(p == 0)
754 break;
755
756 newfield();
757 q = strchr(p, ',');
758 if(q)
759 *q++ = 0;
760 f = &args.field[args.nfield];
761 dofield(p, &f->beg1, &f->beg2, 1, 1);
762 if(f->flags & Bflag) {
763 f->flags |= B1flag;
764 f->flags &= ~Bflag;
765 }
766 if(q) {
767 dofield(q, &f->end1, &f->end2, 1, 0);
768 if(f->end2 <= 0)
769 f->end1++;
770 }
771 hadplus = 0;
772 break;
773 case 't': /* tab character */
774 if(*s == 0) {
775 i++;
776 if(i < argc) {
777 chartorune(&args.tabchar, argv[i]);
778 argv[i] = 0;
779 }
780 } else
781 s += chartorune(&args.tabchar, s);
782 if(args.tabchar == '\n') {
783 fprint(2, "aw come on, rob\n");
784 done("rob");
785 }
786 break;
787 case 'c': /* check order */
788 args.cflag = 1;
789 break;
790 case 'u': /* unique */
791 args.uflag = 1;
792 break;
793 case 'v': /* debugging noise */
794 args.vflag = 1;
795 break;
796 case 'l':
797 if(*s == 0) {
798 i++;
799 if(i < argc) {
800 args.mline = atol(argv[i]);
801 argv[i] = 0;
802 }
803 } else
804 args.mline = atol(s);
805 s = strchr(s, 0);
806 break;
807
808 case 'M': /* month */
809 case 'b': /* skip blanks */
810 case 'd': /* directory order */
811 case 'f': /* fold case */
812 case 'g': /* floating numbers */
813 case 'i': /* ignore non-ascii */
814 case 'n': /* numbers */
815 case 'r': /* reverse */
816 case 'w': /* ignore white */
817 if(args.nfield > 0)
818 fprint(2, "sort: global field set after -k\n");
819 setfield(0, c);
820 break;
821 case 'm':
822 /* option m silently ignored but required by posix */
823 break;
824 default:
825 fprint(2, "sort: unknown option: -%C\n", c);
826 done("option");
827 }
828 continue;
829 }
830 if(c == '+') {
831 argv[i] = 0; /* clobber args processed */
832 c = *s;
833 if(c == '.' || (c >= '0' && c <= '9')) {
834 newfield();
835 f = &args.field[args.nfield];
836 dofield(s, &f->beg1, &f->beg2, 0, 0);
837 if(f->flags & Bflag) {
838 f->flags |= B1flag;
839 f->flags &= ~Bflag;
840 }
841 hadplus = 1;
842 continue;
843 }
844 fprint(2, "sort: unknown option: +%C\n", c);
845 done("option");
846 }
847 args.nfile++;
848 }
849
850 for(i=0; i<=args.nfield; i++) {
851 f = &args.field[i];
852
853 /*
854 * global options apply to fields that
855 * specify no options
856 */
857 if(f->flags == 0) {
858 f->flags = args.field[0].flags;
859 if(args.field[0].flags & Bflag)
860 f->flags |= B1flag;
861 }
862
863
864 /*
865 * build buildkey specification
866 */
867 switch(f->flags & ~(Bflag|B1flag)) {
868 default:
869 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags);
870 done("option");
871 case 0:
872 f->dokey = dokey_;
873 break;
874 case Rflag:
875 f->dokey = dokey_r;
876 break;
877 case Gflag:
878 case Nflag:
879 case Gflag|Nflag:
880 case Gflag|Rflag:
881 case Nflag|Rflag:
882 case Gflag|Nflag|Rflag:
883 f->dokey = dokey_gn;
884 break;
885 case Mflag:
886 case Mflag|Rflag:
887 f->dokey = dokey_m;
888 makemapm(f);
889 break;
890 case Dflag:
891 case Dflag|Fflag:
892 case Dflag|Fflag|Iflag:
893 case Dflag|Fflag|Iflag|Rflag:
894 case Dflag|Fflag|Iflag|Rflag|Wflag:
895 case Dflag|Fflag|Iflag|Wflag:
896 case Dflag|Fflag|Rflag:
897 case Dflag|Fflag|Rflag|Wflag:
898 case Dflag|Fflag|Wflag:
899 case Dflag|Iflag:
900 case Dflag|Iflag|Rflag:
901 case Dflag|Iflag|Rflag|Wflag:
902 case Dflag|Iflag|Wflag:
903 case Dflag|Rflag:
904 case Dflag|Rflag|Wflag:
905 case Dflag|Wflag:
906 case Fflag:
907 case Fflag|Iflag:
908 case Fflag|Iflag|Rflag:
909 case Fflag|Iflag|Rflag|Wflag:
910 case Fflag|Iflag|Wflag:
911 case Fflag|Rflag:
912 case Fflag|Rflag|Wflag:
913 case Fflag|Wflag:
914 case Iflag:
915 case Iflag|Rflag:
916 case Iflag|Rflag|Wflag:
917 case Iflag|Wflag:
918 case Wflag:
919 f->dokey = dokey_dfi;
920 makemapd(f);
921 break;
922 }
923 }
924
925 /*
926 * random spot checks
927 */
928 if(args.nfile > 1 && args.cflag) {
929 fprint(2, "sort: -c can have at most one input file\n");
930 done("option");
931 }
932 return;
933 }
934
935 uchar*
skip(uchar * l,int n1,int n2,int bflag,int endfield)936 skip(uchar *l, int n1, int n2, int bflag, int endfield)
937 {
938 int i, c, tc;
939 Rune r;
940
941 if(endfield && n1 < 0)
942 return 0;
943
944 c = *l++;
945 tc = args.tabchar;
946 if(tc) {
947 if(tc < Runeself) {
948 for(i=n1; i>0; i--) {
949 while(c != tc) {
950 if(c == '\n')
951 return 0;
952 c = *l++;
953 }
954 if(!(endfield && i == 1))
955 c = *l++;
956 }
957 } else {
958 l--;
959 l += chartorune(&r, (char*)l);
960 for(i=n1; i>0; i--) {
961 while(r != tc) {
962 if(r == '\n')
963 return 0;
964 l += chartorune(&r, (char*)l);
965 }
966 if(!(endfield && i == 1))
967 l += chartorune(&r, (char*)l);
968 }
969 c = r;
970 }
971 } else {
972 for(i=n1; i>0; i--) {
973 while(c == ' ' || c == '\t')
974 c = *l++;
975 while(c != ' ' && c != '\t') {
976 if(c == '\n')
977 return 0;
978 c = *l++;
979 }
980 }
981 }
982
983 if(bflag)
984 while(c == ' ' || c == '\t')
985 c = *l++;
986
987 l--;
988 for(i=n2; i>0; i--) {
989 c = *l;
990 if(c < Runeself) {
991 if(c == '\n')
992 return 0;
993 l++;
994 continue;
995 }
996 l += chartorune(&r, (char*)l);
997 }
998 return l;
999 }
1000
1001 void
dokey_gn(Key * k,uchar * lp,uchar * lpe,Field * f)1002 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f)
1003 {
1004 uchar *kp;
1005 int c, cl, dp;
1006 int state, nzero, exp, expsign, rflag;
1007
1008 cl = k->klen + 3;
1009 kp = k->key + cl; /* skip place for sign, exponent[2] */
1010
1011 nzero = 0; /* number of trailing zeros */
1012 exp = 0; /* value of the exponent */
1013 expsign = 0; /* sign of the exponent */
1014 dp = 0x4040; /* location of decimal point */
1015 rflag = f->flags&Rflag; /* xor of rflag and - sign */
1016 state = NSstart;
1017
1018 for(;; lp++) {
1019 if(lp >= lpe)
1020 break;
1021 c = *lp;
1022
1023 if(c == ' ' || c == '\t') {
1024 switch(state) {
1025 case NSstart:
1026 case NSsign:
1027 continue;
1028 }
1029 break;
1030 }
1031 if(c == '+' || c == '-') {
1032 switch(state) {
1033 case NSstart:
1034 state = NSsign;
1035 if(c == '-')
1036 rflag = !rflag;
1037 continue;
1038 case NSexp:
1039 state = NSexpsign;
1040 if(c == '-')
1041 expsign = 1;
1042 continue;
1043 }
1044 break;
1045 }
1046 if(c == '0') {
1047 switch(state) {
1048 case NSdigit:
1049 if(rflag)
1050 c = ~c;
1051 *kp++ = c;
1052 cl++;
1053 nzero++;
1054 dp++;
1055 state = NSdigit;
1056 continue;
1057 case NSfract:
1058 if(rflag)
1059 c = ~c;
1060 *kp++ = c;
1061 cl++;
1062 nzero++;
1063 state = NSfract;
1064 continue;
1065 case NSstart:
1066 case NSsign:
1067 case NSzero:
1068 state = NSzero;
1069 continue;
1070 case NSzerofract:
1071 case NSpoint:
1072 dp--;
1073 state = NSzerofract;
1074 continue;
1075 case NSexpsign:
1076 case NSexp:
1077 case NSexpdigit:
1078 exp = exp*10 + (c - '0');
1079 state = NSexpdigit;
1080 continue;
1081 }
1082 break;
1083 }
1084 if(c >= '1' && c <= '9') {
1085 switch(state) {
1086 case NSzero:
1087 case NSstart:
1088 case NSsign:
1089 case NSdigit:
1090 if(rflag)
1091 c = ~c;
1092 *kp++ = c;
1093 cl++;
1094 nzero = 0;
1095 dp++;
1096 state = NSdigit;
1097 continue;
1098 case NSzerofract:
1099 case NSpoint:
1100 case NSfract:
1101 if(rflag)
1102 c = ~c;
1103 *kp++ = c;
1104 cl++;
1105 nzero = 0;
1106 state = NSfract;
1107 continue;
1108 case NSexpsign:
1109 case NSexp:
1110 case NSexpdigit:
1111 exp = exp*10 + (c - '0');
1112 state = NSexpdigit;
1113 continue;
1114 }
1115 break;
1116 }
1117 if(c == '.') {
1118 switch(state) {
1119 case NSstart:
1120 case NSsign:
1121 state = NSpoint;
1122 continue;
1123 case NSzero:
1124 state = NSzerofract;
1125 continue;
1126 case NSdigit:
1127 state = NSfract;
1128 continue;
1129 }
1130 break;
1131 }
1132 if((f->flags & Gflag) && (c == 'e' || c == 'E')) {
1133 switch(state) {
1134 case NSdigit:
1135 case NSfract:
1136 state = NSexp;
1137 continue;
1138 }
1139 break;
1140 }
1141 break;
1142 }
1143
1144 switch(state) {
1145 /*
1146 * result is zero
1147 */
1148 case NSstart:
1149 case NSsign:
1150 case NSzero:
1151 case NSzerofract:
1152 case NSpoint:
1153 kp = k->key + k->klen;
1154 k->klen += 2;
1155 kp[0] = 0x20; /* between + and - */
1156 kp[1] = 0;
1157 return;
1158 /*
1159 * result has exponent
1160 */
1161 case NSexpsign:
1162 case NSexp:
1163 case NSexpdigit:
1164 if(expsign)
1165 exp = -exp;
1166 dp += exp;
1167
1168 /*
1169 * result is fixed point number
1170 */
1171 case NSdigit:
1172 case NSfract:
1173 kp -= nzero;
1174 cl -= nzero;
1175 break;
1176 }
1177
1178 /*
1179 * end of number
1180 */
1181 c = 0;
1182 if(rflag)
1183 c = ~c;
1184 *kp = c;
1185
1186 /*
1187 * sign and exponent
1188 */
1189 c = 0x30;
1190 if(rflag) {
1191 c = 0x10;
1192 dp = ~dp;
1193 }
1194 kp = k->key + k->klen;
1195 kp[0] = c;
1196 kp[1] = (dp >> 8);
1197 kp[2] = dp;
1198 k->klen = cl+1;
1199 }
1200
1201 void
dokey_m(Key * k,uchar * lp,uchar * lpe,Field * f)1202 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f)
1203 {
1204 uchar *kp;
1205 Rune r, place[3];
1206 int c, cl, pc;
1207 int rflag;
1208
1209 rflag = f->flags&Rflag;
1210 pc = 0;
1211
1212 cl = k->klen;
1213 kp = k->key + cl;
1214
1215 for(;;) {
1216 /*
1217 * get the character
1218 */
1219 if(lp >= lpe)
1220 break;
1221 c = *lp;
1222 if(c >= Runeself) {
1223 lp += chartorune(&r, (char*)lp);
1224 c = r;
1225 } else
1226 lp++;
1227
1228 if(c < nelem(f->mapto)) {
1229 c = f->mapto[c];
1230 if(c == 0)
1231 continue;
1232 }
1233 place[pc++] = c;
1234 if(pc < 3)
1235 continue;
1236 for(c=11; c>=0; c--)
1237 if(memcmp(month[c], place, sizeof(place)) == 0)
1238 break;
1239 c += 10;
1240 if(rflag)
1241 c = ~c;
1242 *kp++ = c;
1243 cl++;
1244 break;
1245 }
1246
1247 c = 0;
1248 if(rflag)
1249 c = ~c;
1250 *kp = c;
1251 k->klen = cl+1;
1252 }
1253
1254 void
dokey_dfi(Key * k,uchar * lp,uchar * lpe,Field * f)1255 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f)
1256 {
1257 uchar *kp;
1258 Rune r;
1259 int c, cl, n, rflag;
1260
1261 cl = k->klen;
1262 kp = k->key + cl;
1263 rflag = f->flags & Rflag;
1264
1265 for(;;) {
1266 /*
1267 * get the character
1268 */
1269 if(lp >= lpe)
1270 break;
1271 c = *lp;
1272 if(c >= Runeself) {
1273 lp += chartorune(&r, (char*)lp);
1274 c = r;
1275 } else
1276 lp++;
1277
1278 /*
1279 * do the various mappings.
1280 * the common case is handled
1281 * completely by the table.
1282 */
1283 if(c != 0 && c < Runeself) {
1284 c = f->mapto[c];
1285 if(c) {
1286 *kp++ = c;
1287 cl++;
1288 }
1289 continue;
1290 }
1291
1292 /*
1293 * for characters out of range,
1294 * the table does not do Rflag.
1295 * ignore is based on mapto[nelem(f->mapto)-1]
1296 */
1297 if(c != 0 && c < nelem(f->mapto)) {
1298 c = f->mapto[c];
1299 if(c == 0)
1300 continue;
1301 } else {
1302 if(f->mapto[nelem(f->mapto)-1] == 0)
1303 continue;
1304 /*
1305 * consider building maps as necessary
1306 */
1307 if(f->flags & Fflag)
1308 c = tolowerrune(tobaserune(c));
1309 if(f->flags & Dflag && !isalpharune(c) &&
1310 !isdigitrune(c) && !isspacerune(c))
1311 continue;
1312 if((f->flags & Wflag) && isspacerune(c))
1313 continue;
1314 }
1315
1316 /*
1317 * put it in the key
1318 */
1319 r = c;
1320 n = runetochar((char*)kp, &r);
1321 kp += n;
1322 cl += n;
1323 if(rflag)
1324 while(n > 0) {
1325 kp[-n] = ~kp[-n];
1326 n--;
1327 }
1328 }
1329
1330 /*
1331 * end of key
1332 */
1333 k->klen = cl+1;
1334 if(rflag) {
1335 *kp = ~0;
1336 return;
1337 }
1338 *kp = 0;
1339 }
1340
1341 void
dokey_r(Key * k,uchar * lp,uchar * lpe,Field *)1342 dokey_r(Key *k, uchar *lp, uchar *lpe, Field*)
1343 {
1344 int cl, n;
1345 uchar *kp;
1346
1347 n = lpe - lp;
1348 if(n < 0)
1349 n = 0;
1350 cl = k->klen;
1351 kp = k->key + cl;
1352 k->klen = cl+n+1;
1353
1354 lpe -= 3;
1355 while(lp < lpe) {
1356 kp[0] = ~lp[0];
1357 kp[1] = ~lp[1];
1358 kp[2] = ~lp[2];
1359 kp[3] = ~lp[3];
1360 kp += 4;
1361 lp += 4;
1362 }
1363
1364 lpe += 3;
1365 while(lp < lpe)
1366 *kp++ = ~*lp++;
1367 *kp = ~0;
1368 }
1369
1370 void
dokey_(Key * k,uchar * lp,uchar * lpe,Field *)1371 dokey_(Key *k, uchar *lp, uchar *lpe, Field*)
1372 {
1373 int n, cl;
1374 uchar *kp;
1375
1376 n = lpe - lp;
1377 if(n < 0)
1378 n = 0;
1379 cl = k->klen;
1380 kp = k->key + cl;
1381 k->klen = cl+n+1;
1382 memmove(kp, lp, n);
1383 kp[n] = 0;
1384 }
1385
1386 void
buildkey(Line * l)1387 buildkey(Line *l)
1388 {
1389 Key *k;
1390 uchar *lp, *lpe;
1391 int ll, kl, cl, i, n;
1392 Field *f;
1393
1394 ll = l->llen - 1;
1395 kl = 0; /* allocated length */
1396 cl = 0; /* current length */
1397 k = 0;
1398
1399 for(i=1; i<=args.nfield; i++) {
1400 f = &args.field[i];
1401 lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0);
1402 if(lp == 0)
1403 lp = l->line + ll;
1404 lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1);
1405 if(lpe == 0)
1406 lpe = l->line + ll;
1407 n = (lpe - lp) + 1;
1408 if(n <= 0)
1409 n = 1;
1410 if(cl+(n+4) > kl) {
1411 kl = cl+(n+4);
1412 k = realloc(k, sizeof(Key) +
1413 (kl-1)*sizeof(k->key[0]));
1414 if(k == 0)
1415 nomem();
1416 }
1417 k->klen = cl;
1418 (*f->dokey)(k, lp, lpe, f);
1419 cl = k->klen;
1420 }
1421
1422 /*
1423 * global comparisons
1424 */
1425 if(!(args.uflag && cl > 0)) {
1426 f = &args.field[0];
1427 if(cl+(ll+4) > kl) {
1428 kl = cl+(ll+4);
1429 k = realloc(k, sizeof(Key) +
1430 (kl-1)*sizeof(k->key[0]));
1431 if(k == 0)
1432 nomem();
1433 }
1434 k->klen = cl;
1435 (*f->dokey)(k, l->line, l->line+ll, f);
1436 cl = k->klen;
1437 }
1438
1439 l->key = k;
1440 k->klen = cl;
1441
1442 if(args.vflag) {
1443 if(write(2, l->line, l->llen) != l->llen)
1444 exits("write");
1445 for(i=0; i<k->klen; i++) {
1446 fprint(2, " %.2x", k->key[i]);
1447 if(k->key[i] == 0x00 || k->key[i] == 0xff)
1448 fprint(2, "\n");
1449 }
1450 }
1451 }
1452
1453 void
makemapm(Field * f)1454 makemapm(Field *f)
1455 {
1456 int i, c;
1457
1458 for(i=0; i<nelem(f->mapto); i++) {
1459 c = 1;
1460 if(i == ' ' || i == '\t')
1461 c = 0;
1462 if(i >= 'a' && i <= 'z')
1463 c = i + ('A' - 'a');
1464 if(i >= 'A' && i <= 'Z')
1465 c = i;
1466 f->mapto[i] = c;
1467 if(args.vflag) {
1468 if((i & 15) == 0)
1469 fprint(2, " ");
1470 fprint(2, " %.2x", c);
1471 if((i & 15) == 15)
1472 fprint(2, "\n");
1473 }
1474 }
1475 }
1476
1477 void
makemapd(Field * f)1478 makemapd(Field *f)
1479 {
1480 int i, c;
1481
1482 for(i=0; i<nelem(f->mapto); i++) {
1483 c = i;
1484 if(f->flags & Iflag)
1485 if(c < 040 || c > 0176)
1486 c = -1;
1487 if((f->flags & Wflag) && c >= 0)
1488 if(c == ' ' || c == '\t')
1489 c = -1;
1490 if((f->flags & Dflag) && c >= 0)
1491 if(!(c == ' ' || c == '\t' ||
1492 (c >= 'a' && c <= 'z') ||
1493 (c >= 'A' && c <= 'Z') ||
1494 (c >= '0' && c <= '9'))){
1495 if(!isupperrune(c = toupperrune(c)))
1496 c = -1;
1497 }
1498 if((f->flags & Fflag) && c >= 0)
1499 c = toupperrune(tobaserune(c));
1500 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself)
1501 c = ~c & 0xff;
1502 if(c < 0)
1503 c = 0;
1504 f->mapto[i] = c;
1505 if(args.vflag) {
1506 if((i & 15) == 0)
1507 fprint(2, " ");
1508 fprint(2, " %.2x", c);
1509 if((i & 15) == 15)
1510 fprint(2, "\n");
1511 }
1512 }
1513 }
1514
1515 Rune* month[12] =
1516 {
1517 L"JAN",
1518 L"FEB",
1519 L"MAR",
1520 L"APR",
1521 L"MAY",
1522 L"JUN",
1523 L"JUL",
1524 L"AUG",
1525 L"SEP",
1526 L"OCT",
1527 L"NOV",
1528 L"DEC",
1529 };
1530
1531 /************** radix sort ***********/
1532
1533 enum
1534 {
1535 Threshold = 14,
1536 };
1537
1538 void rsort4(Key***, ulong, int);
1539 void bsort4(Key***, ulong, int);
1540
1541 void
sort4(void * a,ulong n)1542 sort4(void *a, ulong n)
1543 {
1544 if(n > Threshold)
1545 rsort4((Key***)a, n, 0);
1546 else
1547 bsort4((Key***)a, n, 0);
1548 }
1549
1550 void
rsort4(Key *** a,ulong n,int b)1551 rsort4(Key ***a, ulong n, int b)
1552 {
1553 Key ***ea, ***t, ***u, **t1, **u1, *k;
1554 Key ***part[257];
1555 static long count[257];
1556 long clist[257+257], *cp, *cp1;
1557 int c, lowc, higc;
1558
1559 /*
1560 * pass 1 over all keys,
1561 * count the number of each key[b].
1562 * find low count and high count.
1563 */
1564 lowc = 256;
1565 higc = 0;
1566 ea = a+n;
1567 for(t=a; t<ea; t++) {
1568 k = **t;
1569 n = k->klen;
1570 if(b >= n) {
1571 count[256]++;
1572 continue;
1573 }
1574 c = k->key[b];
1575 n = count[c]++;
1576 if(n == 0) {
1577 if(c < lowc)
1578 lowc = c;
1579 if(c > higc)
1580 higc = c;
1581 }
1582 }
1583
1584 /*
1585 * pass 2 over all counts,
1586 * put partition pointers in part[c].
1587 * save compacted indexes and counts
1588 * in clist[].
1589 */
1590 t = a;
1591 n = count[256];
1592 clist[0] = n;
1593 part[256] = t;
1594 t += n;
1595
1596 cp1 = clist+1;
1597 cp = count+lowc;
1598 for(c=lowc; c<=higc; c++,cp++) {
1599 n = *cp;
1600 if(n) {
1601 cp1[0] = n;
1602 cp1[1] = c;
1603 cp1 += 2;
1604 part[c] = t;
1605 t += n;
1606 }
1607 }
1608 *cp1 = 0;
1609
1610 /*
1611 * pass 3 over all counts.
1612 * chase lowest pointer in each partition
1613 * around a permutation until it comes
1614 * back and is stored where it started.
1615 * static array, count[], should be
1616 * reduced to zero entries except maybe
1617 * count[256].
1618 */
1619 for(cp1=clist+1; cp1[0]; cp1+=2) {
1620 c = cp1[1];
1621 cp = count+c;
1622 while(*cp) {
1623 t1 = *part[c];
1624 for(;;) {
1625 k = *t1;
1626 n = 256;
1627 if(b < k->klen)
1628 n = k->key[b];
1629 u = part[n]++;
1630 count[n]--;
1631 u1 = *u;
1632 *u = t1;
1633 if(n == c)
1634 break;
1635 t1 = u1;
1636 }
1637 }
1638 }
1639
1640 /*
1641 * pass 4 over all partitions.
1642 * call recursively.
1643 */
1644 b++;
1645 t = a + clist[0];
1646 count[256] = 0;
1647 for(cp1=clist+1; n=cp1[0]; cp1+=2) {
1648 if(n > Threshold)
1649 rsort4(t, n, b);
1650 else
1651 if(n > 1)
1652 bsort4(t, n, b);
1653 t += n;
1654 }
1655 }
1656
1657 /*
1658 * bubble sort to pick up
1659 * the pieces.
1660 */
1661 void
bsort4(Key *** a,ulong n,int b)1662 bsort4(Key ***a, ulong n, int b)
1663 {
1664 Key ***i, ***j, ***k, ***l, **t;
1665 Key *ka, *kb;
1666 int n1, n2;
1667
1668 l = a+n;
1669 j = a;
1670
1671 loop:
1672 i = j;
1673 j++;
1674 if(j >= l)
1675 return;
1676
1677 ka = **i;
1678 kb = **j;
1679 n1 = ka->klen - b;
1680 n2 = kb->klen - b;
1681 if(n1 > n2)
1682 n1 = n2;
1683 if(n1 <= 0)
1684 goto loop;
1685 n2 = ka->key[b] - kb->key[b];
1686 if(n2 == 0)
1687 n2 = memcmp(ka->key+b, kb->key+b, n1);
1688 if(n2 <= 0)
1689 goto loop;
1690
1691 for(;;) {
1692 k = i+1;
1693
1694 t = *k;
1695 *k = *i;
1696 *i = t;
1697
1698 if(i <= a)
1699 goto loop;
1700
1701 i--;
1702 ka = **i;
1703 kb = *t;
1704 n1 = ka->klen - b;
1705 n2 = kb->klen - b;
1706 if(n1 > n2)
1707 n1 = n2;
1708 if(n1 <= 0)
1709 goto loop;
1710 n2 = ka->key[b] - kb->key[b];
1711 if(n2 == 0)
1712 n2 = memcmp(ka->key+b, kb->key+b, n1);
1713 if(n2 <= 0)
1714 goto loop;
1715 }
1716 }
1717