1 /* $OpenBSD: file.c,v 1.20 2015/04/03 10:37:24 tobias Exp $ */ 2 3 /*- 4 * Copyright (C) 2009 Gabor Kovesdan <gabor@FreeBSD.org> 5 * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/mman.h> 31 #include <sys/stat.h> 32 #include <sys/types.h> 33 #include <sys/queue.h> 34 35 #include <err.h> 36 #include <fcntl.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <signal.h> 40 #include <string.h> 41 #include <unistd.h> 42 #include <wchar.h> 43 #include <wctype.h> 44 45 #include "coll.h" 46 #include "file.h" 47 #include "radixsort.h" 48 49 unsigned long long available_free_memory = 1000000; 50 51 bool use_mmap; 52 53 const char *tmpdir = "/var/tmp"; 54 const char *compress_program; 55 56 size_t max_open_files = 16; 57 58 /* 59 * How much space we read from file at once 60 */ 61 #define READ_CHUNK 4096 62 63 /* 64 * File reader structure 65 */ 66 struct file_reader { 67 struct reader_buffer rb; 68 FILE *file; 69 char *fname; 70 unsigned char *buffer; 71 unsigned char *mmapaddr; 72 unsigned char *mmapptr; 73 size_t bsz; 74 size_t cbsz; 75 size_t mmapsize; 76 size_t strbeg; 77 char elsymb; 78 }; 79 80 /* 81 * Structure to be used in file merge process. 82 */ 83 struct file_header { 84 struct file_reader *fr; 85 struct sort_list_item *si; /* current top line */ 86 size_t file_pos; 87 }; 88 89 /* 90 * List elements of "cleanable" files list. 91 */ 92 struct CLEANABLE_FILE { 93 char *fn; 94 LIST_ENTRY(CLEANABLE_FILE) files; 95 }; 96 97 /* 98 * List header of "cleanable" files list. 99 */ 100 static LIST_HEAD(CLEANABLE_FILES, CLEANABLE_FILE) tmp_files; 101 102 /* 103 * Init tmp files list 104 */ 105 void 106 init_tmp_files(void) 107 { 108 LIST_INIT(&tmp_files); 109 } 110 111 /* 112 * Save name of a tmp file for signal cleanup 113 */ 114 void 115 tmp_file_atexit(const char *tmp_file) 116 { 117 struct CLEANABLE_FILE *item; 118 sigset_t mask, oldmask; 119 120 item = sort_malloc(sizeof(struct CLEANABLE_FILE)); 121 item->fn = sort_strdup(tmp_file); 122 123 sigfillset(&mask); 124 sigprocmask(SIG_BLOCK, &mask, &oldmask); 125 LIST_INSERT_HEAD(&tmp_files, item, files); 126 sigprocmask(SIG_SETMASK, &oldmask, NULL); 127 } 128 129 /* 130 * Clear tmp files 131 */ 132 void 133 clear_tmp_files(void) 134 { 135 struct CLEANABLE_FILE *item; 136 137 LIST_FOREACH(item, &tmp_files, files) { 138 if (item != NULL && item->fn != NULL) 139 unlink(item->fn); 140 } 141 } 142 143 /* 144 * Check whether a file is a temporary file 145 */ 146 static bool 147 file_is_tmp(const char *fn) 148 { 149 struct CLEANABLE_FILE *item; 150 151 LIST_FOREACH(item, &tmp_files, files) { 152 if (item->fn != NULL && strcmp(item->fn, fn) == 0) 153 return true; 154 } 155 156 return false; 157 } 158 159 /* 160 * Generate new temporary file name 161 */ 162 char * 163 new_tmp_file_name(void) 164 { 165 char *ret; 166 int fd; 167 168 sort_asprintf(&ret, "%s/.bsdsort.XXXXXXXXXX", tmpdir); 169 if ((fd = mkstemp(ret)) == -1) 170 err(2, "%s", ret); 171 close(fd); 172 tmp_file_atexit(ret); 173 return ret; 174 } 175 176 /* 177 * Initialize file list 178 */ 179 void 180 file_list_init(struct file_list *fl, bool tmp) 181 { 182 fl->count = 0; 183 fl->sz = 0; 184 fl->fns = NULL; 185 fl->tmp = tmp; 186 } 187 188 /* 189 * Add a file name to the list 190 */ 191 void 192 file_list_add(struct file_list *fl, char *fn, bool allocate) 193 { 194 if (fl->count >= fl->sz) { 195 fl->fns = sort_reallocarray(fl->fns, 196 fl->sz ? fl->sz : (fl->sz = 1), 2 * sizeof(char *)); 197 fl->sz *= 2; 198 } 199 fl->fns[fl->count] = allocate ? sort_strdup(fn) : fn; 200 fl->count += 1; 201 } 202 203 /* 204 * Populate file list from array of file names 205 */ 206 void 207 file_list_populate(struct file_list *fl, int argc, char **argv, bool allocate) 208 { 209 int i; 210 211 for (i = 0; i < argc; i++) 212 file_list_add(fl, argv[i], allocate); 213 } 214 215 /* 216 * Clean file list data and delete the files, 217 * if this is a list of temporary files 218 */ 219 void 220 file_list_clean(struct file_list *fl) 221 { 222 if (fl->fns) { 223 size_t i; 224 225 for (i = 0; i < fl->count; i++) { 226 if (fl->fns[i]) { 227 if (fl->tmp) 228 unlink(fl->fns[i]); 229 sort_free(fl->fns[i]); 230 fl->fns[i] = NULL; 231 } 232 } 233 sort_free(fl->fns); 234 fl->fns = NULL; 235 } 236 fl->sz = 0; 237 fl->count = 0; 238 fl->tmp = false; 239 } 240 241 /* 242 * Init sort list 243 */ 244 void 245 sort_list_init(struct sort_list *l) 246 { 247 l->count = 0; 248 l->size = 0; 249 l->memsize = sizeof(struct sort_list); 250 l->list = NULL; 251 } 252 253 /* 254 * Add string to sort list 255 */ 256 void 257 sort_list_add(struct sort_list *l, struct bwstring *str) 258 { 259 size_t indx = l->count; 260 261 if ((l->list == NULL) || (indx >= l->size)) { 262 size_t newsize = (l->size + 1) + 1024; 263 264 l->list = sort_reallocarray(l->list, newsize, 265 sizeof(struct sort_list_item *)); 266 l->memsize += (newsize - l->size) * 267 sizeof(struct sort_list_item *); 268 l->size = newsize; 269 } 270 l->list[indx] = sort_list_item_alloc(); 271 sort_list_item_set(l->list[indx], str); 272 l->memsize += sort_list_item_size(l->list[indx]); 273 l->count += 1; 274 } 275 276 /* 277 * Clean sort list data 278 */ 279 void 280 sort_list_clean(struct sort_list *l) 281 { 282 if (l->list) { 283 size_t i; 284 285 for (i = 0; i < l->count; i++) { 286 struct sort_list_item *item; 287 288 item = l->list[i]; 289 290 if (item) { 291 sort_list_item_clean(item); 292 sort_free(item); 293 l->list[i] = NULL; 294 } 295 } 296 sort_free(l->list); 297 l->list = NULL; 298 } 299 l->count = 0; 300 l->size = 0; 301 l->memsize = sizeof(struct sort_list); 302 } 303 304 /* 305 * Write sort list to file 306 */ 307 void 308 sort_list_dump(struct sort_list *l, const char *fn) 309 { 310 FILE *f; 311 312 f = openfile(fn, "w"); 313 if (f == NULL) 314 err(2, "%s", fn); 315 316 if (l->list) { 317 size_t i; 318 319 if (!sort_opts_vals.uflag) { 320 for (i = 0; i < l->count; ++i) 321 bwsfwrite(l->list[i]->str, f, 322 sort_opts_vals.zflag); 323 } else { 324 struct sort_list_item *last_printed_item = NULL; 325 struct sort_list_item *item; 326 for (i = 0; i < l->count; ++i) { 327 item = l->list[i]; 328 if ((last_printed_item == NULL) || 329 list_coll(&last_printed_item, &item)) { 330 bwsfwrite(item->str, f, sort_opts_vals.zflag); 331 last_printed_item = item; 332 } 333 } 334 } 335 } 336 337 closefile(f, fn); 338 } 339 340 /* 341 * Checks if the given file is sorted. Stops at the first disorder, 342 * prints the disordered line and returns 1. 343 */ 344 int 345 check(const char *fn) 346 { 347 struct bwstring *s1, *s2; 348 struct file_reader *fr; 349 struct keys_array *ka1, *ka2; 350 int res; 351 size_t pos, posdisorder; 352 353 s1 = s2 = NULL; 354 ka1 = ka2 = NULL; 355 356 fr = file_reader_init(fn); 357 358 res = 0; 359 pos = 1; 360 posdisorder = 1; 361 362 if (fr == NULL) 363 err(2, "%s", fn); 364 365 s1 = file_reader_readline(fr); 366 if (s1 == NULL) 367 goto end; 368 369 ka1 = keys_array_alloc(); 370 preproc(s1, ka1); 371 372 s2 = file_reader_readline(fr); 373 if (s2 == NULL) 374 goto end; 375 376 ka2 = keys_array_alloc(); 377 preproc(s2, ka2); 378 379 for (;;) { 380 381 if (debug_sort) { 382 bwsprintf(stdout, s2, "s1=<", ">"); 383 bwsprintf(stdout, s1, "s2=<", ">"); 384 } 385 int cmp = key_coll(ka2, ka1, 0); 386 if (debug_sort) 387 printf("; cmp1=%d", cmp); 388 389 if (!cmp && sort_opts_vals.complex_sort && 390 !(sort_opts_vals.uflag) && !(sort_opts_vals.sflag)) { 391 cmp = top_level_str_coll(s2, s1); 392 if (debug_sort) 393 printf("; cmp2=%d", cmp); 394 } 395 if (debug_sort) 396 printf("\n"); 397 398 if ((sort_opts_vals.uflag && (cmp <= 0)) || (cmp < 0)) { 399 if (!(sort_opts_vals.csilentflag)) { 400 bws_disorder_warnx(s2, fn, posdisorder); 401 posdisorder = pos; 402 if (debug_sort) 403 bws_disorder_warnx(s1, fn, posdisorder); 404 } 405 res = 1; 406 goto end; 407 } 408 409 pos++; 410 411 clean_keys_array(s1, ka1); 412 sort_free(ka1); 413 ka1 = ka2; 414 ka2 = NULL; 415 416 bwsfree(s1); 417 s1 = s2; 418 419 s2 = file_reader_readline(fr); 420 if (s2 == NULL) 421 goto end; 422 423 ka2 = keys_array_alloc(); 424 preproc(s2, ka2); 425 } 426 427 end: 428 if (ka1) { 429 clean_keys_array(s1, ka1); 430 sort_free(ka1); 431 } 432 433 bwsfree(s1); 434 435 if (ka2) { 436 clean_keys_array(s2, ka2); 437 sort_free(ka2); 438 } 439 440 bwsfree(s2); 441 442 if (fn == NULL || *fn == 0 || strcmp(fn, "-") == 0) { 443 for (;;) { 444 s2 = file_reader_readline(fr); 445 if (s2 == NULL) 446 break; 447 bwsfree(s2); 448 } 449 } 450 451 file_reader_free(fr); 452 453 return res; 454 } 455 456 /* 457 * Opens a file. If the given filename is "-", stdout will be 458 * opened. 459 */ 460 FILE * 461 openfile(const char *fn, const char *mode) 462 { 463 FILE *file; 464 465 if (strcmp(fn, "-") == 0) 466 return (mode[0] == 'r') ? stdin : stdout; 467 468 if (file_is_tmp(fn) && (compress_program != NULL)) { 469 char *cmd; 470 471 fflush(stdout); 472 473 if (mode[0] == 'r') 474 sort_asprintf(&cmd, "%s -d < %s", 475 compress_program, fn); 476 else if (mode[0] == 'w') 477 sort_asprintf(&cmd, "%s > %s", 478 compress_program, fn); 479 else 480 err(2, "invalid file mode"); 481 482 if ((file = popen(cmd, mode)) == NULL) 483 err(2, "%s", compress_program); 484 485 sort_free(cmd); 486 487 } else if ((file = fopen(fn, mode)) == NULL) 488 err(2, "%s", fn); 489 490 return file; 491 } 492 493 /* 494 * Close file 495 */ 496 void 497 closefile(FILE *f, const char *fn) 498 { 499 if (f == NULL) { 500 ; 501 } else if (f == stdin) { 502 ; 503 } else if (f == stdout) { 504 fflush(f); 505 } else { 506 if (file_is_tmp(fn) && compress_program != NULL) { 507 if (pclose(f) < 0) 508 err(2, NULL); 509 } else 510 fclose(f); 511 } 512 } 513 514 /* 515 * Reads a file into the internal buffer. 516 */ 517 struct file_reader * 518 file_reader_init(const char *fsrc) 519 { 520 struct file_reader *ret; 521 522 if (fsrc == NULL) 523 fsrc = "-"; 524 525 ret = sort_calloc(1, sizeof(struct file_reader)); 526 527 ret->elsymb = '\n'; 528 if (sort_opts_vals.zflag) 529 ret->elsymb = 0; 530 531 ret->fname = sort_strdup(fsrc); 532 533 if (strcmp(fsrc, "-") && (compress_program == NULL) && use_mmap) { 534 struct stat stat_buf; 535 void *addr; 536 size_t sz = 0; 537 int fd; 538 539 fd = open(fsrc, O_RDONLY); 540 if (fd < 0) 541 err(2, "%s", fsrc); 542 543 if (fstat(fd, &stat_buf) < 0) 544 err(2, "%s", fsrc); 545 sz = stat_buf.st_size; 546 547 addr = mmap(NULL, sz, PROT_READ, 0, fd, 0); 548 close(fd); 549 if (addr != MAP_FAILED) { 550 ret->mmapaddr = addr; 551 ret->mmapsize = sz; 552 ret->mmapptr = ret->mmapaddr; 553 posix_madvise(addr, sz, POSIX_MADV_SEQUENTIAL); 554 } 555 } 556 557 if (ret->mmapaddr == NULL) { 558 ret->file = openfile(fsrc, "r"); 559 if (ret->file == NULL) 560 err(2, "%s", fsrc); 561 562 if (strcmp(fsrc, "-")) { 563 ret->cbsz = READ_CHUNK; 564 ret->buffer = sort_malloc(ret->cbsz); 565 ret->bsz = 0; 566 ret->strbeg = 0; 567 568 ret->bsz = fread(ret->buffer, 1, ret->cbsz, ret->file); 569 if (ret->bsz == 0) { 570 if (ferror(ret->file)) 571 err(2, NULL); 572 } 573 } 574 } 575 576 return ret; 577 } 578 579 struct bwstring * 580 file_reader_readline(struct file_reader *fr) 581 { 582 struct bwstring *ret = NULL; 583 584 if (fr->mmapaddr) { 585 unsigned char *mmapend; 586 587 mmapend = fr->mmapaddr + fr->mmapsize; 588 if (fr->mmapptr >= mmapend) 589 return NULL; 590 else { 591 unsigned char *strend; 592 size_t sz; 593 594 sz = mmapend - fr->mmapptr; 595 strend = memchr(fr->mmapptr, fr->elsymb, sz); 596 597 if (strend == NULL) { 598 ret = bwscsbdup(fr->mmapptr, sz); 599 fr->mmapptr = mmapend; 600 } else { 601 ret = bwscsbdup(fr->mmapptr, strend - 602 fr->mmapptr); 603 fr->mmapptr = strend + 1; 604 } 605 } 606 607 } else if (fr->file != stdin) { 608 unsigned char *strend; 609 size_t bsz1, remsz, search_start; 610 611 search_start = 0; 612 remsz = 0; 613 strend = NULL; 614 615 if (fr->bsz > fr->strbeg) 616 remsz = fr->bsz - fr->strbeg; 617 618 /* line read cycle */ 619 for (;;) { 620 if (remsz > search_start) 621 strend = memchr(fr->buffer + fr->strbeg + 622 search_start, fr->elsymb, remsz - 623 search_start); 624 else 625 strend = NULL; 626 627 if (strend) 628 break; 629 if (feof(fr->file)) 630 break; 631 632 if (fr->bsz != fr->cbsz) 633 /* NOTREACHED */ 634 err(2, "File read software error 1"); 635 636 if (remsz > (READ_CHUNK >> 1)) { 637 search_start = fr->cbsz - fr->strbeg; 638 fr->cbsz += READ_CHUNK; 639 fr->buffer = sort_reallocarray(fr->buffer, 640 1, fr->cbsz); 641 bsz1 = fread(fr->buffer + fr->bsz, 1, 642 READ_CHUNK, fr->file); 643 if (bsz1 == 0) { 644 if (ferror(fr->file)) 645 err(2, NULL); 646 break; 647 } 648 fr->bsz += bsz1; 649 remsz += bsz1; 650 } else { 651 if (remsz > 0 && fr->strbeg > 0) { 652 memmove(fr->buffer, 653 fr->buffer + fr->strbeg, remsz); 654 } 655 fr->strbeg = 0; 656 search_start = remsz; 657 bsz1 = fread(fr->buffer + remsz, 1, 658 fr->cbsz - remsz, fr->file); 659 if (bsz1 == 0) { 660 if (ferror(fr->file)) 661 err(2, NULL); 662 break; 663 } 664 fr->bsz = remsz + bsz1; 665 remsz = fr->bsz; 666 } 667 } 668 669 if (strend == NULL) 670 strend = fr->buffer + fr->bsz; 671 672 if ((fr->buffer + fr->strbeg <= strend) && 673 (fr->strbeg < fr->bsz) && (remsz>0)) 674 ret = bwscsbdup(fr->buffer + fr->strbeg, strend - 675 fr->buffer - fr->strbeg); 676 677 fr->strbeg = (strend - fr->buffer) + 1; 678 } else { 679 size_t len = 0; 680 681 ret = bwsfgetln(fr->file, &len, sort_opts_vals.zflag, 682 &(fr->rb)); 683 } 684 685 return ret; 686 } 687 688 static void 689 file_reader_clean(struct file_reader *fr) 690 { 691 if (fr->mmapaddr) 692 munmap(fr->mmapaddr, fr->mmapsize); 693 694 sort_free(fr->buffer); 695 696 if (fr->file) 697 closefile(fr->file, fr->fname); 698 699 sort_free(fr->fname); 700 701 memset(fr, 0, sizeof(struct file_reader)); 702 } 703 704 void 705 file_reader_free(struct file_reader *fr) 706 { 707 file_reader_clean(fr); 708 sort_free(fr); 709 } 710 711 int 712 procfile(const char *fsrc, struct sort_list *list, struct file_list *fl) 713 { 714 struct file_reader *fr; 715 716 fr = file_reader_init(fsrc); 717 if (fr == NULL) 718 err(2, "%s", fsrc); 719 720 /* file browse cycle */ 721 for (;;) { 722 struct bwstring *bws; 723 724 bws = file_reader_readline(fr); 725 726 if (bws == NULL) 727 break; 728 729 sort_list_add(list, bws); 730 731 if (list->memsize >= available_free_memory) { 732 char *fn; 733 734 fn = new_tmp_file_name(); 735 sort_list_to_file(list, fn); 736 file_list_add(fl, fn, false); 737 sort_list_clean(list); 738 } 739 } 740 741 file_reader_free(fr); 742 743 return 0; 744 } 745 746 /* 747 * Compare file headers. Files with EOF always go to the end of the list. 748 */ 749 static int 750 file_header_cmp(struct file_header *f1, struct file_header *f2) 751 { 752 int ret; 753 754 if (f1 == f2) 755 return 0; 756 if (f1->fr == NULL) 757 return (f2->fr == NULL) ? 0 : 1; 758 if (f2->fr == NULL) 759 return -1; 760 761 ret = list_coll(&(f1->si), &(f2->si)); 762 if (!ret) 763 return (f1->file_pos < f2->file_pos) ? -1 : 1; 764 return ret; 765 } 766 767 /* 768 * Allocate and init file header structure 769 */ 770 static void 771 file_header_init(struct file_header **fh, const char *fn, size_t file_pos) 772 { 773 struct bwstring *line; 774 775 *fh = sort_malloc(sizeof(struct file_header)); 776 (*fh)->file_pos = file_pos; 777 (*fh)->fr = file_reader_init(fn); 778 if ((*fh)->fr == NULL) { 779 err(2, "Cannot open %s for reading", 780 strcmp(fn, "-") == 0 ? "stdin" : fn); 781 } 782 line = file_reader_readline((*fh)->fr); 783 if (line == NULL) { 784 file_reader_free((*fh)->fr); 785 (*fh)->fr = NULL; 786 (*fh)->si = NULL; 787 } else { 788 (*fh)->si = sort_list_item_alloc(); 789 sort_list_item_set((*fh)->si, line); 790 } 791 } 792 793 /* 794 * Close file 795 */ 796 static void 797 file_header_close(struct file_header **fh) 798 { 799 if ((*fh)->fr) { 800 file_reader_free((*fh)->fr); 801 (*fh)->fr = NULL; 802 } 803 if ((*fh)->si) { 804 sort_list_item_clean((*fh)->si); 805 sort_free((*fh)->si); 806 (*fh)->si = NULL; 807 } 808 sort_free(*fh); 809 *fh = NULL; 810 } 811 812 /* 813 * Swap two array elements 814 */ 815 static void 816 file_header_swap(struct file_header **fh, size_t i1, size_t i2) 817 { 818 struct file_header *tmp; 819 820 tmp = fh[i1]; 821 fh[i1] = fh[i2]; 822 fh[i2] = tmp; 823 } 824 825 /* heap algorithm ==>> */ 826 827 /* 828 * See heap sort algorithm 829 * "Raises" last element to its right place 830 */ 831 static void 832 file_header_heap_swim(struct file_header **fh, size_t indx) 833 { 834 if (indx > 0) { 835 size_t parent_index; 836 837 parent_index = (indx - 1) >> 1; 838 839 if (file_header_cmp(fh[indx], fh[parent_index]) < 0) { 840 /* swap child and parent and continue */ 841 file_header_swap(fh, indx, parent_index); 842 file_header_heap_swim(fh, parent_index); 843 } 844 } 845 } 846 847 /* 848 * Sink the top element to its correct position 849 */ 850 static void 851 file_header_heap_sink(struct file_header **fh, size_t indx, size_t size) 852 { 853 size_t left_child_index; 854 size_t right_child_index; 855 856 left_child_index = indx + indx + 1; 857 right_child_index = left_child_index + 1; 858 859 if (left_child_index < size) { 860 size_t min_child_index; 861 862 min_child_index = left_child_index; 863 864 if ((right_child_index < size) && 865 (file_header_cmp(fh[left_child_index], 866 fh[right_child_index]) > 0)) 867 min_child_index = right_child_index; 868 if (file_header_cmp(fh[indx], fh[min_child_index]) > 0) { 869 file_header_swap(fh, indx, min_child_index); 870 file_header_heap_sink(fh, min_child_index, size); 871 } 872 } 873 } 874 875 /* <<== heap algorithm */ 876 877 /* 878 * Adds element to the "left" end 879 */ 880 static void 881 file_header_list_rearrange_from_header(struct file_header **fh, size_t size) 882 { 883 file_header_heap_sink(fh, 0, size); 884 } 885 886 /* 887 * Adds element to the "right" end 888 */ 889 static void 890 file_header_list_push(struct file_header *f, struct file_header **fh, size_t size) 891 { 892 fh[size++] = f; 893 file_header_heap_swim(fh, size - 1); 894 } 895 896 struct last_printed 897 { 898 struct bwstring *str; 899 }; 900 901 /* 902 * Prints the current line of the file 903 */ 904 static void 905 file_header_print(struct file_header *fh, FILE *f_out, struct last_printed *lp) 906 { 907 if (sort_opts_vals.uflag) { 908 if ((lp->str == NULL) || (str_list_coll(lp->str, &(fh->si)))) { 909 bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); 910 if (lp->str) 911 bwsfree(lp->str); 912 lp->str = bwsdup(fh->si->str); 913 } 914 } else 915 bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); 916 } 917 918 /* 919 * Read next line 920 */ 921 static void 922 file_header_read_next(struct file_header *fh) 923 { 924 struct bwstring *tmp; 925 926 tmp = file_reader_readline(fh->fr); 927 if (tmp == NULL) { 928 file_reader_free(fh->fr); 929 fh->fr = NULL; 930 if (fh->si) { 931 sort_list_item_clean(fh->si); 932 sort_free(fh->si); 933 fh->si = NULL; 934 } 935 } else { 936 if (fh->si == NULL) 937 fh->si = sort_list_item_alloc(); 938 sort_list_item_set(fh->si, tmp); 939 } 940 } 941 942 /* 943 * Merge array of "files headers" 944 */ 945 static void 946 file_headers_merge(size_t fnum, struct file_header **fh, FILE *f_out) 947 { 948 struct last_printed lp; 949 size_t i; 950 951 memset(&lp, 0, sizeof(lp)); 952 953 /* 954 * construct the initial sort structure 955 */ 956 for (i = 0; i < fnum; i++) 957 file_header_list_push(fh[i], fh, i); 958 959 while (fh[0]->fr) { /* unfinished files are always in front */ 960 /* output the smallest line: */ 961 file_header_print(fh[0], f_out, &lp); 962 /* read a new line, if possible: */ 963 file_header_read_next(fh[0]); 964 /* re-arrange the list: */ 965 file_header_list_rearrange_from_header(fh, fnum); 966 } 967 968 if (lp.str) 969 bwsfree(lp.str); 970 } 971 972 /* 973 * Merges the given files into the output file, which can be 974 * stdout. 975 */ 976 static void 977 merge_files_array(size_t argc, char **argv, const char *fn_out) 978 { 979 struct file_header **fh; 980 FILE *f_out; 981 size_t i; 982 983 f_out = openfile(fn_out, "w"); 984 985 if (f_out == NULL) 986 err(2, "%s", fn_out); 987 988 fh = sort_reallocarray(NULL, argc + 1, sizeof(struct file_header *)); 989 990 for (i = 0; i < argc; i++) 991 file_header_init(fh + i, argv[i], i); 992 993 file_headers_merge(argc, fh, f_out); 994 995 for (i = 0; i < argc; i++) 996 file_header_close(fh + i); 997 998 sort_free(fh); 999 1000 closefile(f_out, fn_out); 1001 } 1002 1003 /* 1004 * Shrinks the file list until its size smaller than max number of opened files 1005 */ 1006 static int 1007 shrink_file_list(struct file_list *fl) 1008 { 1009 struct file_list new_fl; 1010 size_t indx = 0; 1011 1012 if (fl->count < max_open_files) 1013 return 0; 1014 1015 file_list_init(&new_fl, true); 1016 while (indx < fl->count) { 1017 char *fnew; 1018 size_t num; 1019 1020 num = fl->count - indx; 1021 fnew = new_tmp_file_name(); 1022 1023 if (num >= max_open_files) 1024 num = max_open_files - 1; 1025 merge_files_array(num, fl->fns + indx, fnew); 1026 if (fl->tmp) { 1027 size_t i; 1028 1029 for (i = 0; i < num; i++) 1030 unlink(fl->fns[indx + i]); 1031 } 1032 file_list_add(&new_fl, fnew, false); 1033 indx += num; 1034 } 1035 fl->tmp = false; /* already taken care of */ 1036 file_list_clean(fl); 1037 1038 fl->count = new_fl.count; 1039 fl->fns = new_fl.fns; 1040 fl->sz = new_fl.sz; 1041 fl->tmp = new_fl.tmp; 1042 1043 return 1; 1044 } 1045 1046 /* 1047 * Merge list of files 1048 */ 1049 void 1050 merge_files(struct file_list *fl, const char *fn_out) 1051 { 1052 while (shrink_file_list(fl)) 1053 ; 1054 1055 merge_files_array(fl->count, fl->fns, fn_out); 1056 } 1057 1058 static const char * 1059 get_sort_method_name(int sm) 1060 { 1061 if (sm == SORT_MERGESORT) 1062 return "mergesort"; 1063 else if (sort_opts_vals.sort_method == SORT_RADIXSORT) 1064 return "radixsort"; 1065 else if (sort_opts_vals.sort_method == SORT_HEAPSORT) 1066 return "heapsort"; 1067 else 1068 return "quicksort"; 1069 } 1070 1071 /* 1072 * Sort list of lines and writes it to the file 1073 */ 1074 void 1075 sort_list_to_file(struct sort_list *list, const char *outfile) 1076 { 1077 struct sort_mods *sm = &(keys[0].sm); 1078 1079 if (!sm->Mflag && !sm->Rflag && !sm->Vflag && 1080 !sm->gflag && !sm->hflag && !sm->nflag) { 1081 if ((sort_opts_vals.sort_method == SORT_DEFAULT) && byte_sort) 1082 sort_opts_vals.sort_method = SORT_RADIXSORT; 1083 1084 } else if (sort_opts_vals.sort_method == SORT_RADIXSORT) 1085 err(2, "Radix sort cannot be used with these sort options"); 1086 1087 /* 1088 * To handle stable sort and the unique cases in the 1089 * right order, we need to use a stable algorithm. 1090 */ 1091 if (sort_opts_vals.sflag) { 1092 switch (sort_opts_vals.sort_method){ 1093 case SORT_MERGESORT: 1094 break; 1095 case SORT_RADIXSORT: 1096 break; 1097 case SORT_DEFAULT: 1098 sort_opts_vals.sort_method = SORT_MERGESORT; 1099 break; 1100 default: 1101 errx(2, "The chosen sort method cannot be used with " 1102 "stable and/or unique sort"); 1103 }; 1104 } 1105 1106 if (sort_opts_vals.sort_method == SORT_DEFAULT) 1107 sort_opts_vals.sort_method = DEFAULT_SORT_ALGORITHM; 1108 1109 if (debug_sort) 1110 printf("sort_method=%s\n", 1111 get_sort_method_name(sort_opts_vals.sort_method)); 1112 1113 switch (sort_opts_vals.sort_method){ 1114 case SORT_RADIXSORT: 1115 rxsort(list->list, list->count); 1116 break; 1117 case SORT_MERGESORT: 1118 mergesort(list->list, list->count, 1119 sizeof(struct sort_list_item *), list_coll); 1120 break; 1121 case SORT_HEAPSORT: 1122 heapsort(list->list, list->count, 1123 sizeof(struct sort_list_item *), list_coll); 1124 break; 1125 case SORT_QSORT: 1126 qsort(list->list, list->count, 1127 sizeof(struct sort_list_item *), list_coll); 1128 break; 1129 default: 1130 DEFAULT_SORT_FUNC(list->list, list->count, 1131 sizeof(struct sort_list_item *), list_coll); 1132 break; 1133 } 1134 sort_list_dump(list, outfile); 1135 } 1136