1 /* $OpenBSD: file.c,v 1.24 2024/09/20 02:00:46 jsg Exp $ */ 2 3 /*- 4 * Copyright (C) 2009 Gabor Kovesdan <gabor@FreeBSD.org> 5 * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/mman.h> 31 #include <sys/stat.h> 32 #include <sys/types.h> 33 #include <sys/queue.h> 34 35 #include <err.h> 36 #include <fcntl.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <signal.h> 40 #include <string.h> 41 #include <unistd.h> 42 #include <wchar.h> 43 #include <wctype.h> 44 45 #include "coll.h" 46 #include "file.h" 47 #include "radixsort.h" 48 49 unsigned long long available_free_memory = 1000000; 50 51 bool use_mmap; 52 53 const char *tmpdir = "/tmp"; 54 const char *compress_program; 55 56 size_t max_open_files = 16; 57 58 /* 59 * How much space we read from file at once 60 */ 61 #define READ_CHUNK 4096 62 63 /* 64 * File reader structure 65 */ 66 struct file_reader { 67 struct reader_buffer rb; 68 FILE *file; 69 char *fname; 70 unsigned char *buffer; 71 unsigned char *mmapaddr; 72 unsigned char *mmapptr; 73 size_t bsz; 74 size_t cbsz; 75 size_t mmapsize; 76 size_t strbeg; 77 char elsymb; 78 }; 79 80 /* 81 * Structure to be used in file merge process. 82 */ 83 struct file_header { 84 struct file_reader *fr; 85 struct sort_list_item *si; /* current top line */ 86 size_t file_pos; 87 }; 88 89 /* 90 * List elements of "cleanable" files list. 91 */ 92 struct CLEANABLE_FILE { 93 char *fn; 94 LIST_ENTRY(CLEANABLE_FILE) files; 95 }; 96 97 /* 98 * List header of "cleanable" files list. 99 */ 100 static LIST_HEAD(CLEANABLE_FILES, CLEANABLE_FILE) tmp_files; 101 102 /* 103 * Init tmp files list 104 */ 105 void 106 init_tmp_files(void) 107 { 108 LIST_INIT(&tmp_files); 109 } 110 111 /* 112 * Save name of a tmp file for signal cleanup 113 */ 114 void 115 tmp_file_atexit(const char *tmp_file) 116 { 117 struct CLEANABLE_FILE *item; 118 sigset_t mask, oldmask; 119 120 item = sort_malloc(sizeof(struct CLEANABLE_FILE)); 121 item->fn = sort_strdup(tmp_file); 122 123 sigfillset(&mask); 124 sigprocmask(SIG_BLOCK, &mask, &oldmask); 125 LIST_INSERT_HEAD(&tmp_files, item, files); 126 sigprocmask(SIG_SETMASK, &oldmask, NULL); 127 } 128 129 /* 130 * Clear tmp files 131 */ 132 void 133 clear_tmp_files(void) 134 { 135 struct CLEANABLE_FILE *item; 136 137 LIST_FOREACH(item, &tmp_files, files) { 138 if (item != NULL && item->fn != NULL) 139 unlink(item->fn); 140 } 141 } 142 143 /* 144 * Check whether a file is a temporary file 145 */ 146 static bool 147 file_is_tmp(const char *fn) 148 { 149 struct CLEANABLE_FILE *item; 150 151 LIST_FOREACH(item, &tmp_files, files) { 152 if (item->fn != NULL && strcmp(item->fn, fn) == 0) 153 return true; 154 } 155 156 return false; 157 } 158 159 /* 160 * Generate new temporary file name 161 */ 162 char * 163 new_tmp_file_name(void) 164 { 165 char *ret; 166 int fd; 167 168 sort_asprintf(&ret, "%s/.bsdsort.XXXXXXXXXX", tmpdir); 169 if ((fd = mkstemp(ret)) == -1) 170 err(2, "%s", ret); 171 close(fd); 172 tmp_file_atexit(ret); 173 return ret; 174 } 175 176 /* 177 * Initialize file list 178 */ 179 void 180 file_list_init(struct file_list *fl, bool tmp) 181 { 182 fl->count = 0; 183 fl->sz = 0; 184 fl->fns = NULL; 185 fl->tmp = tmp; 186 } 187 188 /* 189 * Add a file name to the list 190 */ 191 void 192 file_list_add(struct file_list *fl, char *fn, bool allocate) 193 { 194 if (fl->count >= fl->sz) { 195 fl->fns = sort_reallocarray(fl->fns, 196 fl->sz ? fl->sz : (fl->sz = 1), 2 * sizeof(char *)); 197 fl->sz *= 2; 198 } 199 fl->fns[fl->count] = allocate ? sort_strdup(fn) : fn; 200 fl->count += 1; 201 } 202 203 /* 204 * Populate file list from array of file names 205 */ 206 void 207 file_list_populate(struct file_list *fl, int argc, char **argv, bool allocate) 208 { 209 int i; 210 211 for (i = 0; i < argc; i++) 212 file_list_add(fl, argv[i], allocate); 213 } 214 215 /* 216 * Clean file list data and delete the files, 217 * if this is a list of temporary files 218 */ 219 void 220 file_list_clean(struct file_list *fl) 221 { 222 if (fl->fns) { 223 size_t i; 224 225 for (i = 0; i < fl->count; i++) { 226 if (fl->fns[i]) { 227 if (fl->tmp) 228 unlink(fl->fns[i]); 229 sort_free(fl->fns[i]); 230 fl->fns[i] = NULL; 231 } 232 } 233 sort_free(fl->fns); 234 fl->fns = NULL; 235 } 236 fl->sz = 0; 237 fl->count = 0; 238 fl->tmp = false; 239 } 240 241 /* 242 * Init sort list 243 */ 244 void 245 sort_list_init(struct sort_list *l) 246 { 247 l->count = 0; 248 l->size = 0; 249 l->memsize = sizeof(struct sort_list); 250 l->list = NULL; 251 } 252 253 /* 254 * Add string to sort list 255 */ 256 void 257 sort_list_add(struct sort_list *l, struct bwstring *str) 258 { 259 size_t indx = l->count; 260 261 if ((l->list == NULL) || (indx >= l->size)) { 262 size_t newsize = (l->size + 1) + 1024; 263 264 l->list = sort_reallocarray(l->list, newsize, 265 sizeof(struct sort_list_item *)); 266 l->memsize += (newsize - l->size) * 267 sizeof(struct sort_list_item *); 268 l->size = newsize; 269 } 270 l->list[indx] = sort_list_item_alloc(); 271 sort_list_item_set(l->list[indx], str); 272 l->memsize += sort_list_item_size(l->list[indx]); 273 l->count += 1; 274 } 275 276 /* 277 * Clean sort list data 278 */ 279 void 280 sort_list_clean(struct sort_list *l) 281 { 282 if (l->list) { 283 size_t i; 284 285 for (i = 0; i < l->count; i++) { 286 struct sort_list_item *item; 287 288 item = l->list[i]; 289 290 if (item) { 291 sort_list_item_clean(item); 292 sort_free(item); 293 l->list[i] = NULL; 294 } 295 } 296 sort_free(l->list); 297 l->list = NULL; 298 } 299 l->count = 0; 300 l->size = 0; 301 l->memsize = sizeof(struct sort_list); 302 } 303 304 /* 305 * Write sort list to file 306 */ 307 void 308 sort_list_dump(struct sort_list *l, const char *fn) 309 { 310 FILE *f; 311 312 f = openfile(fn, "w"); 313 if (f == NULL) 314 err(2, "%s", fn); 315 316 if (l->list) { 317 size_t i; 318 319 if (!sort_opts_vals.uflag) { 320 for (i = 0; i < l->count; ++i) 321 bwsfwrite(l->list[i]->str, f, 322 sort_opts_vals.zflag); 323 } else { 324 struct sort_list_item *last_printed_item = NULL; 325 struct sort_list_item *item; 326 for (i = 0; i < l->count; ++i) { 327 item = l->list[i]; 328 if ((last_printed_item == NULL) || 329 list_coll(&last_printed_item, &item)) { 330 bwsfwrite(item->str, f, sort_opts_vals.zflag); 331 last_printed_item = item; 332 } 333 } 334 } 335 } 336 337 closefile(f, fn); 338 } 339 340 /* 341 * Checks if the given file is sorted. Stops at the first disorder, 342 * prints the disordered line and returns 1. 343 */ 344 int 345 check(const char *fn) 346 { 347 struct bwstring *s1, *s2; 348 struct file_reader *fr; 349 struct keys_array *ka1, *ka2; 350 int res; 351 size_t pos, posdisorder; 352 353 s1 = s2 = NULL; 354 ka1 = ka2 = NULL; 355 356 fr = file_reader_init(fn); 357 358 res = 0; 359 pos = 1; 360 posdisorder = 1; 361 362 if (fr == NULL) 363 err(2, "%s", fn); 364 365 s1 = file_reader_readline(fr); 366 if (s1 == NULL) 367 goto end; 368 369 ka1 = keys_array_alloc(); 370 preproc(s1, ka1); 371 372 s2 = file_reader_readline(fr); 373 if (s2 == NULL) 374 goto end; 375 376 ka2 = keys_array_alloc(); 377 preproc(s2, ka2); 378 379 for (;;) { 380 381 if (debug_sort) { 382 bwsprintf(stdout, s2, "s1=<", ">"); 383 bwsprintf(stdout, s1, "s2=<", ">"); 384 } 385 int cmp = key_coll(ka2, ka1, 0); 386 if (debug_sort) 387 printf("; cmp1=%d", cmp); 388 389 if (!cmp && sort_opts_vals.complex_sort && 390 !(sort_opts_vals.uflag) && !(sort_opts_vals.sflag) && 391 !(sort_opts_vals.kflag)) { 392 cmp = top_level_str_coll(s2, s1); 393 if (debug_sort) 394 printf("; cmp2=%d", cmp); 395 } 396 if (debug_sort) 397 printf("\n"); 398 399 if ((sort_opts_vals.uflag && (cmp <= 0)) || (cmp < 0)) { 400 if (!(sort_opts_vals.csilentflag)) { 401 bws_disorder_warnx(s2, fn, posdisorder); 402 posdisorder = pos; 403 if (debug_sort) 404 bws_disorder_warnx(s1, fn, posdisorder); 405 } 406 res = 1; 407 goto end; 408 } 409 410 pos++; 411 412 clean_keys_array(s1, ka1); 413 sort_free(ka1); 414 ka1 = ka2; 415 ka2 = NULL; 416 417 bwsfree(s1); 418 s1 = s2; 419 420 s2 = file_reader_readline(fr); 421 if (s2 == NULL) 422 goto end; 423 424 ka2 = keys_array_alloc(); 425 preproc(s2, ka2); 426 } 427 428 end: 429 if (ka1) { 430 clean_keys_array(s1, ka1); 431 sort_free(ka1); 432 } 433 434 bwsfree(s1); 435 436 if (ka2) { 437 clean_keys_array(s2, ka2); 438 sort_free(ka2); 439 } 440 441 bwsfree(s2); 442 443 if (fn == NULL || *fn == 0 || strcmp(fn, "-") == 0) { 444 for (;;) { 445 s2 = file_reader_readline(fr); 446 if (s2 == NULL) 447 break; 448 bwsfree(s2); 449 } 450 } 451 452 file_reader_free(fr); 453 454 return res; 455 } 456 457 /* 458 * Opens a file. If the given filename is "-", stdout will be 459 * opened. 460 */ 461 FILE * 462 openfile(const char *fn, const char *mode) 463 { 464 FILE *file; 465 466 if (strcmp(fn, "-") == 0) 467 return (mode[0] == 'r') ? stdin : stdout; 468 469 if (file_is_tmp(fn) && (compress_program != NULL)) { 470 char *cmd; 471 472 fflush(stdout); 473 474 if (mode[0] == 'r') 475 sort_asprintf(&cmd, "%s -d < %s", 476 compress_program, fn); 477 else if (mode[0] == 'w') 478 sort_asprintf(&cmd, "%s > %s", 479 compress_program, fn); 480 else 481 err(2, "invalid file mode"); 482 483 if ((file = popen(cmd, mode)) == NULL) 484 err(2, "%s", compress_program); 485 486 sort_free(cmd); 487 488 } else if ((file = fopen(fn, mode)) == NULL) 489 err(2, "%s", fn); 490 491 return file; 492 } 493 494 /* 495 * Close file 496 */ 497 void 498 closefile(FILE *f, const char *fn) 499 { 500 if (f == NULL) { 501 ; 502 } else if (f == stdin) { 503 ; 504 } else if (f == stdout) { 505 fflush(f); 506 } else { 507 if (file_is_tmp(fn) && compress_program != NULL) { 508 if (pclose(f) < 0) 509 err(2, NULL); 510 } else 511 fclose(f); 512 } 513 } 514 515 /* 516 * Reads a file into the internal buffer. 517 */ 518 struct file_reader * 519 file_reader_init(const char *fsrc) 520 { 521 struct file_reader *ret; 522 523 if (fsrc == NULL) 524 fsrc = "-"; 525 526 ret = sort_calloc(1, sizeof(struct file_reader)); 527 528 ret->elsymb = '\n'; 529 if (sort_opts_vals.zflag) 530 ret->elsymb = 0; 531 532 ret->fname = sort_strdup(fsrc); 533 534 if (strcmp(fsrc, "-") && (compress_program == NULL) && use_mmap) { 535 struct stat stat_buf; 536 void *addr; 537 size_t sz = 0; 538 int fd; 539 540 fd = open(fsrc, O_RDONLY); 541 if (fd < 0) 542 err(2, "%s", fsrc); 543 544 if (fstat(fd, &stat_buf) < 0) 545 err(2, "%s", fsrc); 546 sz = stat_buf.st_size; 547 548 addr = mmap(NULL, sz, PROT_READ, 0, fd, 0); 549 close(fd); 550 if (addr != MAP_FAILED) { 551 ret->mmapaddr = addr; 552 ret->mmapsize = sz; 553 ret->mmapptr = ret->mmapaddr; 554 posix_madvise(addr, sz, POSIX_MADV_SEQUENTIAL); 555 } 556 } 557 558 if (ret->mmapaddr == NULL) { 559 ret->file = openfile(fsrc, "r"); 560 if (ret->file == NULL) 561 err(2, "%s", fsrc); 562 563 if (strcmp(fsrc, "-")) { 564 ret->cbsz = READ_CHUNK; 565 ret->buffer = sort_malloc(ret->cbsz); 566 ret->bsz = 0; 567 ret->strbeg = 0; 568 569 ret->bsz = fread(ret->buffer, 1, ret->cbsz, ret->file); 570 if (ret->bsz == 0) { 571 if (ferror(ret->file)) 572 err(2, NULL); 573 } 574 } 575 } 576 577 return ret; 578 } 579 580 struct bwstring * 581 file_reader_readline(struct file_reader *fr) 582 { 583 struct bwstring *ret = NULL; 584 585 if (fr->mmapaddr) { 586 unsigned char *mmapend; 587 588 mmapend = fr->mmapaddr + fr->mmapsize; 589 if (fr->mmapptr >= mmapend) 590 return NULL; 591 else { 592 unsigned char *strend; 593 size_t sz; 594 595 sz = mmapend - fr->mmapptr; 596 strend = memchr(fr->mmapptr, fr->elsymb, sz); 597 598 if (strend == NULL) { 599 ret = bwscsbdup(fr->mmapptr, sz); 600 fr->mmapptr = mmapend; 601 } else { 602 ret = bwscsbdup(fr->mmapptr, strend - 603 fr->mmapptr); 604 fr->mmapptr = strend + 1; 605 } 606 } 607 608 } else if (fr->file != stdin) { 609 unsigned char *strend; 610 size_t bsz1, remsz, search_start; 611 612 search_start = 0; 613 remsz = 0; 614 strend = NULL; 615 616 if (fr->bsz > fr->strbeg) 617 remsz = fr->bsz - fr->strbeg; 618 619 /* line read cycle */ 620 for (;;) { 621 if (remsz > search_start) 622 strend = memchr(fr->buffer + fr->strbeg + 623 search_start, fr->elsymb, remsz - 624 search_start); 625 else 626 strend = NULL; 627 628 if (strend) 629 break; 630 if (feof(fr->file)) 631 break; 632 633 if (fr->bsz != fr->cbsz) 634 /* NOTREACHED */ 635 err(2, "File read software error 1"); 636 637 if (remsz > (READ_CHUNK >> 1)) { 638 search_start = fr->cbsz - fr->strbeg; 639 fr->cbsz += READ_CHUNK; 640 fr->buffer = sort_reallocarray(fr->buffer, 641 1, fr->cbsz); 642 bsz1 = fread(fr->buffer + fr->bsz, 1, 643 READ_CHUNK, fr->file); 644 if (bsz1 == 0) { 645 if (ferror(fr->file)) 646 err(2, NULL); 647 break; 648 } 649 fr->bsz += bsz1; 650 remsz += bsz1; 651 } else { 652 if (remsz > 0 && fr->strbeg > 0) { 653 memmove(fr->buffer, 654 fr->buffer + fr->strbeg, remsz); 655 } 656 fr->strbeg = 0; 657 search_start = remsz; 658 bsz1 = fread(fr->buffer + remsz, 1, 659 fr->cbsz - remsz, fr->file); 660 if (bsz1 == 0) { 661 if (ferror(fr->file)) 662 err(2, NULL); 663 break; 664 } 665 fr->bsz = remsz + bsz1; 666 remsz = fr->bsz; 667 } 668 } 669 670 if (strend == NULL) 671 strend = fr->buffer + fr->bsz; 672 673 if ((fr->buffer + fr->strbeg <= strend) && 674 (fr->strbeg < fr->bsz) && (remsz>0)) 675 ret = bwscsbdup(fr->buffer + fr->strbeg, strend - 676 fr->buffer - fr->strbeg); 677 678 fr->strbeg = (strend - fr->buffer) + 1; 679 } else { 680 size_t len = 0; 681 682 ret = bwsfgetln(fr->file, &len, sort_opts_vals.zflag, 683 &(fr->rb)); 684 } 685 686 return ret; 687 } 688 689 static void 690 file_reader_clean(struct file_reader *fr) 691 { 692 if (fr->mmapaddr) 693 munmap(fr->mmapaddr, fr->mmapsize); 694 695 sort_free(fr->buffer); 696 697 if (fr->file) 698 closefile(fr->file, fr->fname); 699 700 sort_free(fr->fname); 701 702 memset(fr, 0, sizeof(struct file_reader)); 703 } 704 705 void 706 file_reader_free(struct file_reader *fr) 707 { 708 file_reader_clean(fr); 709 sort_free(fr); 710 } 711 712 int 713 procfile(const char *fsrc, struct sort_list *list, struct file_list *fl) 714 { 715 struct file_reader *fr; 716 717 fr = file_reader_init(fsrc); 718 if (fr == NULL) 719 err(2, "%s", fsrc); 720 721 /* file browse cycle */ 722 for (;;) { 723 struct bwstring *bws; 724 725 bws = file_reader_readline(fr); 726 727 if (bws == NULL) 728 break; 729 730 sort_list_add(list, bws); 731 732 if (list->memsize >= available_free_memory) { 733 char *fn; 734 735 fn = new_tmp_file_name(); 736 sort_list_to_file(list, fn); 737 file_list_add(fl, fn, false); 738 sort_list_clean(list); 739 } 740 } 741 742 file_reader_free(fr); 743 744 return 0; 745 } 746 747 /* 748 * Compare file headers. Files with EOF always go to the end of the list. 749 */ 750 static int 751 file_header_cmp(struct file_header *f1, struct file_header *f2) 752 { 753 int ret; 754 755 if (f1 == f2) 756 return 0; 757 if (f1->fr == NULL) 758 return (f2->fr == NULL) ? 0 : 1; 759 if (f2->fr == NULL) 760 return -1; 761 762 ret = list_coll(&(f1->si), &(f2->si)); 763 if (!ret) 764 return (f1->file_pos < f2->file_pos) ? -1 : 1; 765 return ret; 766 } 767 768 /* 769 * Allocate and init file header structure 770 */ 771 static void 772 file_header_init(struct file_header **fh, const char *fn, size_t file_pos) 773 { 774 struct bwstring *line; 775 776 *fh = sort_malloc(sizeof(struct file_header)); 777 (*fh)->file_pos = file_pos; 778 (*fh)->fr = file_reader_init(fn); 779 if ((*fh)->fr == NULL) { 780 err(2, "Cannot open %s for reading", 781 strcmp(fn, "-") == 0 ? "stdin" : fn); 782 } 783 line = file_reader_readline((*fh)->fr); 784 if (line == NULL) { 785 file_reader_free((*fh)->fr); 786 (*fh)->fr = NULL; 787 (*fh)->si = NULL; 788 } else { 789 (*fh)->si = sort_list_item_alloc(); 790 sort_list_item_set((*fh)->si, line); 791 } 792 } 793 794 /* 795 * Close file 796 */ 797 static void 798 file_header_close(struct file_header **fh) 799 { 800 if ((*fh)->fr) { 801 file_reader_free((*fh)->fr); 802 (*fh)->fr = NULL; 803 } 804 if ((*fh)->si) { 805 sort_list_item_clean((*fh)->si); 806 sort_free((*fh)->si); 807 (*fh)->si = NULL; 808 } 809 sort_free(*fh); 810 *fh = NULL; 811 } 812 813 /* 814 * Swap two array elements 815 */ 816 static void 817 file_header_swap(struct file_header **fh, size_t i1, size_t i2) 818 { 819 struct file_header *tmp; 820 821 tmp = fh[i1]; 822 fh[i1] = fh[i2]; 823 fh[i2] = tmp; 824 } 825 826 /* heap algorithm ==>> */ 827 828 /* 829 * See heap sort algorithm 830 * "Raises" last element to its right place 831 */ 832 static void 833 file_header_heap_swim(struct file_header **fh, size_t indx) 834 { 835 if (indx > 0) { 836 size_t parent_index; 837 838 parent_index = (indx - 1) >> 1; 839 840 if (file_header_cmp(fh[indx], fh[parent_index]) < 0) { 841 /* swap child and parent and continue */ 842 file_header_swap(fh, indx, parent_index); 843 file_header_heap_swim(fh, parent_index); 844 } 845 } 846 } 847 848 /* 849 * Sink the top element to its correct position 850 */ 851 static void 852 file_header_heap_sink(struct file_header **fh, size_t indx, size_t size) 853 { 854 size_t left_child_index; 855 size_t right_child_index; 856 857 left_child_index = indx + indx + 1; 858 right_child_index = left_child_index + 1; 859 860 if (left_child_index < size) { 861 size_t min_child_index; 862 863 min_child_index = left_child_index; 864 865 if ((right_child_index < size) && 866 (file_header_cmp(fh[left_child_index], 867 fh[right_child_index]) > 0)) 868 min_child_index = right_child_index; 869 if (file_header_cmp(fh[indx], fh[min_child_index]) > 0) { 870 file_header_swap(fh, indx, min_child_index); 871 file_header_heap_sink(fh, min_child_index, size); 872 } 873 } 874 } 875 876 /* <<== heap algorithm */ 877 878 /* 879 * Adds element to the "left" end 880 */ 881 static void 882 file_header_list_rearrange_from_header(struct file_header **fh, size_t size) 883 { 884 file_header_heap_sink(fh, 0, size); 885 } 886 887 /* 888 * Adds element to the "right" end 889 */ 890 static void 891 file_header_list_push(struct file_header *f, struct file_header **fh, size_t size) 892 { 893 fh[size++] = f; 894 file_header_heap_swim(fh, size - 1); 895 } 896 897 struct last_printed 898 { 899 struct bwstring *str; 900 }; 901 902 /* 903 * Prints the current line of the file 904 */ 905 static void 906 file_header_print(struct file_header *fh, FILE *f_out, struct last_printed *lp) 907 { 908 if (sort_opts_vals.uflag) { 909 if ((lp->str == NULL) || (str_list_coll(lp->str, &(fh->si)))) { 910 bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); 911 if (lp->str) 912 bwsfree(lp->str); 913 lp->str = bwsdup(fh->si->str); 914 } 915 } else 916 bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); 917 } 918 919 /* 920 * Read next line 921 */ 922 static void 923 file_header_read_next(struct file_header *fh) 924 { 925 struct bwstring *tmp; 926 927 tmp = file_reader_readline(fh->fr); 928 if (tmp == NULL) { 929 file_reader_free(fh->fr); 930 fh->fr = NULL; 931 if (fh->si) { 932 sort_list_item_clean(fh->si); 933 sort_free(fh->si); 934 fh->si = NULL; 935 } 936 } else { 937 if (fh->si == NULL) 938 fh->si = sort_list_item_alloc(); 939 sort_list_item_set(fh->si, tmp); 940 } 941 } 942 943 /* 944 * Merge array of "files headers" 945 */ 946 static void 947 file_headers_merge(size_t fnum, struct file_header **fh, FILE *f_out) 948 { 949 struct last_printed lp; 950 size_t i; 951 952 memset(&lp, 0, sizeof(lp)); 953 954 /* 955 * construct the initial sort structure 956 */ 957 for (i = 0; i < fnum; i++) 958 file_header_list_push(fh[i], fh, i); 959 960 while (fh[0]->fr) { /* unfinished files are always in front */ 961 /* output the smallest line: */ 962 file_header_print(fh[0], f_out, &lp); 963 /* read a new line, if possible: */ 964 file_header_read_next(fh[0]); 965 /* re-arrange the list: */ 966 file_header_list_rearrange_from_header(fh, fnum); 967 } 968 969 if (lp.str) 970 bwsfree(lp.str); 971 } 972 973 /* 974 * Merges the given files into the output file, which can be 975 * stdout. 976 */ 977 static void 978 merge_files_array(size_t argc, char **argv, const char *fn_out) 979 { 980 struct file_header **fh; 981 FILE *f_out; 982 size_t i; 983 984 f_out = openfile(fn_out, "w"); 985 986 if (f_out == NULL) 987 err(2, "%s", fn_out); 988 989 fh = sort_reallocarray(NULL, argc + 1, sizeof(struct file_header *)); 990 991 for (i = 0; i < argc; i++) 992 file_header_init(fh + i, argv[i], i); 993 994 file_headers_merge(argc, fh, f_out); 995 996 for (i = 0; i < argc; i++) 997 file_header_close(fh + i); 998 999 sort_free(fh); 1000 1001 closefile(f_out, fn_out); 1002 } 1003 1004 /* 1005 * Shrinks the file list until its size smaller than max number of opened files 1006 */ 1007 static int 1008 shrink_file_list(struct file_list *fl) 1009 { 1010 struct file_list new_fl; 1011 size_t indx = 0; 1012 1013 if (fl->count < max_open_files) 1014 return 0; 1015 1016 file_list_init(&new_fl, true); 1017 while (indx < fl->count) { 1018 char *fnew; 1019 size_t num; 1020 1021 num = fl->count - indx; 1022 fnew = new_tmp_file_name(); 1023 1024 if (num >= max_open_files) 1025 num = max_open_files - 1; 1026 merge_files_array(num, fl->fns + indx, fnew); 1027 if (fl->tmp) { 1028 size_t i; 1029 1030 for (i = 0; i < num; i++) 1031 unlink(fl->fns[indx + i]); 1032 } 1033 file_list_add(&new_fl, fnew, false); 1034 indx += num; 1035 } 1036 fl->tmp = false; /* already taken care of */ 1037 file_list_clean(fl); 1038 1039 fl->count = new_fl.count; 1040 fl->fns = new_fl.fns; 1041 fl->sz = new_fl.sz; 1042 fl->tmp = new_fl.tmp; 1043 1044 return 1; 1045 } 1046 1047 /* 1048 * Merge list of files 1049 */ 1050 void 1051 merge_files(struct file_list *fl, const char *fn_out) 1052 { 1053 while (shrink_file_list(fl)) 1054 ; 1055 1056 merge_files_array(fl->count, fl->fns, fn_out); 1057 } 1058 1059 static const char * 1060 get_sort_method_name(int sm) 1061 { 1062 if (sm == SORT_MERGESORT) 1063 return "mergesort"; 1064 else if (sort_opts_vals.sort_method == SORT_RADIXSORT) 1065 return "radixsort"; 1066 else if (sort_opts_vals.sort_method == SORT_HEAPSORT) 1067 return "heapsort"; 1068 else 1069 return "quicksort"; 1070 } 1071 1072 /* 1073 * Sort list of lines and writes it to the file 1074 */ 1075 void 1076 sort_list_to_file(struct sort_list *list, const char *outfile) 1077 { 1078 struct sort_mods *sm = &(keys[0].sm); 1079 1080 if (!sm->Mflag && !sm->Rflag && !sm->Vflag && 1081 !sm->gflag && !sm->hflag && !sm->nflag) { 1082 if (sort_opts_vals.sort_method == SORT_DEFAULT && 1083 sort_mb_cur_max == 1) 1084 sort_opts_vals.sort_method = SORT_RADIXSORT; 1085 1086 } else if (sort_opts_vals.sort_method == SORT_RADIXSORT) 1087 err(2, "Radix sort cannot be used with these sort options"); 1088 1089 /* 1090 * To handle stable sort and the unique cases in the 1091 * right order, we need to use a stable algorithm. 1092 */ 1093 if (sort_opts_vals.sflag) { 1094 switch (sort_opts_vals.sort_method){ 1095 case SORT_MERGESORT: 1096 break; 1097 case SORT_RADIXSORT: 1098 break; 1099 case SORT_DEFAULT: 1100 sort_opts_vals.sort_method = SORT_MERGESORT; 1101 break; 1102 default: 1103 errx(2, "The chosen sort method cannot be used with " 1104 "stable and/or unique sort"); 1105 } 1106 } 1107 1108 if (sort_opts_vals.sort_method == SORT_DEFAULT) 1109 sort_opts_vals.sort_method = DEFAULT_SORT_ALGORITHM; 1110 1111 if (debug_sort) 1112 printf("sort_method=%s\n", 1113 get_sort_method_name(sort_opts_vals.sort_method)); 1114 1115 switch (sort_opts_vals.sort_method){ 1116 case SORT_RADIXSORT: 1117 rxsort(list->list, list->count); 1118 break; 1119 case SORT_MERGESORT: 1120 mergesort(list->list, list->count, 1121 sizeof(struct sort_list_item *), list_coll); 1122 break; 1123 case SORT_HEAPSORT: 1124 heapsort(list->list, list->count, 1125 sizeof(struct sort_list_item *), list_coll); 1126 break; 1127 case SORT_QSORT: 1128 qsort(list->list, list->count, 1129 sizeof(struct sort_list_item *), list_coll); 1130 break; 1131 default: 1132 DEFAULT_SORT_FUNC(list->list, list->count, 1133 sizeof(struct sort_list_item *), list_coll); 1134 break; 1135 } 1136 sort_list_dump(list, outfile); 1137 } 1138