1 /* $NetBSD: msort.c,v 1.28 2009/10/09 20:29:43 dsl Exp $ */ 2 3 /*- 4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Ben Harris and Jaromir Dolecek. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1993 34 * The Regents of the University of California. All rights reserved. 35 * 36 * This code is derived from software contributed to Berkeley by 37 * Peter McIlroy. 38 * 39 * Redistribution and use in source and binary forms, with or without 40 * modification, are permitted provided that the following conditions 41 * are met: 42 * 1. Redistributions of source code must retain the above copyright 43 * notice, this list of conditions and the following disclaimer. 44 * 2. Redistributions in binary form must reproduce the above copyright 45 * notice, this list of conditions and the following disclaimer in the 46 * documentation and/or other materials provided with the distribution. 47 * 3. Neither the name of the University nor the names of its contributors 48 * may be used to endorse or promote products derived from this software 49 * without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 61 * SUCH DAMAGE. 62 */ 63 64 #include "sort.h" 65 #include "fsort.h" 66 67 #ifndef lint 68 __RCSID("$NetBSD: msort.c,v 1.28 2009/10/09 20:29:43 dsl Exp $"); 69 __SCCSID("@(#)msort.c 8.1 (Berkeley) 6/6/93"); 70 #endif /* not lint */ 71 72 #include <stdlib.h> 73 #include <string.h> 74 #include <unistd.h> 75 #include <util.h> 76 77 /* Subroutines using comparisons: merge sort and check order */ 78 #define DELETE (1) 79 80 typedef struct mfile { 81 FILE *fp; 82 get_func_t get; 83 RECHEADER *rec; 84 u_char *end; 85 } MFILE; 86 87 static int cmp(RECHEADER *, RECHEADER *); 88 static int insert(struct mfile **, struct mfile *, int, int); 89 static void merge_sort_fstack(FILE *, put_func_t, struct field *); 90 91 /* 92 * Number of files merge() can merge in one pass. 93 */ 94 #define MERGE_FNUM 16 95 96 static struct mfile fstack[MERGE_FNUM]; 97 static struct mfile fstack_1[MERGE_FNUM]; 98 static struct mfile fstack_2[MERGE_FNUM]; 99 static int fstack_count, fstack_1_count, fstack_2_count; 100 101 void 102 save_for_merge(FILE *fp, get_func_t get, struct field *ftbl) 103 { 104 FILE *mfp, *mfp1, *mfp2; 105 106 if (fstack_count == MERGE_FNUM) { 107 /* Must reduce the number of temporary files */ 108 mfp = ftmp(); 109 merge_sort_fstack(mfp, putrec, ftbl); 110 /* Save output in next layer */ 111 if (fstack_1_count == MERGE_FNUM) { 112 mfp1 = ftmp(); 113 memcpy(fstack, fstack_1, sizeof fstack); 114 merge_sort_fstack(mfp1, putrec, ftbl); 115 if (fstack_2_count == MERGE_FNUM) { 116 /* More than 4096 files! */ 117 mfp2 = ftmp(); 118 memcpy(fstack, fstack_2, sizeof fstack); 119 merge_sort_fstack(mfp2, putrec, ftbl); 120 fstack_2[0].fp = mfp2; 121 fstack_2_count = 1; 122 } 123 fstack_2[fstack_2_count].fp = mfp1; 124 fstack_2[fstack_2_count].get = geteasy; 125 fstack_2_count++; 126 fstack_1_count = 0; 127 } 128 fstack_1[fstack_1_count].fp = mfp; 129 fstack_1[fstack_1_count].get = geteasy; 130 fstack_1_count++; 131 fstack_count = 0; 132 } 133 134 fstack[fstack_count].fp = fp; 135 fstack[fstack_count++].get = get; 136 } 137 138 void 139 fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl) 140 { 141 get_func_t get = SINGL_FLD ? makeline : makekey; 142 FILE *fp; 143 int i; 144 145 for (i = 0; i < nfiles; i++) { 146 fp = fopen(filelist->names[i], "r"); 147 if (fp == NULL) 148 err(2, "%s", filelist->names[i]); 149 save_for_merge(fp, get, ftbl); 150 } 151 152 merge_sort(outfp, putline, ftbl); 153 } 154 155 void 156 merge_sort(FILE *outfp, put_func_t put, struct field *ftbl) 157 { 158 int count = fstack_1_count + fstack_2_count; 159 FILE *mfp; 160 int i; 161 162 if (count == 0) { 163 /* All files in initial array */ 164 merge_sort_fstack(outfp, put, ftbl); 165 return; 166 } 167 168 count += fstack_count; 169 170 /* Too many files for one merge sort */ 171 for (;;) { 172 /* Sort latest 16 files */ 173 i = count; 174 if (i > MERGE_FNUM) 175 i = MERGE_FNUM; 176 while (fstack_count > 0) 177 fstack[--i] = fstack[--fstack_count]; 178 while (i > 0 && fstack_1_count > 0) 179 fstack[--i] = fstack_1[--fstack_1_count]; 180 while (i > 0) 181 fstack[--i] = fstack_2[--fstack_2_count]; 182 if (count <= MERGE_FNUM) { 183 /* Got all the data */ 184 fstack_count = count; 185 merge_sort_fstack(outfp, put, ftbl); 186 return; 187 } 188 mfp = ftmp(); 189 fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count; 190 merge_sort_fstack(mfp, putrec, ftbl); 191 fstack[0].fp = mfp; 192 fstack[0].get = geteasy; 193 fstack_count = 1; 194 count -= MERGE_FNUM - 1; 195 } 196 } 197 198 static void 199 merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl) 200 { 201 struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile; 202 RECHEADER *new_rec; 203 u_char *new_end; 204 void *tmp; 205 int c, i, nfiles; 206 size_t sz; 207 208 /* Read one record from each file (read again if a duplicate) */ 209 for (nfiles = i = 0; i < fstack_count; i++) { 210 cfile = &fstack[i]; 211 if (cfile->rec == NULL) { 212 cfile->rec = emalloc(DEFLLEN); 213 cfile->end = (u_char *)cfile->rec + DEFLLEN; 214 } 215 rewind(cfile->fp); 216 217 for (;;) { 218 c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl); 219 if (c == EOF) 220 break; 221 222 if (c == BUFFEND) { 223 /* Double buffer size */ 224 sz = (cfile->end - (u_char *)cfile->rec) * 2; 225 cfile->rec = erealloc(cfile->rec, sz); 226 cfile->end = (u_char *)cfile->rec + sz; 227 continue; 228 } 229 230 if (nfiles != 0) { 231 if (insert(flist, cfile, nfiles, !DELETE)) 232 /* Duplicate removed */ 233 continue; 234 } else 235 flist[0] = cfile; 236 nfiles++; 237 break; 238 } 239 } 240 241 if (nfiles == 0) 242 return; 243 244 /* 245 * We now loop reading a new record from the file with the 246 * 'sorted first' existing record. 247 * As each record is added, the 'first' record is written to the 248 * output file - maintaining one record from each file in the sorted 249 * list. 250 */ 251 new_rec = emalloc(DEFLLEN); 252 new_end = (u_char *)new_rec + DEFLLEN; 253 for (;;) { 254 cfile = flist[0]; 255 c = cfile->get(cfile->fp, new_rec, new_end, ftbl); 256 if (c == EOF) { 257 /* Write out last record from now-empty input */ 258 put(cfile->rec, outfp); 259 if (--nfiles == 0) 260 break; 261 /* Replace from file with now-first sorted record. */ 262 /* (Moving base 'flist' saves copying everything!) */ 263 flist++; 264 continue; 265 } 266 if (c == BUFFEND) { 267 /* Buffer not large enough - double in size */ 268 sz = (new_end - (u_char *)new_rec) * 2; 269 new_rec = erealloc(new_rec, sz); 270 new_end = (u_char *)new_rec +sz; 271 continue; 272 } 273 274 /* Swap in new buffer, saving old */ 275 tmp = cfile->rec; 276 cfile->rec = new_rec; 277 new_rec = tmp; 278 tmp = cfile->end; 279 cfile->end = new_end; 280 new_end = tmp; 281 282 /* Add into sort, removing the original first entry */ 283 c = insert(flist, cfile, nfiles, DELETE); 284 if (c != 0 || (UNIQUE && cfile == flist[0] 285 && cmp(new_rec, cfile->rec) == 0)) { 286 /* Was an unwanted duplicate, restore buffer */ 287 tmp = cfile->rec; 288 cfile->rec = new_rec; 289 new_rec = tmp; 290 tmp = cfile->end; 291 cfile->end = new_end; 292 new_end = tmp; 293 continue; 294 } 295 296 /* Write out 'old' record */ 297 put(new_rec, outfp); 298 } 299 300 free(new_rec); 301 } 302 303 /* 304 * if delete: inserts rec in flist, deletes flist[0]; 305 * otherwise just inserts *rec in flist. 306 * Returns 1 if record is a duplicate to be ignored. 307 */ 308 static int 309 insert(struct mfile **flist, struct mfile *rec, int ttop, int delete) 310 { 311 int mid, top = ttop, bot = 0, cmpv = 1; 312 313 for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) { 314 cmpv = cmp(rec->rec, flist[mid]->rec); 315 if (cmpv == 0 ) { 316 if (UNIQUE) 317 /* Duplicate key, read another record */ 318 /* NB: This doesn't guarantee to keep any 319 * particular record. */ 320 return 1; 321 /* 322 * Apply sort by input file order. 323 * We could truncate the sort is the fileno are 324 * adjacent - but that is all too hard! 325 * The fileno cannot be equal, since we only have one 326 * record from each file (+ flist[0] which never 327 * comes here). 328 */ 329 cmpv = rec < flist[mid] ? -1 : 1; 330 if (REVERSE) 331 cmpv = -cmpv; 332 } 333 if (cmpv < 0) 334 top = mid; 335 else 336 bot = mid; 337 } 338 339 /* At this point we haven't yet compared against flist[0] */ 340 341 if (delete) { 342 /* flist[0] is ourselves, only the caller knows the old data */ 343 if (bot != 0) { 344 memmove(flist, flist + 1, bot * sizeof(MFILE *)); 345 flist[bot] = rec; 346 } 347 return 0; 348 } 349 350 /* Inserting original set of records */ 351 352 if (bot == 0 && cmpv != 0) { 353 /* Doesn't match flist[1], must compare with flist[0] */ 354 cmpv = cmp(rec->rec, flist[0]->rec); 355 if (cmpv == 0 && UNIQUE) 356 return 1; 357 /* Add matching keys in file order (ie new is later) */ 358 if (cmpv < 0) 359 bot = -1; 360 } 361 bot++; 362 memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *)); 363 flist[bot] = rec; 364 return 0; 365 } 366 367 /* 368 * check order on one file 369 */ 370 void 371 order(struct filelist *filelist, struct field *ftbl) 372 { 373 get_func_t get = SINGL_FLD ? makeline : makekey; 374 RECHEADER *crec, *prec, *trec; 375 u_char *crec_end, *prec_end, *trec_end; 376 FILE *fp; 377 int c; 378 379 fp = fopen(filelist->names[0], "r"); 380 if (fp == NULL) 381 err(2, "%s", filelist->names[0]); 382 383 crec = malloc(offsetof(RECHEADER, data[DEFLLEN])); 384 crec_end = crec->data + DEFLLEN; 385 prec = malloc(offsetof(RECHEADER, data[DEFLLEN])); 386 prec_end = prec->data + DEFLLEN; 387 388 /* XXX this does exit(0) for overlong lines */ 389 if (get(fp, prec, prec_end, ftbl) != 0) 390 exit(0); 391 while (get(fp, crec, crec_end, ftbl) == 0) { 392 if (0 < (c = cmp(prec, crec))) { 393 crec->data[crec->length-1] = 0; 394 errx(1, "found disorder: %s", crec->data+crec->offset); 395 } 396 if (UNIQUE && !c) { 397 crec->data[crec->length-1] = 0; 398 errx(1, "found non-uniqueness: %s", 399 crec->data+crec->offset); 400 } 401 /* 402 * Swap pointers so that this record is on place pointed 403 * to by prec and new record is read to place pointed to by 404 * crec. 405 */ 406 trec = prec; 407 prec = crec; 408 crec = trec; 409 trec_end = prec_end; 410 prec_end = crec_end; 411 crec_end = trec_end; 412 } 413 exit(0); 414 } 415 416 static int 417 cmp(RECHEADER *rec1, RECHEADER *rec2) 418 { 419 int len; 420 int r; 421 422 /* key is weights */ 423 len = min(rec1->keylen, rec2->keylen); 424 r = memcmp(rec1->data, rec2->data, len); 425 if (r == 0) 426 r = rec1->keylen - rec2->keylen; 427 if (REVERSE) 428 r = -r; 429 return r; 430 } 431