1 /* $NetBSD: fsort.c,v 1.32 2008/04/28 20:24:15 martin Exp $ */ 2 3 /*- 4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Ben Harris and Jaromir Dolecek. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1993 34 * The Regents of the University of California. All rights reserved. 35 * 36 * This code is derived from software contributed to Berkeley by 37 * Peter McIlroy. 38 * 39 * Redistribution and use in source and binary forms, with or without 40 * modification, are permitted provided that the following conditions 41 * are met: 42 * 1. Redistributions of source code must retain the above copyright 43 * notice, this list of conditions and the following disclaimer. 44 * 2. Redistributions in binary form must reproduce the above copyright 45 * notice, this list of conditions and the following disclaimer in the 46 * documentation and/or other materials provided with the distribution. 47 * 3. Neither the name of the University nor the names of its contributors 48 * may be used to endorse or promote products derived from this software 49 * without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 61 * SUCH DAMAGE. 62 */ 63 64 /* 65 * Read in the next bin. If it fits in one segment sort it; 66 * otherwise refine it by segment deeper by one character, 67 * and try again on smaller bins. Sort the final bin at this level 68 * of recursion to keep the head of fstack at 0. 69 * After PANIC passes, abort to merge sort. 70 */ 71 #include "sort.h" 72 #include "fsort.h" 73 74 #ifndef lint 75 __RCSID("$NetBSD: fsort.c,v 1.32 2008/04/28 20:24:15 martin Exp $"); 76 __SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93"); 77 #endif /* not lint */ 78 79 #include <stdlib.h> 80 #include <string.h> 81 82 static const u_char **keylist = 0; 83 u_char *buffer = 0, *linebuf = 0; 84 size_t bufsize = DEFBUFSIZE; 85 size_t linebuf_size; 86 #define FSORTMAX 4 87 int PANIC = FSORTMAX; 88 89 struct tempfile fstack[MAXFCT]; 90 #define MSTART (MAXFCT - MERGE_FNUM) 91 #define CHECKFSTACK(n) \ 92 if (n >= MAXFCT) \ 93 errx(2, "fstack: too many temporary files; use -H or sort in pieces") 94 95 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1)) 96 97 void 98 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl) 99 int binno, depth, top; 100 struct filelist *filelist; 101 int nfiles; 102 FILE *outfp; 103 struct field *ftbl; 104 { 105 const u_char **keypos; 106 u_char *bufend; 107 u_char *weights; 108 int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0; 109 int c, nelem, base; 110 long sizes [NBINS + 1]; 111 get_func_t get; 112 struct recheader *crec; 113 struct field tfield[2]; 114 FILE *prevfp, *tailfp[FSORTMAX + 1]; 115 u_char *nbuffer; 116 117 memset(tailfp, 0, sizeof(tailfp)); 118 prevfp = outfp; 119 memset(tfield, 0, sizeof(tfield)); 120 if (ftbl[0].flags & R) 121 tfield[0].weights = Rascii; 122 else 123 tfield[0].weights = ascii; 124 tfield[0].icol.num = 1; 125 weights = ftbl[0].weights; 126 if (!buffer) { 127 buffer = malloc(bufsize); 128 keylist = malloc(MAXNUM * sizeof(u_char *)); 129 memset(keylist, 0, MAXNUM * sizeof(u_char *)); 130 if (!SINGL_FLD) { 131 linebuf_size = DEFLLEN; 132 if ((linebuf = malloc(linebuf_size)) == NULL) 133 errx(2, "cannot allocate memory"); 134 } 135 } 136 bufend = buffer + bufsize; 137 if (binno >= 0) { 138 base = top + nfiles; 139 get = getnext; 140 } else { 141 base = 0; 142 if (SINGL_FLD) 143 get = makeline; 144 else 145 get = makekey; 146 } 147 for (;;) { 148 memset(sizes, 0, sizeof(sizes)); 149 c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */ 150 if (binno == weights[REC_D] && 151 !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */ 152 rd_append(weights[REC_D], top, 153 nfiles, prevfp, buffer, bufend); 154 break; 155 } else if (binno == weights[REC_D]) { 156 depth = 0; /* start over on flat weights */ 157 ftbl = tfield; 158 weights = ftbl[0].weights; 159 } 160 keypos = keylist; /* XXXGCC -Wuninitialized m68k */ 161 crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */ 162 while (c != EOF) { 163 keypos = keylist; 164 nelem = 0; 165 crec = (RECHEADER *) buffer; 166 167 do_read: 168 while ((c = get(binno, top, filelist, nfiles, crec, 169 bufend, ftbl)) == 0) { 170 *keypos++ = crec->data + depth; 171 if (++nelem == MAXNUM) { 172 c = BUFFEND; 173 break; 174 } 175 crec =(RECHEADER *)((char *) crec + 176 SALIGN(crec->length) + sizeof(TRECHEADER)); 177 } 178 179 if (c == BUFFEND && nelem < MAXNUM 180 && bufsize < MAXBUFSIZE) { 181 const u_char **keyp; 182 u_char *oldb = buffer; 183 184 /* buffer was too small for data, allocate 185 * bigger buffer */ 186 nbuffer = realloc(buffer, bufsize * 2); 187 if (!nbuffer) { 188 err(2, "failed to realloc buffer to %lu bytes", 189 (unsigned long) bufsize * 2); 190 } 191 buffer = nbuffer; 192 bufsize *= 2; 193 bufend = buffer + bufsize; 194 195 /* patch up keylist[] */ 196 for (keyp = &keypos[-1]; keyp >= keylist; keyp--) 197 *keyp = buffer + (*keyp - oldb); 198 199 crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb)); 200 goto do_read; 201 } 202 203 if (c != BUFFEND && !ntfiles && !mfct) { 204 /* do not push */ 205 continue; 206 } 207 208 /* push */ 209 if (panic >= PANIC) { 210 fstack[MSTART + mfct].fp = ftmp(); 211 if ((stable_sort) 212 ? sradixsort(keylist, nelem, 213 weights, REC_D) 214 : radixsort(keylist, nelem, 215 weights, REC_D) ) 216 err(2, NULL); 217 append(keylist, nelem, depth, 218 fstack[MSTART + mfct].fp, putrec, 219 ftbl); 220 mfct++; 221 /* reduce number of open files */ 222 if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) { 223 /* 224 * Only copy extra incomplete crec 225 * data if there are any. 226 */ 227 int nodata = (bufend >= (u_char *)crec 228 && bufend <= crec->data); 229 size_t sz=0; 230 u_char *tmpbuf=NULL; 231 232 if (!nodata) { 233 sz = bufend - crec->data; 234 tmpbuf = malloc(sz); 235 memmove(tmpbuf, crec->data, sz); 236 } 237 238 CHECKFSTACK(base + ntfiles); 239 fstack[base + ntfiles].fp = ftmp(); 240 fmerge(0, MSTART, filelist, 241 mfct, geteasy, 242 fstack[base + ntfiles].fp, 243 putrec, ftbl); 244 ntfiles++; 245 mfct = 0; 246 247 if (!nodata) { 248 memmove(crec->data, tmpbuf, sz); 249 free(tmpbuf); 250 } 251 } 252 } else { 253 CHECKFSTACK(base + ntfiles); 254 fstack[base + ntfiles].fp = ftmp(); 255 onepass(keylist, depth, nelem, sizes, 256 weights, fstack[base + ntfiles].fp); 257 ntfiles++; 258 } 259 } 260 if (!ntfiles && !mfct) { /* everything in memory--pop */ 261 if (nelem > 1 262 && ((stable_sort) 263 ? sradixsort(keylist, nelem, weights, REC_D) 264 : radixsort(keylist, nelem, weights, REC_D) )) 265 err(2, NULL); 266 if (nelem > 0) 267 append(keylist, nelem, depth, outfp, putline, ftbl); 268 break; /* pop */ 269 } 270 if (panic >= PANIC) { 271 if (!ntfiles) 272 fmerge(0, MSTART, filelist, mfct, geteasy, 273 outfp, putline, ftbl); 274 else 275 fmerge(0, base, filelist, ntfiles, geteasy, 276 outfp, putline, ftbl); 277 break; 278 279 } 280 total = maxb = lastb = 0; /* find if one bin dominates */ 281 for (i = 0; i < NBINS; i++) 282 if (sizes[i]) { 283 if (sizes[i] > sizes[maxb]) 284 maxb = i; 285 lastb = i; 286 total += sizes[i]; 287 } 288 if (sizes[maxb] < max((total / 2) , BUFSIZE)) 289 maxb = lastb; /* otherwise pop after last bin */ 290 fstack[base].lastb = lastb; 291 fstack[base].maxb = maxb; 292 293 /* start refining next level. */ 294 getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */ 295 for (i = 0; i < maxb; i++) { 296 if (!sizes[i]) /* bin empty; step ahead file offset */ 297 getnext(i, base, NULL,ntfiles, crec, bufend, 0); 298 else 299 fsort(i, depth + 1, base, filelist, ntfiles, 300 outfp, ftbl); 301 } 302 303 get = getnext; 304 305 if (lastb != maxb) { 306 if (prevfp != outfp) 307 tailfp[panic] = prevfp; 308 prevfp = ftmp(); 309 for (i = maxb + 1; i <= lastb; i++) 310 if (!sizes[i]) 311 getnext(i, base, NULL, ntfiles, crec, 312 bufend,0); 313 else 314 fsort(i, depth + 1, base, filelist, 315 ntfiles, prevfp, ftbl); 316 } 317 318 /* sort biggest (or last) bin at this level */ 319 depth++; 320 panic++; 321 binno = maxb; 322 top = base; 323 nfiles = ntfiles; /* so overwrite them */ 324 } 325 if (prevfp != outfp) { 326 concat(outfp, prevfp); 327 fclose(prevfp); 328 } 329 for (i = panic; i >= 0; --i) 330 if (tailfp[i]) { 331 concat(outfp, tailfp[i]); 332 fclose(tailfp[i]); 333 } 334 335 /* If on top level, free our structures */ 336 if (depth == 0) { 337 free(keylist), keylist = NULL; 338 free(buffer), buffer = NULL; 339 } 340 } 341 342 /* 343 * This is one pass of radix exchange, dumping the bins to disk. 344 */ 345 #define swap(a, b, t) t = a, a = b, b = t 346 void 347 onepass(a, depth, n, sizes, tr, fp) 348 const u_char **a; 349 int depth; 350 long n, sizes[]; 351 u_char *tr; 352 FILE *fp; 353 { 354 size_t tsizes[NBINS + 1]; 355 const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp; 356 static int histo[256]; 357 int *hp; 358 int c; 359 const u_char **an, *t, **aj; 360 const u_char **ak, *r; 361 362 memset(tsizes, 0, sizeof(tsizes)); 363 depth += sizeof(TRECHEADER); 364 an = &a[n]; 365 for (ak = a; ak < an; ak++) { 366 histo[c = tr[**ak]]++; 367 tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length; 368 } 369 370 bin[0] = a; 371 bpmax = bin + 256; 372 tp = top, hp = histo; 373 for (bp = bin; bp < bpmax; bp++) { 374 *tp++ = *(bp + 1) = *bp + (c = *hp); 375 *hp++ = 0; 376 if (c <= 1) 377 continue; 378 } 379 for (aj = a; aj < an; *aj = r, aj = bin[c + 1]) 380 for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;) 381 swap(*ak, r, t); 382 383 for (ak = a, c = 0; c < 256; c++) { 384 an = bin[c + 1]; 385 n = an - ak; 386 tsizes[c] += n * sizeof(TRECHEADER); 387 /* tell getnext how many elements in this bin, this segment. */ 388 EWRITE(&tsizes[c], sizeof(size_t), 1, fp); 389 sizes[c] += tsizes[c]; 390 for (; ak < an; ++ak) 391 putrec((const RECHEADER *) *ak, fp); 392 } 393 } 394