1 /* $NetBSD: fsort.c,v 1.31 2005/06/10 16:07:45 jmc Exp $ */ 2 3 /*- 4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Ben Harris and Jaromir Dolecek. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /*- 40 * Copyright (c) 1993 41 * The Regents of the University of California. All rights reserved. 42 * 43 * This code is derived from software contributed to Berkeley by 44 * Peter McIlroy. 45 * 46 * Redistribution and use in source and binary forms, with or without 47 * modification, are permitted provided that the following conditions 48 * are met: 49 * 1. Redistributions of source code must retain the above copyright 50 * notice, this list of conditions and the following disclaimer. 51 * 2. Redistributions in binary form must reproduce the above copyright 52 * notice, this list of conditions and the following disclaimer in the 53 * documentation and/or other materials provided with the distribution. 54 * 3. Neither the name of the University nor the names of its contributors 55 * may be used to endorse or promote products derived from this software 56 * without specific prior written permission. 57 * 58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 */ 70 71 /* 72 * Read in the next bin. If it fits in one segment sort it; 73 * otherwise refine it by segment deeper by one character, 74 * and try again on smaller bins. Sort the final bin at this level 75 * of recursion to keep the head of fstack at 0. 76 * After PANIC passes, abort to merge sort. 77 */ 78 #include "sort.h" 79 #include "fsort.h" 80 81 #ifndef lint 82 __RCSID("$NetBSD: fsort.c,v 1.31 2005/06/10 16:07:45 jmc Exp $"); 83 __SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93"); 84 #endif /* not lint */ 85 86 #include <stdlib.h> 87 #include <string.h> 88 89 static const u_char **keylist = 0; 90 u_char *buffer = 0, *linebuf = 0; 91 size_t bufsize = DEFBUFSIZE; 92 size_t linebuf_size; 93 #define FSORTMAX 4 94 int PANIC = FSORTMAX; 95 96 struct tempfile fstack[MAXFCT]; 97 #define MSTART (MAXFCT - MERGE_FNUM) 98 #define CHECKFSTACK(n) \ 99 if (n >= MAXFCT) \ 100 errx(2, "fstack: too many temporary files; use -H or sort in pieces") 101 102 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1)) 103 104 void 105 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl) 106 int binno, depth, top; 107 struct filelist *filelist; 108 int nfiles; 109 FILE *outfp; 110 struct field *ftbl; 111 { 112 const u_char **keypos; 113 u_char *bufend; 114 u_char *weights; 115 int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0; 116 int c, nelem, base; 117 long sizes [NBINS + 1]; 118 get_func_t get; 119 struct recheader *crec; 120 struct field tfield[2]; 121 FILE *prevfp, *tailfp[FSORTMAX + 1]; 122 u_char *nbuffer; 123 124 memset(tailfp, 0, sizeof(tailfp)); 125 prevfp = outfp; 126 memset(tfield, 0, sizeof(tfield)); 127 if (ftbl[0].flags & R) 128 tfield[0].weights = Rascii; 129 else 130 tfield[0].weights = ascii; 131 tfield[0].icol.num = 1; 132 weights = ftbl[0].weights; 133 if (!buffer) { 134 buffer = malloc(bufsize); 135 keylist = malloc(MAXNUM * sizeof(u_char *)); 136 memset(keylist, 0, MAXNUM * sizeof(u_char *)); 137 if (!SINGL_FLD) { 138 linebuf_size = DEFLLEN; 139 if ((linebuf = malloc(linebuf_size)) == NULL) 140 errx(2, "cannot allocate memory"); 141 } 142 } 143 bufend = buffer + bufsize; 144 if (binno >= 0) { 145 base = top + nfiles; 146 get = getnext; 147 } else { 148 base = 0; 149 if (SINGL_FLD) 150 get = makeline; 151 else 152 get = makekey; 153 } 154 for (;;) { 155 memset(sizes, 0, sizeof(sizes)); 156 c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */ 157 if (binno == weights[REC_D] && 158 !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */ 159 rd_append(weights[REC_D], top, 160 nfiles, prevfp, buffer, bufend); 161 break; 162 } else if (binno == weights[REC_D]) { 163 depth = 0; /* start over on flat weights */ 164 ftbl = tfield; 165 weights = ftbl[0].weights; 166 } 167 keypos = keylist; /* XXXGCC -Wuninitialized m68k */ 168 crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */ 169 while (c != EOF) { 170 keypos = keylist; 171 nelem = 0; 172 crec = (RECHEADER *) buffer; 173 174 do_read: 175 while ((c = get(binno, top, filelist, nfiles, crec, 176 bufend, ftbl)) == 0) { 177 *keypos++ = crec->data + depth; 178 if (++nelem == MAXNUM) { 179 c = BUFFEND; 180 break; 181 } 182 crec =(RECHEADER *)((char *) crec + 183 SALIGN(crec->length) + sizeof(TRECHEADER)); 184 } 185 186 if (c == BUFFEND && nelem < MAXNUM 187 && bufsize < MAXBUFSIZE) { 188 const u_char **keyp; 189 u_char *oldb = buffer; 190 191 /* buffer was too small for data, allocate 192 * bigger buffer */ 193 nbuffer = realloc(buffer, bufsize * 2); 194 if (!nbuffer) { 195 err(2, "failed to realloc buffer to %lu bytes", 196 (unsigned long) bufsize * 2); 197 } 198 buffer = nbuffer; 199 bufsize *= 2; 200 bufend = buffer + bufsize; 201 202 /* patch up keylist[] */ 203 for (keyp = &keypos[-1]; keyp >= keylist; keyp--) 204 *keyp = buffer + (*keyp - oldb); 205 206 crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb)); 207 goto do_read; 208 } 209 210 if (c != BUFFEND && !ntfiles && !mfct) { 211 /* do not push */ 212 continue; 213 } 214 215 /* push */ 216 if (panic >= PANIC) { 217 fstack[MSTART + mfct].fp = ftmp(); 218 if ((stable_sort) 219 ? sradixsort(keylist, nelem, 220 weights, REC_D) 221 : radixsort(keylist, nelem, 222 weights, REC_D) ) 223 err(2, NULL); 224 append(keylist, nelem, depth, 225 fstack[MSTART + mfct].fp, putrec, 226 ftbl); 227 mfct++; 228 /* reduce number of open files */ 229 if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) { 230 /* 231 * Only copy extra incomplete crec 232 * data if there are any. 233 */ 234 int nodata = (bufend >= (u_char *)crec 235 && bufend <= crec->data); 236 size_t sz=0; 237 u_char *tmpbuf=NULL; 238 239 if (!nodata) { 240 sz = bufend - crec->data; 241 tmpbuf = malloc(sz); 242 memmove(tmpbuf, crec->data, sz); 243 } 244 245 CHECKFSTACK(base + ntfiles); 246 fstack[base + ntfiles].fp = ftmp(); 247 fmerge(0, MSTART, filelist, 248 mfct, geteasy, 249 fstack[base + ntfiles].fp, 250 putrec, ftbl); 251 ntfiles++; 252 mfct = 0; 253 254 if (!nodata) { 255 memmove(crec->data, tmpbuf, sz); 256 free(tmpbuf); 257 } 258 } 259 } else { 260 CHECKFSTACK(base + ntfiles); 261 fstack[base + ntfiles].fp = ftmp(); 262 onepass(keylist, depth, nelem, sizes, 263 weights, fstack[base + ntfiles].fp); 264 ntfiles++; 265 } 266 } 267 if (!ntfiles && !mfct) { /* everything in memory--pop */ 268 if (nelem > 1 269 && ((stable_sort) 270 ? sradixsort(keylist, nelem, weights, REC_D) 271 : radixsort(keylist, nelem, weights, REC_D) )) 272 err(2, NULL); 273 if (nelem > 0) 274 append(keylist, nelem, depth, outfp, putline, ftbl); 275 break; /* pop */ 276 } 277 if (panic >= PANIC) { 278 if (!ntfiles) 279 fmerge(0, MSTART, filelist, mfct, geteasy, 280 outfp, putline, ftbl); 281 else 282 fmerge(0, base, filelist, ntfiles, geteasy, 283 outfp, putline, ftbl); 284 break; 285 286 } 287 total = maxb = lastb = 0; /* find if one bin dominates */ 288 for (i = 0; i < NBINS; i++) 289 if (sizes[i]) { 290 if (sizes[i] > sizes[maxb]) 291 maxb = i; 292 lastb = i; 293 total += sizes[i]; 294 } 295 if (sizes[maxb] < max((total / 2) , BUFSIZE)) 296 maxb = lastb; /* otherwise pop after last bin */ 297 fstack[base].lastb = lastb; 298 fstack[base].maxb = maxb; 299 300 /* start refining next level. */ 301 getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */ 302 for (i = 0; i < maxb; i++) { 303 if (!sizes[i]) /* bin empty; step ahead file offset */ 304 getnext(i, base, NULL,ntfiles, crec, bufend, 0); 305 else 306 fsort(i, depth + 1, base, filelist, ntfiles, 307 outfp, ftbl); 308 } 309 310 get = getnext; 311 312 if (lastb != maxb) { 313 if (prevfp != outfp) 314 tailfp[panic] = prevfp; 315 prevfp = ftmp(); 316 for (i = maxb + 1; i <= lastb; i++) 317 if (!sizes[i]) 318 getnext(i, base, NULL, ntfiles, crec, 319 bufend,0); 320 else 321 fsort(i, depth + 1, base, filelist, 322 ntfiles, prevfp, ftbl); 323 } 324 325 /* sort biggest (or last) bin at this level */ 326 depth++; 327 panic++; 328 binno = maxb; 329 top = base; 330 nfiles = ntfiles; /* so overwrite them */ 331 } 332 if (prevfp != outfp) { 333 concat(outfp, prevfp); 334 fclose(prevfp); 335 } 336 for (i = panic; i >= 0; --i) 337 if (tailfp[i]) { 338 concat(outfp, tailfp[i]); 339 fclose(tailfp[i]); 340 } 341 342 /* If on top level, free our structures */ 343 if (depth == 0) { 344 free(keylist), keylist = NULL; 345 free(buffer), buffer = NULL; 346 } 347 } 348 349 /* 350 * This is one pass of radix exchange, dumping the bins to disk. 351 */ 352 #define swap(a, b, t) t = a, a = b, b = t 353 void 354 onepass(a, depth, n, sizes, tr, fp) 355 const u_char **a; 356 int depth; 357 long n, sizes[]; 358 u_char *tr; 359 FILE *fp; 360 { 361 size_t tsizes[NBINS + 1]; 362 const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp; 363 static int histo[256]; 364 int *hp; 365 int c; 366 const u_char **an, *t, **aj; 367 const u_char **ak, *r; 368 369 memset(tsizes, 0, sizeof(tsizes)); 370 depth += sizeof(TRECHEADER); 371 an = &a[n]; 372 for (ak = a; ak < an; ak++) { 373 histo[c = tr[**ak]]++; 374 tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length; 375 } 376 377 bin[0] = a; 378 bpmax = bin + 256; 379 tp = top, hp = histo; 380 for (bp = bin; bp < bpmax; bp++) { 381 *tp++ = *(bp + 1) = *bp + (c = *hp); 382 *hp++ = 0; 383 if (c <= 1) 384 continue; 385 } 386 for (aj = a; aj < an; *aj = r, aj = bin[c + 1]) 387 for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;) 388 swap(*ak, r, t); 389 390 for (ak = a, c = 0; c < 256; c++) { 391 an = bin[c + 1]; 392 n = an - ak; 393 tsizes[c] += n * sizeof(TRECHEADER); 394 /* tell getnext how many elements in this bin, this segment. */ 395 EWRITE(&tsizes[c], sizeof(size_t), 1, fp); 396 sizes[c] += tsizes[c]; 397 for (; ak < an; ++ak) 398 putrec((const RECHEADER *) *ak, fp); 399 } 400 } 401