1 /* $NetBSD: fsort.c,v 1.30 2004/02/15 11:54:17 jdolecek Exp $ */ 2 3 /*- 4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Ben Harris and Jaromir Dolecek. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /*- 40 * Copyright (c) 1993 41 * The Regents of the University of California. All rights reserved. 42 * 43 * This code is derived from software contributed to Berkeley by 44 * Peter McIlroy. 45 * 46 * Redistribution and use in source and binary forms, with or without 47 * modification, are permitted provided that the following conditions 48 * are met: 49 * 1. Redistributions of source code must retain the above copyright 50 * notice, this list of conditions and the following disclaimer. 51 * 2. Redistributions in binary form must reproduce the above copyright 52 * notice, this list of conditions and the following disclaimer in the 53 * documentation and/or other materials provided with the distribution. 54 * 3. Neither the name of the University nor the names of its contributors 55 * may be used to endorse or promote products derived from this software 56 * without specific prior written permission. 57 * 58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 */ 70 71 /* 72 * Read in the next bin. If it fits in one segment sort it; 73 * otherwise refine it by segment deeper by one character, 74 * and try again on smaller bins. Sort the final bin at this level 75 * of recursion to keep the head of fstack at 0. 76 * After PANIC passes, abort to merge sort. 77 */ 78 #include "sort.h" 79 #include "fsort.h" 80 81 #ifndef lint 82 __RCSID("$NetBSD: fsort.c,v 1.30 2004/02/15 11:54:17 jdolecek Exp $"); 83 __SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93"); 84 #endif /* not lint */ 85 86 #include <stdlib.h> 87 #include <string.h> 88 89 static const u_char **keylist = 0; 90 u_char *buffer = 0, *linebuf = 0; 91 size_t bufsize = DEFBUFSIZE; 92 size_t linebuf_size; 93 #define FSORTMAX 4 94 int PANIC = FSORTMAX; 95 96 struct tempfile fstack[MAXFCT]; 97 #define MSTART (MAXFCT - MERGE_FNUM) 98 #define CHECKFSTACK(n) \ 99 if (n >= MAXFCT) \ 100 errx(2, "fstack: too many temporary files; use -H or sort in pieces") 101 102 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1)) 103 104 void 105 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl) 106 int binno, depth, top; 107 struct filelist *filelist; 108 int nfiles; 109 FILE *outfp; 110 struct field *ftbl; 111 { 112 const u_char **keypos; 113 u_char *bufend; 114 u_char *weights; 115 int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0; 116 int c, nelem, base; 117 long sizes [NBINS + 1]; 118 get_func_t get; 119 struct recheader *crec; 120 struct field tfield[2]; 121 FILE *prevfp, *tailfp[FSORTMAX + 1]; 122 u_char *nbuffer; 123 124 memset(tailfp, 0, sizeof(tailfp)); 125 prevfp = outfp; 126 memset(tfield, 0, sizeof(tfield)); 127 if (ftbl[0].flags & R) 128 tfield[0].weights = Rascii; 129 else 130 tfield[0].weights = ascii; 131 tfield[0].icol.num = 1; 132 weights = ftbl[0].weights; 133 if (!buffer) { 134 buffer = malloc(bufsize); 135 keylist = malloc(MAXNUM * sizeof(u_char *)); 136 memset(keylist, 0, MAXNUM * sizeof(u_char *)); 137 if (!SINGL_FLD) { 138 linebuf_size = DEFLLEN; 139 if ((linebuf = malloc(linebuf_size)) == NULL) 140 errx(2, "cannot allocate memory"); 141 } 142 } 143 bufend = buffer + bufsize; 144 if (binno >= 0) { 145 base = top + nfiles; 146 get = getnext; 147 } else { 148 base = 0; 149 if (SINGL_FLD) 150 get = makeline; 151 else 152 get = makekey; 153 } 154 for (;;) { 155 memset(sizes, 0, sizeof(sizes)); 156 c = ntfiles = 0; 157 if (binno == weights[REC_D] && 158 !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */ 159 rd_append(weights[REC_D], top, 160 nfiles, prevfp, buffer, bufend); 161 break; 162 } else if (binno == weights[REC_D]) { 163 depth = 0; /* start over on flat weights */ 164 ftbl = tfield; 165 weights = ftbl[0].weights; 166 } 167 while (c != EOF) { 168 keypos = keylist; 169 nelem = 0; 170 crec = (RECHEADER *) buffer; 171 172 do_read: 173 while ((c = get(binno, top, filelist, nfiles, crec, 174 bufend, ftbl)) == 0) { 175 *keypos++ = crec->data + depth; 176 if (++nelem == MAXNUM) { 177 c = BUFFEND; 178 break; 179 } 180 crec =(RECHEADER *)((char *) crec + 181 SALIGN(crec->length) + sizeof(TRECHEADER)); 182 } 183 184 if (c == BUFFEND && nelem < MAXNUM 185 && bufsize < MAXBUFSIZE) { 186 const u_char **keyp; 187 u_char *oldb = buffer; 188 189 /* buffer was too small for data, allocate 190 * bigger buffer */ 191 nbuffer = realloc(buffer, bufsize * 2); 192 if (!nbuffer) { 193 err(2, "failed to realloc buffer to %lu bytes", 194 (unsigned long) bufsize * 2); 195 } 196 buffer = nbuffer; 197 bufsize *= 2; 198 bufend = buffer + bufsize; 199 200 /* patch up keylist[] */ 201 for (keyp = &keypos[-1]; keyp >= keylist; keyp--) 202 *keyp = buffer + (*keyp - oldb); 203 204 crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb)); 205 goto do_read; 206 } 207 208 if (c != BUFFEND && !ntfiles && !mfct) { 209 /* do not push */ 210 continue; 211 } 212 213 /* push */ 214 if (panic >= PANIC) { 215 fstack[MSTART + mfct].fp = ftmp(); 216 if ((stable_sort) 217 ? sradixsort(keylist, nelem, 218 weights, REC_D) 219 : radixsort(keylist, nelem, 220 weights, REC_D) ) 221 err(2, NULL); 222 append(keylist, nelem, depth, 223 fstack[MSTART + mfct].fp, putrec, 224 ftbl); 225 mfct++; 226 /* reduce number of open files */ 227 if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) { 228 /* 229 * Only copy extra incomplete crec 230 * data if there are any. 231 */ 232 int nodata = (bufend >= (u_char *)crec 233 && bufend <= crec->data); 234 size_t sz=0; 235 u_char *tmpbuf=NULL; 236 237 if (!nodata) { 238 sz = bufend - crec->data; 239 tmpbuf = malloc(sz); 240 memmove(tmpbuf, crec->data, sz); 241 } 242 243 CHECKFSTACK(base + ntfiles); 244 fstack[base + ntfiles].fp = ftmp(); 245 fmerge(0, MSTART, filelist, 246 mfct, geteasy, 247 fstack[base + ntfiles].fp, 248 putrec, ftbl); 249 ntfiles++; 250 mfct = 0; 251 252 if (!nodata) { 253 memmove(crec->data, tmpbuf, sz); 254 free(tmpbuf); 255 } 256 } 257 } else { 258 CHECKFSTACK(base + ntfiles); 259 fstack[base + ntfiles].fp = ftmp(); 260 onepass(keylist, depth, nelem, sizes, 261 weights, fstack[base + ntfiles].fp); 262 ntfiles++; 263 } 264 } 265 if (!ntfiles && !mfct) { /* everything in memory--pop */ 266 if (nelem > 1 267 && ((stable_sort) 268 ? sradixsort(keylist, nelem, weights, REC_D) 269 : radixsort(keylist, nelem, weights, REC_D) )) 270 err(2, NULL); 271 if (nelem > 0) 272 append(keylist, nelem, depth, outfp, putline, ftbl); 273 break; /* pop */ 274 } 275 if (panic >= PANIC) { 276 if (!ntfiles) 277 fmerge(0, MSTART, filelist, mfct, geteasy, 278 outfp, putline, ftbl); 279 else 280 fmerge(0, base, filelist, ntfiles, geteasy, 281 outfp, putline, ftbl); 282 break; 283 284 } 285 total = maxb = lastb = 0; /* find if one bin dominates */ 286 for (i = 0; i < NBINS; i++) 287 if (sizes[i]) { 288 if (sizes[i] > sizes[maxb]) 289 maxb = i; 290 lastb = i; 291 total += sizes[i]; 292 } 293 if (sizes[maxb] < max((total / 2) , BUFSIZE)) 294 maxb = lastb; /* otherwise pop after last bin */ 295 fstack[base].lastb = lastb; 296 fstack[base].maxb = maxb; 297 298 /* start refining next level. */ 299 getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */ 300 for (i = 0; i < maxb; i++) { 301 if (!sizes[i]) /* bin empty; step ahead file offset */ 302 getnext(i, base, NULL,ntfiles, crec, bufend, 0); 303 else 304 fsort(i, depth + 1, base, filelist, ntfiles, 305 outfp, ftbl); 306 } 307 308 get = getnext; 309 310 if (lastb != maxb) { 311 if (prevfp != outfp) 312 tailfp[panic] = prevfp; 313 prevfp = ftmp(); 314 for (i = maxb + 1; i <= lastb; i++) 315 if (!sizes[i]) 316 getnext(i, base, NULL, ntfiles, crec, 317 bufend,0); 318 else 319 fsort(i, depth + 1, base, filelist, 320 ntfiles, prevfp, ftbl); 321 } 322 323 /* sort biggest (or last) bin at this level */ 324 depth++; 325 panic++; 326 binno = maxb; 327 top = base; 328 nfiles = ntfiles; /* so overwrite them */ 329 } 330 if (prevfp != outfp) { 331 concat(outfp, prevfp); 332 fclose(prevfp); 333 } 334 for (i = panic; i >= 0; --i) 335 if (tailfp[i]) { 336 concat(outfp, tailfp[i]); 337 fclose(tailfp[i]); 338 } 339 340 /* If on top level, free our structures */ 341 if (depth == 0) { 342 free(keylist), keylist = NULL; 343 free(buffer), buffer = NULL; 344 } 345 } 346 347 /* 348 * This is one pass of radix exchange, dumping the bins to disk. 349 */ 350 #define swap(a, b, t) t = a, a = b, b = t 351 void 352 onepass(a, depth, n, sizes, tr, fp) 353 const u_char **a; 354 int depth; 355 long n, sizes[]; 356 u_char *tr; 357 FILE *fp; 358 { 359 size_t tsizes[NBINS + 1]; 360 const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp; 361 static int histo[256]; 362 int *hp; 363 int c; 364 const u_char **an, *t, **aj; 365 const u_char **ak, *r; 366 367 memset(tsizes, 0, sizeof(tsizes)); 368 depth += sizeof(TRECHEADER); 369 an = &a[n]; 370 for (ak = a; ak < an; ak++) { 371 histo[c = tr[**ak]]++; 372 tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length; 373 } 374 375 bin[0] = a; 376 bpmax = bin + 256; 377 tp = top, hp = histo; 378 for (bp = bin; bp < bpmax; bp++) { 379 *tp++ = *(bp + 1) = *bp + (c = *hp); 380 *hp++ = 0; 381 if (c <= 1) 382 continue; 383 } 384 for (aj = a; aj < an; *aj = r, aj = bin[c + 1]) 385 for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;) 386 swap(*ak, r, t); 387 388 for (ak = a, c = 0; c < 256; c++) { 389 an = bin[c + 1]; 390 n = an - ak; 391 tsizes[c] += n * sizeof(TRECHEADER); 392 /* tell getnext how many elements in this bin, this segment. */ 393 EWRITE(&tsizes[c], sizeof(size_t), 1, fp); 394 sizes[c] += tsizes[c]; 395 for (; ak < an; ++ak) 396 putrec((const RECHEADER *) *ak, fp); 397 } 398 } 399