1 /* $NetBSD: fsort.c,v 1.24 2002/12/24 15:15:01 jdolecek Exp $ */ 2 3 /*- 4 * Copyright (c) 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Peter McIlroy. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 /* 40 * Read in the next bin. If it fits in one segment sort it; 41 * otherwise refine it by segment deeper by one character, 42 * and try again on smaller bins. Sort the final bin at this level 43 * of recursion to keep the head of fstack at 0. 44 * After PANIC passes, abort to merge sort. 45 */ 46 #include "sort.h" 47 #include "fsort.h" 48 49 #ifndef lint 50 __RCSID("$NetBSD: fsort.c,v 1.24 2002/12/24 15:15:01 jdolecek Exp $"); 51 __SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93"); 52 #endif /* not lint */ 53 54 #include <stdlib.h> 55 #include <string.h> 56 57 static const u_char **keylist = 0; 58 u_char *buffer = 0, *linebuf = 0; 59 size_t bufsize = DEFBUFSIZE; 60 size_t linebuf_size; 61 #define FSORTMAX 4 62 int PANIC = FSORTMAX; 63 64 struct tempfile fstack[MAXFCT]; 65 #define MSTART (MAXFCT - MERGE_FNUM) 66 #define CHECKFSTACK(n) \ 67 if (n >= MAXFCT) \ 68 errx(2, "fstack: too many temporary files; use -H or sort in pieces") 69 70 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1)) 71 72 void 73 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl) 74 int binno, depth, top; 75 struct filelist *filelist; 76 int nfiles; 77 FILE *outfp; 78 struct field *ftbl; 79 { 80 const u_char **keypos; 81 u_char *bufend, *tmpbuf; 82 u_char *weights; 83 int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0; 84 int c, nelem, base; 85 long sizes [NBINS+1]; 86 get_func_t get; 87 struct recheader *crec; 88 struct field tfield[2]; 89 FILE *prevfp, *tailfp[FSORTMAX+1]; 90 91 memset(tailfp, 0, sizeof(tailfp)); 92 prevfp = outfp; 93 memset(tfield, 0, sizeof(tfield)); 94 if (ftbl[0].flags & R) 95 tfield[0].weights = Rascii; 96 else 97 tfield[0].weights = ascii; 98 tfield[0].icol.num = 1; 99 weights = ftbl[0].weights; 100 if (!buffer) { 101 buffer = malloc(bufsize); 102 keylist = malloc(MAXNUM * sizeof(u_char *)); 103 memset(keylist, 0, MAXNUM * sizeof(u_char *)); 104 if (!SINGL_FLD) { 105 linebuf_size = DEFLLEN; 106 if ((linebuf = malloc(linebuf_size)) == NULL) 107 errx(2, "cannot allocate memory"); 108 } 109 } 110 bufend = buffer + bufsize; 111 if (binno >= 0) { 112 base = top + nfiles; 113 get = getnext; 114 } else { 115 base = 0; 116 if (SINGL_FLD) 117 get = makeline; 118 else 119 get = makekey; 120 } 121 for (;;) { 122 memset(sizes, 0, sizeof(sizes)); 123 c = ntfiles = 0; 124 if (binno == weights[REC_D] && 125 !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */ 126 rd_append(weights[REC_D], top, 127 nfiles, prevfp, buffer, bufend); 128 break; 129 } else if (binno == weights[REC_D]) { 130 depth = 0; /* start over on flat weights */ 131 ftbl = tfield; 132 weights = ftbl[0].weights; 133 } 134 while (c != EOF) { 135 keypos = keylist; 136 nelem = 0; 137 crec = (RECHEADER *) buffer; 138 139 do_read: 140 while((c = get(binno, top, filelist, nfiles, crec, 141 bufend, ftbl)) == 0) { 142 *keypos++ = crec->data + depth; 143 if (++nelem == MAXNUM) { 144 c = BUFFEND; 145 break; 146 } 147 crec =(RECHEADER *) ((char *) crec + 148 SALIGN(crec->length) + sizeof(TRECHEADER)); 149 } 150 151 if (c == BUFFEND && nelem < MAXNUM 152 && bufsize < MAXBUFSIZE) { 153 const u_char **keyp; 154 u_char *oldb = buffer; 155 156 /* buffer was too small for data, allocate 157 * bigger buffer */ 158 bufsize *= 2; 159 buffer = realloc(buffer, bufsize); 160 if (!buffer) { 161 err(2, "failed to realloc buffer to %ld bytes", 162 (unsigned long) bufsize); 163 } 164 bufend = buffer + bufsize; 165 166 /* patch up keylist[] */ 167 for(keyp = &keypos[-1]; keyp >= keylist; keyp--) 168 *keyp = buffer + (*keyp - oldb); 169 170 crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb)); 171 goto do_read; 172 } 173 174 if (c != BUFFEND && !ntfiles && !mfct) { 175 /* do not push */ 176 continue; 177 } 178 179 /* push */ 180 if (panic >= PANIC) { 181 fstack[MSTART + mfct].fp = ftmp(); 182 if ((stable_sort) 183 ? sradixsort(keylist, nelem, 184 weights, REC_D) 185 : radixsort(keylist, nelem, 186 weights, REC_D) ) 187 err(2, NULL); 188 append(keylist, nelem, depth, 189 fstack[MSTART + mfct].fp, putrec, 190 ftbl); 191 mfct++; 192 /* reduce number of open files */ 193 if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) { 194 /* 195 * Only copy extra incomplete crec 196 * data if there are any. 197 */ 198 int nodata = (bufend >= (u_char *)crec 199 && bufend <= crec->data); 200 201 if (!nodata) { 202 tmpbuf = malloc(bufend - 203 crec->data); 204 memmove(tmpbuf, crec->data, 205 bufend - crec->data); 206 } 207 208 CHECKFSTACK(base + ntfiles); 209 fstack[base + ntfiles].fp = ftmp(); 210 fmerge(0, MSTART, filelist, 211 mfct, geteasy, 212 fstack[base + ntfiles].fp, 213 putrec, ftbl); 214 ntfiles++; 215 mfct = 0; 216 217 if (!nodata) { 218 memmove(crec->data, tmpbuf, 219 bufend - crec->data); 220 free(tmpbuf); 221 } 222 } 223 } else { 224 CHECKFSTACK(base + ntfiles); 225 fstack[base + ntfiles].fp = ftmp(); 226 onepass(keylist, depth, nelem, sizes, 227 weights, fstack[base + ntfiles].fp); 228 ntfiles++; 229 } 230 } 231 if (!ntfiles && !mfct) { /* everything in memory--pop */ 232 if (nelem > 1 233 && ((stable_sort) 234 ? sradixsort(keylist, nelem, weights, REC_D) 235 : radixsort(keylist, nelem, weights, REC_D) )) 236 err(2, NULL); 237 if (nelem > 0) 238 append(keylist, nelem, depth, outfp, putline, ftbl); 239 break; /* pop */ 240 } 241 if (panic >= PANIC) { 242 if (!ntfiles) 243 fmerge(0, MSTART, filelist, mfct, geteasy, 244 outfp, putline, ftbl); 245 else 246 fmerge(0, base, filelist, ntfiles, geteasy, 247 outfp, putline, ftbl); 248 break; 249 250 } 251 total = maxb = lastb = 0; /* find if one bin dominates */ 252 for (i = 0; i < NBINS; i++) 253 if (sizes[i]) { 254 if (sizes[i] > sizes[maxb]) 255 maxb = i; 256 lastb = i; 257 total += sizes[i]; 258 } 259 if (sizes[maxb] < max((total / 2) , BUFSIZE)) 260 maxb = lastb; /* otherwise pop after last bin */ 261 fstack[base].lastb = lastb; 262 fstack[base].maxb = maxb; 263 264 /* start refining next level. */ 265 getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */ 266 for (i = 0; i < maxb; i++) { 267 if (!sizes[i]) /* bin empty; step ahead file offset */ 268 getnext(i, base, NULL,ntfiles, crec, bufend, 0); 269 else 270 fsort(i, depth+1, base, filelist, ntfiles, 271 outfp, ftbl); 272 } 273 274 get = getnext; 275 276 if (lastb != maxb) { 277 if (prevfp != outfp) 278 tailfp[panic] = prevfp; 279 prevfp = ftmp(); 280 for (i = maxb+1; i <= lastb; i++) 281 if (!sizes[i]) 282 getnext(i, base, NULL, ntfiles, crec, 283 bufend,0); 284 else 285 fsort(i, depth+1, base, filelist, 286 ntfiles, prevfp, ftbl); 287 } 288 289 /* sort biggest (or last) bin at this level */ 290 depth++; 291 panic++; 292 binno = maxb; 293 top = base; 294 nfiles = ntfiles; /* so overwrite them */ 295 } 296 if (prevfp != outfp) { 297 concat(outfp, prevfp); 298 fclose(prevfp); 299 } 300 for (i = panic; i >= 0; --i) 301 if (tailfp[i]) { 302 concat(outfp, tailfp[i]); 303 fclose(tailfp[i]); 304 } 305 306 /* If on top level, free our structures */ 307 if (depth == 0) { 308 free(keylist), keylist = NULL; 309 free(buffer), buffer = NULL; 310 } 311 } 312 313 /* 314 * This is one pass of radix exchange, dumping the bins to disk. 315 */ 316 #define swap(a, b, t) t = a, a = b, b = t 317 void 318 onepass(a, depth, n, sizes, tr, fp) 319 const u_char **a; 320 int depth; 321 long n, sizes[]; 322 u_char *tr; 323 FILE *fp; 324 { 325 size_t tsizes[NBINS+1]; 326 const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp; 327 static int histo[256]; 328 int *hp; 329 int c; 330 const u_char **an, *t, **aj; 331 const u_char **ak, *r; 332 333 memset(tsizes, 0, sizeof(tsizes)); 334 depth += sizeof(TRECHEADER); 335 an = &a[n]; 336 for (ak = a; ak < an; ak++) { 337 histo[c = tr[**ak]]++; 338 tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length; 339 } 340 341 bin[0] = a; 342 bpmax = bin + 256; 343 tp = top, hp = histo; 344 for (bp = bin; bp < bpmax; bp++) { 345 *tp++ = *(bp+1) = *bp + (c = *hp); 346 *hp++ = 0; 347 if (c <= 1) 348 continue; 349 } 350 for (aj = a; aj < an; *aj = r, aj = bin[c+1]) 351 for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;) 352 swap(*ak, r, t); 353 354 for (ak = a, c = 0; c < 256; c++) { 355 an = bin[c+1]; 356 n = an - ak; 357 tsizes[c] += n * sizeof(TRECHEADER); 358 /* tell getnext how many elements in this bin, this segment. */ 359 EWRITE(&tsizes[c], sizeof(size_t), 1, fp); 360 sizes[c] += tsizes[c]; 361 for (; ak < an; ++ak) 362 putrec((const RECHEADER *) *ak, fp); 363 } 364 } 365