1 /* $NetBSD: fsort.c,v 1.25 2003/08/07 11:15:54 agc Exp $ */ 2 3 /*- 4 * Copyright (c) 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Peter McIlroy. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Read in the next bin. If it fits in one segment sort it; 37 * otherwise refine it by segment deeper by one character, 38 * and try again on smaller bins. Sort the final bin at this level 39 * of recursion to keep the head of fstack at 0. 40 * After PANIC passes, abort to merge sort. 41 */ 42 #include "sort.h" 43 #include "fsort.h" 44 45 #ifndef lint 46 __RCSID("$NetBSD: fsort.c,v 1.25 2003/08/07 11:15:54 agc Exp $"); 47 __SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93"); 48 #endif /* not lint */ 49 50 #include <stdlib.h> 51 #include <string.h> 52 53 static const u_char **keylist = 0; 54 u_char *buffer = 0, *linebuf = 0; 55 size_t bufsize = DEFBUFSIZE; 56 size_t linebuf_size; 57 #define FSORTMAX 4 58 int PANIC = FSORTMAX; 59 60 struct tempfile fstack[MAXFCT]; 61 #define MSTART (MAXFCT - MERGE_FNUM) 62 #define CHECKFSTACK(n) \ 63 if (n >= MAXFCT) \ 64 errx(2, "fstack: too many temporary files; use -H or sort in pieces") 65 66 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1)) 67 68 void 69 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl) 70 int binno, depth, top; 71 struct filelist *filelist; 72 int nfiles; 73 FILE *outfp; 74 struct field *ftbl; 75 { 76 const u_char **keypos; 77 u_char *bufend, *tmpbuf; 78 u_char *weights; 79 int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0; 80 int c, nelem, base; 81 long sizes [NBINS+1]; 82 get_func_t get; 83 struct recheader *crec; 84 struct field tfield[2]; 85 FILE *prevfp, *tailfp[FSORTMAX+1]; 86 87 memset(tailfp, 0, sizeof(tailfp)); 88 prevfp = outfp; 89 memset(tfield, 0, sizeof(tfield)); 90 if (ftbl[0].flags & R) 91 tfield[0].weights = Rascii; 92 else 93 tfield[0].weights = ascii; 94 tfield[0].icol.num = 1; 95 weights = ftbl[0].weights; 96 if (!buffer) { 97 buffer = malloc(bufsize); 98 keylist = malloc(MAXNUM * sizeof(u_char *)); 99 memset(keylist, 0, MAXNUM * sizeof(u_char *)); 100 if (!SINGL_FLD) { 101 linebuf_size = DEFLLEN; 102 if ((linebuf = malloc(linebuf_size)) == NULL) 103 errx(2, "cannot allocate memory"); 104 } 105 } 106 bufend = buffer + bufsize; 107 if (binno >= 0) { 108 base = top + nfiles; 109 get = getnext; 110 } else { 111 base = 0; 112 if (SINGL_FLD) 113 get = makeline; 114 else 115 get = makekey; 116 } 117 for (;;) { 118 memset(sizes, 0, sizeof(sizes)); 119 c = ntfiles = 0; 120 if (binno == weights[REC_D] && 121 !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */ 122 rd_append(weights[REC_D], top, 123 nfiles, prevfp, buffer, bufend); 124 break; 125 } else if (binno == weights[REC_D]) { 126 depth = 0; /* start over on flat weights */ 127 ftbl = tfield; 128 weights = ftbl[0].weights; 129 } 130 while (c != EOF) { 131 keypos = keylist; 132 nelem = 0; 133 crec = (RECHEADER *) buffer; 134 135 do_read: 136 while((c = get(binno, top, filelist, nfiles, crec, 137 bufend, ftbl)) == 0) { 138 *keypos++ = crec->data + depth; 139 if (++nelem == MAXNUM) { 140 c = BUFFEND; 141 break; 142 } 143 crec =(RECHEADER *) ((char *) crec + 144 SALIGN(crec->length) + sizeof(TRECHEADER)); 145 } 146 147 if (c == BUFFEND && nelem < MAXNUM 148 && bufsize < MAXBUFSIZE) { 149 const u_char **keyp; 150 u_char *oldb = buffer; 151 152 /* buffer was too small for data, allocate 153 * bigger buffer */ 154 bufsize *= 2; 155 buffer = realloc(buffer, bufsize); 156 if (!buffer) { 157 err(2, "failed to realloc buffer to %ld bytes", 158 (unsigned long) bufsize); 159 } 160 bufend = buffer + bufsize; 161 162 /* patch up keylist[] */ 163 for(keyp = &keypos[-1]; keyp >= keylist; keyp--) 164 *keyp = buffer + (*keyp - oldb); 165 166 crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb)); 167 goto do_read; 168 } 169 170 if (c != BUFFEND && !ntfiles && !mfct) { 171 /* do not push */ 172 continue; 173 } 174 175 /* push */ 176 if (panic >= PANIC) { 177 fstack[MSTART + mfct].fp = ftmp(); 178 if ((stable_sort) 179 ? sradixsort(keylist, nelem, 180 weights, REC_D) 181 : radixsort(keylist, nelem, 182 weights, REC_D) ) 183 err(2, NULL); 184 append(keylist, nelem, depth, 185 fstack[MSTART + mfct].fp, putrec, 186 ftbl); 187 mfct++; 188 /* reduce number of open files */ 189 if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) { 190 /* 191 * Only copy extra incomplete crec 192 * data if there are any. 193 */ 194 int nodata = (bufend >= (u_char *)crec 195 && bufend <= crec->data); 196 197 if (!nodata) { 198 tmpbuf = malloc(bufend - 199 crec->data); 200 memmove(tmpbuf, crec->data, 201 bufend - crec->data); 202 } 203 204 CHECKFSTACK(base + ntfiles); 205 fstack[base + ntfiles].fp = ftmp(); 206 fmerge(0, MSTART, filelist, 207 mfct, geteasy, 208 fstack[base + ntfiles].fp, 209 putrec, ftbl); 210 ntfiles++; 211 mfct = 0; 212 213 if (!nodata) { 214 memmove(crec->data, tmpbuf, 215 bufend - crec->data); 216 free(tmpbuf); 217 } 218 } 219 } else { 220 CHECKFSTACK(base + ntfiles); 221 fstack[base + ntfiles].fp = ftmp(); 222 onepass(keylist, depth, nelem, sizes, 223 weights, fstack[base + ntfiles].fp); 224 ntfiles++; 225 } 226 } 227 if (!ntfiles && !mfct) { /* everything in memory--pop */ 228 if (nelem > 1 229 && ((stable_sort) 230 ? sradixsort(keylist, nelem, weights, REC_D) 231 : radixsort(keylist, nelem, weights, REC_D) )) 232 err(2, NULL); 233 if (nelem > 0) 234 append(keylist, nelem, depth, outfp, putline, ftbl); 235 break; /* pop */ 236 } 237 if (panic >= PANIC) { 238 if (!ntfiles) 239 fmerge(0, MSTART, filelist, mfct, geteasy, 240 outfp, putline, ftbl); 241 else 242 fmerge(0, base, filelist, ntfiles, geteasy, 243 outfp, putline, ftbl); 244 break; 245 246 } 247 total = maxb = lastb = 0; /* find if one bin dominates */ 248 for (i = 0; i < NBINS; i++) 249 if (sizes[i]) { 250 if (sizes[i] > sizes[maxb]) 251 maxb = i; 252 lastb = i; 253 total += sizes[i]; 254 } 255 if (sizes[maxb] < max((total / 2) , BUFSIZE)) 256 maxb = lastb; /* otherwise pop after last bin */ 257 fstack[base].lastb = lastb; 258 fstack[base].maxb = maxb; 259 260 /* start refining next level. */ 261 getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */ 262 for (i = 0; i < maxb; i++) { 263 if (!sizes[i]) /* bin empty; step ahead file offset */ 264 getnext(i, base, NULL,ntfiles, crec, bufend, 0); 265 else 266 fsort(i, depth+1, base, filelist, ntfiles, 267 outfp, ftbl); 268 } 269 270 get = getnext; 271 272 if (lastb != maxb) { 273 if (prevfp != outfp) 274 tailfp[panic] = prevfp; 275 prevfp = ftmp(); 276 for (i = maxb+1; i <= lastb; i++) 277 if (!sizes[i]) 278 getnext(i, base, NULL, ntfiles, crec, 279 bufend,0); 280 else 281 fsort(i, depth+1, base, filelist, 282 ntfiles, prevfp, ftbl); 283 } 284 285 /* sort biggest (or last) bin at this level */ 286 depth++; 287 panic++; 288 binno = maxb; 289 top = base; 290 nfiles = ntfiles; /* so overwrite them */ 291 } 292 if (prevfp != outfp) { 293 concat(outfp, prevfp); 294 fclose(prevfp); 295 } 296 for (i = panic; i >= 0; --i) 297 if (tailfp[i]) { 298 concat(outfp, tailfp[i]); 299 fclose(tailfp[i]); 300 } 301 302 /* If on top level, free our structures */ 303 if (depth == 0) { 304 free(keylist), keylist = NULL; 305 free(buffer), buffer = NULL; 306 } 307 } 308 309 /* 310 * This is one pass of radix exchange, dumping the bins to disk. 311 */ 312 #define swap(a, b, t) t = a, a = b, b = t 313 void 314 onepass(a, depth, n, sizes, tr, fp) 315 const u_char **a; 316 int depth; 317 long n, sizes[]; 318 u_char *tr; 319 FILE *fp; 320 { 321 size_t tsizes[NBINS+1]; 322 const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp; 323 static int histo[256]; 324 int *hp; 325 int c; 326 const u_char **an, *t, **aj; 327 const u_char **ak, *r; 328 329 memset(tsizes, 0, sizeof(tsizes)); 330 depth += sizeof(TRECHEADER); 331 an = &a[n]; 332 for (ak = a; ak < an; ak++) { 333 histo[c = tr[**ak]]++; 334 tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length; 335 } 336 337 bin[0] = a; 338 bpmax = bin + 256; 339 tp = top, hp = histo; 340 for (bp = bin; bp < bpmax; bp++) { 341 *tp++ = *(bp+1) = *bp + (c = *hp); 342 *hp++ = 0; 343 if (c <= 1) 344 continue; 345 } 346 for (aj = a; aj < an; *aj = r, aj = bin[c+1]) 347 for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;) 348 swap(*ak, r, t); 349 350 for (ak = a, c = 0; c < 256; c++) { 351 an = bin[c+1]; 352 n = an - ak; 353 tsizes[c] += n * sizeof(TRECHEADER); 354 /* tell getnext how many elements in this bin, this segment. */ 355 EWRITE(&tsizes[c], sizeof(size_t), 1, fp); 356 sizes[c] += tsizes[c]; 357 for (; ak < an; ++ak) 358 putrec((const RECHEADER *) *ak, fp); 359 } 360 } 361