xref: /netbsd-src/usr.bin/sort/fsort.c (revision 326b2259b73e878289ebd80cd9d20bc5aee35e99)
1 /*	$NetBSD: fsort.c,v 1.25 2003/08/07 11:15:54 agc Exp $	*/
2 
3 /*-
4  * Copyright (c) 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Peter McIlroy.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * Read in the next bin.  If it fits in one segment sort it;
37  * otherwise refine it by segment deeper by one character,
38  * and try again on smaller bins.  Sort the final bin at this level
39  * of recursion to keep the head of fstack at 0.
40  * After PANIC passes, abort to merge sort.
41  */
42 #include "sort.h"
43 #include "fsort.h"
44 
45 #ifndef lint
46 __RCSID("$NetBSD: fsort.c,v 1.25 2003/08/07 11:15:54 agc Exp $");
47 __SCCSID("@(#)fsort.c	8.1 (Berkeley) 6/6/93");
48 #endif /* not lint */
49 
50 #include <stdlib.h>
51 #include <string.h>
52 
53 static const u_char **keylist = 0;
54 u_char *buffer = 0, *linebuf = 0;
55 size_t bufsize = DEFBUFSIZE;
56 size_t linebuf_size;
57 #define FSORTMAX 4
58 int PANIC = FSORTMAX;
59 
60 struct tempfile fstack[MAXFCT];
61 #define MSTART		(MAXFCT - MERGE_FNUM)
62 #define	CHECKFSTACK(n)					\
63 	if (n >= MAXFCT)				\
64 		errx(2, "fstack: too many temporary files; use -H or sort in pieces")
65 
66 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1))
67 
68 void
69 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl)
70 	int binno, depth, top;
71 	struct filelist *filelist;
72 	int nfiles;
73 	FILE *outfp;
74 	struct field *ftbl;
75 {
76 	const u_char **keypos;
77 	u_char *bufend, *tmpbuf;
78 	u_char *weights;
79 	int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
80 	int c, nelem, base;
81 	long sizes [NBINS+1];
82 	get_func_t get;
83 	struct recheader *crec;
84 	struct field tfield[2];
85 	FILE *prevfp, *tailfp[FSORTMAX+1];
86 
87 	memset(tailfp, 0, sizeof(tailfp));
88 	prevfp = outfp;
89 	memset(tfield, 0, sizeof(tfield));
90 	if (ftbl[0].flags & R)
91 		tfield[0].weights = Rascii;
92 	else
93 		tfield[0].weights = ascii;
94 	tfield[0].icol.num = 1;
95 	weights = ftbl[0].weights;
96 	if (!buffer) {
97 		buffer = malloc(bufsize);
98 		keylist = malloc(MAXNUM * sizeof(u_char *));
99 		memset(keylist, 0, MAXNUM * sizeof(u_char *));
100 		if (!SINGL_FLD) {
101 			linebuf_size = DEFLLEN;
102 			if ((linebuf = malloc(linebuf_size)) == NULL)
103 				errx(2, "cannot allocate memory");
104 		}
105 	}
106 	bufend = buffer + bufsize;
107 	if (binno >= 0) {
108 		base = top + nfiles;
109 		get = getnext;
110 	} else {
111 		base = 0;
112 		if (SINGL_FLD)
113 			get = makeline;
114 		else
115 			get = makekey;
116 	}
117 	for (;;) {
118 		memset(sizes, 0, sizeof(sizes));
119 		c = ntfiles = 0;
120 		if (binno == weights[REC_D] &&
121 		    !(SINGL_FLD && ftbl[0].flags & F)) {	/* pop */
122 			rd_append(weights[REC_D], top,
123 			    nfiles, prevfp, buffer, bufend);
124 			break;
125 		} else if (binno == weights[REC_D]) {
126 			depth = 0;		/* start over on flat weights */
127 			ftbl = tfield;
128 			weights = ftbl[0].weights;
129 		}
130 		while (c != EOF) {
131 			keypos = keylist;
132 			nelem = 0;
133 			crec = (RECHEADER *) buffer;
134 
135 		   do_read:
136 			while((c = get(binno, top, filelist, nfiles, crec,
137 			    bufend, ftbl)) == 0) {
138 				*keypos++ = crec->data + depth;
139 				if (++nelem == MAXNUM) {
140 					c = BUFFEND;
141 					break;
142 				}
143 				crec =(RECHEADER *)	((char *) crec +
144 				SALIGN(crec->length) + sizeof(TRECHEADER));
145 			}
146 
147 			if (c == BUFFEND && nelem < MAXNUM
148 			    && bufsize < MAXBUFSIZE) {
149 				const u_char **keyp;
150 				u_char *oldb = buffer;
151 
152 				/* buffer was too small for data, allocate
153 				 * bigger buffer */
154 				bufsize *= 2;
155 				buffer = realloc(buffer, bufsize);
156 				if (!buffer) {
157 					err(2, "failed to realloc buffer to %ld bytes",
158 						(unsigned long) bufsize);
159 				}
160 				bufend = buffer + bufsize;
161 
162 				/* patch up keylist[] */
163 				for(keyp = &keypos[-1]; keyp >= keylist; keyp--)
164 					*keyp = buffer + (*keyp - oldb);
165 
166 				crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb));
167 				goto do_read;
168 			}
169 
170 			if (c != BUFFEND && !ntfiles && !mfct) {
171 				/* do not push */
172 				continue;
173 			}
174 
175 			/* push */
176 			if (panic >= PANIC) {
177 				fstack[MSTART + mfct].fp = ftmp();
178 				if ((stable_sort)
179 					? sradixsort(keylist, nelem,
180 						weights, REC_D)
181 					: radixsort(keylist, nelem,
182 						weights, REC_D) )
183 					err(2, NULL);
184 				append(keylist, nelem, depth,
185 				    fstack[MSTART + mfct].fp, putrec,
186 				    ftbl);
187 				mfct++;
188 				/* reduce number of open files */
189 				if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) {
190 					/*
191 					 * Only copy extra incomplete crec
192 					 * data if there are any.
193 					 */
194 					int nodata = (bufend >= (u_char *)crec
195 					    && bufend <= crec->data);
196 
197 					if (!nodata) {
198 						tmpbuf = malloc(bufend -
199 						    crec->data);
200 						memmove(tmpbuf, crec->data,
201 						    bufend - crec->data);
202 					}
203 
204 					CHECKFSTACK(base + ntfiles);
205 					fstack[base + ntfiles].fp = ftmp();
206 					fmerge(0, MSTART, filelist,
207 					    mfct, geteasy,
208 					    fstack[base + ntfiles].fp,
209 					    putrec, ftbl);
210 					ntfiles++;
211 					mfct = 0;
212 
213 					if (!nodata) {
214 						memmove(crec->data, tmpbuf,
215 						    bufend - crec->data);
216 						free(tmpbuf);
217 					}
218 				}
219 			} else {
220 				CHECKFSTACK(base + ntfiles);
221 				fstack[base + ntfiles].fp = ftmp();
222 				onepass(keylist, depth, nelem, sizes,
223 				    weights, fstack[base + ntfiles].fp);
224 				ntfiles++;
225 			}
226 		}
227 		if (!ntfiles && !mfct) {	/* everything in memory--pop */
228 			if (nelem > 1
229 			    && ((stable_sort)
230 				? sradixsort(keylist, nelem, weights, REC_D)
231 				: radixsort(keylist, nelem, weights, REC_D) ))
232 				err(2, NULL);
233 			if (nelem > 0)
234 			  append(keylist, nelem, depth, outfp, putline, ftbl);
235 			break;					/* pop */
236 		}
237 		if (panic >= PANIC) {
238 			if (!ntfiles)
239 				fmerge(0, MSTART, filelist, mfct, geteasy,
240 				    outfp, putline, ftbl);
241 			else
242 				fmerge(0, base, filelist, ntfiles, geteasy,
243 				    outfp, putline, ftbl);
244 			break;
245 
246 		}
247 		total = maxb = lastb = 0;	/* find if one bin dominates */
248 		for (i = 0; i < NBINS; i++)
249 			if (sizes[i]) {
250 				if (sizes[i] > sizes[maxb])
251 					maxb = i;
252 				lastb = i;
253 				total += sizes[i];
254 			}
255 		if (sizes[maxb] < max((total / 2) , BUFSIZE))
256 			maxb = lastb;	/* otherwise pop after last bin */
257 		fstack[base].lastb = lastb;
258 		fstack[base].maxb = maxb;
259 
260 		/* start refining next level. */
261 		getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */
262 		for (i = 0; i < maxb; i++) {
263 			if (!sizes[i])	/* bin empty; step ahead file offset */
264 				getnext(i, base, NULL,ntfiles, crec, bufend, 0);
265 			else
266 				fsort(i, depth+1, base, filelist, ntfiles,
267 					outfp, ftbl);
268 		}
269 
270 		get = getnext;
271 
272 		if (lastb != maxb) {
273 			if (prevfp != outfp)
274 				tailfp[panic] = prevfp;
275 			prevfp = ftmp();
276 			for (i = maxb+1; i <= lastb; i++)
277 				if (!sizes[i])
278 					getnext(i, base, NULL, ntfiles, crec,
279 					    bufend,0);
280 				else
281 					fsort(i, depth+1, base, filelist,
282 					    ntfiles, prevfp, ftbl);
283 		}
284 
285 		/* sort biggest (or last) bin at this level */
286 		depth++;
287 		panic++;
288 		binno = maxb;
289 		top = base;
290 		nfiles = ntfiles;		/* so overwrite them */
291 	}
292 	if (prevfp != outfp) {
293 		concat(outfp, prevfp);
294 		fclose(prevfp);
295 	}
296 	for (i = panic; i >= 0; --i)
297 		if (tailfp[i]) {
298 			concat(outfp, tailfp[i]);
299 			fclose(tailfp[i]);
300 		}
301 
302 	/* If on top level, free our structures */
303 	if (depth == 0) {
304 		free(keylist), keylist = NULL;
305 		free(buffer), buffer = NULL;
306 	}
307 }
308 
309 /*
310  * This is one pass of radix exchange, dumping the bins to disk.
311  */
312 #define swap(a, b, t) t = a, a = b, b = t
313 void
314 onepass(a, depth, n, sizes, tr, fp)
315 	const u_char **a;
316 	int depth;
317 	long n, sizes[];
318 	u_char *tr;
319 	FILE *fp;
320 {
321 	size_t tsizes[NBINS+1];
322 	const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp;
323 	static int histo[256];
324 	int *hp;
325 	int c;
326 	const u_char **an, *t, **aj;
327 	const u_char **ak, *r;
328 
329 	memset(tsizes, 0, sizeof(tsizes));
330 	depth += sizeof(TRECHEADER);
331 	an = &a[n];
332 	for (ak = a; ak < an; ak++) {
333 		histo[c = tr[**ak]]++;
334 		tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length;
335 	}
336 
337 	bin[0] = a;
338 	bpmax = bin + 256;
339 	tp = top, hp = histo;
340 	for (bp = bin; bp < bpmax; bp++) {
341 		*tp++ = *(bp+1) = *bp + (c = *hp);
342 		*hp++ = 0;
343 		if (c <= 1)
344 			continue;
345 	}
346 	for (aj = a; aj < an; *aj = r, aj = bin[c+1])
347 		for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;)
348 			swap(*ak, r, t);
349 
350 	for (ak = a, c = 0; c < 256; c++) {
351 		an = bin[c+1];
352 		n = an - ak;
353 		tsizes[c] += n * sizeof(TRECHEADER);
354 		/* tell getnext how many elements in this bin, this segment. */
355 		EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
356 		sizes[c] += tsizes[c];
357 		for (; ak < an; ++ak)
358 			putrec((const RECHEADER *) *ak, fp);
359 	}
360 }
361