xref: /netbsd-src/usr.bin/sort/fsort.c (revision d710132b4b8ce7f7cccaaf660cb16aa16b4077a0)
1 /*	$NetBSD: fsort.c,v 1.24 2002/12/24 15:15:01 jdolecek Exp $	*/
2 
3 /*-
4  * Copyright (c) 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Peter McIlroy.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 /*
40  * Read in the next bin.  If it fits in one segment sort it;
41  * otherwise refine it by segment deeper by one character,
42  * and try again on smaller bins.  Sort the final bin at this level
43  * of recursion to keep the head of fstack at 0.
44  * After PANIC passes, abort to merge sort.
45  */
46 #include "sort.h"
47 #include "fsort.h"
48 
49 #ifndef lint
50 __RCSID("$NetBSD: fsort.c,v 1.24 2002/12/24 15:15:01 jdolecek Exp $");
51 __SCCSID("@(#)fsort.c	8.1 (Berkeley) 6/6/93");
52 #endif /* not lint */
53 
54 #include <stdlib.h>
55 #include <string.h>
56 
57 static const u_char **keylist = 0;
58 u_char *buffer = 0, *linebuf = 0;
59 size_t bufsize = DEFBUFSIZE;
60 size_t linebuf_size;
61 #define FSORTMAX 4
62 int PANIC = FSORTMAX;
63 
64 struct tempfile fstack[MAXFCT];
65 #define MSTART		(MAXFCT - MERGE_FNUM)
66 #define	CHECKFSTACK(n)					\
67 	if (n >= MAXFCT)				\
68 		errx(2, "fstack: too many temporary files; use -H or sort in pieces")
69 
70 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1))
71 
72 void
73 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl)
74 	int binno, depth, top;
75 	struct filelist *filelist;
76 	int nfiles;
77 	FILE *outfp;
78 	struct field *ftbl;
79 {
80 	const u_char **keypos;
81 	u_char *bufend, *tmpbuf;
82 	u_char *weights;
83 	int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
84 	int c, nelem, base;
85 	long sizes [NBINS+1];
86 	get_func_t get;
87 	struct recheader *crec;
88 	struct field tfield[2];
89 	FILE *prevfp, *tailfp[FSORTMAX+1];
90 
91 	memset(tailfp, 0, sizeof(tailfp));
92 	prevfp = outfp;
93 	memset(tfield, 0, sizeof(tfield));
94 	if (ftbl[0].flags & R)
95 		tfield[0].weights = Rascii;
96 	else
97 		tfield[0].weights = ascii;
98 	tfield[0].icol.num = 1;
99 	weights = ftbl[0].weights;
100 	if (!buffer) {
101 		buffer = malloc(bufsize);
102 		keylist = malloc(MAXNUM * sizeof(u_char *));
103 		memset(keylist, 0, MAXNUM * sizeof(u_char *));
104 		if (!SINGL_FLD) {
105 			linebuf_size = DEFLLEN;
106 			if ((linebuf = malloc(linebuf_size)) == NULL)
107 				errx(2, "cannot allocate memory");
108 		}
109 	}
110 	bufend = buffer + bufsize;
111 	if (binno >= 0) {
112 		base = top + nfiles;
113 		get = getnext;
114 	} else {
115 		base = 0;
116 		if (SINGL_FLD)
117 			get = makeline;
118 		else
119 			get = makekey;
120 	}
121 	for (;;) {
122 		memset(sizes, 0, sizeof(sizes));
123 		c = ntfiles = 0;
124 		if (binno == weights[REC_D] &&
125 		    !(SINGL_FLD && ftbl[0].flags & F)) {	/* pop */
126 			rd_append(weights[REC_D], top,
127 			    nfiles, prevfp, buffer, bufend);
128 			break;
129 		} else if (binno == weights[REC_D]) {
130 			depth = 0;		/* start over on flat weights */
131 			ftbl = tfield;
132 			weights = ftbl[0].weights;
133 		}
134 		while (c != EOF) {
135 			keypos = keylist;
136 			nelem = 0;
137 			crec = (RECHEADER *) buffer;
138 
139 		   do_read:
140 			while((c = get(binno, top, filelist, nfiles, crec,
141 			    bufend, ftbl)) == 0) {
142 				*keypos++ = crec->data + depth;
143 				if (++nelem == MAXNUM) {
144 					c = BUFFEND;
145 					break;
146 				}
147 				crec =(RECHEADER *)	((char *) crec +
148 				SALIGN(crec->length) + sizeof(TRECHEADER));
149 			}
150 
151 			if (c == BUFFEND && nelem < MAXNUM
152 			    && bufsize < MAXBUFSIZE) {
153 				const u_char **keyp;
154 				u_char *oldb = buffer;
155 
156 				/* buffer was too small for data, allocate
157 				 * bigger buffer */
158 				bufsize *= 2;
159 				buffer = realloc(buffer, bufsize);
160 				if (!buffer) {
161 					err(2, "failed to realloc buffer to %ld bytes",
162 						(unsigned long) bufsize);
163 				}
164 				bufend = buffer + bufsize;
165 
166 				/* patch up keylist[] */
167 				for(keyp = &keypos[-1]; keyp >= keylist; keyp--)
168 					*keyp = buffer + (*keyp - oldb);
169 
170 				crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb));
171 				goto do_read;
172 			}
173 
174 			if (c != BUFFEND && !ntfiles && !mfct) {
175 				/* do not push */
176 				continue;
177 			}
178 
179 			/* push */
180 			if (panic >= PANIC) {
181 				fstack[MSTART + mfct].fp = ftmp();
182 				if ((stable_sort)
183 					? sradixsort(keylist, nelem,
184 						weights, REC_D)
185 					: radixsort(keylist, nelem,
186 						weights, REC_D) )
187 					err(2, NULL);
188 				append(keylist, nelem, depth,
189 				    fstack[MSTART + mfct].fp, putrec,
190 				    ftbl);
191 				mfct++;
192 				/* reduce number of open files */
193 				if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) {
194 					/*
195 					 * Only copy extra incomplete crec
196 					 * data if there are any.
197 					 */
198 					int nodata = (bufend >= (u_char *)crec
199 					    && bufend <= crec->data);
200 
201 					if (!nodata) {
202 						tmpbuf = malloc(bufend -
203 						    crec->data);
204 						memmove(tmpbuf, crec->data,
205 						    bufend - crec->data);
206 					}
207 
208 					CHECKFSTACK(base + ntfiles);
209 					fstack[base + ntfiles].fp = ftmp();
210 					fmerge(0, MSTART, filelist,
211 					    mfct, geteasy,
212 					    fstack[base + ntfiles].fp,
213 					    putrec, ftbl);
214 					ntfiles++;
215 					mfct = 0;
216 
217 					if (!nodata) {
218 						memmove(crec->data, tmpbuf,
219 						    bufend - crec->data);
220 						free(tmpbuf);
221 					}
222 				}
223 			} else {
224 				CHECKFSTACK(base + ntfiles);
225 				fstack[base + ntfiles].fp = ftmp();
226 				onepass(keylist, depth, nelem, sizes,
227 				    weights, fstack[base + ntfiles].fp);
228 				ntfiles++;
229 			}
230 		}
231 		if (!ntfiles && !mfct) {	/* everything in memory--pop */
232 			if (nelem > 1
233 			    && ((stable_sort)
234 				? sradixsort(keylist, nelem, weights, REC_D)
235 				: radixsort(keylist, nelem, weights, REC_D) ))
236 				err(2, NULL);
237 			if (nelem > 0)
238 			  append(keylist, nelem, depth, outfp, putline, ftbl);
239 			break;					/* pop */
240 		}
241 		if (panic >= PANIC) {
242 			if (!ntfiles)
243 				fmerge(0, MSTART, filelist, mfct, geteasy,
244 				    outfp, putline, ftbl);
245 			else
246 				fmerge(0, base, filelist, ntfiles, geteasy,
247 				    outfp, putline, ftbl);
248 			break;
249 
250 		}
251 		total = maxb = lastb = 0;	/* find if one bin dominates */
252 		for (i = 0; i < NBINS; i++)
253 			if (sizes[i]) {
254 				if (sizes[i] > sizes[maxb])
255 					maxb = i;
256 				lastb = i;
257 				total += sizes[i];
258 			}
259 		if (sizes[maxb] < max((total / 2) , BUFSIZE))
260 			maxb = lastb;	/* otherwise pop after last bin */
261 		fstack[base].lastb = lastb;
262 		fstack[base].maxb = maxb;
263 
264 		/* start refining next level. */
265 		getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */
266 		for (i = 0; i < maxb; i++) {
267 			if (!sizes[i])	/* bin empty; step ahead file offset */
268 				getnext(i, base, NULL,ntfiles, crec, bufend, 0);
269 			else
270 				fsort(i, depth+1, base, filelist, ntfiles,
271 					outfp, ftbl);
272 		}
273 
274 		get = getnext;
275 
276 		if (lastb != maxb) {
277 			if (prevfp != outfp)
278 				tailfp[panic] = prevfp;
279 			prevfp = ftmp();
280 			for (i = maxb+1; i <= lastb; i++)
281 				if (!sizes[i])
282 					getnext(i, base, NULL, ntfiles, crec,
283 					    bufend,0);
284 				else
285 					fsort(i, depth+1, base, filelist,
286 					    ntfiles, prevfp, ftbl);
287 		}
288 
289 		/* sort biggest (or last) bin at this level */
290 		depth++;
291 		panic++;
292 		binno = maxb;
293 		top = base;
294 		nfiles = ntfiles;		/* so overwrite them */
295 	}
296 	if (prevfp != outfp) {
297 		concat(outfp, prevfp);
298 		fclose(prevfp);
299 	}
300 	for (i = panic; i >= 0; --i)
301 		if (tailfp[i]) {
302 			concat(outfp, tailfp[i]);
303 			fclose(tailfp[i]);
304 		}
305 
306 	/* If on top level, free our structures */
307 	if (depth == 0) {
308 		free(keylist), keylist = NULL;
309 		free(buffer), buffer = NULL;
310 	}
311 }
312 
313 /*
314  * This is one pass of radix exchange, dumping the bins to disk.
315  */
316 #define swap(a, b, t) t = a, a = b, b = t
317 void
318 onepass(a, depth, n, sizes, tr, fp)
319 	const u_char **a;
320 	int depth;
321 	long n, sizes[];
322 	u_char *tr;
323 	FILE *fp;
324 {
325 	size_t tsizes[NBINS+1];
326 	const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp;
327 	static int histo[256];
328 	int *hp;
329 	int c;
330 	const u_char **an, *t, **aj;
331 	const u_char **ak, *r;
332 
333 	memset(tsizes, 0, sizeof(tsizes));
334 	depth += sizeof(TRECHEADER);
335 	an = &a[n];
336 	for (ak = a; ak < an; ak++) {
337 		histo[c = tr[**ak]]++;
338 		tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length;
339 	}
340 
341 	bin[0] = a;
342 	bpmax = bin + 256;
343 	tp = top, hp = histo;
344 	for (bp = bin; bp < bpmax; bp++) {
345 		*tp++ = *(bp+1) = *bp + (c = *hp);
346 		*hp++ = 0;
347 		if (c <= 1)
348 			continue;
349 	}
350 	for (aj = a; aj < an; *aj = r, aj = bin[c+1])
351 		for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;)
352 			swap(*ak, r, t);
353 
354 	for (ak = a, c = 0; c < 256; c++) {
355 		an = bin[c+1];
356 		n = an - ak;
357 		tsizes[c] += n * sizeof(TRECHEADER);
358 		/* tell getnext how many elements in this bin, this segment. */
359 		EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
360 		sizes[c] += tsizes[c];
361 		for (; ak < an; ++ak)
362 			putrec((const RECHEADER *) *ak, fp);
363 	}
364 }
365