xref: /netbsd-src/usr.bin/sort/msort.c (revision 274254cdae52594c1aa480a736aef78313d15c9c)
1 /*	$NetBSD: msort.c,v 1.18 2008/04/28 20:24:15 martin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Ben Harris and Jaromir Dolecek.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (c) 1993
34  *	The Regents of the University of California.  All rights reserved.
35  *
36  * This code is derived from software contributed to Berkeley by
37  * Peter McIlroy.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. Neither the name of the University nor the names of its contributors
48  *    may be used to endorse or promote products derived from this software
49  *    without specific prior written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61  * SUCH DAMAGE.
62  */
63 
64 #include "sort.h"
65 #include "fsort.h"
66 
67 #ifndef lint
68 __RCSID("$NetBSD: msort.c,v 1.18 2008/04/28 20:24:15 martin Exp $");
69 __SCCSID("@(#)msort.c	8.1 (Berkeley) 6/6/93");
70 #endif /* not lint */
71 
72 #include <stdlib.h>
73 #include <string.h>
74 #include <unistd.h>
75 
76 /* Subroutines using comparisons: merge sort and check order */
77 #define DELETE (1)
78 
79 typedef struct mfile {
80 	u_char *end;
81 	short flno;
82 	struct recheader rec[1];
83 } MFILE;
84 
85 static u_char *wts, *wts1 = NULL;
86 
87 static int cmp __P((RECHEADER *, RECHEADER *));
88 static int insert __P((struct mfile **, struct mfile **, int, int));
89 static void merge(int, int, get_func_t, FILE *, put_func_t, struct field *);
90 
91 void
92 fmerge(binno, top, filelist, nfiles, get, outfp, fput, ftbl)
93 	int binno, top;
94 	struct filelist *filelist;
95 	int nfiles;
96 	get_func_t get;
97 	FILE *outfp;
98 	put_func_t fput;
99 	struct field *ftbl;
100 {
101 	FILE *tout;
102 	int i, j, last;
103 	put_func_t put;
104 	struct tempfile *l_fstack;
105 
106 	wts = ftbl->weights;
107 	if (!UNIQUE && SINGL_FLD && ftbl->flags & F)
108 		wts1 = (ftbl->flags & R) ? Rascii : ascii;
109 
110 	if (!buffer) {
111 		buffer = malloc(bufsize);
112 		if (!buffer)
113 			err(2, "fmerge(): malloc");
114 		memset(buffer, 0, bufsize);
115 
116 		if (!linebuf && !SINGL_FLD) {
117 			linebuf_size = DEFLLEN;
118 			linebuf = malloc(linebuf_size);
119 			memset(linebuf, 0, linebuf_size);
120 		}
121 	}
122 
123 	if (binno >= 0)
124 		l_fstack = fstack + top;
125 	else
126 		l_fstack = fstack;
127 
128 	while (nfiles) {
129 		put = putrec;
130 		for (j = 0; j < nfiles; j += MERGE_FNUM) {
131 			if (nfiles <= MERGE_FNUM) {
132 				tout = outfp;
133 				put = fput;
134 			}
135 			else
136 				tout = ftmp();
137 			last = min(MERGE_FNUM, nfiles - j);
138 			if (binno < 0) {
139 				for (i = 0; i < last; i++)
140 					if (!(l_fstack[i+MAXFCT-1-MERGE_FNUM].fp =
141 					    fopen(filelist->names[j+i], "r")))
142 						err(2, "%s",
143 							filelist->names[j+i]);
144 				merge(MAXFCT-1-MERGE_FNUM, last, get, tout, put, ftbl);
145 			} else {
146 				for (i = 0; i< last; i++)
147 					rewind(l_fstack[i+j].fp);
148 				merge(top+j, last, get, tout, put, ftbl);
149 			}
150 			if (nfiles > MERGE_FNUM)
151 				l_fstack[j/MERGE_FNUM].fp = tout;
152 		}
153 		nfiles = (nfiles + (MERGE_FNUM - 1)) / MERGE_FNUM;
154 		if (nfiles == 1)
155 			nfiles = 0;
156 		if (binno < 0) {
157 			binno = 0;
158 			get = geteasy;
159 			top = 0;
160 		}
161 	}
162 }
163 
164 static void
165 merge(infl0, nfiles, get, outfp, put, ftbl)
166 	int infl0, nfiles;
167 	get_func_t get;
168 	put_func_t put;
169 	FILE *outfp;
170 	struct field *ftbl;
171 {
172 	int c, i, j, nf = nfiles;
173 	struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
174 	size_t availsz = bufsize;
175 	static void *bufs[MERGE_FNUM + 1];
176 	static size_t bufs_sz[MERGE_FNUM + 1];
177 
178 	/*
179 	 * We need nfiles + 1 buffers. One is 'buffer', the
180 	 * rest needs to be allocated.
181 	 */
182 	bufs[0] = buffer;
183 	bufs_sz[0] = bufsize;
184 	for (i = 1; i < nfiles + 1; i++) {
185 		if (bufs[i])
186 			continue;
187 
188 		bufs[i] = malloc(DEFLLEN);
189 		if (!bufs[i])
190 			err(2, "merge: malloc");
191 		memset(bufs[i], 0, DEFLLEN);
192 		bufs_sz[i] = DEFLLEN;
193 	}
194 
195 	for (i = j = 0; i < nfiles; i++, j++) {
196 		cfile = (struct mfile *) bufs[j];
197 		cfile->flno = infl0 + j;
198 		cfile->end = (u_char *) bufs[j] + bufs_sz[j];
199 		for (c = 1; c == 1;) {
200 			if (EOF == (c = get(cfile->flno, 0, NULL, nfiles,
201 			   cfile->rec, cfile->end, ftbl))) {
202 				--i;
203 				--nfiles;
204 				break;
205 			}
206 
207 			if (c == BUFFEND) {
208 				cfile = realloc(bufs[j], bufs_sz[j]);
209 				if (!cfile)
210 					err(2, "merge: realloc");
211 
212 				bufs[j] = (void *) cfile;
213 				bufs_sz[j] *= 2;
214 				cfile->end = (u_char *)cfile + bufs_sz[j];
215 
216 				c = 1;
217 				continue;
218 			}
219 
220 			if (i)
221 				c = insert(flist, &cfile, i, !DELETE);
222 			else
223 				flist[0] = cfile;
224 		}
225 	}
226 
227 	cfile = (struct mfile *) bufs[nf];
228 	cfile->flno = flist[0]->flno;
229 	cfile->end = (u_char *) cfile + bufs_sz[nf];
230 	while (nfiles) {
231 		for (c = 1; c == 1;) {
232 			if (EOF == (c = get(cfile->flno, 0, NULL, nfiles,
233 			   cfile->rec, cfile->end, ftbl))) {
234 				put(flist[0]->rec, outfp);
235 				if (--nfiles > 0) {
236 					flist++;
237 					cfile->flno = flist[0]->flno;
238 				}
239 				break;
240 			}
241 			if (c == BUFFEND) {
242 				char *oldbuf = (char *) cfile;
243 				availsz = (char *) cfile->end - oldbuf;
244 				availsz *= 2;
245 				cfile = realloc(oldbuf, availsz);
246 				if (!cfile)
247 					err(2, "merge: realloc");
248 
249 				for (i = 0; i < nf + 1; i++) {
250 					if (bufs[i] == oldbuf) {
251 						bufs[i] = (char *)cfile;
252 						bufs_sz[i] = availsz;
253 						break;
254 					}
255 				}
256 
257 				cfile->end = (u_char *)cfile + availsz;
258 				c = 1;
259 				continue;
260 			}
261 
262 			if (!(c = insert(flist, &cfile, nfiles, DELETE)))
263 				put(cfile->rec, outfp);
264 		}
265 	}
266 
267 	if (bufs_sz[0] > bufsize) {
268 		buffer = bufs[0];
269 		bufsize = bufs_sz[0];
270 	}
271 }
272 
273 /*
274  * if delete: inserts *rec in flist, deletes flist[0], and leaves it in *rec;
275  * otherwise just inserts *rec in flist.
276  */
277 static int
278 insert(flist, rec, ttop, delete)
279 	struct mfile **flist, **rec;
280 	int delete, ttop;			/* delete = 0 or 1 */
281 {
282 	struct mfile *tmprec = *rec;
283 	int mid, top = ttop, bot = 0, cmpv = 1;
284 
285 	for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
286 		cmpv = cmp(tmprec->rec, flist[mid]->rec);
287 		if (cmpv < 0)
288 			top = mid;
289 		else if (cmpv > 0)
290 			bot = mid;
291 		else {
292 			if (UNIQUE)
293 				break;
294 
295 			if (stable_sort) {
296 				/*
297 				 * Apply sort by fileno, to give priority
298 				 * to earlier specified files, hence providing
299 				 * more stable sort.
300 				 * If fileno is same, the new record should
301 				 * be put _after_ the previous entry.
302 				 */
303 				cmpv = tmprec->flno - flist[mid]->flno;
304 				if (cmpv >= 0)
305 					bot = mid;
306 				else /* cmpv == 0 */
307 					bot = mid - 1;
308 			} else {
309 				/* non-stable sort */
310 				bot = mid - 1;
311 			}
312 
313 			break;
314 		}
315 	}
316 
317 	if (delete) {
318 		if (UNIQUE) {
319 			if (!bot && cmpv)
320 				cmpv = cmp(tmprec->rec, flist[0]->rec);
321 			if (!cmpv)
322 				return (1);
323 		}
324 		tmprec = flist[0];
325 		if (bot)
326 			memmove(flist, flist + 1, bot * sizeof(MFILE **));
327 		flist[bot] = *rec;
328 		*rec = tmprec;
329 		(*rec)->flno = flist[0]->flno;
330 		return (0);
331 	} else {
332 		if (!bot && !(UNIQUE && !cmpv)) {
333 			cmpv = cmp(tmprec->rec, flist[0]->rec);
334 			if (cmpv < 0)
335 				bot = -1;
336 		}
337 		if (UNIQUE && !cmpv)
338 			return (1);
339 		bot++;
340 		memmove(flist + bot + 1, flist + bot,
341 		    (ttop - bot) * sizeof(MFILE **));
342 		flist[bot] = *rec;
343 		return (0);
344 	}
345 }
346 
347 /*
348  * check order on one file
349  */
350 void
351 order(filelist, get, ftbl)
352 	struct filelist *filelist;
353 	get_func_t get;
354 	struct field *ftbl;
355 {
356 	u_char *crec_end, *prec_end, *trec_end;
357 	int c;
358 	RECHEADER *crec, *prec, *trec;
359 
360 	if (!SINGL_FLD)
361 		linebuf = malloc(DEFLLEN);
362 	buffer = malloc(2 * (DEFLLEN + sizeof(TRECHEADER)));
363 	crec = (RECHEADER *) buffer;
364 	crec_end = buffer + DEFLLEN + sizeof(TRECHEADER);
365 	prec = (RECHEADER *) (buffer + DEFLLEN + sizeof(TRECHEADER));
366 	prec_end = buffer + 2*(DEFLLEN + sizeof(TRECHEADER));
367 	wts = ftbl->weights;
368 	if (SINGL_FLD && (ftbl->flags & F))
369 		wts1 = (ftbl->flags & R) ? Rascii : ascii;
370 	else
371 		wts1 = NULL;
372 	if (0 == get(-1, 0, filelist, 1, prec, prec_end, ftbl))
373 	while (0 == get(-1, 0, filelist, 1, crec, crec_end, ftbl)) {
374 		if (0 < (c = cmp(prec, crec))) {
375 			crec->data[crec->length-1] = 0;
376 			errx(1, "found disorder: %s", crec->data+crec->offset);
377 		}
378 		if (UNIQUE && !c) {
379 			crec->data[crec->length-1] = 0;
380 			errx(1, "found non-uniqueness: %s",
381 			    crec->data+crec->offset);
382 		}
383 		/*
384 		 * Swap pointers so that this record is on place pointed
385 		 * to by prec and new record is read to place pointed to by
386 		 * crec.
387 		 */
388 		trec = prec;
389 		prec = crec;
390 		crec = trec;
391 		trec_end = prec_end;
392 		prec_end = crec_end;
393 		crec_end = trec_end;
394 	}
395 	exit(0);
396 }
397 
398 static int
399 cmp(rec1, rec2)
400 	RECHEADER *rec1, *rec2;
401 {
402 	int r;
403 	u_char *pos1, *pos2, *end;
404 	u_char *cwts;
405 	for (cwts = wts; cwts; cwts = (cwts == wts1 ? NULL : wts1)) {
406 		pos1 = rec1->data;
407 		pos2 = rec2->data;
408 		if (!SINGL_FLD && (UNIQUE || stable_sort))
409 			end = pos1 + min(rec1->offset, rec2->offset);
410 		else
411 			end = pos1 + min(rec1->length, rec2->length);
412 
413 		for (; pos1 < end; ) {
414 			if ((r = cwts[*pos1++] - cwts[*pos2++]))
415 				return (r);
416 		}
417 	}
418 	return (0);
419 }
420