xref: /netbsd-src/usr.bin/sort/msort.c (revision e5548b402ae4c44fb816de42c7bba9581ce23ef5)
1 /*	$NetBSD: msort.c,v 1.17 2004/02/17 19:09:36 jdolecek Exp $	*/
2 
3 /*-
4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Ben Harris and Jaromir Dolecek.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *        This product includes software developed by the NetBSD
21  *        Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*-
40  * Copyright (c) 1993
41  *	The Regents of the University of California.  All rights reserved.
42  *
43  * This code is derived from software contributed to Berkeley by
44  * Peter McIlroy.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. Neither the name of the University nor the names of its contributors
55  *    may be used to endorse or promote products derived from this software
56  *    without specific prior written permission.
57  *
58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 #include "sort.h"
72 #include "fsort.h"
73 
74 #ifndef lint
75 __RCSID("$NetBSD: msort.c,v 1.17 2004/02/17 19:09:36 jdolecek Exp $");
76 __SCCSID("@(#)msort.c	8.1 (Berkeley) 6/6/93");
77 #endif /* not lint */
78 
79 #include <stdlib.h>
80 #include <string.h>
81 #include <unistd.h>
82 
83 /* Subroutines using comparisons: merge sort and check order */
84 #define DELETE (1)
85 
86 typedef struct mfile {
87 	u_char *end;
88 	short flno;
89 	struct recheader rec[1];
90 } MFILE;
91 
92 static u_char *wts, *wts1 = NULL;
93 
94 static int cmp __P((RECHEADER *, RECHEADER *));
95 static int insert __P((struct mfile **, struct mfile **, int, int));
96 static void merge(int, int, get_func_t, FILE *, put_func_t, struct field *);
97 
98 void
99 fmerge(binno, top, filelist, nfiles, get, outfp, fput, ftbl)
100 	int binno, top;
101 	struct filelist *filelist;
102 	int nfiles;
103 	get_func_t get;
104 	FILE *outfp;
105 	put_func_t fput;
106 	struct field *ftbl;
107 {
108 	FILE *tout;
109 	int i, j, last;
110 	put_func_t put;
111 	struct tempfile *l_fstack;
112 
113 	wts = ftbl->weights;
114 	if (!UNIQUE && SINGL_FLD && ftbl->flags & F)
115 		wts1 = (ftbl->flags & R) ? Rascii : ascii;
116 
117 	if (!buffer) {
118 		buffer = malloc(bufsize);
119 		if (!buffer)
120 			err(2, "fmerge(): malloc");
121 		memset(buffer, 0, bufsize);
122 
123 		if (!linebuf && !SINGL_FLD) {
124 			linebuf_size = DEFLLEN;
125 			linebuf = malloc(linebuf_size);
126 			memset(linebuf, 0, linebuf_size);
127 		}
128 	}
129 
130 	if (binno >= 0)
131 		l_fstack = fstack + top;
132 	else
133 		l_fstack = fstack;
134 
135 	while (nfiles) {
136 		put = putrec;
137 		for (j = 0; j < nfiles; j += MERGE_FNUM) {
138 			if (nfiles <= MERGE_FNUM) {
139 				tout = outfp;
140 				put = fput;
141 			}
142 			else
143 				tout = ftmp();
144 			last = min(MERGE_FNUM, nfiles - j);
145 			if (binno < 0) {
146 				for (i = 0; i < last; i++)
147 					if (!(l_fstack[i+MAXFCT-1-MERGE_FNUM].fp =
148 					    fopen(filelist->names[j+i], "r")))
149 						err(2, "%s",
150 							filelist->names[j+i]);
151 				merge(MAXFCT-1-MERGE_FNUM, last, get, tout, put, ftbl);
152 			} else {
153 				for (i = 0; i< last; i++)
154 					rewind(l_fstack[i+j].fp);
155 				merge(top+j, last, get, tout, put, ftbl);
156 			}
157 			if (nfiles > MERGE_FNUM)
158 				l_fstack[j/MERGE_FNUM].fp = tout;
159 		}
160 		nfiles = (nfiles + (MERGE_FNUM - 1)) / MERGE_FNUM;
161 		if (nfiles == 1)
162 			nfiles = 0;
163 		if (binno < 0) {
164 			binno = 0;
165 			get = geteasy;
166 			top = 0;
167 		}
168 	}
169 }
170 
171 static void
172 merge(infl0, nfiles, get, outfp, put, ftbl)
173 	int infl0, nfiles;
174 	get_func_t get;
175 	put_func_t put;
176 	FILE *outfp;
177 	struct field *ftbl;
178 {
179 	int c, i, j, nf = nfiles;
180 	struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
181 	size_t availsz = bufsize;
182 	static void *bufs[MERGE_FNUM + 1];
183 	static size_t bufs_sz[MERGE_FNUM + 1];
184 
185 	/*
186 	 * We need nfiles + 1 buffers. One is 'buffer', the
187 	 * rest needs to be allocated.
188 	 */
189 	bufs[0] = buffer;
190 	bufs_sz[0] = bufsize;
191 	for (i = 1; i < nfiles + 1; i++) {
192 		if (bufs[i])
193 			continue;
194 
195 		bufs[i] = malloc(DEFLLEN);
196 		if (!bufs[i])
197 			err(2, "merge: malloc");
198 		memset(bufs[i], 0, DEFLLEN);
199 		bufs_sz[i] = DEFLLEN;
200 	}
201 
202 	for (i = j = 0; i < nfiles; i++, j++) {
203 		cfile = (struct mfile *) bufs[j];
204 		cfile->flno = infl0 + j;
205 		cfile->end = (u_char *) bufs[j] + bufs_sz[j];
206 		for (c = 1; c == 1;) {
207 			if (EOF == (c = get(cfile->flno, 0, NULL, nfiles,
208 			   cfile->rec, cfile->end, ftbl))) {
209 				--i;
210 				--nfiles;
211 				break;
212 			}
213 
214 			if (c == BUFFEND) {
215 				cfile = realloc(bufs[j], bufs_sz[j]);
216 				if (!cfile)
217 					err(2, "merge: realloc");
218 
219 				bufs[j] = (void *) cfile;
220 				bufs_sz[j] *= 2;
221 				cfile->end = (u_char *)cfile + bufs_sz[j];
222 
223 				c = 1;
224 				continue;
225 			}
226 
227 			if (i)
228 				c = insert(flist, &cfile, i, !DELETE);
229 			else
230 				flist[0] = cfile;
231 		}
232 	}
233 
234 	cfile = (struct mfile *) bufs[nf];
235 	cfile->flno = flist[0]->flno;
236 	cfile->end = (u_char *) cfile + bufs_sz[nf];
237 	while (nfiles) {
238 		for (c = 1; c == 1;) {
239 			if (EOF == (c = get(cfile->flno, 0, NULL, nfiles,
240 			   cfile->rec, cfile->end, ftbl))) {
241 				put(flist[0]->rec, outfp);
242 				if (--nfiles > 0) {
243 					flist++;
244 					cfile->flno = flist[0]->flno;
245 				}
246 				break;
247 			}
248 			if (c == BUFFEND) {
249 				char *oldbuf = (char *) cfile;
250 				availsz = (char *) cfile->end - oldbuf;
251 				availsz *= 2;
252 				cfile = realloc(oldbuf, availsz);
253 				if (!cfile)
254 					err(2, "merge: realloc");
255 
256 				for (i = 0; i < nf + 1; i++) {
257 					if (bufs[i] == oldbuf) {
258 						bufs[i] = (char *)cfile;
259 						bufs_sz[i] = availsz;
260 						break;
261 					}
262 				}
263 
264 				cfile->end = (u_char *)cfile + availsz;
265 				c = 1;
266 				continue;
267 			}
268 
269 			if (!(c = insert(flist, &cfile, nfiles, DELETE)))
270 				put(cfile->rec, outfp);
271 		}
272 	}
273 
274 	if (bufs_sz[0] > bufsize) {
275 		buffer = bufs[0];
276 		bufsize = bufs_sz[0];
277 	}
278 }
279 
280 /*
281  * if delete: inserts *rec in flist, deletes flist[0], and leaves it in *rec;
282  * otherwise just inserts *rec in flist.
283  */
284 static int
285 insert(flist, rec, ttop, delete)
286 	struct mfile **flist, **rec;
287 	int delete, ttop;			/* delete = 0 or 1 */
288 {
289 	struct mfile *tmprec = *rec;
290 	int mid, top = ttop, bot = 0, cmpv = 1;
291 
292 	for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
293 		cmpv = cmp(tmprec->rec, flist[mid]->rec);
294 		if (cmpv < 0)
295 			top = mid;
296 		else if (cmpv > 0)
297 			bot = mid;
298 		else {
299 			if (UNIQUE)
300 				break;
301 
302 			if (stable_sort) {
303 				/*
304 				 * Apply sort by fileno, to give priority
305 				 * to earlier specified files, hence providing
306 				 * more stable sort.
307 				 * If fileno is same, the new record should
308 				 * be put _after_ the previous entry.
309 				 */
310 				cmpv = tmprec->flno - flist[mid]->flno;
311 				if (cmpv >= 0)
312 					bot = mid;
313 				else /* cmpv == 0 */
314 					bot = mid - 1;
315 			} else {
316 				/* non-stable sort */
317 				bot = mid - 1;
318 			}
319 
320 			break;
321 		}
322 	}
323 
324 	if (delete) {
325 		if (UNIQUE) {
326 			if (!bot && cmpv)
327 				cmpv = cmp(tmprec->rec, flist[0]->rec);
328 			if (!cmpv)
329 				return (1);
330 		}
331 		tmprec = flist[0];
332 		if (bot)
333 			memmove(flist, flist + 1, bot * sizeof(MFILE **));
334 		flist[bot] = *rec;
335 		*rec = tmprec;
336 		(*rec)->flno = flist[0]->flno;
337 		return (0);
338 	} else {
339 		if (!bot && !(UNIQUE && !cmpv)) {
340 			cmpv = cmp(tmprec->rec, flist[0]->rec);
341 			if (cmpv < 0)
342 				bot = -1;
343 		}
344 		if (UNIQUE && !cmpv)
345 			return (1);
346 		bot++;
347 		memmove(flist + bot + 1, flist + bot,
348 		    (ttop - bot) * sizeof(MFILE **));
349 		flist[bot] = *rec;
350 		return (0);
351 	}
352 }
353 
354 /*
355  * check order on one file
356  */
357 void
358 order(filelist, get, ftbl)
359 	struct filelist *filelist;
360 	get_func_t get;
361 	struct field *ftbl;
362 {
363 	u_char *crec_end, *prec_end, *trec_end;
364 	int c;
365 	RECHEADER *crec, *prec, *trec;
366 
367 	if (!SINGL_FLD)
368 		linebuf = malloc(DEFLLEN);
369 	buffer = malloc(2 * (DEFLLEN + sizeof(TRECHEADER)));
370 	crec = (RECHEADER *) buffer;
371 	crec_end = buffer + DEFLLEN + sizeof(TRECHEADER);
372 	prec = (RECHEADER *) (buffer + DEFLLEN + sizeof(TRECHEADER));
373 	prec_end = buffer + 2*(DEFLLEN + sizeof(TRECHEADER));
374 	wts = ftbl->weights;
375 	if (SINGL_FLD && (ftbl->flags & F))
376 		wts1 = (ftbl->flags & R) ? Rascii : ascii;
377 	else
378 		wts1 = NULL;
379 	if (0 == get(-1, 0, filelist, 1, prec, prec_end, ftbl))
380 	while (0 == get(-1, 0, filelist, 1, crec, crec_end, ftbl)) {
381 		if (0 < (c = cmp(prec, crec))) {
382 			crec->data[crec->length-1] = 0;
383 			errx(1, "found disorder: %s", crec->data+crec->offset);
384 		}
385 		if (UNIQUE && !c) {
386 			crec->data[crec->length-1] = 0;
387 			errx(1, "found non-uniqueness: %s",
388 			    crec->data+crec->offset);
389 		}
390 		/*
391 		 * Swap pointers so that this record is on place pointed
392 		 * to by prec and new record is read to place pointed to by
393 		 * crec.
394 		 */
395 		trec = prec;
396 		prec = crec;
397 		crec = trec;
398 		trec_end = prec_end;
399 		prec_end = crec_end;
400 		crec_end = trec_end;
401 	}
402 	exit(0);
403 }
404 
405 static int
406 cmp(rec1, rec2)
407 	RECHEADER *rec1, *rec2;
408 {
409 	int r;
410 	u_char *pos1, *pos2, *end;
411 	u_char *cwts;
412 	for (cwts = wts; cwts; cwts = (cwts == wts1 ? NULL : wts1)) {
413 		pos1 = rec1->data;
414 		pos2 = rec2->data;
415 		if (!SINGL_FLD && (UNIQUE || stable_sort))
416 			end = pos1 + min(rec1->offset, rec2->offset);
417 		else
418 			end = pos1 + min(rec1->length, rec2->length);
419 
420 		for (; pos1 < end; ) {
421 			if ((r = cwts[*pos1++] - cwts[*pos2++]))
422 				return (r);
423 		}
424 	}
425 	return (0);
426 }
427