xref: /minix3/usr.bin/sort/msort.c (revision 0fbbaa43e914d38ef3af549125d31574117d1ebf)
1 /*	$NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $	*/
2 
3 /*-
4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Ben Harris and Jaromir Dolecek.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (c) 1993
34  *	The Regents of the University of California.  All rights reserved.
35  *
36  * This code is derived from software contributed to Berkeley by
37  * Peter McIlroy.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. Neither the name of the University nor the names of its contributors
48  *    may be used to endorse or promote products derived from this software
49  *    without specific prior written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61  * SUCH DAMAGE.
62  */
63 
64 #include "sort.h"
65 #include "fsort.h"
66 
67 __RCSID("$NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $");
68 
69 #include <stdlib.h>
70 #include <string.h>
71 #include <unistd.h>
72 #include <util.h>
73 
74 /* Subroutines using comparisons: merge sort and check order */
75 #define DELETE (1)
76 
77 typedef struct mfile {
78 	FILE         *fp;
79 	get_func_t   get;
80 	RECHEADER    *rec;
81 	u_char       *end;
82 } MFILE;
83 
84 static int cmp(RECHEADER *, RECHEADER *);
85 static int insert(struct mfile **, struct mfile *, int, int);
86 static void merge_sort_fstack(FILE *, put_func_t, struct field *);
87 
88 /*
89  * Number of files merge() can merge in one pass.
90  */
91 #define MERGE_FNUM      16
92 
93 static struct mfile fstack[MERGE_FNUM];
94 static struct mfile fstack_1[MERGE_FNUM];
95 static struct mfile fstack_2[MERGE_FNUM];
96 static int fstack_count, fstack_1_count, fstack_2_count;
97 
98 void
save_for_merge(FILE * fp,get_func_t get,struct field * ftbl)99 save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
100 {
101 	FILE *mfp, *mfp1, *mfp2;
102 
103 	if (fstack_count == MERGE_FNUM) {
104 		/* Must reduce the number of temporary files */
105 		mfp = ftmp();
106 		merge_sort_fstack(mfp, putrec, ftbl);
107 		/* Save output in next layer */
108 		if (fstack_1_count == MERGE_FNUM) {
109 			mfp1 = ftmp();
110 			memcpy(fstack, fstack_1, sizeof fstack);
111 			merge_sort_fstack(mfp1, putrec, ftbl);
112 			if (fstack_2_count == MERGE_FNUM) {
113 				/* More than 4096 files! */
114 				mfp2 = ftmp();
115 				memcpy(fstack, fstack_2, sizeof fstack);
116 				merge_sort_fstack(mfp2, putrec, ftbl);
117 				fstack_2[0].fp = mfp2;
118 				fstack_2_count = 1;
119 			}
120 			fstack_2[fstack_2_count].fp = mfp1;
121 			fstack_2[fstack_2_count].get = geteasy;
122 			fstack_2_count++;
123 			fstack_1_count = 0;
124 		}
125 		fstack_1[fstack_1_count].fp = mfp;
126 		fstack_1[fstack_1_count].get = geteasy;
127 		fstack_1_count++;
128 		fstack_count = 0;
129 	}
130 
131 	fstack[fstack_count].fp = fp;
132 	fstack[fstack_count++].get = get;
133 }
134 
135 void
fmerge(struct filelist * filelist,int nfiles,FILE * outfp,struct field * ftbl)136 fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
137 {
138 	get_func_t get = SINGL_FLD ? makeline : makekey;
139 	FILE *fp;
140 	int i;
141 
142 	for (i = 0; i < nfiles; i++) {
143 #if defined(__minix)
144 	/* LSC FIXME: Not very pretty, but reduce the diff */
145 #include "pathnames.h"
146 		if (!strcmp(filelist->names[0], _PATH_STDIN))
147 			fp = stdin;
148 		else
149 #endif /* defined(__minix) */
150 		fp = fopen(filelist->names[i], "r");
151 		if (fp == NULL)
152 			err(2, "%s", filelist->names[i]);
153 		save_for_merge(fp, get, ftbl);
154 	}
155 
156 	merge_sort(outfp, putline, ftbl);
157 }
158 
159 void
merge_sort(FILE * outfp,put_func_t put,struct field * ftbl)160 merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
161 {
162 	int count = fstack_1_count + fstack_2_count;
163 	FILE *mfp;
164 	int i;
165 
166 	if (count == 0) {
167 		/* All files in initial array */
168 		merge_sort_fstack(outfp, put, ftbl);
169 		return;
170 	}
171 
172 	count += fstack_count;
173 
174 	/* Too many files for one merge sort */
175 	for (;;) {
176 		/* Sort latest 16 files */
177 		i = count;
178 		if (i > MERGE_FNUM)
179 			i = MERGE_FNUM;
180 		while (fstack_count > 0)
181 			fstack[--i] = fstack[--fstack_count];
182 		while (i > 0 && fstack_1_count > 0)
183 			fstack[--i] = fstack_1[--fstack_1_count];
184 		while (i > 0)
185 			fstack[--i] = fstack_2[--fstack_2_count];
186 		if (count <= MERGE_FNUM) {
187 			/* Got all the data */
188 			fstack_count = count;
189 			merge_sort_fstack(outfp, put, ftbl);
190 			return;
191 		}
192 		mfp = ftmp();
193 		fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
194 		merge_sort_fstack(mfp, putrec, ftbl);
195 		fstack[0].fp = mfp;
196 		fstack[0].get = geteasy;
197 		fstack_count = 1;
198 		count -= MERGE_FNUM - 1;
199 	}
200 }
201 
202 static void
merge_sort_fstack(FILE * outfp,put_func_t put,struct field * ftbl)203 merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
204 {
205 	struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
206 	RECHEADER *new_rec;
207 	u_char *new_end;
208 	void *tmp;
209 	int c, i, nfiles;
210 	size_t sz;
211 
212 	/* Read one record from each file (read again if a duplicate) */
213 	for (nfiles = i = 0; i < fstack_count; i++) {
214 		cfile = &fstack[i];
215 		if (cfile->rec == NULL) {
216 			cfile->rec = allocrec(NULL, DEFLLEN);
217 			cfile->end = (u_char *)cfile->rec + DEFLLEN;
218 		}
219 		rewind(cfile->fp);
220 
221 		for (;;) {
222 			c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
223 			if (c == EOF)
224 				break;
225 
226 			if (c == BUFFEND) {
227 				/* Double buffer size */
228 				sz = (cfile->end - (u_char *)cfile->rec) * 2;
229 				cfile->rec = allocrec(cfile->rec, sz);
230 				cfile->end = (u_char *)cfile->rec + sz;
231 				continue;
232 			}
233 
234 			if (nfiles != 0) {
235 				if (insert(flist, cfile, nfiles, !DELETE))
236 					/* Duplicate removed */
237 					continue;
238 			} else
239 				flist[0] = cfile;
240 			nfiles++;
241 			break;
242 		}
243 	}
244 
245 	if (nfiles == 0)
246 		return;
247 
248 	/*
249 	 * We now loop reading a new record from the file with the
250 	 * 'sorted first' existing record.
251 	 * As each record is added, the 'first' record is written to the
252 	 * output file - maintaining one record from each file in the sorted
253 	 * list.
254 	 */
255 	new_rec = allocrec(NULL, DEFLLEN);
256 	new_end = (u_char *)new_rec + DEFLLEN;
257 	for (;;) {
258 		cfile = flist[0];
259 		c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
260 		if (c == EOF) {
261 			/* Write out last record from now-empty input */
262 			put(cfile->rec, outfp);
263 			if (--nfiles == 0)
264 				break;
265 			/* Replace from file with now-first sorted record. */
266 			/* (Moving base 'flist' saves copying everything!) */
267 			flist++;
268 			continue;
269 		}
270 		if (c == BUFFEND) {
271 			/* Buffer not large enough - double in size */
272 			sz = (new_end - (u_char *)new_rec) * 2;
273 			new_rec = allocrec(new_rec, sz);
274 			new_end = (u_char *)new_rec +sz;
275 			continue;
276 		}
277 
278 		/* Swap in new buffer, saving old */
279 		tmp = cfile->rec;
280 		cfile->rec = new_rec;
281 		new_rec = tmp;
282 		tmp = cfile->end;
283 		cfile->end = new_end;
284 		new_end = tmp;
285 
286 		/* Add into sort, removing the original first entry */
287 		c = insert(flist, cfile, nfiles, DELETE);
288 		if (c != 0 || (UNIQUE && cfile == flist[0]
289 			    && cmp(new_rec, cfile->rec) == 0)) {
290 			/* Was an unwanted duplicate, restore buffer */
291 			tmp = cfile->rec;
292 			cfile->rec = new_rec;
293 			new_rec = tmp;
294 			tmp = cfile->end;
295 			cfile->end = new_end;
296 			new_end = tmp;
297 			continue;
298 		}
299 
300 		/* Write out 'old' record */
301 		put(new_rec, outfp);
302 	}
303 
304 	free(new_rec);
305 }
306 
307 /*
308  * if delete: inserts rec in flist, deletes flist[0];
309  * otherwise just inserts *rec in flist.
310  * Returns 1 if record is a duplicate to be ignored.
311  */
312 static int
insert(struct mfile ** flist,struct mfile * rec,int ttop,int delete)313 insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
314 {
315 	int mid, top = ttop, bot = 0, cmpv = 1;
316 
317 	for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
318 		cmpv = cmp(rec->rec, flist[mid]->rec);
319 		if (cmpv == 0 ) {
320 			if (UNIQUE)
321 				/* Duplicate key, read another record */
322 				/* NB: This doesn't guarantee to keep any
323 				 * particular record. */
324 				return 1;
325 			/*
326 			 * Apply sort by input file order.
327 			 * We could truncate the sort is the fileno are
328 			 * adjacent - but that is all too hard!
329 			 * The fileno cannot be equal, since we only have one
330 			 * record from each file (+ flist[0] which never
331 			 * comes here).
332 			 */
333 			cmpv = rec < flist[mid] ? -1 : 1;
334 			if (REVERSE)
335 				cmpv = -cmpv;
336 		}
337 		if (cmpv < 0)
338 			top = mid;
339 		else
340 			bot = mid;
341 	}
342 
343 	/* At this point we haven't yet compared against flist[0] */
344 
345 	if (delete) {
346 		/* flist[0] is ourselves, only the caller knows the old data */
347 		if (bot != 0) {
348 			memmove(flist, flist + 1, bot * sizeof(MFILE *));
349 			flist[bot] = rec;
350 		}
351 		return 0;
352 	}
353 
354 	/* Inserting original set of records */
355 
356 	if (bot == 0 && cmpv != 0) {
357 		/* Doesn't match flist[1], must compare with flist[0] */
358 		cmpv = cmp(rec->rec, flist[0]->rec);
359 		if (cmpv == 0 && UNIQUE)
360 			return 1;
361 		/* Add matching keys in file order (ie new is later) */
362 		if (cmpv < 0)
363 			bot = -1;
364 	}
365 	bot++;
366 	memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
367 	flist[bot] = rec;
368 	return 0;
369 }
370 
371 /*
372  * check order on one file
373  */
374 void
order(struct filelist * filelist,struct field * ftbl)375 order(struct filelist *filelist, struct field *ftbl)
376 {
377 	get_func_t get = SINGL_FLD ? makeline : makekey;
378 	RECHEADER *crec, *prec, *trec;
379 	u_char *crec_end, *prec_end, *trec_end;
380 	FILE *fp;
381 	int c;
382 
383 #if defined(__minix)
384 	if (!strcmp(filelist->names[0], _PATH_STDIN))
385 		fp = stdin;
386 	else
387 #endif /* defined(__minix) */
388 	fp = fopen(filelist->names[0], "r");
389 	if (fp == NULL)
390 		err(2, "%s", filelist->names[0]);
391 
392 	crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
393 	crec_end = crec->data + DEFLLEN;
394 	prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
395 	prec_end = prec->data + DEFLLEN;
396 
397 	/* XXX this does exit(0) for overlong lines */
398 	if (get(fp, prec, prec_end, ftbl) != 0)
399 		exit(0);
400 	while (get(fp, crec, crec_end, ftbl) == 0) {
401 		if (0 < (c = cmp(prec, crec))) {
402 			crec->data[crec->length-1] = 0;
403 			errx(1, "found disorder: %s", crec->data+crec->offset);
404 		}
405 		if (UNIQUE && !c) {
406 			crec->data[crec->length-1] = 0;
407 			errx(1, "found non-uniqueness: %s",
408 			    crec->data+crec->offset);
409 		}
410 		/*
411 		 * Swap pointers so that this record is on place pointed
412 		 * to by prec and new record is read to place pointed to by
413 		 * crec.
414 		 */
415 		trec = prec;
416 		prec = crec;
417 		crec = trec;
418 		trec_end = prec_end;
419 		prec_end = crec_end;
420 		crec_end = trec_end;
421 	}
422 	exit(0);
423 }
424 
425 static int
cmp(RECHEADER * rec1,RECHEADER * rec2)426 cmp(RECHEADER *rec1, RECHEADER *rec2)
427 {
428 	int len;
429 	int r;
430 
431 	/* key is weights */
432 	len = min(rec1->keylen, rec2->keylen);
433 	r = memcmp(rec1->data, rec2->data, len);
434 	if (r == 0)
435 		r = rec1->keylen - rec2->keylen;
436 	if (REVERSE)
437 		r = -r;
438 	return r;
439 }
440