xref: /netbsd-src/usr.bin/sort/fields.c (revision 23c8222edbfb0f0932d88a8351d3a0cf817dfb9e)
1 /*	$NetBSD: fields.c,v 1.18 2004/03/14 21:12:14 heas Exp $	*/
2 
3 /*-
4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Ben Harris and Jaromir Dolecek.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *        This product includes software developed by the NetBSD
21  *        Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*-
40  * Copyright (c) 1993
41  *	The Regents of the University of California.  All rights reserved.
42  *
43  * This code is derived from software contributed to Berkeley by
44  * Peter McIlroy.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. Neither the name of the University nor the names of its contributors
55  *    may be used to endorse or promote products derived from this software
56  *    without specific prior written permission.
57  *
58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 /* Subroutines to generate sort keys. */
72 
73 #include "sort.h"
74 
75 #ifndef lint
76 __RCSID("$NetBSD: fields.c,v 1.18 2004/03/14 21:12:14 heas Exp $");
77 __SCCSID("@(#)fields.c	8.1 (Berkeley) 6/6/93");
78 #endif /* not lint */
79 
80 #define SKIP_BLANKS(ptr) {					\
81 	if (BLANK & d_mask[*(ptr)])				\
82 		while (BLANK & d_mask[*(++(ptr))]);		\
83 }
84 
85 #define NEXTCOL(pos) {						\
86 	if (!SEP_FLAG)						\
87 		while (BLANK & l_d_mask[*(++pos)]);		\
88 	while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\
89 }
90 
91 static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
92 static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));
93 
94 #define DECIMAL '.'
95 #define OFFSET 128
96 
97 u_char TENS[10];	/* TENS[0] = REC_D <= 128 ? 130 - '0' : 2 -'0'... */
98 u_char NEGTENS[10];	/* NEGTENS[0] = REC_D <= 128 ? 126 + '0' : 252 +'0' */
99 u_char *OFF_TENS, *OFF_NTENS;	/* TENS - '0', NEGTENS - '0' */
100 u_char fnum[NBINS], rnum[NBINS];
101 
102 /*
103  * constructs sort key with leading recheader, followed by the key,
104  * followed by the original line.
105  */
106 length_t
107 enterkey(keybuf, line, size, fieldtable)
108 	RECHEADER *keybuf;	/* pointer to start of key */
109 	DBT *line;
110 	int size;
111 	struct field fieldtable[];
112 {
113 	int i;
114 	u_char *l_d_mask;
115 	u_char *lineend, *pos;
116 	u_char *endkey, *keypos;
117 	struct coldesc *clpos;
118 	int col = 1;
119 	struct field *ftpos;
120 	l_d_mask = d_mask;
121 	pos = (u_char *) line->data - 1;
122 	lineend = (u_char *) line->data + line->size-1;
123 				/* don't include rec_delimiter */
124 
125 	for (i = 0; i < ncols; i++) {
126 		clpos = clist + i;
127 		for (; (col < clpos->num) && (pos < lineend); col++) {
128 			NEXTCOL(pos);
129 		}
130 		if (pos >= lineend)
131 			break;
132 		clpos->start = SEP_FLAG ? pos + 1 : pos;
133 		NEXTCOL(pos);
134 		clpos->end = pos;
135 		col++;
136 		if (pos >= lineend) {
137 			clpos->end = lineend;
138 			i++;
139 			break;
140 		}
141 	}
142 	for (; i <= ncols; i++)
143 		clist[i].start = clist[i].end = lineend;
144 	if (clist[0].start < (u_char *) line->data)
145 		clist[0].start++;
146 
147 	keypos = keybuf->data;
148 	endkey = (u_char *) keybuf + size - line->size;
149 	for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++)
150 		if ((keypos = enterfield(keypos, endkey, ftpos,
151 		    fieldtable->flags)) == NULL)
152 			return (1);
153 
154 	keybuf->offset = keypos - keybuf->data;
155 	keybuf->length = keybuf->offset + line->size;
156 	if (keybuf->length + sizeof(TRECHEADER) > size) {
157 		/* line too long for buffer */
158 		return (1);
159 	}
160 
161 	/*
162 	 * Make [s]radixsort() only sort by relevant part of key if:
163 	 * 1. we want to choose unique items by relevant field[s]
164 	 * 2. we want stable sort and so the items should be sorted only by
165 	 *    the relevant field[s]
166 	 */
167 	if (UNIQUE || (stable_sort && keybuf->offset < line->size))
168 		keypos[-1] = REC_D;
169 
170 	memcpy(keybuf->data + keybuf->offset, line->data, line->size);
171 	return (0);
172 }
173 
174 /*
175  * constructs a field (as defined by -k) within a key
176  */
177 static u_char *
178 enterfield(tablepos, endkey, cur_fld, gflags)
179 	struct field *cur_fld;
180 	u_char *tablepos, *endkey;
181 	int gflags;
182 {
183 	u_char *start, *end, *lineend, *mask, *lweight;
184 	struct column icol, tcol;
185 	u_int flags;
186 	u_int Rflag;
187 
188 	icol = cur_fld->icol;
189 	tcol = cur_fld->tcol;
190 	flags = cur_fld->flags;
191 	start = icol.p->start;
192 	lineend = clist[ncols].end;
193 	if (flags & BI)
194 		SKIP_BLANKS(start);
195 	start += icol.indent;
196 	start = min(start, lineend);
197 
198 	if (!tcol.num)
199 		end = lineend;
200 	else {
201 		if (tcol.indent) {
202 			end = tcol.p->start;
203 			if (flags & BT)
204 				SKIP_BLANKS(end);
205 			end += tcol.indent;
206 			end = min(end, lineend);
207 		} else
208 			end = tcol.p->end;
209 	}
210 
211 	if (flags & N) {
212 		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
213 		return number(tablepos, endkey, start, end, Rflag);
214 	}
215 
216 	mask = cur_fld->mask;
217 	lweight = cur_fld->weights;
218 	for (; start < end; start++)
219 		if (mask[*start]) {
220 			if (*start <= 1) {
221 				if (tablepos+2 >= endkey)
222 					return (NULL);
223 				*tablepos++ = lweight[1];
224 				*tablepos++ = lweight[*start ? 2 : 1];
225 			} else {
226 				if (tablepos+1 >= endkey)
227 					return (NULL);
228 				*tablepos++ = lweight[*start];
229 			}
230 		}
231 	*tablepos++ = lweight[0];
232 	return (tablepos == endkey ? NULL : tablepos);
233 }
234 
235 /* Uses the first bin to assign sign, expsign, 0, and the first
236  * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
237  *   When sorting in forward order:
238  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
239  * else use (0-99)->(2-102).
240  * If the exponent is >=61, use another byte for each additional 253
241  * in the exponent. Cutoff is at 567.
242  * To avoid confusing the exponent and the mantissa, use a field delimiter
243  * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
244  * only time a field delimiter can come in that position.
245  * Reverse order is done analagously.
246  */
247 
248 static u_char *
249 number(pos, bufend, line, lineend, Rflag)
250 	u_char *line, *pos, *bufend, *lineend;
251 	int Rflag;
252 {
253 	int or_sign, parity = 0;
254 	int expincr = 1, exponent = -1;
255 	int bite, expsign = 1, sign = 1, zeroskip = 0;
256 	u_char lastvalue='0', *nonzero=NULL, *tline, *C_TENS;
257 	u_char *nweights;
258 
259 	if (Rflag)
260 		nweights = rnum;
261 	else
262 		nweights = fnum;
263 	if (pos > bufend - 8)
264 		return (NULL);
265 	/*
266 	 * or_sign sets the sort direction:
267 	 *	(-r: +/-)(sign: +/-)(expsign: +/-)
268 	 */
269 	or_sign = sign ^ expsign ^ Rflag;
270 	SKIP_BLANKS(line);
271 	if (*line == '-') {	/* set the sign */
272 		or_sign ^= 1;
273 		sign = 0;
274 		line++;
275 	}
276 	/* eat initial zeroes */
277 	for (; *line == '0' && line < lineend; line++)
278 		zeroskip = 1;
279 	/* calculate exponents < 0 */
280 	if (*line == DECIMAL) {
281 		exponent = 1;
282 		while (*++line == '0' && line < lineend)
283 			exponent++;
284 		expincr = 0;
285 		expsign = 0;
286 	}
287 	/* next character better be a digit */
288 	if (*line < '1' || *line > '9' || line >= lineend) {
289 		/* only exit if we didn't skip any zero number */
290 		if (!zeroskip) {
291 			*pos++ = nweights[127];
292 			return (pos);
293 		}
294 	}
295 	if (expincr) {
296 		for (tline = line-1; *++tline >= '0' &&
297 		    *tline <= '9' && tline < lineend;)
298 			exponent++;
299 	}
300 	if (exponent > 567) {
301 		*pos++ = nweights[sign ? (expsign ? 254 : 128)
302 					: (expsign ? 0 : 126)];
303 		warnx("exponent out of bounds");
304 		return (pos);
305 	}
306 	bite = min(exponent, 61);
307 	*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
308 				: (expsign ? 64-bite : 64+bite)];
309 	if (bite >= 61) {
310 		do {
311 			exponent -= bite;
312 			bite = min(exponent, 254);
313 			*pos++ = nweights[or_sign ? 254-bite : bite];
314 		} while (bite == 254);
315 	}
316 	C_TENS = or_sign ? OFF_NTENS : OFF_TENS;
317 	for (; line < lineend; line++) {
318 		if (*line >= '0' && *line <= '9') {
319 			if (parity) {
320 				*pos++ = C_TENS[lastvalue] + (or_sign ? - *line
321 						: *line);
322 				if (pos == bufend)
323 					return (NULL);
324 				if (*line != '0' || lastvalue != '0')
325 					nonzero = pos;
326 			} else
327 				lastvalue = *line;
328 			parity ^= 1;
329 		} else if (*line == DECIMAL) {
330 			if (!expincr)	/* a decimal already occurred once */
331 				break;
332 			expincr = 0;
333 		} else
334 			break;
335 	}
336 	if ((parity && lastvalue != '0') || !nonzero) {
337 		*pos++ = or_sign ? OFF_NTENS[lastvalue] - '0' :
338 					OFF_TENS[lastvalue] + '0';
339 	} else
340 		pos = nonzero;
341 	if (pos > bufend-1)
342 		return (NULL);
343 	*pos++ = or_sign ? nweights[254] : nweights[0];
344 	return (pos);
345 }
346 
347 /* This forces a gap around the record delimiter
348  * Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
349  * rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
350  */
351 void
352 num_init()
353 {
354 	int i;
355 	TENS[0] = REC_D <=128 ? 130 - '0' : 2 - '0';
356 	NEGTENS[0] = REC_D <=128 ? 126 + '0' : 254 + '0';
357 	OFF_TENS = TENS - '0';
358 	OFF_NTENS = NEGTENS - '0';
359 	for (i = 1; i < 10; i++) {
360 		TENS[i] = TENS[i - 1] + 10;
361 		NEGTENS[i] = NEGTENS[i - 1] - 10;
362 	}
363 	for (i = 0; i < REC_D; i++) {
364 		fnum[i] = i;
365 		rnum[255 - i] = i;
366 	}
367 	for (i = REC_D; i <255; i++) {
368 		fnum[i] = i + 1;
369 		rnum[255 - i] = i - 1;
370 	}
371 }
372