xref: /netbsd-src/usr.bin/sort/fields.c (revision de4fa6c51a9708fc05f88b618fa6fad87c9508ec)
1 /*	$NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $	*/
2 
3 /*-
4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Ben Harris and Jaromir Dolecek.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (c) 1993
34  *	The Regents of the University of California.  All rights reserved.
35  *
36  * This code is derived from software contributed to Berkeley by
37  * Peter McIlroy.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. Neither the name of the University nor the names of its contributors
48  *    may be used to endorse or promote products derived from this software
49  *    without specific prior written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61  * SUCH DAMAGE.
62  */
63 
64 /* Subroutines to generate sort keys. */
65 
66 #include "sort.h"
67 
68 #ifndef lint
69 __RCSID("$NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $");
70 __SCCSID("@(#)fields.c	8.1 (Berkeley) 6/6/93");
71 #endif /* not lint */
72 
73 #define SKIP_BLANKS(ptr) {					\
74 	if (BLANK & d_mask[*(ptr)])				\
75 		while (BLANK & d_mask[*(++(ptr))]);		\
76 }
77 
78 #define NEXTCOL(pos) {						\
79 	if (!SEP_FLAG)						\
80 		while (BLANK & l_d_mask[*(++pos)]);		\
81 	while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\
82 }
83 
84 static u_char *enterfield(u_char *, const u_char *, struct field *, int);
85 static u_char *number(u_char *, const u_char *, u_char *, u_char *, int);
86 
87 #define DECIMAL_POINT '.'
88 
89 /*
90  * constructs sort key with leading recheader, followed by the key,
91  * followed by the original line.
92  */
93 length_t
94 enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data,
95     size_t line_size, struct field fieldtable[])
96 	/* keybuf:	 pointer to start of key */
97 {
98 	int i;
99 	u_char *l_d_mask;
100 	u_char *lineend, *pos;
101 	const u_char *endkey;
102 	u_char *keypos;
103 	struct coldesc *clpos;
104 	int col = 1;
105 	struct field *ftpos;
106 
107 	l_d_mask = d_mask;
108 	pos = line_data - 1;
109 	lineend = line_data + line_size-1;
110 				/* don't include rec_delimiter */
111 
112 	for (i = 0; i < ncols; i++) {
113 		clpos = clist + i;
114 		for (; (col < clpos->num) && (pos < lineend); col++) {
115 			NEXTCOL(pos);
116 		}
117 		if (pos >= lineend)
118 			break;
119 		clpos->start = SEP_FLAG ? pos + 1 : pos;
120 		NEXTCOL(pos);
121 		clpos->end = pos;
122 		col++;
123 		if (pos >= lineend) {
124 			clpos->end = lineend;
125 			i++;
126 			break;
127 		}
128 	}
129 	for (; i <= ncols; i++)
130 		clist[i].start = clist[i].end = lineend;
131 	if (clist[0].start < line_data)
132 		clist[0].start++;
133 
134 	/*
135 	 * We write the sort keys (concatenated) followed by the
136 	 * original line data (for output) as the 'keybuf' data.
137 	 * keybuf->length is the number of key bytes + data bytes.
138 	 * keybuf->offset is the number of key bytes.
139 	 * We add a record separator weight after the key in case
140 	 * (as is usual) we need to preserve the order of equal lines,
141 	 * and for 'sort -u'.
142 	 * The key itself will have had the correct weight applied.
143 	 */
144 	keypos = keybuf->data;
145 	endkey = keybuf_end - line_size - 1;
146 	if (endkey <= keypos)
147 		/* No room for any key bytes */
148 		return 1;
149 
150 	for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) {
151 		if ((keypos = enterfield(keypos, endkey, ftpos,
152 		    fieldtable->flags)) == NULL)
153 			return (1);
154 	}
155 	*keypos++ = 0;
156 
157 	keybuf->offset = keypos - keybuf->data;
158 	keybuf->length = keybuf->offset + line_size;
159 
160 	memcpy(keypos, line_data, line_size);
161 	return (0);
162 }
163 
164 /*
165  * constructs a field (as defined by -k) within a key
166  */
167 static u_char *
168 enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld,
169     int gflags)
170 {
171 	u_char *start, *end, *lineend, *mask, *lweight;
172 	struct column icol, tcol;
173 	u_int flags;
174 
175 	icol = cur_fld->icol;
176 	tcol = cur_fld->tcol;
177 	flags = cur_fld->flags;
178 	start = icol.p->start;
179 	lineend = clist[ncols].end;
180 	if (flags & BI)
181 		SKIP_BLANKS(start);
182 	start += icol.indent;
183 	start = min(start, lineend);
184 
185 	if (!tcol.num)
186 		end = lineend;
187 	else {
188 		if (tcol.indent) {
189 			end = tcol.p->start;
190 			if (flags & BT)
191 				SKIP_BLANKS(end);
192 			end += tcol.indent;
193 			end = min(end, lineend);
194 		} else
195 			end = tcol.p->end;
196 	}
197 
198 	if (flags & N)
199 		return number(tablepos, endkey, start, end, flags);
200 
201 	/* Bound check space - assuming nothing is skipped */
202 	if (tablepos + (end - start) + 1 >= endkey)
203 		return NULL;
204 
205 	mask = cur_fld->mask;
206 	lweight = cur_fld->weights;
207 	for (; start < end; start++) {
208 		if (!mask || mask[*start]) {
209 			*tablepos++ = lweight[*start];
210 		}
211 	}
212 	/* Add extra byte to sort short keys correctly */
213 	*tablepos++ = flags & R ? 255 : 1;
214 	return tablepos;
215 }
216 
217 /*
218  * Numbers are converted to a floating point format (exponent & mantissa)
219  * so that they compare correctly as sequence of unsigned bytes.
220  * The output cannot contain a 0x00 byte (the record separator).
221  * Bytes 0x01 and 0xff are used to terminate positive and negative numbers
222  * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
223  *
224  * The first byte contain the overall sign, exponent sign and some of the
225  * exponent. These have to be ordered (-ve value, decreasing exponent),
226  * zero, (+ve value, increasing exponent).
227  * After excluding 0, 1, 0xff and 0x80 (used for zero) there are 61
228  * exponent values available, this isn't quite enough and the highest
229  * values are used to encode large exponents in multiple bytes.
230  *
231  * An exponent of zero has value 0xc0 for +ve numbers and 0x40 for -ves.
232  *
233  * The mantissa is stored 2 digits per byte offset by 0x40, for negative
234  * numbers the order must be reversed (they are subtracted from 0x100).
235  *
236  * Reverse sorts are done by inverting the sign of the number.
237  *
238  * We don't have to worry about REC_D, the key is terminated by 0x00.
239  */
240 
241 #define SIGNED(reverse, value) ((reverse) ? 0x100 - (value) : (value))
242 
243 /* Large exponents are encoded EXP_EXC_BITS per byte */
244 #define EXP_ENC_BITS 7
245 #define EXP_ENC_VAL  (1 << EXP_ENC_BITS)
246 #define EXP_ENC_MASK (EXP_ENC_VAL - 1)
247 #define MAX_EXP_ENC  ((int)(sizeof(int) * 8 + (EXP_ENC_BITS-1))/EXP_ENC_BITS)
248 
249 static u_char *
250 number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
251     int reverse)
252 {
253 	int exponent = -1;
254 	int had_dp = 0;
255 	u_char *tline;
256 	char ch;
257 	unsigned int val;
258 	u_char *last_nz_pos;
259 
260 	reverse &= R;
261 
262 	/* Give ourselves space for the key terminator */
263 	bufend--;
264 
265 	/* Ensure we have enough space for the exponent */
266 	if (pos + 1 + MAX_EXP_ENC > bufend)
267 		return (NULL);
268 
269 	SKIP_BLANKS(line);
270 	if (*line == '-') {	/* set the sign */
271 		reverse ^= R;
272 		line++;
273 	}
274 	/* eat initial zeroes */
275 	for (; *line == '0' && line < lineend; line++)
276 		continue;
277 
278 	/* calculate exponents */
279 	if (*line == DECIMAL_POINT) {
280 		/* Decimal fraction */
281 		had_dp = 1;
282 		while (*++line == '0' && line < lineend)
283 			exponent--;
284 	} else {
285 		/* Large (absolute) value, count digits */
286 		for (tline = line; *tline >= '0' &&
287 		    *tline <= '9' && tline < lineend; tline++)
288 			exponent++;
289 	}
290 
291 	/* If the first/next character isn't a digit, value is zero */
292 	if (*line < '1' || *line > '9' || line >= lineend) {
293 		/* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */
294 		/* XXX what about NaN, NAN, inf and INF */
295 		*pos++ = 0x80;
296 		return pos;
297 	}
298 
299 	/* Maybe here we should allow for e+12 (etc) */
300 
301 	/* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
302 	exponent += 0xc0;
303 
304 	if (exponent > 0x80 + MAX_EXP_ENC && exponent < 0x100 - MAX_EXP_ENC) {
305 		/* Value ok for simple encoding */
306 		*pos++ = SIGNED(reverse, exponent);
307 	} else {
308 		/* Out or range for a single byte */
309 		int c, t;
310 		exponent -= 0xc0;
311 		t = exponent > 0 ? exponent : -exponent;
312 		/* Count how many 7-bit blocks are needed */
313 		for (c = 0; ; c++) {
314 			t /= EXP_ENC_VAL;
315 			if (t == 0)
316 				break;
317 		}
318 		/* 'c' better be 0..4 here - but probably 0..2 */
319 		t = c;
320 		/* Offset just outside valid range */
321 		t += 0x40 - MAX_EXP_ENC;
322 		if (exponent < 0)
323 			t = -t;
324 		t += 0xc0;
325 		*pos++ = SIGNED(reverse, t);
326 		/* now add each 7-bit block (offset 0x40..0xbf) */
327 		for (; c >= 0; c--) {
328 			t = exponent >> (c * EXP_ENC_BITS);
329 			t = (t & EXP_ENC_MASK) + 0x40;
330 			*pos++ = SIGNED(reverse, t);
331 		}
332 	}
333 
334 	/* Now add mantissa, 2 digits per byte */
335 	for (last_nz_pos = pos; line < lineend; ) {
336 		if (pos >= bufend)
337 			return NULL;
338 		ch = *line++;
339 		val = (ch - '0') * 10;
340 		if (val > 90) {
341 			if (ch == DECIMAL_POINT && !had_dp) {
342 				had_dp = 1;
343 				continue;
344 			}
345 			break;
346 		}
347 		while (line < lineend) {
348 			ch = *line++;
349 			if (ch == DECIMAL_POINT && !had_dp) {
350 				had_dp = 1;
351 				continue;
352 			}
353 			if (ch < '0' || ch > '9')
354 				line = lineend;
355 			else
356 				val += ch - '0';
357 			break;
358 		}
359 		*pos++ = SIGNED(reverse, val + 0x40);
360 		if (val != 0)
361 			last_nz_pos = pos;
362 	}
363 
364 	/* Add key terminator, deleting any trailing "00" */
365 	*last_nz_pos++ = SIGNED(reverse, 1);
366 
367 	return (last_nz_pos);
368 }
369