xref: /netbsd-src/usr.bin/sort/fields.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: fields.c,v 1.31 2009/11/06 18:34:22 joerg Exp $	*/
2 
3 /*-
4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Ben Harris and Jaromir Dolecek.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (c) 1993
34  *	The Regents of the University of California.  All rights reserved.
35  *
36  * This code is derived from software contributed to Berkeley by
37  * Peter McIlroy.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. Neither the name of the University nor the names of its contributors
48  *    may be used to endorse or promote products derived from this software
49  *    without specific prior written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61  * SUCH DAMAGE.
62  */
63 
64 /* Subroutines to generate sort keys. */
65 
66 #include "sort.h"
67 
68 __RCSID("$NetBSD: fields.c,v 1.31 2009/11/06 18:34:22 joerg Exp $");
69 
70 #define SKIP_BLANKS(ptr) {					\
71 	if (BLANK & d_mask[*(ptr)])				\
72 		while (BLANK & d_mask[*(++(ptr))]);		\
73 }
74 
75 #define NEXTCOL(pos) {						\
76 	if (!SEP_FLAG)						\
77 		while (BLANK & l_d_mask[*(++pos)]);		\
78 	while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\
79 }
80 
81 static u_char *enterfield(u_char *, const u_char *, struct field *, int);
82 static u_char *number(u_char *, const u_char *, u_char *, u_char *, int);
83 
84 #define DECIMAL_POINT '.'
85 
86 /*
87  * constructs sort key with leading recheader, followed by the key,
88  * followed by the original line.
89  */
90 length_t
91 enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data,
92     size_t line_size, struct field fieldtable[])
93 	/* keybuf:	 pointer to start of key */
94 {
95 	int i;
96 	u_char *l_d_mask;
97 	u_char *lineend, *pos;
98 	const u_char *endkey;
99 	u_char *keypos;
100 	struct coldesc *clpos;
101 	int col = 1;
102 	struct field *ftpos;
103 
104 	l_d_mask = d_mask;
105 	pos = line_data - 1;
106 	lineend = line_data + line_size-1;
107 				/* don't include rec_delimiter */
108 
109 	for (i = 0; i < ncols; i++) {
110 		clpos = clist + i;
111 		for (; (col < clpos->num) && (pos < lineend); col++) {
112 			NEXTCOL(pos);
113 		}
114 		if (pos >= lineend)
115 			break;
116 		clpos->start = SEP_FLAG ? pos + 1 : pos;
117 		NEXTCOL(pos);
118 		clpos->end = pos;
119 		col++;
120 		if (pos >= lineend) {
121 			clpos->end = lineend;
122 			i++;
123 			break;
124 		}
125 	}
126 	for (; i <= ncols; i++)
127 		clist[i].start = clist[i].end = lineend;
128 	if (clist[0].start < line_data)
129 		clist[0].start++;
130 
131 	/*
132 	 * We write the sort keys (concatenated) followed by the
133 	 * original line data (for output) as the 'keybuf' data.
134 	 * keybuf->length is the number of key bytes + data bytes.
135 	 * keybuf->offset is the number of key bytes.
136 	 * We add a record separator weight after the key in case
137 	 * (as is usual) we need to preserve the order of equal lines,
138 	 * and for 'sort -u'.
139 	 * The key itself will have had the correct weight applied.
140 	 */
141 	keypos = keybuf->data;
142 	endkey = keybuf_end - line_size - 1;
143 	if (endkey <= keypos)
144 		/* No room for any key bytes */
145 		return 1;
146 
147 	for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) {
148 		if ((keypos = enterfield(keypos, endkey, ftpos,
149 		    fieldtable->flags)) == NULL)
150 			return (1);
151 	}
152 
153 	keybuf->offset = keypos - keybuf->data;
154 	keybuf->length = keybuf->offset + line_size;
155 
156 	/*
157 	 * Posix requires that equal keys be further sorted by the
158 	 * entire original record.
159 	 * NetBSD has (at least for some time) kept equal keys in
160 	 * their original order.
161 	 * For 'sort -u' posix_sort is unset.
162 	 */
163 	keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
164 
165 	memcpy(keypos, line_data, line_size);
166 	return (0);
167 }
168 
169 /*
170  * constructs a field (as defined by -k) within a key
171  */
172 static u_char *
173 enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld,
174     int gflags)
175 {
176 	u_char *start, *end, *lineend, *mask, *lweight;
177 	struct column icol, tcol;
178 	u_int flags;
179 
180 	icol = cur_fld->icol;
181 	tcol = cur_fld->tcol;
182 	flags = cur_fld->flags;
183 	start = icol.p->start;
184 	lineend = clist[ncols].end;
185 	if (flags & BI)
186 		SKIP_BLANKS(start);
187 	start += icol.indent;
188 	start = min(start, lineend);
189 
190 	if (!tcol.num)
191 		end = lineend;
192 	else {
193 		if (tcol.indent) {
194 			end = tcol.p->start;
195 			if (flags & BT)
196 				SKIP_BLANKS(end);
197 			end += tcol.indent;
198 			end = min(end, lineend);
199 		} else
200 			end = tcol.p->end;
201 	}
202 
203 	if (flags & N)
204 		return number(tablepos, endkey, start, end, flags);
205 
206 	/* Bound check space - assuming nothing is skipped */
207 	if (tablepos + (end - start) + 1 >= endkey)
208 		return NULL;
209 
210 	mask = cur_fld->mask;
211 	lweight = cur_fld->weights;
212 	for (; start < end; start++) {
213 		if (!mask || mask[*start]) {
214 			*tablepos++ = lweight[*start];
215 		}
216 	}
217 	/* Add extra byte (absent from lweight) to sort short keys correctly */
218 	*tablepos++ = lweight[REC_D];
219 	return tablepos;
220 }
221 
222 /*
223  * Numbers are converted to a floating point format (exponent & mantissa)
224  * so that they compare correctly as sequence of unsigned bytes.
225  * Bytes 0x00 and 0xff are used to terminate positive and negative numbers
226  * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
227  *
228  * The first byte contain the overall sign, exponent sign and some of the
229  * exponent. These have to be ordered (-ve value, decreasing exponent),
230  * zero, (+ve value, increasing exponent).
231  *
232  * The first byte is 0x80 for zero, 0xc0 for +ve with exponent 0.
233  * -ve values are the 1's compliments (so 0x7f isn't used!).
234  *
235  * This only leaves 63 byte values for +ve exponents - which isn't enough.
236  * The largest 4 exponent values are used to hold a byte count of the
237  * number of following bytes that contain 8 exponent bits per byte,
238  * This lets us sort exponents from -2^31 to +2^31.
239  *
240  * The mantissa is stored 2 digits per byte offset by 0x40, for negative
241  * numbers the order must be reversed (they are bit inverted).
242  *
243  * Reverse sorts are done by inverting the sign of the number.
244  */
245 #define MAX_EXP_ENC  ((int)sizeof(int))
246 
247 static u_char *
248 number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
249     int reverse)
250 {
251 	int exponent = -1;
252 	int had_dp = 0;
253 	u_char *tline;
254 	char ch;
255 	unsigned int val;
256 	u_char *last_nz_pos;
257 	u_char negate;
258 
259 	if (reverse & R)
260 		negate = 0xff;
261 	else
262 		negate = 0;
263 
264 	/* Give ourselves space for the key terminator */
265 	bufend--;
266 
267 	/* Ensure we have enough space for the exponent */
268 	if (pos + 1 + MAX_EXP_ENC > bufend)
269 		return (NULL);
270 
271 	SKIP_BLANKS(line);
272 	if (*line == '-') {	/* set the sign */
273 		negate ^= 0xff;
274 		line++;
275 	}
276 	/* eat initial zeroes */
277 	for (; *line == '0' && line < lineend; line++)
278 		continue;
279 
280 	/* calculate exponents */
281 	if (*line == DECIMAL_POINT) {
282 		/* Decimal fraction */
283 		had_dp = 1;
284 		while (*++line == '0' && line < lineend)
285 			exponent--;
286 	} else {
287 		/* Large (absolute) value, count digits */
288 		for (tline = line; *tline >= '0' &&
289 		    *tline <= '9' && tline < lineend; tline++)
290 			exponent++;
291 	}
292 
293 	/* If the first/next character isn't a digit, value is zero */
294 	if (*line < '1' || *line > '9' || line >= lineend) {
295 		/* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */
296 		/* XXX what about NaN, NAN, inf and INF */
297 		*pos++ = 0x80;
298 		return pos;
299 	}
300 
301 	/* Maybe here we should allow for e+12 (etc) */
302 
303 	if (exponent < 0x40 - MAX_EXP_ENC && -exponent < 0x40 - MAX_EXP_ENC) {
304 		/* Value ok for simple encoding */
305 		/* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
306 		exponent += 0xc0;
307 		*pos++ = negate ^ exponent;
308 	} else {
309 		/* Out or range for a single byte */
310 		int c, t;
311 		t = exponent > 0 ? exponent : -exponent;
312 		/* Count how many 8-bit bytes are needed */
313 		for (c = 0; ; c++) {
314 			t >>= 8;
315 			if (t == 0)
316 				break;
317 		}
318 		/* 'c' better be 0..3 here - but probably 0..1 */
319 		/* Offset just outside valid range */
320 		t = c + 0x40 - MAX_EXP_ENC;
321 		if (exponent < 0)
322 			t = -t;
323 		*pos++ = negate ^ (t + 0xc0);
324 		/* now add each byte, most significant first */
325 		for (; c >= 0; c--)
326 			*pos++ = negate ^ (exponent >> (c * 8));
327 	}
328 
329 	/* Finally add mantissa, 2 digits per byte */
330 	for (last_nz_pos = pos; line < lineend; ) {
331 		if (pos >= bufend)
332 			return NULL;
333 		ch = *line++;
334 		val = (ch - '0') * 10;
335 		if (val > 90) {
336 			if (ch == DECIMAL_POINT && !had_dp) {
337 				had_dp = 1;
338 				continue;
339 			}
340 			break;
341 		}
342 		while (line < lineend) {
343 			ch = *line++;
344 			if (ch == DECIMAL_POINT && !had_dp) {
345 				had_dp = 1;
346 				continue;
347 			}
348 			if (ch < '0' || ch > '9')
349 				line = lineend;
350 			else
351 				val += ch - '0';
352 			break;
353 		}
354 		*pos++ = negate ^ (val + 0x40);
355 		if (val != 0)
356 			last_nz_pos = pos;
357 	}
358 
359 	/* Add key terminator, deleting any trailing "00" */
360 	*last_nz_pos++ = negate;
361 
362 	return (last_nz_pos);
363 }
364