1 /* $NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $ */ 2 3 /*- 4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Ben Harris and Jaromir Dolecek. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1993 34 * The Regents of the University of California. All rights reserved. 35 * 36 * This code is derived from software contributed to Berkeley by 37 * Peter McIlroy. 38 * 39 * Redistribution and use in source and binary forms, with or without 40 * modification, are permitted provided that the following conditions 41 * are met: 42 * 1. Redistributions of source code must retain the above copyright 43 * notice, this list of conditions and the following disclaimer. 44 * 2. Redistributions in binary form must reproduce the above copyright 45 * notice, this list of conditions and the following disclaimer in the 46 * documentation and/or other materials provided with the distribution. 47 * 3. Neither the name of the University nor the names of its contributors 48 * may be used to endorse or promote products derived from this software 49 * without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 61 * SUCH DAMAGE. 62 */ 63 64 /* Subroutines to generate sort keys. */ 65 66 #include "sort.h" 67 68 #ifndef lint 69 __RCSID("$NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $"); 70 __SCCSID("@(#)fields.c 8.1 (Berkeley) 6/6/93"); 71 #endif /* not lint */ 72 73 #define SKIP_BLANKS(ptr) { \ 74 if (BLANK & d_mask[*(ptr)]) \ 75 while (BLANK & d_mask[*(++(ptr))]); \ 76 } 77 78 #define NEXTCOL(pos) { \ 79 if (!SEP_FLAG) \ 80 while (BLANK & l_d_mask[*(++pos)]); \ 81 while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\ 82 } 83 84 static u_char *enterfield(u_char *, const u_char *, struct field *, int); 85 static u_char *number(u_char *, const u_char *, u_char *, u_char *, int); 86 87 #define DECIMAL_POINT '.' 88 89 /* 90 * constructs sort key with leading recheader, followed by the key, 91 * followed by the original line. 92 */ 93 length_t 94 enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data, 95 size_t line_size, struct field fieldtable[]) 96 /* keybuf: pointer to start of key */ 97 { 98 int i; 99 u_char *l_d_mask; 100 u_char *lineend, *pos; 101 const u_char *endkey; 102 u_char *keypos; 103 struct coldesc *clpos; 104 int col = 1; 105 struct field *ftpos; 106 107 l_d_mask = d_mask; 108 pos = line_data - 1; 109 lineend = line_data + line_size-1; 110 /* don't include rec_delimiter */ 111 112 for (i = 0; i < ncols; i++) { 113 clpos = clist + i; 114 for (; (col < clpos->num) && (pos < lineend); col++) { 115 NEXTCOL(pos); 116 } 117 if (pos >= lineend) 118 break; 119 clpos->start = SEP_FLAG ? pos + 1 : pos; 120 NEXTCOL(pos); 121 clpos->end = pos; 122 col++; 123 if (pos >= lineend) { 124 clpos->end = lineend; 125 i++; 126 break; 127 } 128 } 129 for (; i <= ncols; i++) 130 clist[i].start = clist[i].end = lineend; 131 if (clist[0].start < line_data) 132 clist[0].start++; 133 134 /* 135 * We write the sort keys (concatenated) followed by the 136 * original line data (for output) as the 'keybuf' data. 137 * keybuf->length is the number of key bytes + data bytes. 138 * keybuf->offset is the number of key bytes. 139 * We add a record separator weight after the key in case 140 * (as is usual) we need to preserve the order of equal lines, 141 * and for 'sort -u'. 142 * The key itself will have had the correct weight applied. 143 */ 144 keypos = keybuf->data; 145 endkey = keybuf_end - line_size - 1; 146 if (endkey <= keypos) 147 /* No room for any key bytes */ 148 return 1; 149 150 for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) { 151 if ((keypos = enterfield(keypos, endkey, ftpos, 152 fieldtable->flags)) == NULL) 153 return (1); 154 } 155 *keypos++ = 0; 156 157 keybuf->offset = keypos - keybuf->data; 158 keybuf->length = keybuf->offset + line_size; 159 160 memcpy(keypos, line_data, line_size); 161 return (0); 162 } 163 164 /* 165 * constructs a field (as defined by -k) within a key 166 */ 167 static u_char * 168 enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld, 169 int gflags) 170 { 171 u_char *start, *end, *lineend, *mask, *lweight; 172 struct column icol, tcol; 173 u_int flags; 174 175 icol = cur_fld->icol; 176 tcol = cur_fld->tcol; 177 flags = cur_fld->flags; 178 start = icol.p->start; 179 lineend = clist[ncols].end; 180 if (flags & BI) 181 SKIP_BLANKS(start); 182 start += icol.indent; 183 start = min(start, lineend); 184 185 if (!tcol.num) 186 end = lineend; 187 else { 188 if (tcol.indent) { 189 end = tcol.p->start; 190 if (flags & BT) 191 SKIP_BLANKS(end); 192 end += tcol.indent; 193 end = min(end, lineend); 194 } else 195 end = tcol.p->end; 196 } 197 198 if (flags & N) 199 return number(tablepos, endkey, start, end, flags); 200 201 /* Bound check space - assuming nothing is skipped */ 202 if (tablepos + (end - start) + 1 >= endkey) 203 return NULL; 204 205 mask = cur_fld->mask; 206 lweight = cur_fld->weights; 207 for (; start < end; start++) { 208 if (!mask || mask[*start]) { 209 *tablepos++ = lweight[*start]; 210 } 211 } 212 /* Add extra byte to sort short keys correctly */ 213 *tablepos++ = flags & R ? 255 : 1; 214 return tablepos; 215 } 216 217 /* 218 * Numbers are converted to a floating point format (exponent & mantissa) 219 * so that they compare correctly as sequence of unsigned bytes. 220 * The output cannot contain a 0x00 byte (the record separator). 221 * Bytes 0x01 and 0xff are used to terminate positive and negative numbers 222 * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12. 223 * 224 * The first byte contain the overall sign, exponent sign and some of the 225 * exponent. These have to be ordered (-ve value, decreasing exponent), 226 * zero, (+ve value, increasing exponent). 227 * After excluding 0, 1, 0xff and 0x80 (used for zero) there are 61 228 * exponent values available, this isn't quite enough and the highest 229 * values are used to encode large exponents in multiple bytes. 230 * 231 * An exponent of zero has value 0xc0 for +ve numbers and 0x40 for -ves. 232 * 233 * The mantissa is stored 2 digits per byte offset by 0x40, for negative 234 * numbers the order must be reversed (they are subtracted from 0x100). 235 * 236 * Reverse sorts are done by inverting the sign of the number. 237 * 238 * We don't have to worry about REC_D, the key is terminated by 0x00. 239 */ 240 241 #define SIGNED(reverse, value) ((reverse) ? 0x100 - (value) : (value)) 242 243 /* Large exponents are encoded EXP_EXC_BITS per byte */ 244 #define EXP_ENC_BITS 7 245 #define EXP_ENC_VAL (1 << EXP_ENC_BITS) 246 #define EXP_ENC_MASK (EXP_ENC_VAL - 1) 247 #define MAX_EXP_ENC ((int)(sizeof(int) * 8 + (EXP_ENC_BITS-1))/EXP_ENC_BITS) 248 249 static u_char * 250 number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend, 251 int reverse) 252 { 253 int exponent = -1; 254 int had_dp = 0; 255 u_char *tline; 256 char ch; 257 unsigned int val; 258 u_char *last_nz_pos; 259 260 reverse &= R; 261 262 /* Give ourselves space for the key terminator */ 263 bufend--; 264 265 /* Ensure we have enough space for the exponent */ 266 if (pos + 1 + MAX_EXP_ENC > bufend) 267 return (NULL); 268 269 SKIP_BLANKS(line); 270 if (*line == '-') { /* set the sign */ 271 reverse ^= R; 272 line++; 273 } 274 /* eat initial zeroes */ 275 for (; *line == '0' && line < lineend; line++) 276 continue; 277 278 /* calculate exponents */ 279 if (*line == DECIMAL_POINT) { 280 /* Decimal fraction */ 281 had_dp = 1; 282 while (*++line == '0' && line < lineend) 283 exponent--; 284 } else { 285 /* Large (absolute) value, count digits */ 286 for (tline = line; *tline >= '0' && 287 *tline <= '9' && tline < lineend; tline++) 288 exponent++; 289 } 290 291 /* If the first/next character isn't a digit, value is zero */ 292 if (*line < '1' || *line > '9' || line >= lineend) { 293 /* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */ 294 /* XXX what about NaN, NAN, inf and INF */ 295 *pos++ = 0x80; 296 return pos; 297 } 298 299 /* Maybe here we should allow for e+12 (etc) */ 300 301 /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */ 302 exponent += 0xc0; 303 304 if (exponent > 0x80 + MAX_EXP_ENC && exponent < 0x100 - MAX_EXP_ENC) { 305 /* Value ok for simple encoding */ 306 *pos++ = SIGNED(reverse, exponent); 307 } else { 308 /* Out or range for a single byte */ 309 int c, t; 310 exponent -= 0xc0; 311 t = exponent > 0 ? exponent : -exponent; 312 /* Count how many 7-bit blocks are needed */ 313 for (c = 0; ; c++) { 314 t /= EXP_ENC_VAL; 315 if (t == 0) 316 break; 317 } 318 /* 'c' better be 0..4 here - but probably 0..2 */ 319 t = c; 320 /* Offset just outside valid range */ 321 t += 0x40 - MAX_EXP_ENC; 322 if (exponent < 0) 323 t = -t; 324 t += 0xc0; 325 *pos++ = SIGNED(reverse, t); 326 /* now add each 7-bit block (offset 0x40..0xbf) */ 327 for (; c >= 0; c--) { 328 t = exponent >> (c * EXP_ENC_BITS); 329 t = (t & EXP_ENC_MASK) + 0x40; 330 *pos++ = SIGNED(reverse, t); 331 } 332 } 333 334 /* Now add mantissa, 2 digits per byte */ 335 for (last_nz_pos = pos; line < lineend; ) { 336 if (pos >= bufend) 337 return NULL; 338 ch = *line++; 339 val = (ch - '0') * 10; 340 if (val > 90) { 341 if (ch == DECIMAL_POINT && !had_dp) { 342 had_dp = 1; 343 continue; 344 } 345 break; 346 } 347 while (line < lineend) { 348 ch = *line++; 349 if (ch == DECIMAL_POINT && !had_dp) { 350 had_dp = 1; 351 continue; 352 } 353 if (ch < '0' || ch > '9') 354 line = lineend; 355 else 356 val += ch - '0'; 357 break; 358 } 359 *pos++ = SIGNED(reverse, val + 0x40); 360 if (val != 0) 361 last_nz_pos = pos; 362 } 363 364 /* Add key terminator, deleting any trailing "00" */ 365 *last_nz_pos++ = SIGNED(reverse, 1); 366 367 return (last_nz_pos); 368 } 369