1 /* $NetBSD: fields.c,v 1.33 2013/01/20 10:12:58 apb Exp $ */ 2 3 /*- 4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Ben Harris and Jaromir Dolecek. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1993 34 * The Regents of the University of California. All rights reserved. 35 * 36 * This code is derived from software contributed to Berkeley by 37 * Peter McIlroy. 38 * 39 * Redistribution and use in source and binary forms, with or without 40 * modification, are permitted provided that the following conditions 41 * are met: 42 * 1. Redistributions of source code must retain the above copyright 43 * notice, this list of conditions and the following disclaimer. 44 * 2. Redistributions in binary form must reproduce the above copyright 45 * notice, this list of conditions and the following disclaimer in the 46 * documentation and/or other materials provided with the distribution. 47 * 3. Neither the name of the University nor the names of its contributors 48 * may be used to endorse or promote products derived from this software 49 * without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 61 * SUCH DAMAGE. 62 */ 63 64 /* Subroutines to generate sort keys. */ 65 66 #include "sort.h" 67 68 __RCSID("$NetBSD: fields.c,v 1.33 2013/01/20 10:12:58 apb Exp $"); 69 70 #define SKIP_BLANKS(ptr) { \ 71 if (BLANK & d_mask[*(ptr)]) \ 72 while (BLANK & d_mask[*(++(ptr))]); \ 73 } 74 75 #define NEXTCOL(pos) { \ 76 if (!SEP_FLAG) \ 77 while (BLANK & l_d_mask[*(++pos)]); \ 78 while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\ 79 } 80 81 static u_char *enterfield(u_char *, const u_char *, struct field *, int); 82 static u_char *number(u_char *, const u_char *, u_char *, u_char *, int); 83 static u_char *length(u_char *, const u_char *, u_char *, u_char *, int); 84 85 #if defined(__minix) 86 static u_char *numhex(u_char *, const u_char *, u_char *, u_char *, int); 87 #endif /* defined(__minix) */ 88 89 #define DECIMAL_POINT '.' 90 91 /* 92 * constructs sort key with leading recheader, followed by the key, 93 * followed by the original line. 94 */ 95 length_t 96 enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data, 97 size_t line_size, struct field fieldtable[]) 98 /* keybuf: pointer to start of key */ 99 { 100 int i; 101 u_char *l_d_mask; 102 u_char *lineend, *pos; 103 const u_char *endkey; 104 u_char *keypos; 105 struct coldesc *clpos; 106 int col = 1; 107 struct field *ftpos; 108 109 l_d_mask = d_mask; 110 pos = line_data - 1; 111 lineend = line_data + line_size-1; 112 /* don't include rec_delimiter */ 113 114 for (i = 0; i < ncols; i++) { 115 clpos = clist + i; 116 for (; (col < clpos->num) && (pos < lineend); col++) { 117 NEXTCOL(pos); 118 } 119 if (pos >= lineend) 120 break; 121 clpos->start = SEP_FLAG ? pos + 1 : pos; 122 NEXTCOL(pos); 123 clpos->end = pos; 124 col++; 125 if (pos >= lineend) { 126 clpos->end = lineend; 127 i++; 128 break; 129 } 130 } 131 for (; i <= ncols; i++) 132 clist[i].start = clist[i].end = lineend; 133 if (clist[0].start < line_data) 134 clist[0].start++; 135 136 /* 137 * We write the sort keys (concatenated) followed by the 138 * original line data (for output) as the 'keybuf' data. 139 * keybuf->length is the number of key bytes + data bytes. 140 * keybuf->offset is the number of key bytes. 141 * We add a record separator weight after the key in case 142 * (as is usual) we need to preserve the order of equal lines, 143 * and for 'sort -u'. 144 * The key itself will have had the correct weight applied. 145 */ 146 keypos = keybuf->data; 147 endkey = keybuf_end - line_size - 1; 148 if (endkey <= keypos) 149 /* No room for any key bytes */ 150 return 1; 151 152 for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) { 153 if ((keypos = enterfield(keypos, endkey, ftpos, 154 fieldtable->flags)) == NULL) 155 return (1); 156 } 157 158 keybuf->offset = keypos - keybuf->data; 159 keybuf->length = keybuf->offset + line_size; 160 161 /* 162 * Posix requires that equal keys be further sorted by the 163 * entire original record. 164 * NetBSD has (at least for some time) kept equal keys in 165 * their original order. 166 * For 'sort -u' posix_sort is unset. 167 */ 168 keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset; 169 170 memcpy(keypos, line_data, line_size); 171 return (0); 172 } 173 174 /* 175 * constructs a field (as defined by -k) within a key 176 */ 177 static u_char * 178 enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld, 179 int gflags) 180 { 181 u_char *start, *end, *lineend, *mask, *lweight; 182 struct column icol, tcol; 183 u_int flags; 184 185 icol = cur_fld->icol; 186 tcol = cur_fld->tcol; 187 flags = cur_fld->flags; 188 start = icol.p->start; 189 lineend = clist[ncols].end; 190 if (flags & BI) 191 SKIP_BLANKS(start); 192 start += icol.indent; 193 start = min(start, lineend); 194 195 if (!tcol.num) 196 end = lineend; 197 else { 198 if (tcol.indent) { 199 end = tcol.p->start; 200 if (flags & BT) 201 SKIP_BLANKS(end); 202 end += tcol.indent; 203 end = min(end, lineend); 204 } else 205 end = tcol.p->end; 206 } 207 208 if (flags & L) 209 return length(tablepos, endkey, start, end, flags); 210 if (flags & N) 211 return number(tablepos, endkey, start, end, flags); 212 #if defined(__minix) 213 if (flags & X) 214 return numhex(tablepos, endkey, start, end, flags); 215 #endif /* defined(__minix) */ 216 217 /* Bound check space - assuming nothing is skipped */ 218 if (tablepos + (end - start) + 1 >= endkey) 219 return NULL; 220 221 mask = cur_fld->mask; 222 lweight = cur_fld->weights; 223 for (; start < end; start++) { 224 if (!mask || mask[*start]) { 225 *tablepos++ = lweight[*start]; 226 } 227 } 228 /* Add extra byte (absent from lweight) to sort short keys correctly */ 229 *tablepos++ = lweight[REC_D]; 230 return tablepos; 231 } 232 233 /* 234 * Numbers are converted to a floating point format (exponent & mantissa) 235 * so that they compare correctly as sequence of unsigned bytes. 236 * Bytes 0x00 and 0xff are used to terminate positive and negative numbers 237 * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12. 238 * 239 * The first byte contain the overall sign, exponent sign and some of the 240 * exponent. These have to be ordered (-ve value, decreasing exponent), 241 * zero, (+ve value, increasing exponent). 242 * 243 * The first byte is 0x80 for zero, 0xc0 for +ve with exponent 0. 244 * -ve values are the 1's compliments (so 0x7f isn't used!). 245 * 246 * This only leaves 63 byte values for +ve exponents - which isn't enough. 247 * The largest 4 exponent values are used to hold a byte count of the 248 * number of following bytes that contain 8 exponent bits per byte, 249 * This lets us sort exponents from -2^31 to +2^31. 250 * 251 * The mantissa is stored 2 digits per byte offset by 0x40, for negative 252 * numbers the order must be reversed (they are bit inverted). 253 * 254 * Reverse sorts are done by inverting the sign of the number. 255 */ 256 #define MAX_EXP_ENC ((int)sizeof(int)) 257 258 static u_char * 259 number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend, 260 int reverse) 261 { 262 int exponent = -1; 263 int had_dp = 0; 264 u_char *tline; 265 char ch; 266 unsigned int val; 267 u_char *last_nz_pos; 268 u_char negate; 269 270 if (reverse & R) 271 negate = 0xff; 272 else 273 negate = 0; 274 275 /* Give ourselves space for the key terminator */ 276 bufend--; 277 278 /* Ensure we have enough space for the exponent */ 279 if (pos + 1 + MAX_EXP_ENC > bufend) 280 return (NULL); 281 282 SKIP_BLANKS(line); 283 if (*line == '-') { /* set the sign */ 284 negate ^= 0xff; 285 line++; 286 } else if (*line == '+') { 287 line++; 288 } 289 290 /* eat initial zeroes */ 291 for (; *line == '0' && line < lineend; line++) 292 continue; 293 294 /* calculate exponents */ 295 if (*line == DECIMAL_POINT) { 296 /* Decimal fraction */ 297 had_dp = 1; 298 while (*++line == '0' && line < lineend) 299 exponent--; 300 } else { 301 /* Large (absolute) value, count digits */ 302 for (tline = line; *tline >= '0' && 303 *tline <= '9' && tline < lineend; tline++) 304 exponent++; 305 } 306 307 /* If the first/next character isn't a digit, value is zero */ 308 if (*line < '1' || *line > '9' || line >= lineend) { 309 /* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */ 310 /* XXX what about NaN, NAN, inf and INF */ 311 *pos++ = 0x80; 312 return pos; 313 } 314 315 /* Maybe here we should allow for e+12 (etc) */ 316 317 if (exponent < 0x40 - MAX_EXP_ENC && -exponent < 0x40 - MAX_EXP_ENC) { 318 /* Value ok for simple encoding */ 319 /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */ 320 exponent += 0xc0; 321 *pos++ = negate ^ exponent; 322 } else { 323 /* Out or range for a single byte */ 324 int c, t; 325 t = exponent > 0 ? exponent : -exponent; 326 /* Count how many 8-bit bytes are needed */ 327 for (c = 0; ; c++) { 328 t >>= 8; 329 if (t == 0) 330 break; 331 } 332 /* 'c' better be 0..3 here - but probably 0..1 */ 333 /* Offset just outside valid range */ 334 t = c + 0x40 - MAX_EXP_ENC; 335 if (exponent < 0) 336 t = -t; 337 *pos++ = negate ^ (t + 0xc0); 338 /* now add each byte, most significant first */ 339 for (; c >= 0; c--) 340 *pos++ = negate ^ (exponent >> (c * 8)); 341 } 342 343 /* Finally add mantissa, 2 digits per byte */ 344 for (last_nz_pos = pos; line < lineend; ) { 345 if (pos >= bufend) 346 return NULL; 347 ch = *line++; 348 val = (ch - '0') * 10; 349 if (val > 90) { 350 if (ch == DECIMAL_POINT && !had_dp) { 351 had_dp = 1; 352 continue; 353 } 354 break; 355 } 356 while (line < lineend) { 357 ch = *line++; 358 if (ch == DECIMAL_POINT && !had_dp) { 359 had_dp = 1; 360 continue; 361 } 362 if (ch < '0' || ch > '9') 363 line = lineend; 364 else 365 val += ch - '0'; 366 break; 367 } 368 *pos++ = negate ^ (val + 0x40); 369 if (val != 0) 370 last_nz_pos = pos; 371 } 372 373 /* Add key terminator, deleting any trailing "00" */ 374 *last_nz_pos++ = negate; 375 376 return (last_nz_pos); 377 } 378 379 static u_char * 380 length(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend, 381 int flag) 382 { 383 u_char buf[32]; 384 int l; 385 SKIP_BLANKS(line); 386 l = snprintf((char *)buf, sizeof(buf), "%td", lineend - line); 387 return number(pos, bufend, buf, buf + l, flag); 388 } 389 390 #if defined(__minix) 391 static u_char * 392 numhex(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend, 393 int flag) 394 { 395 u_char buf[32]; 396 int64_t n = 0; 397 int l; 398 SKIP_BLANKS(line); 399 sscanf((const char *) pos, "%llx", &n); 400 l = snprintf((char *)buf, sizeof(buf), "%lld", n); 401 return number(pos, bufend, buf, buf + l, flag); 402 } 403 #endif /* defined(__minix) */ 404