1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include "fields.h" 29 30 /* 31 * fields 32 * 33 * Overview 34 * By a field, we mean the various delimited character sequences within each 35 * line of the input files. The sort key consists of an ordered sequence of 36 * fields, which need not include all possible fields for the given line. 37 * (Furthermore, not every line need contain sufficient fields for the fields 38 * given within the sort key. In fact, none of the lines in the input stream 39 * need contain sufficient fields.) 40 * 41 * There are two methods for specifying fields for sort(1); these are 42 * discussed in options.c. Here we discuss only the internal representation 43 * of fields, as used for constructing the collation vector for each line as 44 * defined by the sort key. 45 * 46 * Representation 47 * The sort key is a singly-linked list of field specifiers. At present, 48 * fields may belong to one of three species: alphabetical, numerical, or 49 * monthly; the species (f_species) then indicates the conversion function 50 * (f_convert) used to transform the raw characters of the character sequence 51 * to a collatable form. (In principle, this allows us to consider future 52 * field species such as hexadecimal.) 53 * 54 * Fields and offsets are numbered such that zero refers to the first field or 55 * character, respectively. Thus, the interpretation of a key specifier, m.n, 56 * is that the field begins at the nth character beyond the mth occurence of 57 * the key separator. If the blanks flag has been specified, then the field 58 * begins at the nth non-blank character past the mth key separator. If the 59 * key separator is unspecified, then the key separator is defined as one or 60 * more blank characters. 61 * 62 * In general, the various options afforded by sort may be broken into two 63 * categories: field species and field modifiers. For each field species, 64 * there is one or more conversion routines that take a delimited character 65 * sequence and convert it to a character sequence collatable by strcmp() or 66 * memcmp(). For field species that may be further modified, such as the 67 * fold-to-uppercase option for alphabetic fields, the conversion routine may 68 * be aware of how the modifier affects collation. Finally, the no-modifiers 69 * case may present an opportunity for a simplified, faster version. 70 * 71 * Code Structure 72 * The code paths for single-byte and multi-byte locales diverge significantly 73 * in fields.c. Most routines have an *_wide() version, which produces an 74 * equivalent effect for line records whose data field is composed of wide 75 * characters (wchar_t). However, the l_collated field of a line record is 76 * always composed of characters, so that the radix sorts provided in 77 * internal.c can work in both single- and multi-byte locales. Thus, in the 78 * various convert_*_wide() routines, the output is placed in l_collated, with 79 * a length multiplier of 4. 80 */ 81 82 #define BEFORE_NUMBER 0x0 83 #define IN_NUMBER 0x1 84 85 static char numerical_separator; 86 static char numerical_decimal; 87 static char monetary_separator; 88 static char monetary_decimal; 89 90 static wchar_t w_numerical_separator; 91 static wchar_t w_numerical_decimal; 92 static wchar_t w_monetary_separator; 93 static wchar_t w_monetary_decimal; 94 95 #define MONTHS_IN_YEAR 12 96 #define MAX_MON_LEN 20 97 98 enum { MO_NONE = 1, MO_OFFSET = 2 }; 99 100 static char *months[MONTHS_IN_YEAR]; 101 static size_t month_lengths[MONTHS_IN_YEAR]; 102 static wchar_t *w_months[MONTHS_IN_YEAR]; 103 static size_t w_month_lengths[MONTHS_IN_YEAR]; 104 105 #define DECIMAL_CHAR (numerical_decimal) 106 #define IS_BLANK(x) (isspace((uchar_t)(x)) && (x) != '\n') 107 #define IS_SEPARATOR(x) \ 108 ((numerical_separator != '\0' && (x) == numerical_separator) || \ 109 (monetary_separator != '\0' && (x) == monetary_separator)) 110 #define IS_DECIMAL(x) \ 111 ((x) == numerical_decimal || \ 112 (monetary_decimal != '\0' && (x) == monetary_decimal)) 113 #define W_DECIMAL_CHAR (w_numerical_decimal) 114 #define W_IS_BLANK(x) (iswspace(x) && (x) != L'\n') 115 #define W_IS_SEPARATOR(x) \ 116 ((numerical_separator != '\0' && (x) == w_numerical_separator) || \ 117 (monetary_separator != '\0' && (x) == w_monetary_separator)) 118 #define W_IS_DECIMAL(x) \ 119 (((x) == w_numerical_decimal) || \ 120 (monetary_decimal != '\0' && (x) == w_monetary_decimal)) 121 122 #define INTERFIELD_SEPARATOR '\0' 123 #define W_INTERFIELD_SEPARATOR L'\0' 124 125 #define INT_SIGN_FLIP_MASK 0x80000000 126 #define INT_SIGN_PASS_MASK 0x00000000 127 128 /* 129 * strx_ops_t, xfrm_len, and xfrm_cpy: In the case where we are sorting in the 130 * C locale, we want to avoid the expense of transforming strings to collatable 131 * forms since, by definition, an arbitrary string in the C locale is already in 132 * its collatable form. Therefore, we construct a small ops vector (the 133 * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into 134 * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear 135 * strxfrm()-like. 136 */ 137 /*ARGSUSED*/ 138 static size_t 139 xfrm_len(const char *s2, size_t len) 140 { 141 return (strxfrm(NULL, s2, 0) + 1); 142 } 143 144 /* 145 * The length represented by n includes a null character, so to return the 146 * correct length we subtract 1. Note that this function is only used by 147 * field_convert_alpha, and isn't for general use, as it assumes that n is the 148 * length of s2 plus a null character. 149 */ 150 static size_t 151 C_ncpy(char *s1, const char *s2, size_t n) 152 { 153 (void) strncpy(s1, s2, n); 154 return (n - 1); 155 } 156 157 /*ARGSUSED*/ 158 static size_t 159 C_len(const char *s, size_t len) 160 { 161 ASSERT(s != NULL); 162 return (len); 163 } 164 165 typedef struct _strx_ops { 166 size_t (*sx_len)(const char *, size_t); 167 size_t (*sx_xfrm)(char *, const char *, size_t); 168 } strx_ops_t; 169 170 static const strx_ops_t C_ops = { C_len, C_ncpy }; 171 static const strx_ops_t SB_ops = { xfrm_len, strxfrm }; 172 173 static const strx_ops_t *xfrm_ops; 174 175 static void 176 field_initialize_separator(void) 177 { 178 /* 179 * A locale need not define all of the cases below: only decimal_point 180 * must be defined. Furthermore, sort(1) has traditionally not used the 181 * positive_sign and negative_sign, grouping, or currency_symbols (or 182 * their numeric counterparts, if any). 183 */ 184 struct lconv *conv = localeconv(); 185 186 if (!xstreql(conv->thousands_sep, "")) { 187 numerical_separator = *conv->thousands_sep; 188 (void) mbtowc(&w_numerical_separator, conv->thousands_sep, 189 MB_CUR_MAX); 190 } else 191 numerical_separator = '\0'; 192 193 if (!xstreql(conv->mon_thousands_sep, "")) { 194 monetary_separator = *conv->mon_thousands_sep; 195 (void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep, 196 MB_CUR_MAX); 197 } else 198 monetary_separator = '\0'; 199 200 if (!xstreql(conv->mon_decimal_point, "")) { 201 monetary_decimal = *conv->mon_decimal_point; 202 (void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point, 203 MB_CUR_MAX); 204 } else 205 monetary_decimal = '\0'; 206 207 numerical_decimal = *conv->decimal_point; 208 (void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX); 209 } 210 211 static void 212 field_initialize_month(int is_c_locale) 213 { 214 int i; 215 int j; 216 struct tm this_month; 217 const char *c_months[MONTHS_IN_YEAR] = { 218 "JAN", "FEB", "MAR", "APR", "MAY", "JUN", 219 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" 220 }; 221 222 char month_name[MAX_MON_LEN * MB_LEN_MAX]; 223 wchar_t w_month_name[MAX_MON_LEN]; 224 225 if (is_c_locale) { 226 for (i = 0; i < MONTHS_IN_YEAR; i++) { 227 months[i] = (char *)c_months[i]; 228 month_lengths[i] = strlen(c_months[i]); 229 } 230 /* 231 * We don't need to initialize the wide version of the month 232 * names. 233 */ 234 return; 235 } 236 237 (void) memset(&this_month, 0, sizeof (this_month)); 238 239 for (i = 0; i < MONTHS_IN_YEAR; i++) { 240 this_month.tm_mon = i; 241 242 (void) strftime(month_name, sizeof (month_name), 243 "%b", &this_month); 244 245 for (j = 0; j < strlen(month_name); j++) 246 month_name[j] = toupper(month_name[j]); 247 (void) mbstowcs(w_month_name, month_name, MAX_MON_LEN); 248 249 months[i] = strdup(month_name); 250 month_lengths[i] = strlen(month_name); 251 w_months[i] = wsdup(w_month_name); 252 w_month_lengths[i] = wslen(w_month_name); 253 } 254 } 255 256 void 257 field_initialize(sort_t *S) 258 { 259 field_initialize_month(S->m_c_locale); 260 field_initialize_separator(); 261 262 if (S->m_c_locale) 263 xfrm_ops = &C_ops; 264 else 265 xfrm_ops = &SB_ops; 266 } 267 268 field_t * 269 field_new(sort_t *S) 270 { 271 field_t *F = safe_realloc(NULL, sizeof (field_t)); 272 273 F->f_start_field = -1; 274 F->f_start_offset = -1; 275 F->f_end_field = -1; 276 F->f_end_offset = -1; 277 F->f_next = NULL; 278 279 if (S == NULL) { 280 F->f_species = ALPHA; 281 F->f_options = 0; 282 } else { 283 F->f_species = S->m_default_species; 284 F->f_options = S->m_field_options; 285 } 286 287 return (F); 288 } 289 290 void 291 field_delete(field_t *F) 292 { 293 free(F); 294 } 295 296 /* 297 * The recursive implementation of field_add_to_chain() given below is 298 * inappropriate if function calls are expensive, or a truly large number of 299 * fields are anticipated. 300 */ 301 void 302 field_add_to_chain(field_t **F, field_t *A) 303 { 304 if (*F == NULL) 305 *F = A; 306 else 307 field_add_to_chain(&((*F)->f_next), A); 308 } 309 310 #ifdef DEBUG 311 #ifndef _LP64 312 #define FIELD_FMT \ 313 "\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n" 314 #else /* !_LP64 */ 315 #define FIELD_FMT \ 316 "\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n" 317 #endif /* !_LP64 */ 318 319 /* 320 * field_print is used only for debugging purposes. 321 */ 322 void 323 field_print(field_t *F) 324 { 325 char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"}; 326 int status = 0; 327 328 (void) fprintf(stderr, "Type: %s", field_names[F->f_species]); 329 (void) fprintf(stderr, "\tOptions: "); 330 331 if (F->f_options & FIELD_REVERSE_COMPARISONS) { 332 (void) fprintf(stderr, "REVERSE"); 333 status++; 334 } 335 if (F->f_options & FIELD_DICTIONARY_ORDER) { 336 (void) fprintf(stderr, "DICTIONARY "); 337 status++; 338 } 339 if (F->f_options & FIELD_FOLD_UPPERCASE) { 340 (void) fprintf(stderr, "UPPERCASE "); 341 status++; 342 } 343 if (F->f_options & FIELD_IGNORE_NONPRINTABLES) { 344 (void) fprintf(stderr, "PRINTABLES "); 345 status++; 346 } 347 if (F->f_options & FIELD_IGNORE_BLANKS_START) { 348 (void) fprintf(stderr, "BLANKS_START "); 349 status++; 350 } 351 if (F->f_options & FIELD_IGNORE_BLANKS_END) { 352 (void) fprintf(stderr, "BLANKS_END "); 353 status++; 354 } 355 356 if (status == 0) 357 (void) fprintf(stderr, "NO_MODIFIERS"); 358 359 (void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset, 360 F->f_end_field, F->f_end_offset); 361 } 362 #endif /* DEBUG */ 363 364 static ssize_t 365 field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks) 366 { 367 char *S = L->l_data.sp; 368 char *T = S; 369 char *eol = S + L->l_data_length; 370 ssize_t field = is_end ? F->f_end_field : F->f_start_field; 371 ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset; 372 ssize_t ret; 373 374 ASSERT(is_end || field > -1); 375 376 if (is_end && field == -1) 377 return (L->l_data_length); 378 379 while (field-- > 0) { 380 while (T < eol && IS_BLANK(*T)) 381 T++; 382 383 while (T < eol && !IS_BLANK(*T)) 384 T++; 385 } 386 387 if ((!is_end || offset > 0) && is_blanks) { 388 while (IS_BLANK(*T)) 389 T++; 390 } 391 392 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) 393 return (L->l_data_length); 394 395 return (ret); 396 } 397 398 static void 399 field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end) 400 { 401 ASSERT(F->f_start_field > -1); 402 403 *start = field_boundary(F, L, 0, 404 F->f_options & FIELD_IGNORE_BLANKS_START); 405 *end = field_boundary(F, L, 1, 406 F->f_options & FIELD_IGNORE_BLANKS_END); 407 } 408 409 static ssize_t 410 field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks) 411 { 412 wchar_t *S = L->l_data.wp; 413 wchar_t *T = S; 414 wchar_t *eol = S + L->l_data_length; 415 ssize_t field = is_end ? F->f_end_field : F->f_start_field; 416 ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset; 417 ssize_t ret; 418 419 ASSERT(is_end || field > -1); 420 421 if (is_end && field == -1) 422 return (L->l_data_length); 423 424 while (field-- > 0) { 425 while (T < eol && W_IS_BLANK(*T)) 426 T++; 427 428 while (T < eol && !W_IS_BLANK(*T)) 429 T++; 430 } 431 432 if ((!is_end || offset > 0) && is_blanks) { 433 while (W_IS_BLANK(*T)) 434 T++; 435 } 436 437 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) 438 return (L->l_data_length); 439 440 return (ret); 441 } 442 443 static void 444 field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end) 445 { 446 ASSERT(F->f_start_field > -1); 447 448 *start = field_boundary_wide(F, L, 0, 449 F->f_options & FIELD_IGNORE_BLANKS_START); 450 *end = field_boundary_wide(F, L, 1, 451 F->f_options & FIELD_IGNORE_BLANKS_END); 452 } 453 454 static ssize_t 455 field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks, 456 vchar_t delimiter) 457 { 458 char *S = L->l_data.sp; 459 char *T = S; 460 char *eol = S + L->l_data_length; 461 ssize_t field = is_end ? F->f_end_field : F->f_start_field; 462 ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset; 463 ssize_t ret; 464 465 ASSERT(is_end || field > -1); 466 467 if (is_end && field == -1) 468 return (L->l_data_length); 469 470 while (field-- > 0) { 471 T = xstrnchr(T, delimiter.sc, eol - T); 472 if (T == NULL || T > eol) 473 return (L->l_data_length); 474 475 T++; 476 } 477 478 if ((!is_end || offset != 0) && is_blanks) { 479 while (IS_BLANK(*T)) 480 T++; 481 } 482 483 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) { 484 if (S[L->l_data_length - 1] == delimiter.sc) { 485 return (L->l_data_length - 1); 486 } else { 487 return (L->l_data_length); 488 } 489 } 490 491 if (is_end && offset == 0) 492 ret--; 493 494 return (ret); 495 } 496 497 /* 498 * field_delimit_tabbed() is called when a field separator has been defined 499 * using the -t option. The character at the offset, start, is either one or 500 * more character positions past the delimiter marking the start of the 501 * field, or at the end of the line. 502 */ 503 static void 504 field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end, 505 vchar_t delimiter) 506 { 507 ASSERT(F->f_start_field > -1); 508 509 *start = field_boundary_tabbed(F, L, 0, F->f_options & 510 FIELD_IGNORE_BLANKS_START, delimiter); 511 *end = field_boundary_tabbed(F, L, 1, F->f_options & 512 FIELD_IGNORE_BLANKS_END, delimiter); 513 } 514 515 static ssize_t 516 field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks, 517 vchar_t delimiter) 518 { 519 wchar_t *S = L->l_data.wp; 520 wchar_t *T = S; 521 wchar_t *eol = S + L->l_data_length; 522 ssize_t field = is_end ? F->f_end_field : F->f_start_field; 523 ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset; 524 ssize_t ret; 525 526 ASSERT(is_end || field > -1); 527 528 if (is_end && field == -1) 529 return (L->l_data_length); 530 531 while (field-- > 0) { 532 T = xwsnchr(T, delimiter.wc, eol - T); 533 if (T == NULL || T > eol) 534 return (L->l_data_length); 535 536 T++; 537 } 538 539 if ((!is_end || offset != 0) && is_blanks) { 540 while (W_IS_BLANK(*T)) 541 T++; 542 } 543 544 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) { 545 if (S[L->l_data_length - 1] == delimiter.wc) { 546 return (L->l_data_length - 1); 547 } else { 548 return (L->l_data_length); 549 } 550 } 551 552 if (is_end && offset == 0) 553 ret--; 554 555 return (ret); 556 } 557 558 static void 559 field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start, 560 ssize_t *end, vchar_t delimiter) 561 { 562 ASSERT(F->f_start_field > -1); 563 564 *start = field_boundary_tabbed_wide(F, L, 0, F->f_options & 565 FIELD_IGNORE_BLANKS_START, delimiter); 566 *end = field_boundary_tabbed_wide(F, L, 1, F->f_options & 567 FIELD_IGNORE_BLANKS_END, delimiter); 568 } 569 570 /*ARGSUSED*/ 571 ssize_t 572 field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter, 573 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 574 { 575 int j; 576 ssize_t val; 577 char month_candidate[MAX_MON_LEN * MB_LEN_MAX]; 578 ssize_t month_length = data_length; 579 ssize_t month_offset = data_offset; 580 581 if (sizeof (char) > L->l_collate_bufsize - coll_offset) 582 return (-1); 583 584 (void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX); 585 586 587 /* 588 * The month field formally begins with the first non-blank character. 589 */ 590 while (IS_BLANK(*(L->l_data.sp + month_offset))) { 591 month_offset++; 592 month_length--; 593 } 594 595 for (j = 0; j < MAX_MON_LEN && j < month_length; j++) 596 month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]); 597 598 for (j = 0; j < MONTHS_IN_YEAR; j++) { 599 if (xstrneql(month_candidate, months[j], month_lengths[j])) { 600 *(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET; 601 return (1); 602 } 603 } 604 605 /* 606 * no matching month; copy string into field. required behaviour is 607 * that "month-free" keys sort before month-sortable keys, so insert 608 * a "will sort first" token. 609 */ 610 *(L->l_collate.sp + coll_offset) = '\0' + MO_NONE; 611 612 val = field_convert_alpha_simple(F, L, delimiter, data_offset, 613 data_length, coll_offset + 1); 614 615 if (val < 0) 616 return (-1); 617 else 618 return (val + 1); 619 } 620 621 /*ARGSUSED*/ 622 ssize_t 623 field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter, 624 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 625 { 626 ssize_t j; 627 ssize_t val; 628 wchar_t month_candidate[MAX_MON_LEN]; 629 wchar_t *month; 630 wchar_t *buffer = L->l_collate.wp + coll_offset; 631 ssize_t month_length = data_length; 632 ssize_t month_offset = data_offset; 633 634 if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) < 635 sizeof (wchar_t)) 636 return (-1); 637 638 (void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t)); 639 640 641 while (W_IS_BLANK(*(L->l_data.wp + month_offset))) { 642 month_offset++; 643 month_length--; 644 } 645 646 month = L->l_data.wp + month_offset; 647 648 for (j = 0; j < MAX_MON_LEN && j < month_length; j++) 649 month_candidate[j] = towupper(month[j]); 650 651 for (j = 0; j < MONTHS_IN_YEAR; j++) 652 if (xwcsneql(month_candidate, w_months[j], 653 w_month_lengths[j])) { 654 *buffer = L'\0' + j + MO_OFFSET; 655 return (1); 656 } 657 658 *buffer = L'\0' + MO_NONE; 659 660 val = field_convert_alpha_wide(F, L, delimiter, data_offset, 661 data_length, coll_offset + sizeof (wchar_t)); 662 663 if (val < 0) 664 return (-1); 665 else 666 return (val + 1); 667 } 668 669 /* 670 * field_convert_alpha() always fails with return value -1 if the converted 671 * string would cause l_collate_length to exceed l_collate_bufsize 672 */ 673 /*ARGSUSED*/ 674 ssize_t 675 field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter, 676 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 677 { 678 static char *compose; 679 static ssize_t compose_length; 680 681 ssize_t clength = 0; 682 ssize_t dlength; 683 ssize_t i; 684 685 if (compose_length < (data_length + 1)) { 686 compose_length = data_length + 1; 687 compose = safe_realloc(compose, compose_length * sizeof (char)); 688 } 689 690 for (i = data_offset; i < data_offset + data_length; i++) { 691 char t = (L->l_data.sp)[i]; 692 693 if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && 694 !isprint((uchar_t)t)) 695 continue; 696 697 if ((F->f_options & FIELD_DICTIONARY_ORDER) && 698 !isalnum((uchar_t)t) && !isspace((uchar_t)t)) 699 continue; 700 701 if (F->f_options & FIELD_FOLD_UPPERCASE) 702 t = toupper(t); 703 704 compose[clength++] = t; 705 } 706 compose[clength] = '\0'; 707 708 if ((dlength = xfrm_ops->sx_len(compose, clength)) < 709 L->l_collate_bufsize - coll_offset) 710 return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset, 711 compose, dlength + 1)); 712 else 713 return ((ssize_t)-1); 714 } 715 716 /*ARGSUSED*/ 717 ssize_t 718 field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter, 719 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 720 { 721 static char *compose; 722 static ssize_t compose_length; 723 724 ssize_t clength; 725 ssize_t dlength; 726 727 if (compose_length < (data_length + 1)) { 728 compose_length = data_length + 1; 729 compose = safe_realloc(compose, compose_length * sizeof (char)); 730 } 731 732 (void) memcpy(compose, L->l_data.sp + data_offset, data_length); 733 clength = data_length; 734 compose[clength] = '\0'; 735 736 if ((dlength = xfrm_ops->sx_len(compose, clength)) < 737 L->l_collate_bufsize - coll_offset) 738 return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset, 739 compose, dlength + 1)); 740 else 741 return ((ssize_t)-1); 742 } 743 744 /*ARGSUSED*/ 745 ssize_t 746 field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter, 747 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 748 { 749 wchar_t *compose = safe_realloc(NULL, (data_length + 1) * 750 sizeof (wchar_t)); 751 ssize_t clength = 0; 752 ssize_t dlength; 753 ssize_t i; 754 ssize_t ret; 755 756 for (i = data_offset; i < data_offset + data_length; i++) { 757 wchar_t t = (L->l_data.wp)[i]; 758 759 if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t)) 760 continue; 761 762 if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) && 763 !iswspace(t)) 764 continue; 765 766 if (F->f_options & FIELD_FOLD_UPPERCASE) 767 t = towupper(t); 768 769 compose[clength++] = t; 770 } 771 compose[clength] = L'\0'; 772 773 dlength = wcsxfrm(NULL, compose, (size_t)0); 774 if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize - 775 coll_offset * sizeof (wchar_t)) { 776 ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose, 777 (size_t)dlength + 1); 778 } else { 779 ret = (ssize_t)-1; 780 } 781 782 safe_free(compose); 783 784 return (ret); 785 } 786 787 /* 788 * field_convert_numeric() converts the given field into a collatable numerical 789 * sequence. The sequence is ordered as { log, integer, separator, fraction }, 790 * with an optional sentinel component at the sequence end. 791 */ 792 /*ARGSUSED*/ 793 ssize_t 794 field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter, 795 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 796 { 797 char *number; 798 char *buffer = L->l_collate.sp + coll_offset; 799 ssize_t length; 800 801 char sign = '2'; 802 int log_ten; 803 char *digits = buffer + 1 + sizeof (int) / sizeof (char); 804 size_t j = 0; 805 size_t i; 806 807 int state = BEFORE_NUMBER; 808 809 number = L->l_data.sp + data_offset; 810 length = data_length; 811 812 /* 813 * Eat leading blanks, if any. 814 */ 815 for (i = 0; i < length; i++) 816 if (!IS_BLANK(number[i])) 817 break; 818 819 /* 820 * Test that there is sufficient size in the collation buffer for our 821 * number. In addition to the possible remaining characters in the 822 * field, we also require space for the sign (char), logarithm (int), 823 * separator (char), and as many as two string terminators (for reverse 824 * sorts). 825 */ 826 if (((length - i) + 4 * sizeof (char) + sizeof (int)) > 827 (L->l_collate_bufsize - coll_offset)) 828 return ((ssize_t)-1); 829 830 /* 831 * If negative, set sign. 832 */ 833 if (number[i] == '-') { 834 i++; 835 sign = '0'; 836 } 837 838 /* 839 * Scan integer part; eat leading zeros. 840 */ 841 for (; i < length; i++) { 842 if (IS_SEPARATOR(number[i])) 843 continue; 844 845 if (number[i] == '0' && !(state & IN_NUMBER)) 846 continue; 847 848 if (!isdigit((uchar_t)number[i])) 849 break; 850 851 state |= IN_NUMBER; 852 if (sign == '0') 853 digits[j++] = '0' + '9' - number[i]; 854 else 855 digits[j++] = number[i]; 856 } 857 858 if (i < length && IS_DECIMAL(number[i])) { 859 /* 860 * Integer part terminated by decimal. 861 */ 862 digits[j] = DECIMAL_CHAR; 863 log_ten = j++; 864 865 /* 866 * Scan fractional part. 867 */ 868 for (++i; i < length; i++) { 869 if (IS_SEPARATOR(number[i])) 870 continue; 871 872 if (!isdigit((uchar_t)number[i])) 873 break; 874 875 if (number[i] != '0') 876 state |= IN_NUMBER; 877 878 if (sign == '0') 879 digits[j++] = '0' + '9' - number[i]; 880 else 881 digits[j++] = number[i]; 882 } 883 884 if (sign == '0') 885 digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR); 886 } else { 887 /* 888 * Nondigit or end of string seen. 889 */ 890 log_ten = (int)j; 891 if (sign == '0') 892 digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR); 893 else 894 digits[j] = INTERFIELD_SEPARATOR; 895 } 896 897 if ((state & IN_NUMBER) == 0) { 898 /* 899 * A non-zero number was not detected; treat as defined zero. 900 */ 901 sign = '1'; 902 log_ten = 0; 903 digits[0] = '0'; 904 j = 1; 905 } 906 907 /* 908 * We subtract a constant from the log of negative values so that 909 * they will correctly precede positive values with a zero logarithm. 910 */ 911 if (sign == '0') { 912 if (j != 0) 913 log_ten = -log_ten - 2; 914 else 915 /* 916 * Special case for -0. 917 */ 918 log_ten = -1; 919 } 920 921 buffer[0] = sign; 922 923 /* 924 * Place logarithm in big-endian form. 925 */ 926 for (i = 0; i < sizeof (int); i++) 927 buffer[i + 1] = (log_ten << (i * NBBY)) 928 >> ((sizeof (int) - 1) * NBBY); 929 930 if (j + sizeof (char) + sizeof (int) < 931 L->l_collate_bufsize - coll_offset) 932 return (j + 1 + sizeof (int)); 933 else 934 return ((ssize_t)-1); 935 } 936 937 /*ARGSUSED*/ 938 ssize_t 939 field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter, 940 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 941 { 942 wchar_t *number; 943 wchar_t *buffer = L->l_collate.wp + coll_offset; 944 char *lbuffer; 945 ssize_t length; 946 947 wchar_t sign = L'2'; 948 int log_ten; 949 wchar_t *digits = buffer + 1 + sizeof (int)/sizeof (wchar_t); 950 size_t j = 0; 951 size_t i; 952 953 int state = BEFORE_NUMBER; 954 955 number = L->l_data.wp + data_offset; 956 length = data_length; 957 958 for (i = 0; i < length; i++) 959 if (!W_IS_BLANK(number[i])) 960 break; 961 962 if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) + 963 sizeof (int)) > (L->l_collate_bufsize - coll_offset)) 964 return ((ssize_t)-1); 965 966 if (number[i] == L'-') { 967 i++; 968 sign = L'0'; 969 } 970 971 for (; i < length; i++) { 972 if (W_IS_SEPARATOR(number[i])) 973 continue; 974 975 if (number[i] == L'0' && !(state & IN_NUMBER)) 976 continue; 977 978 if (!iswdigit(number[i])) 979 break; 980 981 state |= IN_NUMBER; 982 if (sign == L'0') 983 digits[j++] = L'0' + L'9' - number[i]; 984 else 985 digits[j++] = number[i]; 986 } 987 988 if (i < length && W_IS_DECIMAL(number[i])) { 989 digits[j] = W_DECIMAL_CHAR; 990 log_ten = j++; 991 992 for (++i; i < length; i++) { 993 if (W_IS_SEPARATOR(number[i])) 994 continue; 995 996 if (!iswdigit(number[i])) 997 break; 998 999 if (number[i] != L'0') 1000 state |= IN_NUMBER; 1001 1002 if (sign == L'0') 1003 digits[j++] = L'0' + L'9' - number[i]; 1004 else 1005 digits[j++] = number[i]; 1006 } 1007 1008 if (sign == L'0') 1009 digits[j++] = (wchar_t)(WCHAR_MAX - 1010 W_INTERFIELD_SEPARATOR); 1011 } else { 1012 log_ten = (int)j; 1013 if (sign == L'0') 1014 digits[j++] = (wchar_t)(WCHAR_MAX - 1015 W_INTERFIELD_SEPARATOR); 1016 else 1017 digits[j] = W_INTERFIELD_SEPARATOR; 1018 } 1019 1020 if ((state & IN_NUMBER) == 0) { 1021 sign = L'1'; 1022 log_ten = 0; 1023 digits[0] = L'0'; 1024 j = 1; 1025 } 1026 1027 if (sign == L'0') { 1028 if (j != 0) 1029 log_ten = -log_ten - 2; 1030 else 1031 log_ten = -1; 1032 } 1033 1034 buffer[0] = sign; 1035 /* 1036 * Place logarithm in big-endian form. 1037 */ 1038 lbuffer = (char *)(buffer + 1); 1039 for (i = 0; i < sizeof (int); i++) 1040 lbuffer[i] = (log_ten << (i * NBBY)) 1041 >> ((sizeof (int) - 1) * NBBY); 1042 1043 if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) < 1044 L->l_collate_bufsize - coll_offset * sizeof (wchar_t)) 1045 return (j + 1 + sizeof (int) / sizeof (wchar_t)); 1046 else 1047 return ((ssize_t)-1); 1048 } 1049 1050 /* 1051 * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour 1052 * when coll_offset exceeds l_collate_bufsize. 1053 */ 1054 ssize_t 1055 field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator) 1056 { 1057 ssize_t coll_offset = 0; 1058 ssize_t start, end, distance; 1059 field_t *cur_fieldp = F; 1060 1061 while (cur_fieldp != NULL) { 1062 /* 1063 * delimit field 1064 */ 1065 if (!field_separator.sc) 1066 field_delimit(cur_fieldp, L, &start, &end); 1067 else 1068 field_delimit_tabbed(cur_fieldp, L, &start, &end, 1069 field_separator); 1070 1071 distance = 0; 1072 if (end - start > 0 || 1073 (end - start == 0 && F->f_species == NUMERIC)) { 1074 /* 1075 * Convert field, appending to collated field of line 1076 * record. 1077 */ 1078 distance = cur_fieldp->f_convert(cur_fieldp, L, 1079 field_separator, start, end - start, coll_offset); 1080 1081 /* 1082 * branch should execute comparatively rarely 1083 */ 1084 if (distance == -1) { 1085 if (flags & FCV_REALLOC) { 1086 ASSERT(L->l_collate_bufsize > 0); 1087 L->l_collate_bufsize *= 2; 1088 L->l_collate.sp = 1089 safe_realloc(L->l_collate.sp, 1090 L->l_collate_bufsize); 1091 1092 __S(stats_incr_convert_reallocs()); 1093 continue; 1094 } else { 1095 /* 1096 * FCV_FAIL has been set. 1097 */ 1098 return (-1); 1099 } 1100 } 1101 } 1102 1103 if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) { 1104 xstrninv(L->l_collate.sp, coll_offset, distance); 1105 *(L->l_collate.sp + coll_offset + distance) = 1106 (char)(UCHAR_MAX - INTERFIELD_SEPARATOR); 1107 distance++; 1108 } 1109 1110 ASSERT(distance >= 0); 1111 coll_offset += distance; 1112 if (coll_offset >= L->l_collate_bufsize) { 1113 if (flags & FCV_REALLOC) { 1114 ASSERT(L->l_collate_bufsize > 0); 1115 L->l_collate_bufsize *= 2; 1116 L->l_collate.sp = safe_realloc(L->l_collate.sp, 1117 L->l_collate_bufsize); 1118 1119 __S(stats_incr_convert_reallocs()); 1120 } else { 1121 return (-1); 1122 } 1123 } 1124 *(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR; 1125 coll_offset++; 1126 1127 cur_fieldp = cur_fieldp->f_next; 1128 } 1129 1130 L->l_collate_length = coll_offset; 1131 1132 return (L->l_collate_length); 1133 } 1134 1135 ssize_t 1136 field_convert_wide(field_t *F, line_rec_t *L, int flags, 1137 vchar_t field_separator) 1138 { 1139 ssize_t coll_offset = 0; 1140 ssize_t start, end, distance; 1141 field_t *cur_fieldp = F; 1142 1143 while (cur_fieldp != NULL) { 1144 if (!field_separator.wc) 1145 field_delimit_wide(cur_fieldp, L, &start, &end); 1146 else 1147 field_delimit_tabbed_wide(cur_fieldp, L, &start, &end, 1148 field_separator); 1149 1150 distance = 0; 1151 if (end - start > 0 || 1152 end - start == 0 && F->f_species == NUMERIC) { 1153 distance = cur_fieldp->f_convert(cur_fieldp, L, 1154 field_separator, start, end - start, coll_offset); 1155 1156 if (distance == -1) { 1157 if (flags & FCV_REALLOC) { 1158 ASSERT(L->l_collate_bufsize > 0); 1159 L->l_collate_bufsize *= 2; 1160 L->l_collate.wp = safe_realloc( 1161 L->l_collate.wp, 1162 L->l_collate_bufsize); 1163 1164 __S(stats_incr_convert_reallocs()); 1165 continue; 1166 } else { 1167 return (-1); 1168 } 1169 } 1170 } 1171 1172 if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) { 1173 xwcsninv(L->l_collate.wp, coll_offset, distance); 1174 *(L->l_collate.wp + coll_offset + distance) = 1175 WCHAR_MAX - INTERFIELD_SEPARATOR; 1176 distance++; 1177 } 1178 1179 ASSERT(distance >= 0); 1180 coll_offset += distance; 1181 if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) { 1182 if (flags & FCV_REALLOC) { 1183 ASSERT(L->l_collate_bufsize > 0); 1184 L->l_collate_bufsize *= 2; 1185 L->l_collate.wp = safe_realloc(L->l_collate.wp, 1186 L->l_collate_bufsize); 1187 1188 __S(stats_incr_convert_reallocs()); 1189 } else { 1190 return (-1); 1191 } 1192 } 1193 *(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR; 1194 coll_offset++; 1195 1196 cur_fieldp = cur_fieldp->f_next; 1197 } 1198 1199 L->l_collate_length = coll_offset * sizeof (wchar_t); 1200 #ifdef _LITTLE_ENDIAN 1201 xwcsntomsb(L->l_collate.wp, coll_offset); 1202 #endif /* _LITTLE_ENDIAN */ 1203 1204 return (L->l_collate_length); 1205 } 1206 1207 /* 1208 * line_convert() and line_convert_wide() are called when the collation vector 1209 * of a given line has been exhausted, and we are performing the final, 1210 * full-line comparison required by the sort specification. Because we do not 1211 * have a guarantee that l_data is null-terminated, we create an explicitly 1212 * null-terminated copy suitable for transformation to a collatable form for the 1213 * current locale. 1214 */ 1215 static void 1216 line_convert(line_rec_t *L) 1217 { 1218 static ssize_t bufsize; 1219 static char *buffer; 1220 1221 if (L->l_raw_collate.sp != NULL) 1222 return; 1223 1224 if (L->l_data_length + 1 > bufsize) { 1225 buffer = safe_realloc(buffer, L->l_data_length + 1); 1226 bufsize = L->l_data_length + 1; 1227 } 1228 1229 (void) strncpy(buffer, L->l_data.sp, L->l_data_length); 1230 buffer[L->l_data_length] = '\0'; 1231 1232 L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp, 1233 xfrm_ops->sx_len(buffer, L->l_data_length) + 1); 1234 xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer, 1235 xfrm_ops->sx_len(buffer, L->l_data_length) + 1); 1236 1237 __S(stats_incr_line_conversions()); 1238 } 1239 1240 static void 1241 line_convert_wide(line_rec_t *L) 1242 { 1243 static wchar_t *buffer; 1244 static ssize_t bufsize; 1245 1246 ssize_t dlength; 1247 1248 if (L->l_raw_collate.wp != NULL) 1249 return; 1250 1251 if (L->l_data_length + 1 > bufsize) { 1252 buffer = safe_realloc(buffer, (L->l_data_length + 1) * 1253 sizeof (wchar_t)); 1254 bufsize = L->l_data_length + 1; 1255 } 1256 1257 (void) wcsncpy(buffer, L->l_data.wp, L->l_data_length); 1258 buffer[L->l_data_length] = L'\0'; 1259 1260 dlength = wcsxfrm(NULL, buffer, 0) + 1; 1261 L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength * 1262 sizeof (wchar_t)); 1263 (void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength); 1264 1265 __S(stats_incr_line_conversions()); 1266 } 1267 1268 /* 1269 * Our convention for collation is 1270 * 1271 * A > B => r > 0, 1272 * A == B => r = 0, 1273 * A < B => r < 0 1274 * 1275 * This convention is consistent with the definition of memcmp(), strcmp(), and 1276 * strncmp() in the C locale. collated() and collated_wide() have two optional 1277 * behaviours, which can be activated by setting the appropriate values in 1278 * coll_flag: COLL_UNIQUE, which returns 0 if the l_collate fields of the line 1279 * records being compared are identical; COLL_DATA_ONLY, which ignores the 1280 * l_collate field for the current comparison; and COLL_REVERSE, which flips the 1281 * result for comparisons that fall through to an actual data comparison (since 1282 * the collated vector should already reflect reverse ordering from field 1283 * conversion). 1284 */ 1285 int 1286 collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag) 1287 { 1288 ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth; 1289 int r; 1290 int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK : 1291 INT_SIGN_PASS_MASK; 1292 ssize_t la, lb; 1293 1294 if (!(coll_flag & COLL_DATA_ONLY)) { 1295 if (ml > 0) { 1296 r = memcmp(A->l_collate.sp + depth, 1297 B->l_collate.sp + depth, ml); 1298 1299 if (r) 1300 return (r); 1301 } 1302 1303 if (A->l_collate_length < B->l_collate_length) 1304 return (-1); 1305 1306 if (A->l_collate_length > B->l_collate_length) 1307 return (1); 1308 } 1309 1310 /* 1311 * This is where we cut out, if we know that the current sort is over 1312 * the entire line. 1313 */ 1314 if (coll_flag & COLL_UNIQUE) 1315 return (0); 1316 1317 line_convert(A); 1318 line_convert(B); 1319 1320 la = strlen(A->l_raw_collate.sp); 1321 lb = strlen(B->l_raw_collate.sp); 1322 1323 r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb)); 1324 1325 if (r) 1326 return (r ^ mask); 1327 1328 if (la < lb) 1329 return (-1 ^ mask); 1330 1331 if (la > lb) 1332 return (1 ^ mask); 1333 1334 return (0); 1335 } 1336 1337 int 1338 collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag) 1339 { 1340 ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth; 1341 int r; 1342 int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK : 1343 INT_SIGN_PASS_MASK; 1344 ssize_t la, lb; 1345 1346 if (!(coll_flag & COLL_DATA_ONLY)) { 1347 if (ml > 0) { 1348 r = memcmp(A->l_collate.sp + depth, 1349 B->l_collate.sp + depth, ml); 1350 1351 if (r) 1352 return (r); 1353 } 1354 if (A->l_collate_length < B->l_collate_length) 1355 return (-1); 1356 1357 if (A->l_collate_length > B->l_collate_length) 1358 return (1); 1359 } 1360 1361 if (coll_flag & COLL_UNIQUE) 1362 return (0); 1363 1364 line_convert_wide(A); 1365 line_convert_wide(B); 1366 1367 la = wcslen(A->l_raw_collate.wp); 1368 lb = wcslen(B->l_raw_collate.wp); 1369 1370 r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp, 1371 (size_t)MIN(la, lb)); 1372 1373 if (r) 1374 return (r ^ mask); 1375 1376 if (la < lb) 1377 return (-1 ^ mask); 1378 1379 if (la > lb) 1380 return (1 ^ mask); 1381 1382 return (0); 1383 } 1384