10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
54623Srm88369 * Common Development and Distribution License (the "License").
64623Srm88369 * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
21*12067SJohn.Beck@Sun.COM
220Sstevel@tonic-gate /*
23*12067SJohn.Beck@Sun.COM * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate #include "fields.h"
270Sstevel@tonic-gate
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate * fields
300Sstevel@tonic-gate *
310Sstevel@tonic-gate * Overview
320Sstevel@tonic-gate * By a field, we mean the various delimited character sequences within each
330Sstevel@tonic-gate * line of the input files. The sort key consists of an ordered sequence of
340Sstevel@tonic-gate * fields, which need not include all possible fields for the given line.
350Sstevel@tonic-gate * (Furthermore, not every line need contain sufficient fields for the fields
360Sstevel@tonic-gate * given within the sort key. In fact, none of the lines in the input stream
370Sstevel@tonic-gate * need contain sufficient fields.)
380Sstevel@tonic-gate *
390Sstevel@tonic-gate * There are two methods for specifying fields for sort(1); these are
400Sstevel@tonic-gate * discussed in options.c. Here we discuss only the internal representation
410Sstevel@tonic-gate * of fields, as used for constructing the collation vector for each line as
420Sstevel@tonic-gate * defined by the sort key.
430Sstevel@tonic-gate *
440Sstevel@tonic-gate * Representation
450Sstevel@tonic-gate * The sort key is a singly-linked list of field specifiers. At present,
460Sstevel@tonic-gate * fields may belong to one of three species: alphabetical, numerical, or
470Sstevel@tonic-gate * monthly; the species (f_species) then indicates the conversion function
480Sstevel@tonic-gate * (f_convert) used to transform the raw characters of the character sequence
490Sstevel@tonic-gate * to a collatable form. (In principle, this allows us to consider future
500Sstevel@tonic-gate * field species such as hexadecimal.)
510Sstevel@tonic-gate *
520Sstevel@tonic-gate * Fields and offsets are numbered such that zero refers to the first field or
530Sstevel@tonic-gate * character, respectively. Thus, the interpretation of a key specifier, m.n,
540Sstevel@tonic-gate * is that the field begins at the nth character beyond the mth occurence of
550Sstevel@tonic-gate * the key separator. If the blanks flag has been specified, then the field
560Sstevel@tonic-gate * begins at the nth non-blank character past the mth key separator. If the
570Sstevel@tonic-gate * key separator is unspecified, then the key separator is defined as one or
580Sstevel@tonic-gate * more blank characters.
590Sstevel@tonic-gate *
600Sstevel@tonic-gate * In general, the various options afforded by sort may be broken into two
610Sstevel@tonic-gate * categories: field species and field modifiers. For each field species,
620Sstevel@tonic-gate * there is one or more conversion routines that take a delimited character
630Sstevel@tonic-gate * sequence and convert it to a character sequence collatable by strcmp() or
640Sstevel@tonic-gate * memcmp(). For field species that may be further modified, such as the
650Sstevel@tonic-gate * fold-to-uppercase option for alphabetic fields, the conversion routine may
660Sstevel@tonic-gate * be aware of how the modifier affects collation. Finally, the no-modifiers
670Sstevel@tonic-gate * case may present an opportunity for a simplified, faster version.
680Sstevel@tonic-gate *
690Sstevel@tonic-gate * Code Structure
700Sstevel@tonic-gate * The code paths for single-byte and multi-byte locales diverge significantly
710Sstevel@tonic-gate * in fields.c. Most routines have an *_wide() version, which produces an
720Sstevel@tonic-gate * equivalent effect for line records whose data field is composed of wide
730Sstevel@tonic-gate * characters (wchar_t). However, the l_collated field of a line record is
740Sstevel@tonic-gate * always composed of characters, so that the radix sorts provided in
750Sstevel@tonic-gate * internal.c can work in both single- and multi-byte locales. Thus, in the
760Sstevel@tonic-gate * various convert_*_wide() routines, the output is placed in l_collated, with
770Sstevel@tonic-gate * a length multiplier of 4.
780Sstevel@tonic-gate */
790Sstevel@tonic-gate
800Sstevel@tonic-gate #define BEFORE_NUMBER 0x0
810Sstevel@tonic-gate #define IN_NUMBER 0x1
820Sstevel@tonic-gate
830Sstevel@tonic-gate static char numerical_separator;
840Sstevel@tonic-gate static char numerical_decimal;
850Sstevel@tonic-gate static char monetary_separator;
860Sstevel@tonic-gate static char monetary_decimal;
870Sstevel@tonic-gate
880Sstevel@tonic-gate static wchar_t w_numerical_separator;
890Sstevel@tonic-gate static wchar_t w_numerical_decimal;
900Sstevel@tonic-gate static wchar_t w_monetary_separator;
910Sstevel@tonic-gate static wchar_t w_monetary_decimal;
920Sstevel@tonic-gate
930Sstevel@tonic-gate #define MONTHS_IN_YEAR 12
940Sstevel@tonic-gate #define MAX_MON_LEN 20
950Sstevel@tonic-gate
960Sstevel@tonic-gate enum { MO_NONE = 1, MO_OFFSET = 2 };
970Sstevel@tonic-gate
980Sstevel@tonic-gate static char *months[MONTHS_IN_YEAR];
990Sstevel@tonic-gate static size_t month_lengths[MONTHS_IN_YEAR];
1000Sstevel@tonic-gate static wchar_t *w_months[MONTHS_IN_YEAR];
1010Sstevel@tonic-gate static size_t w_month_lengths[MONTHS_IN_YEAR];
1020Sstevel@tonic-gate
1030Sstevel@tonic-gate #define DECIMAL_CHAR (numerical_decimal)
1040Sstevel@tonic-gate #define IS_BLANK(x) (isspace((uchar_t)(x)) && (x) != '\n')
1050Sstevel@tonic-gate #define IS_SEPARATOR(x) \
1060Sstevel@tonic-gate ((numerical_separator != '\0' && (x) == numerical_separator) || \
1070Sstevel@tonic-gate (monetary_separator != '\0' && (x) == monetary_separator))
1080Sstevel@tonic-gate #define IS_DECIMAL(x) \
1090Sstevel@tonic-gate ((x) == numerical_decimal || \
1100Sstevel@tonic-gate (monetary_decimal != '\0' && (x) == monetary_decimal))
1110Sstevel@tonic-gate #define W_DECIMAL_CHAR (w_numerical_decimal)
1120Sstevel@tonic-gate #define W_IS_BLANK(x) (iswspace(x) && (x) != L'\n')
1130Sstevel@tonic-gate #define W_IS_SEPARATOR(x) \
1140Sstevel@tonic-gate ((numerical_separator != '\0' && (x) == w_numerical_separator) || \
1150Sstevel@tonic-gate (monetary_separator != '\0' && (x) == w_monetary_separator))
1160Sstevel@tonic-gate #define W_IS_DECIMAL(x) \
1170Sstevel@tonic-gate (((x) == w_numerical_decimal) || \
1180Sstevel@tonic-gate (monetary_decimal != '\0' && (x) == w_monetary_decimal))
1190Sstevel@tonic-gate
1200Sstevel@tonic-gate #define INTERFIELD_SEPARATOR '\0'
1210Sstevel@tonic-gate #define W_INTERFIELD_SEPARATOR L'\0'
1220Sstevel@tonic-gate
1230Sstevel@tonic-gate #define INT_SIGN_FLIP_MASK 0x80000000
1240Sstevel@tonic-gate #define INT_SIGN_PASS_MASK 0x00000000
1250Sstevel@tonic-gate
1260Sstevel@tonic-gate /*
1270Sstevel@tonic-gate * strx_ops_t, xfrm_len, and xfrm_cpy: In the case where we are sorting in the
1280Sstevel@tonic-gate * C locale, we want to avoid the expense of transforming strings to collatable
1290Sstevel@tonic-gate * forms since, by definition, an arbitrary string in the C locale is already in
1300Sstevel@tonic-gate * its collatable form. Therefore, we construct a small ops vector (the
1310Sstevel@tonic-gate * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into
1320Sstevel@tonic-gate * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear
1330Sstevel@tonic-gate * strxfrm()-like.
1340Sstevel@tonic-gate */
1350Sstevel@tonic-gate /*ARGSUSED*/
1360Sstevel@tonic-gate static size_t
xfrm_len(const char * s2,size_t len)1370Sstevel@tonic-gate xfrm_len(const char *s2, size_t len)
1380Sstevel@tonic-gate {
1390Sstevel@tonic-gate return (strxfrm(NULL, s2, 0) + 1);
1400Sstevel@tonic-gate }
1410Sstevel@tonic-gate
1420Sstevel@tonic-gate /*
1430Sstevel@tonic-gate * The length represented by n includes a null character, so to return the
1440Sstevel@tonic-gate * correct length we subtract 1. Note that this function is only used by
1450Sstevel@tonic-gate * field_convert_alpha, and isn't for general use, as it assumes that n is the
1460Sstevel@tonic-gate * length of s2 plus a null character.
1470Sstevel@tonic-gate */
1480Sstevel@tonic-gate static size_t
C_ncpy(char * s1,const char * s2,size_t n)1490Sstevel@tonic-gate C_ncpy(char *s1, const char *s2, size_t n)
1500Sstevel@tonic-gate {
1510Sstevel@tonic-gate (void) strncpy(s1, s2, n);
1520Sstevel@tonic-gate return (n - 1);
1530Sstevel@tonic-gate }
1540Sstevel@tonic-gate
1550Sstevel@tonic-gate /*ARGSUSED*/
1560Sstevel@tonic-gate static size_t
C_len(const char * s,size_t len)1570Sstevel@tonic-gate C_len(const char *s, size_t len)
1580Sstevel@tonic-gate {
1590Sstevel@tonic-gate ASSERT(s != NULL);
1600Sstevel@tonic-gate return (len);
1610Sstevel@tonic-gate }
1620Sstevel@tonic-gate
1630Sstevel@tonic-gate typedef struct _strx_ops {
1640Sstevel@tonic-gate size_t (*sx_len)(const char *, size_t);
1650Sstevel@tonic-gate size_t (*sx_xfrm)(char *, const char *, size_t);
1660Sstevel@tonic-gate } strx_ops_t;
1670Sstevel@tonic-gate
1680Sstevel@tonic-gate static const strx_ops_t C_ops = { C_len, C_ncpy };
1690Sstevel@tonic-gate static const strx_ops_t SB_ops = { xfrm_len, strxfrm };
1700Sstevel@tonic-gate
1710Sstevel@tonic-gate static const strx_ops_t *xfrm_ops;
1720Sstevel@tonic-gate
1730Sstevel@tonic-gate static void
field_initialize_separator(void)1740Sstevel@tonic-gate field_initialize_separator(void)
1750Sstevel@tonic-gate {
1760Sstevel@tonic-gate /*
1770Sstevel@tonic-gate * A locale need not define all of the cases below: only decimal_point
1780Sstevel@tonic-gate * must be defined. Furthermore, sort(1) has traditionally not used the
1790Sstevel@tonic-gate * positive_sign and negative_sign, grouping, or currency_symbols (or
1800Sstevel@tonic-gate * their numeric counterparts, if any).
1810Sstevel@tonic-gate */
1820Sstevel@tonic-gate struct lconv *conv = localeconv();
1830Sstevel@tonic-gate
1840Sstevel@tonic-gate if (!xstreql(conv->thousands_sep, "")) {
1850Sstevel@tonic-gate numerical_separator = *conv->thousands_sep;
1860Sstevel@tonic-gate (void) mbtowc(&w_numerical_separator, conv->thousands_sep,
1870Sstevel@tonic-gate MB_CUR_MAX);
1880Sstevel@tonic-gate } else
1890Sstevel@tonic-gate numerical_separator = '\0';
1900Sstevel@tonic-gate
1910Sstevel@tonic-gate if (!xstreql(conv->mon_thousands_sep, "")) {
1920Sstevel@tonic-gate monetary_separator = *conv->mon_thousands_sep;
1930Sstevel@tonic-gate (void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep,
1940Sstevel@tonic-gate MB_CUR_MAX);
1950Sstevel@tonic-gate } else
1960Sstevel@tonic-gate monetary_separator = '\0';
1970Sstevel@tonic-gate
1980Sstevel@tonic-gate if (!xstreql(conv->mon_decimal_point, "")) {
1990Sstevel@tonic-gate monetary_decimal = *conv->mon_decimal_point;
2000Sstevel@tonic-gate (void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point,
2010Sstevel@tonic-gate MB_CUR_MAX);
2020Sstevel@tonic-gate } else
2030Sstevel@tonic-gate monetary_decimal = '\0';
2040Sstevel@tonic-gate
2050Sstevel@tonic-gate numerical_decimal = *conv->decimal_point;
2060Sstevel@tonic-gate (void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX);
2070Sstevel@tonic-gate }
2080Sstevel@tonic-gate
2090Sstevel@tonic-gate static void
field_initialize_month(int is_c_locale)2100Sstevel@tonic-gate field_initialize_month(int is_c_locale)
2110Sstevel@tonic-gate {
2120Sstevel@tonic-gate int i;
2130Sstevel@tonic-gate int j;
2140Sstevel@tonic-gate struct tm this_month;
2150Sstevel@tonic-gate const char *c_months[MONTHS_IN_YEAR] = {
2160Sstevel@tonic-gate "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
2170Sstevel@tonic-gate "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
2180Sstevel@tonic-gate };
2190Sstevel@tonic-gate
2200Sstevel@tonic-gate char month_name[MAX_MON_LEN * MB_LEN_MAX];
2210Sstevel@tonic-gate wchar_t w_month_name[MAX_MON_LEN];
2220Sstevel@tonic-gate
2230Sstevel@tonic-gate if (is_c_locale) {
2240Sstevel@tonic-gate for (i = 0; i < MONTHS_IN_YEAR; i++) {
2250Sstevel@tonic-gate months[i] = (char *)c_months[i];
2260Sstevel@tonic-gate month_lengths[i] = strlen(c_months[i]);
2270Sstevel@tonic-gate }
2280Sstevel@tonic-gate /*
2290Sstevel@tonic-gate * We don't need to initialize the wide version of the month
2300Sstevel@tonic-gate * names.
2310Sstevel@tonic-gate */
2320Sstevel@tonic-gate return;
2330Sstevel@tonic-gate }
2340Sstevel@tonic-gate
2350Sstevel@tonic-gate (void) memset(&this_month, 0, sizeof (this_month));
2360Sstevel@tonic-gate
2370Sstevel@tonic-gate for (i = 0; i < MONTHS_IN_YEAR; i++) {
2380Sstevel@tonic-gate this_month.tm_mon = i;
2390Sstevel@tonic-gate
2400Sstevel@tonic-gate (void) strftime(month_name, sizeof (month_name),
2410Sstevel@tonic-gate "%b", &this_month);
2420Sstevel@tonic-gate
2430Sstevel@tonic-gate for (j = 0; j < strlen(month_name); j++)
2440Sstevel@tonic-gate month_name[j] = toupper(month_name[j]);
2450Sstevel@tonic-gate (void) mbstowcs(w_month_name, month_name, MAX_MON_LEN);
2460Sstevel@tonic-gate
2470Sstevel@tonic-gate months[i] = strdup(month_name);
2480Sstevel@tonic-gate month_lengths[i] = strlen(month_name);
2490Sstevel@tonic-gate w_months[i] = wsdup(w_month_name);
2500Sstevel@tonic-gate w_month_lengths[i] = wslen(w_month_name);
2510Sstevel@tonic-gate }
2520Sstevel@tonic-gate }
2530Sstevel@tonic-gate
2540Sstevel@tonic-gate void
field_initialize(sort_t * S)2550Sstevel@tonic-gate field_initialize(sort_t *S)
2560Sstevel@tonic-gate {
2570Sstevel@tonic-gate field_initialize_month(S->m_c_locale);
2580Sstevel@tonic-gate field_initialize_separator();
2590Sstevel@tonic-gate
2600Sstevel@tonic-gate if (S->m_c_locale)
2610Sstevel@tonic-gate xfrm_ops = &C_ops;
2620Sstevel@tonic-gate else
2630Sstevel@tonic-gate xfrm_ops = &SB_ops;
2640Sstevel@tonic-gate }
2650Sstevel@tonic-gate
2660Sstevel@tonic-gate field_t *
field_new(sort_t * S)2670Sstevel@tonic-gate field_new(sort_t *S)
2680Sstevel@tonic-gate {
2690Sstevel@tonic-gate field_t *F = safe_realloc(NULL, sizeof (field_t));
2700Sstevel@tonic-gate
2710Sstevel@tonic-gate F->f_start_field = -1;
2720Sstevel@tonic-gate F->f_start_offset = -1;
2730Sstevel@tonic-gate F->f_end_field = -1;
2740Sstevel@tonic-gate F->f_end_offset = -1;
2750Sstevel@tonic-gate F->f_next = NULL;
2760Sstevel@tonic-gate
2770Sstevel@tonic-gate if (S == NULL) {
2780Sstevel@tonic-gate F->f_species = ALPHA;
2790Sstevel@tonic-gate F->f_options = 0;
2800Sstevel@tonic-gate } else {
2810Sstevel@tonic-gate F->f_species = S->m_default_species;
2820Sstevel@tonic-gate F->f_options = S->m_field_options;
2830Sstevel@tonic-gate }
2840Sstevel@tonic-gate
2850Sstevel@tonic-gate return (F);
2860Sstevel@tonic-gate }
2870Sstevel@tonic-gate
2880Sstevel@tonic-gate void
field_delete(field_t * F)2890Sstevel@tonic-gate field_delete(field_t *F)
2900Sstevel@tonic-gate {
2910Sstevel@tonic-gate free(F);
2920Sstevel@tonic-gate }
2930Sstevel@tonic-gate
2940Sstevel@tonic-gate /*
2950Sstevel@tonic-gate * The recursive implementation of field_add_to_chain() given below is
2960Sstevel@tonic-gate * inappropriate if function calls are expensive, or a truly large number of
2970Sstevel@tonic-gate * fields are anticipated.
2980Sstevel@tonic-gate */
2990Sstevel@tonic-gate void
field_add_to_chain(field_t ** F,field_t * A)3000Sstevel@tonic-gate field_add_to_chain(field_t **F, field_t *A)
3010Sstevel@tonic-gate {
3020Sstevel@tonic-gate if (*F == NULL)
3030Sstevel@tonic-gate *F = A;
3040Sstevel@tonic-gate else
3050Sstevel@tonic-gate field_add_to_chain(&((*F)->f_next), A);
3060Sstevel@tonic-gate }
3070Sstevel@tonic-gate
3080Sstevel@tonic-gate #ifdef DEBUG
3090Sstevel@tonic-gate #ifndef _LP64
3100Sstevel@tonic-gate #define FIELD_FMT \
3110Sstevel@tonic-gate "\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n"
3120Sstevel@tonic-gate #else /* !_LP64 */
3130Sstevel@tonic-gate #define FIELD_FMT \
3140Sstevel@tonic-gate "\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n"
3150Sstevel@tonic-gate #endif /* !_LP64 */
3160Sstevel@tonic-gate
3170Sstevel@tonic-gate /*
3180Sstevel@tonic-gate * field_print is used only for debugging purposes.
3190Sstevel@tonic-gate */
3200Sstevel@tonic-gate void
field_print(field_t * F)3210Sstevel@tonic-gate field_print(field_t *F)
3220Sstevel@tonic-gate {
3230Sstevel@tonic-gate char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"};
3240Sstevel@tonic-gate int status = 0;
3250Sstevel@tonic-gate
3260Sstevel@tonic-gate (void) fprintf(stderr, "Type: %s", field_names[F->f_species]);
3270Sstevel@tonic-gate (void) fprintf(stderr, "\tOptions: ");
3280Sstevel@tonic-gate
3290Sstevel@tonic-gate if (F->f_options & FIELD_REVERSE_COMPARISONS) {
3300Sstevel@tonic-gate (void) fprintf(stderr, "REVERSE");
3310Sstevel@tonic-gate status++;
3320Sstevel@tonic-gate }
3330Sstevel@tonic-gate if (F->f_options & FIELD_DICTIONARY_ORDER) {
3340Sstevel@tonic-gate (void) fprintf(stderr, "DICTIONARY ");
3350Sstevel@tonic-gate status++;
3360Sstevel@tonic-gate }
3370Sstevel@tonic-gate if (F->f_options & FIELD_FOLD_UPPERCASE) {
3380Sstevel@tonic-gate (void) fprintf(stderr, "UPPERCASE ");
3390Sstevel@tonic-gate status++;
3400Sstevel@tonic-gate }
3410Sstevel@tonic-gate if (F->f_options & FIELD_IGNORE_NONPRINTABLES) {
3420Sstevel@tonic-gate (void) fprintf(stderr, "PRINTABLES ");
3430Sstevel@tonic-gate status++;
3440Sstevel@tonic-gate }
3450Sstevel@tonic-gate if (F->f_options & FIELD_IGNORE_BLANKS_START) {
3460Sstevel@tonic-gate (void) fprintf(stderr, "BLANKS_START ");
3470Sstevel@tonic-gate status++;
3480Sstevel@tonic-gate }
3490Sstevel@tonic-gate if (F->f_options & FIELD_IGNORE_BLANKS_END) {
3500Sstevel@tonic-gate (void) fprintf(stderr, "BLANKS_END ");
3510Sstevel@tonic-gate status++;
3520Sstevel@tonic-gate }
3530Sstevel@tonic-gate
3540Sstevel@tonic-gate if (status == 0)
3550Sstevel@tonic-gate (void) fprintf(stderr, "NO_MODIFIERS");
3560Sstevel@tonic-gate
3570Sstevel@tonic-gate (void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset,
3580Sstevel@tonic-gate F->f_end_field, F->f_end_offset);
3590Sstevel@tonic-gate }
3600Sstevel@tonic-gate #endif /* DEBUG */
3610Sstevel@tonic-gate
3620Sstevel@tonic-gate static ssize_t
field_boundary(field_t * F,line_rec_t * L,int is_end,int is_blanks)3630Sstevel@tonic-gate field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks)
3640Sstevel@tonic-gate {
3650Sstevel@tonic-gate char *S = L->l_data.sp;
3660Sstevel@tonic-gate char *T = S;
3670Sstevel@tonic-gate char *eol = S + L->l_data_length;
3680Sstevel@tonic-gate ssize_t field = is_end ? F->f_end_field : F->f_start_field;
3690Sstevel@tonic-gate ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
3700Sstevel@tonic-gate ssize_t ret;
3710Sstevel@tonic-gate
3720Sstevel@tonic-gate ASSERT(is_end || field > -1);
3730Sstevel@tonic-gate
3740Sstevel@tonic-gate if (is_end && field == -1)
3750Sstevel@tonic-gate return (L->l_data_length);
3760Sstevel@tonic-gate
3770Sstevel@tonic-gate while (field-- > 0) {
3780Sstevel@tonic-gate while (T < eol && IS_BLANK(*T))
3790Sstevel@tonic-gate T++;
3800Sstevel@tonic-gate
3810Sstevel@tonic-gate while (T < eol && !IS_BLANK(*T))
3820Sstevel@tonic-gate T++;
3830Sstevel@tonic-gate }
3840Sstevel@tonic-gate
3850Sstevel@tonic-gate if ((!is_end || offset > 0) && is_blanks) {
3860Sstevel@tonic-gate while (IS_BLANK(*T))
3870Sstevel@tonic-gate T++;
3880Sstevel@tonic-gate }
3890Sstevel@tonic-gate
3900Sstevel@tonic-gate if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
3910Sstevel@tonic-gate return (L->l_data_length);
3920Sstevel@tonic-gate
3930Sstevel@tonic-gate return (ret);
3940Sstevel@tonic-gate }
3950Sstevel@tonic-gate
3960Sstevel@tonic-gate static void
field_delimit(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end)3970Sstevel@tonic-gate field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
3980Sstevel@tonic-gate {
3990Sstevel@tonic-gate ASSERT(F->f_start_field > -1);
4000Sstevel@tonic-gate
4010Sstevel@tonic-gate *start = field_boundary(F, L, 0,
4020Sstevel@tonic-gate F->f_options & FIELD_IGNORE_BLANKS_START);
4030Sstevel@tonic-gate *end = field_boundary(F, L, 1,
4040Sstevel@tonic-gate F->f_options & FIELD_IGNORE_BLANKS_END);
4050Sstevel@tonic-gate }
4060Sstevel@tonic-gate
4070Sstevel@tonic-gate static ssize_t
field_boundary_wide(field_t * F,line_rec_t * L,int is_end,int is_blanks)4080Sstevel@tonic-gate field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks)
4090Sstevel@tonic-gate {
4100Sstevel@tonic-gate wchar_t *S = L->l_data.wp;
4110Sstevel@tonic-gate wchar_t *T = S;
4120Sstevel@tonic-gate wchar_t *eol = S + L->l_data_length;
4130Sstevel@tonic-gate ssize_t field = is_end ? F->f_end_field : F->f_start_field;
4140Sstevel@tonic-gate ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
4150Sstevel@tonic-gate ssize_t ret;
4160Sstevel@tonic-gate
4170Sstevel@tonic-gate ASSERT(is_end || field > -1);
4180Sstevel@tonic-gate
4190Sstevel@tonic-gate if (is_end && field == -1)
4200Sstevel@tonic-gate return (L->l_data_length);
4210Sstevel@tonic-gate
4220Sstevel@tonic-gate while (field-- > 0) {
4230Sstevel@tonic-gate while (T < eol && W_IS_BLANK(*T))
4240Sstevel@tonic-gate T++;
4250Sstevel@tonic-gate
4260Sstevel@tonic-gate while (T < eol && !W_IS_BLANK(*T))
4270Sstevel@tonic-gate T++;
4280Sstevel@tonic-gate }
4290Sstevel@tonic-gate
4300Sstevel@tonic-gate if ((!is_end || offset > 0) && is_blanks) {
4310Sstevel@tonic-gate while (W_IS_BLANK(*T))
4320Sstevel@tonic-gate T++;
4330Sstevel@tonic-gate }
4340Sstevel@tonic-gate
4350Sstevel@tonic-gate if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
4360Sstevel@tonic-gate return (L->l_data_length);
4370Sstevel@tonic-gate
4380Sstevel@tonic-gate return (ret);
4390Sstevel@tonic-gate }
4400Sstevel@tonic-gate
4410Sstevel@tonic-gate static void
field_delimit_wide(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end)4420Sstevel@tonic-gate field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
4430Sstevel@tonic-gate {
4440Sstevel@tonic-gate ASSERT(F->f_start_field > -1);
4450Sstevel@tonic-gate
4460Sstevel@tonic-gate *start = field_boundary_wide(F, L, 0,
4470Sstevel@tonic-gate F->f_options & FIELD_IGNORE_BLANKS_START);
4480Sstevel@tonic-gate *end = field_boundary_wide(F, L, 1,
4490Sstevel@tonic-gate F->f_options & FIELD_IGNORE_BLANKS_END);
4500Sstevel@tonic-gate }
4510Sstevel@tonic-gate
4520Sstevel@tonic-gate static ssize_t
field_boundary_tabbed(field_t * F,line_rec_t * L,int is_end,int is_blanks,vchar_t delimiter)4530Sstevel@tonic-gate field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks,
4540Sstevel@tonic-gate vchar_t delimiter)
4550Sstevel@tonic-gate {
4560Sstevel@tonic-gate char *S = L->l_data.sp;
4570Sstevel@tonic-gate char *T = S;
4580Sstevel@tonic-gate char *eol = S + L->l_data_length;
4590Sstevel@tonic-gate ssize_t field = is_end ? F->f_end_field : F->f_start_field;
4600Sstevel@tonic-gate ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
4610Sstevel@tonic-gate ssize_t ret;
4620Sstevel@tonic-gate
4630Sstevel@tonic-gate ASSERT(is_end || field > -1);
4640Sstevel@tonic-gate
4650Sstevel@tonic-gate if (is_end && field == -1)
4660Sstevel@tonic-gate return (L->l_data_length);
4670Sstevel@tonic-gate
4680Sstevel@tonic-gate while (field-- > 0) {
4690Sstevel@tonic-gate T = xstrnchr(T, delimiter.sc, eol - T);
4700Sstevel@tonic-gate if (T == NULL || T > eol)
4710Sstevel@tonic-gate return (L->l_data_length);
4720Sstevel@tonic-gate
4730Sstevel@tonic-gate T++;
4740Sstevel@tonic-gate }
4750Sstevel@tonic-gate
4760Sstevel@tonic-gate if ((!is_end || offset != 0) && is_blanks) {
4770Sstevel@tonic-gate while (IS_BLANK(*T))
4780Sstevel@tonic-gate T++;
4790Sstevel@tonic-gate }
4800Sstevel@tonic-gate
4814623Srm88369 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
482*12067SJohn.Beck@Sun.COM if (L->l_data_length <= 0)
483*12067SJohn.Beck@Sun.COM return (0);
4844623Srm88369 if (S[L->l_data_length - 1] == delimiter.sc) {
4854623Srm88369 return (L->l_data_length - 1);
4864623Srm88369 } else {
4874623Srm88369 return (L->l_data_length);
4884623Srm88369 }
4894623Srm88369 }
4900Sstevel@tonic-gate
4910Sstevel@tonic-gate if (is_end && offset == 0)
4920Sstevel@tonic-gate ret--;
4930Sstevel@tonic-gate
4940Sstevel@tonic-gate return (ret);
4950Sstevel@tonic-gate }
4960Sstevel@tonic-gate
4970Sstevel@tonic-gate /*
4980Sstevel@tonic-gate * field_delimit_tabbed() is called when a field separator has been defined
4990Sstevel@tonic-gate * using the -t option. The character at the offset, start, is either one or
5000Sstevel@tonic-gate * more character positions past the delimiter marking the start of the
5010Sstevel@tonic-gate * field, or at the end of the line.
5020Sstevel@tonic-gate */
5030Sstevel@tonic-gate static void
field_delimit_tabbed(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end,vchar_t delimiter)5040Sstevel@tonic-gate field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end,
5050Sstevel@tonic-gate vchar_t delimiter)
5060Sstevel@tonic-gate {
5070Sstevel@tonic-gate ASSERT(F->f_start_field > -1);
5080Sstevel@tonic-gate
5090Sstevel@tonic-gate *start = field_boundary_tabbed(F, L, 0, F->f_options &
5100Sstevel@tonic-gate FIELD_IGNORE_BLANKS_START, delimiter);
5110Sstevel@tonic-gate *end = field_boundary_tabbed(F, L, 1, F->f_options &
5120Sstevel@tonic-gate FIELD_IGNORE_BLANKS_END, delimiter);
5130Sstevel@tonic-gate }
5140Sstevel@tonic-gate
5150Sstevel@tonic-gate static ssize_t
field_boundary_tabbed_wide(field_t * F,line_rec_t * L,int is_end,int is_blanks,vchar_t delimiter)5160Sstevel@tonic-gate field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks,
5170Sstevel@tonic-gate vchar_t delimiter)
5180Sstevel@tonic-gate {
5190Sstevel@tonic-gate wchar_t *S = L->l_data.wp;
5200Sstevel@tonic-gate wchar_t *T = S;
5210Sstevel@tonic-gate wchar_t *eol = S + L->l_data_length;
5220Sstevel@tonic-gate ssize_t field = is_end ? F->f_end_field : F->f_start_field;
5230Sstevel@tonic-gate ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
5240Sstevel@tonic-gate ssize_t ret;
5250Sstevel@tonic-gate
5260Sstevel@tonic-gate ASSERT(is_end || field > -1);
5270Sstevel@tonic-gate
5280Sstevel@tonic-gate if (is_end && field == -1)
5290Sstevel@tonic-gate return (L->l_data_length);
5300Sstevel@tonic-gate
5310Sstevel@tonic-gate while (field-- > 0) {
5320Sstevel@tonic-gate T = xwsnchr(T, delimiter.wc, eol - T);
5330Sstevel@tonic-gate if (T == NULL || T > eol)
5340Sstevel@tonic-gate return (L->l_data_length);
5350Sstevel@tonic-gate
5360Sstevel@tonic-gate T++;
5370Sstevel@tonic-gate }
5380Sstevel@tonic-gate
5390Sstevel@tonic-gate if ((!is_end || offset != 0) && is_blanks) {
5400Sstevel@tonic-gate while (W_IS_BLANK(*T))
5410Sstevel@tonic-gate T++;
5420Sstevel@tonic-gate }
5430Sstevel@tonic-gate
5444623Srm88369 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
545*12067SJohn.Beck@Sun.COM if (L->l_data_length <= 0)
546*12067SJohn.Beck@Sun.COM return (0);
5474623Srm88369 if (S[L->l_data_length - 1] == delimiter.wc) {
5484623Srm88369 return (L->l_data_length - 1);
5494623Srm88369 } else {
5504623Srm88369 return (L->l_data_length);
5514623Srm88369 }
5524623Srm88369 }
5530Sstevel@tonic-gate
5540Sstevel@tonic-gate if (is_end && offset == 0)
5550Sstevel@tonic-gate ret--;
5560Sstevel@tonic-gate
5570Sstevel@tonic-gate return (ret);
5580Sstevel@tonic-gate }
5590Sstevel@tonic-gate
5600Sstevel@tonic-gate static void
field_delimit_tabbed_wide(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end,vchar_t delimiter)5610Sstevel@tonic-gate field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start,
5620Sstevel@tonic-gate ssize_t *end, vchar_t delimiter)
5630Sstevel@tonic-gate {
5640Sstevel@tonic-gate ASSERT(F->f_start_field > -1);
5650Sstevel@tonic-gate
5660Sstevel@tonic-gate *start = field_boundary_tabbed_wide(F, L, 0, F->f_options &
5670Sstevel@tonic-gate FIELD_IGNORE_BLANKS_START, delimiter);
5680Sstevel@tonic-gate *end = field_boundary_tabbed_wide(F, L, 1, F->f_options &
5690Sstevel@tonic-gate FIELD_IGNORE_BLANKS_END, delimiter);
5700Sstevel@tonic-gate }
5710Sstevel@tonic-gate
5720Sstevel@tonic-gate /*ARGSUSED*/
5730Sstevel@tonic-gate ssize_t
field_convert_month(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)5740Sstevel@tonic-gate field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter,
5750Sstevel@tonic-gate ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
5760Sstevel@tonic-gate {
5770Sstevel@tonic-gate int j;
5780Sstevel@tonic-gate ssize_t val;
5790Sstevel@tonic-gate char month_candidate[MAX_MON_LEN * MB_LEN_MAX];
5800Sstevel@tonic-gate ssize_t month_length = data_length;
5810Sstevel@tonic-gate ssize_t month_offset = data_offset;
5820Sstevel@tonic-gate
5830Sstevel@tonic-gate if (sizeof (char) > L->l_collate_bufsize - coll_offset)
5840Sstevel@tonic-gate return (-1);
5850Sstevel@tonic-gate
5860Sstevel@tonic-gate (void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX);
5870Sstevel@tonic-gate
5880Sstevel@tonic-gate
5890Sstevel@tonic-gate /*
5900Sstevel@tonic-gate * The month field formally begins with the first non-blank character.
5910Sstevel@tonic-gate */
5920Sstevel@tonic-gate while (IS_BLANK(*(L->l_data.sp + month_offset))) {
5930Sstevel@tonic-gate month_offset++;
5940Sstevel@tonic-gate month_length--;
5950Sstevel@tonic-gate }
5960Sstevel@tonic-gate
5970Sstevel@tonic-gate for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
5980Sstevel@tonic-gate month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]);
5990Sstevel@tonic-gate
6000Sstevel@tonic-gate for (j = 0; j < MONTHS_IN_YEAR; j++) {
6010Sstevel@tonic-gate if (xstrneql(month_candidate, months[j], month_lengths[j])) {
6020Sstevel@tonic-gate *(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET;
6030Sstevel@tonic-gate return (1);
6040Sstevel@tonic-gate }
6050Sstevel@tonic-gate }
6060Sstevel@tonic-gate
6070Sstevel@tonic-gate /*
6080Sstevel@tonic-gate * no matching month; copy string into field. required behaviour is
6090Sstevel@tonic-gate * that "month-free" keys sort before month-sortable keys, so insert
6100Sstevel@tonic-gate * a "will sort first" token.
6110Sstevel@tonic-gate */
6120Sstevel@tonic-gate *(L->l_collate.sp + coll_offset) = '\0' + MO_NONE;
6130Sstevel@tonic-gate
6140Sstevel@tonic-gate val = field_convert_alpha_simple(F, L, delimiter, data_offset,
6150Sstevel@tonic-gate data_length, coll_offset + 1);
6160Sstevel@tonic-gate
6170Sstevel@tonic-gate if (val < 0)
6180Sstevel@tonic-gate return (-1);
6190Sstevel@tonic-gate else
6200Sstevel@tonic-gate return (val + 1);
6210Sstevel@tonic-gate }
6220Sstevel@tonic-gate
6230Sstevel@tonic-gate /*ARGSUSED*/
6240Sstevel@tonic-gate ssize_t
field_convert_month_wide(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)6250Sstevel@tonic-gate field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
6260Sstevel@tonic-gate ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
6270Sstevel@tonic-gate {
6280Sstevel@tonic-gate ssize_t j;
6290Sstevel@tonic-gate ssize_t val;
6300Sstevel@tonic-gate wchar_t month_candidate[MAX_MON_LEN];
6310Sstevel@tonic-gate wchar_t *month;
6320Sstevel@tonic-gate wchar_t *buffer = L->l_collate.wp + coll_offset;
6330Sstevel@tonic-gate ssize_t month_length = data_length;
6340Sstevel@tonic-gate ssize_t month_offset = data_offset;
6350Sstevel@tonic-gate
6360Sstevel@tonic-gate if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) <
6370Sstevel@tonic-gate sizeof (wchar_t))
6380Sstevel@tonic-gate return (-1);
6390Sstevel@tonic-gate
6400Sstevel@tonic-gate (void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t));
6410Sstevel@tonic-gate
6420Sstevel@tonic-gate
6430Sstevel@tonic-gate while (W_IS_BLANK(*(L->l_data.wp + month_offset))) {
6440Sstevel@tonic-gate month_offset++;
6450Sstevel@tonic-gate month_length--;
6460Sstevel@tonic-gate }
6470Sstevel@tonic-gate
6480Sstevel@tonic-gate month = L->l_data.wp + month_offset;
6490Sstevel@tonic-gate
6500Sstevel@tonic-gate for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
6510Sstevel@tonic-gate month_candidate[j] = towupper(month[j]);
6520Sstevel@tonic-gate
6530Sstevel@tonic-gate for (j = 0; j < MONTHS_IN_YEAR; j++)
6540Sstevel@tonic-gate if (xwcsneql(month_candidate, w_months[j],
6550Sstevel@tonic-gate w_month_lengths[j])) {
6560Sstevel@tonic-gate *buffer = L'\0' + j + MO_OFFSET;
6570Sstevel@tonic-gate return (1);
6580Sstevel@tonic-gate }
6590Sstevel@tonic-gate
6600Sstevel@tonic-gate *buffer = L'\0' + MO_NONE;
6610Sstevel@tonic-gate
6620Sstevel@tonic-gate val = field_convert_alpha_wide(F, L, delimiter, data_offset,
6630Sstevel@tonic-gate data_length, coll_offset + sizeof (wchar_t));
6640Sstevel@tonic-gate
6650Sstevel@tonic-gate if (val < 0)
6660Sstevel@tonic-gate return (-1);
6670Sstevel@tonic-gate else
6680Sstevel@tonic-gate return (val + 1);
6690Sstevel@tonic-gate }
6700Sstevel@tonic-gate
6710Sstevel@tonic-gate /*
6720Sstevel@tonic-gate * field_convert_alpha() always fails with return value -1 if the converted
6730Sstevel@tonic-gate * string would cause l_collate_length to exceed l_collate_bufsize
6740Sstevel@tonic-gate */
6750Sstevel@tonic-gate /*ARGSUSED*/
6760Sstevel@tonic-gate ssize_t
field_convert_alpha(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)6770Sstevel@tonic-gate field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter,
6780Sstevel@tonic-gate ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
6790Sstevel@tonic-gate {
6800Sstevel@tonic-gate static char *compose;
6810Sstevel@tonic-gate static ssize_t compose_length;
6820Sstevel@tonic-gate
6830Sstevel@tonic-gate ssize_t clength = 0;
6840Sstevel@tonic-gate ssize_t dlength;
6850Sstevel@tonic-gate ssize_t i;
6860Sstevel@tonic-gate
6870Sstevel@tonic-gate if (compose_length < (data_length + 1)) {
6880Sstevel@tonic-gate compose_length = data_length + 1;
6890Sstevel@tonic-gate compose = safe_realloc(compose, compose_length * sizeof (char));
6900Sstevel@tonic-gate }
6910Sstevel@tonic-gate
6920Sstevel@tonic-gate for (i = data_offset; i < data_offset + data_length; i++) {
6930Sstevel@tonic-gate char t = (L->l_data.sp)[i];
6940Sstevel@tonic-gate
6950Sstevel@tonic-gate if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) &&
6960Sstevel@tonic-gate !isprint((uchar_t)t))
6970Sstevel@tonic-gate continue;
6980Sstevel@tonic-gate
6990Sstevel@tonic-gate if ((F->f_options & FIELD_DICTIONARY_ORDER) &&
7000Sstevel@tonic-gate !isalnum((uchar_t)t) && !isspace((uchar_t)t))
7010Sstevel@tonic-gate continue;
7020Sstevel@tonic-gate
7030Sstevel@tonic-gate if (F->f_options & FIELD_FOLD_UPPERCASE)
7040Sstevel@tonic-gate t = toupper(t);
7050Sstevel@tonic-gate
7060Sstevel@tonic-gate compose[clength++] = t;
7070Sstevel@tonic-gate }
7080Sstevel@tonic-gate compose[clength] = '\0';
7090Sstevel@tonic-gate
7100Sstevel@tonic-gate if ((dlength = xfrm_ops->sx_len(compose, clength)) <
7110Sstevel@tonic-gate L->l_collate_bufsize - coll_offset)
7120Sstevel@tonic-gate return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
7134623Srm88369 compose, dlength + 1));
7140Sstevel@tonic-gate else
7150Sstevel@tonic-gate return ((ssize_t)-1);
7160Sstevel@tonic-gate }
7170Sstevel@tonic-gate
7180Sstevel@tonic-gate /*ARGSUSED*/
7190Sstevel@tonic-gate ssize_t
field_convert_alpha_simple(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)7200Sstevel@tonic-gate field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter,
7210Sstevel@tonic-gate ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
7220Sstevel@tonic-gate {
7230Sstevel@tonic-gate static char *compose;
7240Sstevel@tonic-gate static ssize_t compose_length;
7250Sstevel@tonic-gate
7260Sstevel@tonic-gate ssize_t clength;
7270Sstevel@tonic-gate ssize_t dlength;
7280Sstevel@tonic-gate
7290Sstevel@tonic-gate if (compose_length < (data_length + 1)) {
7300Sstevel@tonic-gate compose_length = data_length + 1;
7310Sstevel@tonic-gate compose = safe_realloc(compose, compose_length * sizeof (char));
7320Sstevel@tonic-gate }
7330Sstevel@tonic-gate
7340Sstevel@tonic-gate (void) memcpy(compose, L->l_data.sp + data_offset, data_length);
7350Sstevel@tonic-gate clength = data_length;
7360Sstevel@tonic-gate compose[clength] = '\0';
7370Sstevel@tonic-gate
7380Sstevel@tonic-gate if ((dlength = xfrm_ops->sx_len(compose, clength)) <
7390Sstevel@tonic-gate L->l_collate_bufsize - coll_offset)
7400Sstevel@tonic-gate return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
7414623Srm88369 compose, dlength + 1));
7420Sstevel@tonic-gate else
7430Sstevel@tonic-gate return ((ssize_t)-1);
7440Sstevel@tonic-gate }
7450Sstevel@tonic-gate
7460Sstevel@tonic-gate /*ARGSUSED*/
7470Sstevel@tonic-gate ssize_t
field_convert_alpha_wide(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)7480Sstevel@tonic-gate field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
7490Sstevel@tonic-gate ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
7500Sstevel@tonic-gate {
7510Sstevel@tonic-gate wchar_t *compose = safe_realloc(NULL, (data_length + 1) *
7520Sstevel@tonic-gate sizeof (wchar_t));
7530Sstevel@tonic-gate ssize_t clength = 0;
7540Sstevel@tonic-gate ssize_t dlength;
7550Sstevel@tonic-gate ssize_t i;
7560Sstevel@tonic-gate ssize_t ret;
7570Sstevel@tonic-gate
7580Sstevel@tonic-gate for (i = data_offset; i < data_offset + data_length; i++) {
7590Sstevel@tonic-gate wchar_t t = (L->l_data.wp)[i];
7600Sstevel@tonic-gate
7610Sstevel@tonic-gate if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t))
7620Sstevel@tonic-gate continue;
7630Sstevel@tonic-gate
7640Sstevel@tonic-gate if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) &&
7650Sstevel@tonic-gate !iswspace(t))
7660Sstevel@tonic-gate continue;
7670Sstevel@tonic-gate
7680Sstevel@tonic-gate if (F->f_options & FIELD_FOLD_UPPERCASE)
7690Sstevel@tonic-gate t = towupper(t);
7700Sstevel@tonic-gate
7710Sstevel@tonic-gate compose[clength++] = t;
7720Sstevel@tonic-gate }
7730Sstevel@tonic-gate compose[clength] = L'\0';
7740Sstevel@tonic-gate
7750Sstevel@tonic-gate dlength = wcsxfrm(NULL, compose, (size_t)0);
7760Sstevel@tonic-gate if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize -
7770Sstevel@tonic-gate coll_offset * sizeof (wchar_t)) {
7780Sstevel@tonic-gate ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose,
7790Sstevel@tonic-gate (size_t)dlength + 1);
7800Sstevel@tonic-gate } else {
7810Sstevel@tonic-gate ret = (ssize_t)-1;
7820Sstevel@tonic-gate }
7830Sstevel@tonic-gate
7840Sstevel@tonic-gate safe_free(compose);
7850Sstevel@tonic-gate
7860Sstevel@tonic-gate return (ret);
7870Sstevel@tonic-gate }
7880Sstevel@tonic-gate
7890Sstevel@tonic-gate /*
7900Sstevel@tonic-gate * field_convert_numeric() converts the given field into a collatable numerical
7910Sstevel@tonic-gate * sequence. The sequence is ordered as { log, integer, separator, fraction },
7920Sstevel@tonic-gate * with an optional sentinel component at the sequence end.
7930Sstevel@tonic-gate */
7940Sstevel@tonic-gate /*ARGSUSED*/
7950Sstevel@tonic-gate ssize_t
field_convert_numeric(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)7960Sstevel@tonic-gate field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter,
7970Sstevel@tonic-gate ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
7980Sstevel@tonic-gate {
7990Sstevel@tonic-gate char *number;
8000Sstevel@tonic-gate char *buffer = L->l_collate.sp + coll_offset;
8010Sstevel@tonic-gate ssize_t length;
8020Sstevel@tonic-gate
8030Sstevel@tonic-gate char sign = '2';
8040Sstevel@tonic-gate int log_ten;
8050Sstevel@tonic-gate char *digits = buffer + 1 + sizeof (int) / sizeof (char);
8060Sstevel@tonic-gate size_t j = 0;
8070Sstevel@tonic-gate size_t i;
8080Sstevel@tonic-gate
8090Sstevel@tonic-gate int state = BEFORE_NUMBER;
8100Sstevel@tonic-gate
8110Sstevel@tonic-gate number = L->l_data.sp + data_offset;
8120Sstevel@tonic-gate length = data_length;
8130Sstevel@tonic-gate
8140Sstevel@tonic-gate /*
8150Sstevel@tonic-gate * Eat leading blanks, if any.
8160Sstevel@tonic-gate */
8170Sstevel@tonic-gate for (i = 0; i < length; i++)
8180Sstevel@tonic-gate if (!IS_BLANK(number[i]))
8190Sstevel@tonic-gate break;
8200Sstevel@tonic-gate
8210Sstevel@tonic-gate /*
8220Sstevel@tonic-gate * Test that there is sufficient size in the collation buffer for our
8230Sstevel@tonic-gate * number. In addition to the possible remaining characters in the
8240Sstevel@tonic-gate * field, we also require space for the sign (char), logarithm (int),
8250Sstevel@tonic-gate * separator (char), and as many as two string terminators (for reverse
8260Sstevel@tonic-gate * sorts).
8270Sstevel@tonic-gate */
8280Sstevel@tonic-gate if (((length - i) + 4 * sizeof (char) + sizeof (int)) >
8290Sstevel@tonic-gate (L->l_collate_bufsize - coll_offset))
8300Sstevel@tonic-gate return ((ssize_t)-1);
8310Sstevel@tonic-gate
8320Sstevel@tonic-gate /*
8330Sstevel@tonic-gate * If negative, set sign.
8340Sstevel@tonic-gate */
8350Sstevel@tonic-gate if (number[i] == '-') {
8360Sstevel@tonic-gate i++;
8370Sstevel@tonic-gate sign = '0';
8380Sstevel@tonic-gate }
8390Sstevel@tonic-gate
8400Sstevel@tonic-gate /*
8410Sstevel@tonic-gate * Scan integer part; eat leading zeros.
8420Sstevel@tonic-gate */
8430Sstevel@tonic-gate for (; i < length; i++) {
8440Sstevel@tonic-gate if (IS_SEPARATOR(number[i]))
8450Sstevel@tonic-gate continue;
8460Sstevel@tonic-gate
8470Sstevel@tonic-gate if (number[i] == '0' && !(state & IN_NUMBER))
8480Sstevel@tonic-gate continue;
8490Sstevel@tonic-gate
8500Sstevel@tonic-gate if (!isdigit((uchar_t)number[i]))
8510Sstevel@tonic-gate break;
8520Sstevel@tonic-gate
8530Sstevel@tonic-gate state |= IN_NUMBER;
8540Sstevel@tonic-gate if (sign == '0')
8550Sstevel@tonic-gate digits[j++] = '0' + '9' - number[i];
8560Sstevel@tonic-gate else
8570Sstevel@tonic-gate digits[j++] = number[i];
8580Sstevel@tonic-gate }
8590Sstevel@tonic-gate
8600Sstevel@tonic-gate if (i < length && IS_DECIMAL(number[i])) {
8610Sstevel@tonic-gate /*
8620Sstevel@tonic-gate * Integer part terminated by decimal.
8630Sstevel@tonic-gate */
8640Sstevel@tonic-gate digits[j] = DECIMAL_CHAR;
8650Sstevel@tonic-gate log_ten = j++;
8660Sstevel@tonic-gate
8670Sstevel@tonic-gate /*
8680Sstevel@tonic-gate * Scan fractional part.
8690Sstevel@tonic-gate */
8700Sstevel@tonic-gate for (++i; i < length; i++) {
8710Sstevel@tonic-gate if (IS_SEPARATOR(number[i]))
8720Sstevel@tonic-gate continue;
8730Sstevel@tonic-gate
8740Sstevel@tonic-gate if (!isdigit((uchar_t)number[i]))
8750Sstevel@tonic-gate break;
8760Sstevel@tonic-gate
8770Sstevel@tonic-gate if (number[i] != '0')
8780Sstevel@tonic-gate state |= IN_NUMBER;
8790Sstevel@tonic-gate
8800Sstevel@tonic-gate if (sign == '0')
8810Sstevel@tonic-gate digits[j++] = '0' + '9' - number[i];
8820Sstevel@tonic-gate else
8830Sstevel@tonic-gate digits[j++] = number[i];
8840Sstevel@tonic-gate }
8850Sstevel@tonic-gate
8860Sstevel@tonic-gate if (sign == '0')
8870Sstevel@tonic-gate digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
8880Sstevel@tonic-gate } else {
8890Sstevel@tonic-gate /*
8900Sstevel@tonic-gate * Nondigit or end of string seen.
8910Sstevel@tonic-gate */
8920Sstevel@tonic-gate log_ten = (int)j;
8930Sstevel@tonic-gate if (sign == '0')
8940Sstevel@tonic-gate digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
8950Sstevel@tonic-gate else
8960Sstevel@tonic-gate digits[j] = INTERFIELD_SEPARATOR;
8970Sstevel@tonic-gate }
8980Sstevel@tonic-gate
8990Sstevel@tonic-gate if ((state & IN_NUMBER) == 0) {
9000Sstevel@tonic-gate /*
9010Sstevel@tonic-gate * A non-zero number was not detected; treat as defined zero.
9020Sstevel@tonic-gate */
9030Sstevel@tonic-gate sign = '1';
9040Sstevel@tonic-gate log_ten = 0;
9050Sstevel@tonic-gate digits[0] = '0';
9060Sstevel@tonic-gate j = 1;
9070Sstevel@tonic-gate }
9080Sstevel@tonic-gate
9090Sstevel@tonic-gate /*
9100Sstevel@tonic-gate * We subtract a constant from the log of negative values so that
9110Sstevel@tonic-gate * they will correctly precede positive values with a zero logarithm.
9120Sstevel@tonic-gate */
9130Sstevel@tonic-gate if (sign == '0') {
9140Sstevel@tonic-gate if (j != 0)
9150Sstevel@tonic-gate log_ten = -log_ten - 2;
9160Sstevel@tonic-gate else
9170Sstevel@tonic-gate /*
9180Sstevel@tonic-gate * Special case for -0.
9190Sstevel@tonic-gate */
9200Sstevel@tonic-gate log_ten = -1;
9210Sstevel@tonic-gate }
9220Sstevel@tonic-gate
9230Sstevel@tonic-gate buffer[0] = sign;
9240Sstevel@tonic-gate
9250Sstevel@tonic-gate /*
9260Sstevel@tonic-gate * Place logarithm in big-endian form.
9270Sstevel@tonic-gate */
9280Sstevel@tonic-gate for (i = 0; i < sizeof (int); i++)
9290Sstevel@tonic-gate buffer[i + 1] = (log_ten << (i * NBBY))
9300Sstevel@tonic-gate >> ((sizeof (int) - 1) * NBBY);
9310Sstevel@tonic-gate
9320Sstevel@tonic-gate if (j + sizeof (char) + sizeof (int) <
9330Sstevel@tonic-gate L->l_collate_bufsize - coll_offset)
9340Sstevel@tonic-gate return (j + 1 + sizeof (int));
9350Sstevel@tonic-gate else
9360Sstevel@tonic-gate return ((ssize_t)-1);
9370Sstevel@tonic-gate }
9380Sstevel@tonic-gate
9390Sstevel@tonic-gate /*ARGSUSED*/
9400Sstevel@tonic-gate ssize_t
field_convert_numeric_wide(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)9410Sstevel@tonic-gate field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
9420Sstevel@tonic-gate ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
9430Sstevel@tonic-gate {
9440Sstevel@tonic-gate wchar_t *number;
9450Sstevel@tonic-gate wchar_t *buffer = L->l_collate.wp + coll_offset;
9460Sstevel@tonic-gate char *lbuffer;
9470Sstevel@tonic-gate ssize_t length;
9480Sstevel@tonic-gate
9490Sstevel@tonic-gate wchar_t sign = L'2';
9500Sstevel@tonic-gate int log_ten;
9510Sstevel@tonic-gate wchar_t *digits = buffer + 1 + sizeof (int)/sizeof (wchar_t);
9520Sstevel@tonic-gate size_t j = 0;
9530Sstevel@tonic-gate size_t i;
9540Sstevel@tonic-gate
9550Sstevel@tonic-gate int state = BEFORE_NUMBER;
9560Sstevel@tonic-gate
9570Sstevel@tonic-gate number = L->l_data.wp + data_offset;
9580Sstevel@tonic-gate length = data_length;
9590Sstevel@tonic-gate
9600Sstevel@tonic-gate for (i = 0; i < length; i++)
9610Sstevel@tonic-gate if (!W_IS_BLANK(number[i]))
9620Sstevel@tonic-gate break;
9630Sstevel@tonic-gate
9640Sstevel@tonic-gate if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) +
9650Sstevel@tonic-gate sizeof (int)) > (L->l_collate_bufsize - coll_offset))
9660Sstevel@tonic-gate return ((ssize_t)-1);
9670Sstevel@tonic-gate
9680Sstevel@tonic-gate if (number[i] == L'-') {
9690Sstevel@tonic-gate i++;
9700Sstevel@tonic-gate sign = L'0';
9710Sstevel@tonic-gate }
9720Sstevel@tonic-gate
9730Sstevel@tonic-gate for (; i < length; i++) {
9740Sstevel@tonic-gate if (W_IS_SEPARATOR(number[i]))
9750Sstevel@tonic-gate continue;
9760Sstevel@tonic-gate
9770Sstevel@tonic-gate if (number[i] == L'0' && !(state & IN_NUMBER))
9780Sstevel@tonic-gate continue;
9790Sstevel@tonic-gate
9800Sstevel@tonic-gate if (!iswdigit(number[i]))
9810Sstevel@tonic-gate break;
9820Sstevel@tonic-gate
9830Sstevel@tonic-gate state |= IN_NUMBER;
9840Sstevel@tonic-gate if (sign == L'0')
9850Sstevel@tonic-gate digits[j++] = L'0' + L'9' - number[i];
9860Sstevel@tonic-gate else
9870Sstevel@tonic-gate digits[j++] = number[i];
9880Sstevel@tonic-gate }
9890Sstevel@tonic-gate
9900Sstevel@tonic-gate if (i < length && W_IS_DECIMAL(number[i])) {
9910Sstevel@tonic-gate digits[j] = W_DECIMAL_CHAR;
9920Sstevel@tonic-gate log_ten = j++;
9930Sstevel@tonic-gate
9940Sstevel@tonic-gate for (++i; i < length; i++) {
9950Sstevel@tonic-gate if (W_IS_SEPARATOR(number[i]))
9960Sstevel@tonic-gate continue;
9970Sstevel@tonic-gate
9980Sstevel@tonic-gate if (!iswdigit(number[i]))
9990Sstevel@tonic-gate break;
10000Sstevel@tonic-gate
10010Sstevel@tonic-gate if (number[i] != L'0')
10020Sstevel@tonic-gate state |= IN_NUMBER;
10030Sstevel@tonic-gate
10040Sstevel@tonic-gate if (sign == L'0')
10050Sstevel@tonic-gate digits[j++] = L'0' + L'9' - number[i];
10060Sstevel@tonic-gate else
10070Sstevel@tonic-gate digits[j++] = number[i];
10080Sstevel@tonic-gate }
10090Sstevel@tonic-gate
10100Sstevel@tonic-gate if (sign == L'0')
10110Sstevel@tonic-gate digits[j++] = (wchar_t)(WCHAR_MAX -
10120Sstevel@tonic-gate W_INTERFIELD_SEPARATOR);
10130Sstevel@tonic-gate } else {
10140Sstevel@tonic-gate log_ten = (int)j;
10150Sstevel@tonic-gate if (sign == L'0')
10160Sstevel@tonic-gate digits[j++] = (wchar_t)(WCHAR_MAX -
10170Sstevel@tonic-gate W_INTERFIELD_SEPARATOR);
10180Sstevel@tonic-gate else
10190Sstevel@tonic-gate digits[j] = W_INTERFIELD_SEPARATOR;
10200Sstevel@tonic-gate }
10210Sstevel@tonic-gate
10220Sstevel@tonic-gate if ((state & IN_NUMBER) == 0) {
10230Sstevel@tonic-gate sign = L'1';
10240Sstevel@tonic-gate log_ten = 0;
10250Sstevel@tonic-gate digits[0] = L'0';
10260Sstevel@tonic-gate j = 1;
10270Sstevel@tonic-gate }
10280Sstevel@tonic-gate
10290Sstevel@tonic-gate if (sign == L'0') {
10300Sstevel@tonic-gate if (j != 0)
10310Sstevel@tonic-gate log_ten = -log_ten - 2;
10320Sstevel@tonic-gate else
10330Sstevel@tonic-gate log_ten = -1;
10340Sstevel@tonic-gate }
10350Sstevel@tonic-gate
10360Sstevel@tonic-gate buffer[0] = sign;
10370Sstevel@tonic-gate /*
10380Sstevel@tonic-gate * Place logarithm in big-endian form.
10390Sstevel@tonic-gate */
10400Sstevel@tonic-gate lbuffer = (char *)(buffer + 1);
10410Sstevel@tonic-gate for (i = 0; i < sizeof (int); i++)
10420Sstevel@tonic-gate lbuffer[i] = (log_ten << (i * NBBY))
10430Sstevel@tonic-gate >> ((sizeof (int) - 1) * NBBY);
10440Sstevel@tonic-gate
10450Sstevel@tonic-gate if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) <
10460Sstevel@tonic-gate L->l_collate_bufsize - coll_offset * sizeof (wchar_t))
10470Sstevel@tonic-gate return (j + 1 + sizeof (int) / sizeof (wchar_t));
10480Sstevel@tonic-gate else
10490Sstevel@tonic-gate return ((ssize_t)-1);
10500Sstevel@tonic-gate }
10510Sstevel@tonic-gate
10520Sstevel@tonic-gate /*
10530Sstevel@tonic-gate * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour
10540Sstevel@tonic-gate * when coll_offset exceeds l_collate_bufsize.
10550Sstevel@tonic-gate */
10560Sstevel@tonic-gate ssize_t
field_convert(field_t * F,line_rec_t * L,int flags,vchar_t field_separator)10570Sstevel@tonic-gate field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator)
10580Sstevel@tonic-gate {
10590Sstevel@tonic-gate ssize_t coll_offset = 0;
10600Sstevel@tonic-gate ssize_t start, end, distance;
10610Sstevel@tonic-gate field_t *cur_fieldp = F;
10620Sstevel@tonic-gate
10630Sstevel@tonic-gate while (cur_fieldp != NULL) {
10640Sstevel@tonic-gate /*
10650Sstevel@tonic-gate * delimit field
10660Sstevel@tonic-gate */
10670Sstevel@tonic-gate if (!field_separator.sc)
10680Sstevel@tonic-gate field_delimit(cur_fieldp, L, &start, &end);
10690Sstevel@tonic-gate else
10700Sstevel@tonic-gate field_delimit_tabbed(cur_fieldp, L, &start, &end,
10710Sstevel@tonic-gate field_separator);
10720Sstevel@tonic-gate
10730Sstevel@tonic-gate distance = 0;
10740Sstevel@tonic-gate if (end - start > 0 ||
10750Sstevel@tonic-gate (end - start == 0 && F->f_species == NUMERIC)) {
10760Sstevel@tonic-gate /*
10770Sstevel@tonic-gate * Convert field, appending to collated field of line
10780Sstevel@tonic-gate * record.
10790Sstevel@tonic-gate */
10800Sstevel@tonic-gate distance = cur_fieldp->f_convert(cur_fieldp, L,
10810Sstevel@tonic-gate field_separator, start, end - start, coll_offset);
10820Sstevel@tonic-gate
10830Sstevel@tonic-gate /*
10840Sstevel@tonic-gate * branch should execute comparatively rarely
10850Sstevel@tonic-gate */
10860Sstevel@tonic-gate if (distance == -1) {
10870Sstevel@tonic-gate if (flags & FCV_REALLOC) {
10880Sstevel@tonic-gate ASSERT(L->l_collate_bufsize > 0);
10890Sstevel@tonic-gate L->l_collate_bufsize *= 2;
10900Sstevel@tonic-gate L->l_collate.sp =
10910Sstevel@tonic-gate safe_realloc(L->l_collate.sp,
10920Sstevel@tonic-gate L->l_collate_bufsize);
10930Sstevel@tonic-gate
10940Sstevel@tonic-gate __S(stats_incr_convert_reallocs());
10950Sstevel@tonic-gate continue;
10960Sstevel@tonic-gate } else {
10970Sstevel@tonic-gate /*
10980Sstevel@tonic-gate * FCV_FAIL has been set.
10990Sstevel@tonic-gate */
11000Sstevel@tonic-gate return (-1);
11010Sstevel@tonic-gate }
11020Sstevel@tonic-gate }
11030Sstevel@tonic-gate }
11040Sstevel@tonic-gate
11050Sstevel@tonic-gate if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
11060Sstevel@tonic-gate xstrninv(L->l_collate.sp, coll_offset, distance);
11070Sstevel@tonic-gate *(L->l_collate.sp + coll_offset + distance) =
11080Sstevel@tonic-gate (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
11090Sstevel@tonic-gate distance++;
11100Sstevel@tonic-gate }
11110Sstevel@tonic-gate
11120Sstevel@tonic-gate ASSERT(distance >= 0);
11130Sstevel@tonic-gate coll_offset += distance;
11140Sstevel@tonic-gate if (coll_offset >= L->l_collate_bufsize) {
11150Sstevel@tonic-gate if (flags & FCV_REALLOC) {
11160Sstevel@tonic-gate ASSERT(L->l_collate_bufsize > 0);
11170Sstevel@tonic-gate L->l_collate_bufsize *= 2;
11180Sstevel@tonic-gate L->l_collate.sp = safe_realloc(L->l_collate.sp,
11190Sstevel@tonic-gate L->l_collate_bufsize);
11200Sstevel@tonic-gate
11210Sstevel@tonic-gate __S(stats_incr_convert_reallocs());
11220Sstevel@tonic-gate } else {
11230Sstevel@tonic-gate return (-1);
11240Sstevel@tonic-gate }
11250Sstevel@tonic-gate }
11260Sstevel@tonic-gate *(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR;
11270Sstevel@tonic-gate coll_offset++;
11280Sstevel@tonic-gate
11290Sstevel@tonic-gate cur_fieldp = cur_fieldp->f_next;
11300Sstevel@tonic-gate }
11310Sstevel@tonic-gate
11320Sstevel@tonic-gate L->l_collate_length = coll_offset;
11330Sstevel@tonic-gate
11340Sstevel@tonic-gate return (L->l_collate_length);
11350Sstevel@tonic-gate }
11360Sstevel@tonic-gate
11370Sstevel@tonic-gate ssize_t
field_convert_wide(field_t * F,line_rec_t * L,int flags,vchar_t field_separator)11380Sstevel@tonic-gate field_convert_wide(field_t *F, line_rec_t *L, int flags,
11390Sstevel@tonic-gate vchar_t field_separator)
11400Sstevel@tonic-gate {
11410Sstevel@tonic-gate ssize_t coll_offset = 0;
11420Sstevel@tonic-gate ssize_t start, end, distance;
11430Sstevel@tonic-gate field_t *cur_fieldp = F;
11440Sstevel@tonic-gate
11450Sstevel@tonic-gate while (cur_fieldp != NULL) {
11460Sstevel@tonic-gate if (!field_separator.wc)
11470Sstevel@tonic-gate field_delimit_wide(cur_fieldp, L, &start, &end);
11480Sstevel@tonic-gate else
11490Sstevel@tonic-gate field_delimit_tabbed_wide(cur_fieldp, L, &start, &end,
11500Sstevel@tonic-gate field_separator);
11510Sstevel@tonic-gate
11520Sstevel@tonic-gate distance = 0;
11530Sstevel@tonic-gate if (end - start > 0 ||
11540Sstevel@tonic-gate end - start == 0 && F->f_species == NUMERIC) {
11550Sstevel@tonic-gate distance = cur_fieldp->f_convert(cur_fieldp, L,
11560Sstevel@tonic-gate field_separator, start, end - start, coll_offset);
11570Sstevel@tonic-gate
11580Sstevel@tonic-gate if (distance == -1) {
11590Sstevel@tonic-gate if (flags & FCV_REALLOC) {
11600Sstevel@tonic-gate ASSERT(L->l_collate_bufsize > 0);
11610Sstevel@tonic-gate L->l_collate_bufsize *= 2;
11620Sstevel@tonic-gate L->l_collate.wp = safe_realloc(
11630Sstevel@tonic-gate L->l_collate.wp,
11640Sstevel@tonic-gate L->l_collate_bufsize);
11650Sstevel@tonic-gate
11660Sstevel@tonic-gate __S(stats_incr_convert_reallocs());
11670Sstevel@tonic-gate continue;
11680Sstevel@tonic-gate } else {
11690Sstevel@tonic-gate return (-1);
11700Sstevel@tonic-gate }
11710Sstevel@tonic-gate }
11720Sstevel@tonic-gate }
11730Sstevel@tonic-gate
11740Sstevel@tonic-gate if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
11750Sstevel@tonic-gate xwcsninv(L->l_collate.wp, coll_offset, distance);
11760Sstevel@tonic-gate *(L->l_collate.wp + coll_offset + distance) =
11770Sstevel@tonic-gate WCHAR_MAX - INTERFIELD_SEPARATOR;
11780Sstevel@tonic-gate distance++;
11790Sstevel@tonic-gate }
11800Sstevel@tonic-gate
11810Sstevel@tonic-gate ASSERT(distance >= 0);
11820Sstevel@tonic-gate coll_offset += distance;
11830Sstevel@tonic-gate if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) {
11840Sstevel@tonic-gate if (flags & FCV_REALLOC) {
11850Sstevel@tonic-gate ASSERT(L->l_collate_bufsize > 0);
11860Sstevel@tonic-gate L->l_collate_bufsize *= 2;
11870Sstevel@tonic-gate L->l_collate.wp = safe_realloc(L->l_collate.wp,
11880Sstevel@tonic-gate L->l_collate_bufsize);
11890Sstevel@tonic-gate
11900Sstevel@tonic-gate __S(stats_incr_convert_reallocs());
11910Sstevel@tonic-gate } else {
11920Sstevel@tonic-gate return (-1);
11930Sstevel@tonic-gate }
11940Sstevel@tonic-gate }
11950Sstevel@tonic-gate *(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR;
11960Sstevel@tonic-gate coll_offset++;
11970Sstevel@tonic-gate
11980Sstevel@tonic-gate cur_fieldp = cur_fieldp->f_next;
11990Sstevel@tonic-gate }
12000Sstevel@tonic-gate
12010Sstevel@tonic-gate L->l_collate_length = coll_offset * sizeof (wchar_t);
12020Sstevel@tonic-gate #ifdef _LITTLE_ENDIAN
12030Sstevel@tonic-gate xwcsntomsb(L->l_collate.wp, coll_offset);
12040Sstevel@tonic-gate #endif /* _LITTLE_ENDIAN */
12050Sstevel@tonic-gate
12060Sstevel@tonic-gate return (L->l_collate_length);
12070Sstevel@tonic-gate }
12080Sstevel@tonic-gate
12090Sstevel@tonic-gate /*
12100Sstevel@tonic-gate * line_convert() and line_convert_wide() are called when the collation vector
12110Sstevel@tonic-gate * of a given line has been exhausted, and we are performing the final,
12120Sstevel@tonic-gate * full-line comparison required by the sort specification. Because we do not
12130Sstevel@tonic-gate * have a guarantee that l_data is null-terminated, we create an explicitly
12140Sstevel@tonic-gate * null-terminated copy suitable for transformation to a collatable form for the
12150Sstevel@tonic-gate * current locale.
12160Sstevel@tonic-gate */
12170Sstevel@tonic-gate static void
line_convert(line_rec_t * L)12180Sstevel@tonic-gate line_convert(line_rec_t *L)
12190Sstevel@tonic-gate {
12200Sstevel@tonic-gate static ssize_t bufsize;
12210Sstevel@tonic-gate static char *buffer;
12220Sstevel@tonic-gate
12230Sstevel@tonic-gate if (L->l_raw_collate.sp != NULL)
12240Sstevel@tonic-gate return;
12250Sstevel@tonic-gate
12260Sstevel@tonic-gate if (L->l_data_length + 1 > bufsize) {
12270Sstevel@tonic-gate buffer = safe_realloc(buffer, L->l_data_length + 1);
12280Sstevel@tonic-gate bufsize = L->l_data_length + 1;
12290Sstevel@tonic-gate }
12300Sstevel@tonic-gate
12310Sstevel@tonic-gate (void) strncpy(buffer, L->l_data.sp, L->l_data_length);
12320Sstevel@tonic-gate buffer[L->l_data_length] = '\0';
12330Sstevel@tonic-gate
12340Sstevel@tonic-gate L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp,
12350Sstevel@tonic-gate xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
12360Sstevel@tonic-gate xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer,
12370Sstevel@tonic-gate xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
12380Sstevel@tonic-gate
12390Sstevel@tonic-gate __S(stats_incr_line_conversions());
12400Sstevel@tonic-gate }
12410Sstevel@tonic-gate
12420Sstevel@tonic-gate static void
line_convert_wide(line_rec_t * L)12430Sstevel@tonic-gate line_convert_wide(line_rec_t *L)
12440Sstevel@tonic-gate {
12450Sstevel@tonic-gate static wchar_t *buffer;
12460Sstevel@tonic-gate static ssize_t bufsize;
12470Sstevel@tonic-gate
12480Sstevel@tonic-gate ssize_t dlength;
12490Sstevel@tonic-gate
12500Sstevel@tonic-gate if (L->l_raw_collate.wp != NULL)
12510Sstevel@tonic-gate return;
12520Sstevel@tonic-gate
12530Sstevel@tonic-gate if (L->l_data_length + 1 > bufsize) {
12540Sstevel@tonic-gate buffer = safe_realloc(buffer, (L->l_data_length + 1) *
12550Sstevel@tonic-gate sizeof (wchar_t));
12560Sstevel@tonic-gate bufsize = L->l_data_length + 1;
12570Sstevel@tonic-gate }
12580Sstevel@tonic-gate
12590Sstevel@tonic-gate (void) wcsncpy(buffer, L->l_data.wp, L->l_data_length);
12600Sstevel@tonic-gate buffer[L->l_data_length] = L'\0';
12610Sstevel@tonic-gate
12620Sstevel@tonic-gate dlength = wcsxfrm(NULL, buffer, 0) + 1;
12630Sstevel@tonic-gate L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength *
12640Sstevel@tonic-gate sizeof (wchar_t));
12650Sstevel@tonic-gate (void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength);
12660Sstevel@tonic-gate
12670Sstevel@tonic-gate __S(stats_incr_line_conversions());
12680Sstevel@tonic-gate }
12690Sstevel@tonic-gate
12700Sstevel@tonic-gate /*
12710Sstevel@tonic-gate * Our convention for collation is
12720Sstevel@tonic-gate *
12730Sstevel@tonic-gate * A > B => r > 0,
12740Sstevel@tonic-gate * A == B => r = 0,
12750Sstevel@tonic-gate * A < B => r < 0
12760Sstevel@tonic-gate *
12770Sstevel@tonic-gate * This convention is consistent with the definition of memcmp(), strcmp(), and
12780Sstevel@tonic-gate * strncmp() in the C locale. collated() and collated_wide() have two optional
12790Sstevel@tonic-gate * behaviours, which can be activated by setting the appropriate values in
12800Sstevel@tonic-gate * coll_flag: COLL_UNIQUE, which returns 0 if the l_collate fields of the line
12810Sstevel@tonic-gate * records being compared are identical; COLL_DATA_ONLY, which ignores the
12820Sstevel@tonic-gate * l_collate field for the current comparison; and COLL_REVERSE, which flips the
12830Sstevel@tonic-gate * result for comparisons that fall through to an actual data comparison (since
12840Sstevel@tonic-gate * the collated vector should already reflect reverse ordering from field
12850Sstevel@tonic-gate * conversion).
12860Sstevel@tonic-gate */
12870Sstevel@tonic-gate int
collated(line_rec_t * A,line_rec_t * B,ssize_t depth,flag_t coll_flag)12880Sstevel@tonic-gate collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
12890Sstevel@tonic-gate {
12900Sstevel@tonic-gate ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
12910Sstevel@tonic-gate int r;
12920Sstevel@tonic-gate int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
12930Sstevel@tonic-gate INT_SIGN_PASS_MASK;
12940Sstevel@tonic-gate ssize_t la, lb;
12950Sstevel@tonic-gate
12960Sstevel@tonic-gate if (!(coll_flag & COLL_DATA_ONLY)) {
12970Sstevel@tonic-gate if (ml > 0) {
12980Sstevel@tonic-gate r = memcmp(A->l_collate.sp + depth,
12990Sstevel@tonic-gate B->l_collate.sp + depth, ml);
13000Sstevel@tonic-gate
13010Sstevel@tonic-gate if (r)
13020Sstevel@tonic-gate return (r);
13030Sstevel@tonic-gate }
13040Sstevel@tonic-gate
13050Sstevel@tonic-gate if (A->l_collate_length < B->l_collate_length)
13060Sstevel@tonic-gate return (-1);
13070Sstevel@tonic-gate
13080Sstevel@tonic-gate if (A->l_collate_length > B->l_collate_length)
13090Sstevel@tonic-gate return (1);
13100Sstevel@tonic-gate }
13110Sstevel@tonic-gate
13120Sstevel@tonic-gate /*
13130Sstevel@tonic-gate * This is where we cut out, if we know that the current sort is over
13140Sstevel@tonic-gate * the entire line.
13150Sstevel@tonic-gate */
13160Sstevel@tonic-gate if (coll_flag & COLL_UNIQUE)
13170Sstevel@tonic-gate return (0);
13180Sstevel@tonic-gate
13190Sstevel@tonic-gate line_convert(A);
13200Sstevel@tonic-gate line_convert(B);
13210Sstevel@tonic-gate
13220Sstevel@tonic-gate la = strlen(A->l_raw_collate.sp);
13230Sstevel@tonic-gate lb = strlen(B->l_raw_collate.sp);
13240Sstevel@tonic-gate
13250Sstevel@tonic-gate r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb));
13260Sstevel@tonic-gate
13270Sstevel@tonic-gate if (r)
13280Sstevel@tonic-gate return (r ^ mask);
13290Sstevel@tonic-gate
13300Sstevel@tonic-gate if (la < lb)
13310Sstevel@tonic-gate return (-1 ^ mask);
13320Sstevel@tonic-gate
13330Sstevel@tonic-gate if (la > lb)
13340Sstevel@tonic-gate return (1 ^ mask);
13350Sstevel@tonic-gate
13360Sstevel@tonic-gate return (0);
13370Sstevel@tonic-gate }
13380Sstevel@tonic-gate
13390Sstevel@tonic-gate int
collated_wide(line_rec_t * A,line_rec_t * B,ssize_t depth,flag_t coll_flag)13400Sstevel@tonic-gate collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
13410Sstevel@tonic-gate {
13420Sstevel@tonic-gate ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
13430Sstevel@tonic-gate int r;
13440Sstevel@tonic-gate int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
13450Sstevel@tonic-gate INT_SIGN_PASS_MASK;
13460Sstevel@tonic-gate ssize_t la, lb;
13470Sstevel@tonic-gate
13480Sstevel@tonic-gate if (!(coll_flag & COLL_DATA_ONLY)) {
13490Sstevel@tonic-gate if (ml > 0) {
13500Sstevel@tonic-gate r = memcmp(A->l_collate.sp + depth,
13510Sstevel@tonic-gate B->l_collate.sp + depth, ml);
13520Sstevel@tonic-gate
13530Sstevel@tonic-gate if (r)
13540Sstevel@tonic-gate return (r);
13550Sstevel@tonic-gate }
13560Sstevel@tonic-gate if (A->l_collate_length < B->l_collate_length)
13570Sstevel@tonic-gate return (-1);
13580Sstevel@tonic-gate
13590Sstevel@tonic-gate if (A->l_collate_length > B->l_collate_length)
13600Sstevel@tonic-gate return (1);
13610Sstevel@tonic-gate }
13620Sstevel@tonic-gate
13630Sstevel@tonic-gate if (coll_flag & COLL_UNIQUE)
13640Sstevel@tonic-gate return (0);
13650Sstevel@tonic-gate
13660Sstevel@tonic-gate line_convert_wide(A);
13670Sstevel@tonic-gate line_convert_wide(B);
13680Sstevel@tonic-gate
13690Sstevel@tonic-gate la = wcslen(A->l_raw_collate.wp);
13700Sstevel@tonic-gate lb = wcslen(B->l_raw_collate.wp);
13710Sstevel@tonic-gate
13720Sstevel@tonic-gate r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp,
13730Sstevel@tonic-gate (size_t)MIN(la, lb));
13740Sstevel@tonic-gate
13750Sstevel@tonic-gate if (r)
13760Sstevel@tonic-gate return (r ^ mask);
13770Sstevel@tonic-gate
13780Sstevel@tonic-gate if (la < lb)
13790Sstevel@tonic-gate return (-1 ^ mask);
13800Sstevel@tonic-gate
13810Sstevel@tonic-gate if (la > lb)
13820Sstevel@tonic-gate return (1 ^ mask);
13830Sstevel@tonic-gate
13840Sstevel@tonic-gate return (0);
13850Sstevel@tonic-gate }
1386