xref: /onnv-gate/usr/src/cmd/sort/common/fields.c (revision 12067:51a7904bad2f)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
54623Srm88369  * Common Development and Distribution License (the "License").
64623Srm88369  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
21*12067SJohn.Beck@Sun.COM 
220Sstevel@tonic-gate /*
23*12067SJohn.Beck@Sun.COM  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #include "fields.h"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * fields
300Sstevel@tonic-gate  *
310Sstevel@tonic-gate  * Overview
320Sstevel@tonic-gate  *   By a field, we mean the various delimited character sequences within each
330Sstevel@tonic-gate  *   line of the input files.  The sort key consists of an ordered sequence of
340Sstevel@tonic-gate  *   fields, which need not include all possible fields for the given line.
350Sstevel@tonic-gate  *   (Furthermore, not every line need contain sufficient fields for the fields
360Sstevel@tonic-gate  *   given within the sort key.  In fact, none of the lines in the input stream
370Sstevel@tonic-gate  *   need contain sufficient fields.)
380Sstevel@tonic-gate  *
390Sstevel@tonic-gate  *   There are two methods for specifying fields for sort(1); these are
400Sstevel@tonic-gate  *   discussed in options.c.  Here we discuss only the internal representation
410Sstevel@tonic-gate  *   of fields, as used for constructing the collation vector for each line as
420Sstevel@tonic-gate  *   defined by the sort key.
430Sstevel@tonic-gate  *
440Sstevel@tonic-gate  * Representation
450Sstevel@tonic-gate  *   The sort key is a singly-linked list of field specifiers.  At present,
460Sstevel@tonic-gate  *   fields may belong to one of three species:  alphabetical, numerical, or
470Sstevel@tonic-gate  *   monthly; the species (f_species) then indicates the conversion function
480Sstevel@tonic-gate  *   (f_convert) used to transform the raw characters of the character sequence
490Sstevel@tonic-gate  *   to a collatable form.  (In principle, this allows us to consider future
500Sstevel@tonic-gate  *   field species such as hexadecimal.)
510Sstevel@tonic-gate  *
520Sstevel@tonic-gate  *   Fields and offsets are numbered such that zero refers to the first field or
530Sstevel@tonic-gate  *   character, respectively.  Thus, the interpretation of a key specifier, m.n,
540Sstevel@tonic-gate  *   is that the field begins at the nth character beyond the mth occurence of
550Sstevel@tonic-gate  *   the key separator.  If the blanks flag has been specified, then the field
560Sstevel@tonic-gate  *   begins at the nth non-blank character past the mth key separator.  If the
570Sstevel@tonic-gate  *   key separator is unspecified, then the key separator is defined as one or
580Sstevel@tonic-gate  *   more blank characters.
590Sstevel@tonic-gate  *
600Sstevel@tonic-gate  *   In general, the various options afforded by sort may be broken into two
610Sstevel@tonic-gate  *   categories:  field species and field modifiers.  For each field species,
620Sstevel@tonic-gate  *   there is one or more conversion routines that take a delimited character
630Sstevel@tonic-gate  *   sequence and convert it to a character sequence collatable by strcmp() or
640Sstevel@tonic-gate  *   memcmp().  For field species that may be further modified, such as the
650Sstevel@tonic-gate  *   fold-to-uppercase option for alphabetic fields, the conversion routine may
660Sstevel@tonic-gate  *   be aware of how the modifier affects collation.  Finally, the no-modifiers
670Sstevel@tonic-gate  *   case may present an opportunity for a simplified, faster version.
680Sstevel@tonic-gate  *
690Sstevel@tonic-gate  * Code Structure
700Sstevel@tonic-gate  *   The code paths for single-byte and multi-byte locales diverge significantly
710Sstevel@tonic-gate  *   in fields.c.  Most routines have an *_wide() version, which produces an
720Sstevel@tonic-gate  *   equivalent effect for line records whose data field is composed of wide
730Sstevel@tonic-gate  *   characters (wchar_t).  However, the l_collated field of a line record is
740Sstevel@tonic-gate  *   always composed of characters, so that the radix sorts provided in
750Sstevel@tonic-gate  *   internal.c can work in both single- and multi-byte locales.  Thus, in the
760Sstevel@tonic-gate  *   various convert_*_wide() routines, the output is placed in l_collated, with
770Sstevel@tonic-gate  *   a length multiplier of 4.
780Sstevel@tonic-gate  */
790Sstevel@tonic-gate 
800Sstevel@tonic-gate #define	BEFORE_NUMBER	0x0
810Sstevel@tonic-gate #define	IN_NUMBER	0x1
820Sstevel@tonic-gate 
830Sstevel@tonic-gate static char	numerical_separator;
840Sstevel@tonic-gate static char	numerical_decimal;
850Sstevel@tonic-gate static char	monetary_separator;
860Sstevel@tonic-gate static char	monetary_decimal;
870Sstevel@tonic-gate 
880Sstevel@tonic-gate static wchar_t	w_numerical_separator;
890Sstevel@tonic-gate static wchar_t	w_numerical_decimal;
900Sstevel@tonic-gate static wchar_t	w_monetary_separator;
910Sstevel@tonic-gate static wchar_t	w_monetary_decimal;
920Sstevel@tonic-gate 
930Sstevel@tonic-gate #define	MONTHS_IN_YEAR	12
940Sstevel@tonic-gate #define	MAX_MON_LEN	20
950Sstevel@tonic-gate 
960Sstevel@tonic-gate enum { MO_NONE = 1, MO_OFFSET = 2 };
970Sstevel@tonic-gate 
980Sstevel@tonic-gate static char	*months[MONTHS_IN_YEAR];
990Sstevel@tonic-gate static size_t	month_lengths[MONTHS_IN_YEAR];
1000Sstevel@tonic-gate static wchar_t	*w_months[MONTHS_IN_YEAR];
1010Sstevel@tonic-gate static size_t	w_month_lengths[MONTHS_IN_YEAR];
1020Sstevel@tonic-gate 
1030Sstevel@tonic-gate #define	DECIMAL_CHAR		(numerical_decimal)
1040Sstevel@tonic-gate #define	IS_BLANK(x)		(isspace((uchar_t)(x)) && (x) != '\n')
1050Sstevel@tonic-gate #define	IS_SEPARATOR(x)		\
1060Sstevel@tonic-gate 	((numerical_separator != '\0' && (x) == numerical_separator) || \
1070Sstevel@tonic-gate 	(monetary_separator != '\0' && (x) == monetary_separator))
1080Sstevel@tonic-gate #define	IS_DECIMAL(x)		\
1090Sstevel@tonic-gate 	((x) == numerical_decimal || \
1100Sstevel@tonic-gate 	(monetary_decimal != '\0' && (x) == monetary_decimal))
1110Sstevel@tonic-gate #define	W_DECIMAL_CHAR		(w_numerical_decimal)
1120Sstevel@tonic-gate #define	W_IS_BLANK(x)		(iswspace(x) && (x) != L'\n')
1130Sstevel@tonic-gate #define	W_IS_SEPARATOR(x)	\
1140Sstevel@tonic-gate 	((numerical_separator != '\0' && (x) == w_numerical_separator) || \
1150Sstevel@tonic-gate 	(monetary_separator != '\0' && (x) == w_monetary_separator))
1160Sstevel@tonic-gate #define	W_IS_DECIMAL(x)		\
1170Sstevel@tonic-gate 	(((x) == w_numerical_decimal) || \
1180Sstevel@tonic-gate 	(monetary_decimal != '\0' && (x) == w_monetary_decimal))
1190Sstevel@tonic-gate 
1200Sstevel@tonic-gate #define	INTERFIELD_SEPARATOR '\0'
1210Sstevel@tonic-gate #define	W_INTERFIELD_SEPARATOR L'\0'
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate #define	INT_SIGN_FLIP_MASK 0x80000000
1240Sstevel@tonic-gate #define	INT_SIGN_PASS_MASK 0x00000000
1250Sstevel@tonic-gate 
1260Sstevel@tonic-gate /*
1270Sstevel@tonic-gate  * strx_ops_t, xfrm_len, and xfrm_cpy:  In the case where we are sorting in the
1280Sstevel@tonic-gate  * C locale, we want to avoid the expense of transforming strings to collatable
1290Sstevel@tonic-gate  * forms since, by definition, an arbitrary string in the C locale is already in
1300Sstevel@tonic-gate  * its collatable form.  Therefore, we construct a small ops vector (the
1310Sstevel@tonic-gate  * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into
1320Sstevel@tonic-gate  * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear
1330Sstevel@tonic-gate  * strxfrm()-like.
1340Sstevel@tonic-gate  */
1350Sstevel@tonic-gate /*ARGSUSED*/
1360Sstevel@tonic-gate static size_t
xfrm_len(const char * s2,size_t len)1370Sstevel@tonic-gate xfrm_len(const char *s2, size_t len)
1380Sstevel@tonic-gate {
1390Sstevel@tonic-gate 	return (strxfrm(NULL, s2, 0) + 1);
1400Sstevel@tonic-gate }
1410Sstevel@tonic-gate 
1420Sstevel@tonic-gate /*
1430Sstevel@tonic-gate  * The length represented by n includes a null character, so to return the
1440Sstevel@tonic-gate  * correct length we subtract 1.  Note that this function is only used by
1450Sstevel@tonic-gate  * field_convert_alpha, and isn't for general use, as it assumes that n is the
1460Sstevel@tonic-gate  * length of s2 plus a null character.
1470Sstevel@tonic-gate  */
1480Sstevel@tonic-gate static size_t
C_ncpy(char * s1,const char * s2,size_t n)1490Sstevel@tonic-gate C_ncpy(char *s1, const char *s2, size_t n)
1500Sstevel@tonic-gate {
1510Sstevel@tonic-gate 	(void) strncpy(s1, s2, n);
1520Sstevel@tonic-gate 	return (n - 1);
1530Sstevel@tonic-gate }
1540Sstevel@tonic-gate 
1550Sstevel@tonic-gate /*ARGSUSED*/
1560Sstevel@tonic-gate static size_t
C_len(const char * s,size_t len)1570Sstevel@tonic-gate C_len(const char *s, size_t len)
1580Sstevel@tonic-gate {
1590Sstevel@tonic-gate 	ASSERT(s != NULL);
1600Sstevel@tonic-gate 	return (len);
1610Sstevel@tonic-gate }
1620Sstevel@tonic-gate 
1630Sstevel@tonic-gate typedef struct _strx_ops {
1640Sstevel@tonic-gate 	size_t	(*sx_len)(const char *, size_t);
1650Sstevel@tonic-gate 	size_t	(*sx_xfrm)(char *, const char *, size_t);
1660Sstevel@tonic-gate } strx_ops_t;
1670Sstevel@tonic-gate 
1680Sstevel@tonic-gate static const strx_ops_t C_ops = { C_len, C_ncpy };
1690Sstevel@tonic-gate static const strx_ops_t SB_ops = { xfrm_len, strxfrm };
1700Sstevel@tonic-gate 
1710Sstevel@tonic-gate static const strx_ops_t *xfrm_ops;
1720Sstevel@tonic-gate 
1730Sstevel@tonic-gate static void
field_initialize_separator(void)1740Sstevel@tonic-gate field_initialize_separator(void)
1750Sstevel@tonic-gate {
1760Sstevel@tonic-gate 	/*
1770Sstevel@tonic-gate 	 * A locale need not define all of the cases below:  only decimal_point
1780Sstevel@tonic-gate 	 * must be defined.  Furthermore, sort(1) has traditionally not used the
1790Sstevel@tonic-gate 	 * positive_sign and negative_sign, grouping, or currency_symbols (or
1800Sstevel@tonic-gate 	 * their numeric counterparts, if any).
1810Sstevel@tonic-gate 	 */
1820Sstevel@tonic-gate 	struct lconv *conv = localeconv();
1830Sstevel@tonic-gate 
1840Sstevel@tonic-gate 	if (!xstreql(conv->thousands_sep, "")) {
1850Sstevel@tonic-gate 		numerical_separator = *conv->thousands_sep;
1860Sstevel@tonic-gate 		(void) mbtowc(&w_numerical_separator, conv->thousands_sep,
1870Sstevel@tonic-gate 		    MB_CUR_MAX);
1880Sstevel@tonic-gate 	} else
1890Sstevel@tonic-gate 		numerical_separator = '\0';
1900Sstevel@tonic-gate 
1910Sstevel@tonic-gate 	if (!xstreql(conv->mon_thousands_sep, "")) {
1920Sstevel@tonic-gate 		monetary_separator = *conv->mon_thousands_sep;
1930Sstevel@tonic-gate 		(void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep,
1940Sstevel@tonic-gate 		    MB_CUR_MAX);
1950Sstevel@tonic-gate 	} else
1960Sstevel@tonic-gate 		monetary_separator = '\0';
1970Sstevel@tonic-gate 
1980Sstevel@tonic-gate 	if (!xstreql(conv->mon_decimal_point, "")) {
1990Sstevel@tonic-gate 		monetary_decimal = *conv->mon_decimal_point;
2000Sstevel@tonic-gate 		(void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point,
2010Sstevel@tonic-gate 		    MB_CUR_MAX);
2020Sstevel@tonic-gate 	} else
2030Sstevel@tonic-gate 		monetary_decimal = '\0';
2040Sstevel@tonic-gate 
2050Sstevel@tonic-gate 	numerical_decimal = *conv->decimal_point;
2060Sstevel@tonic-gate 	(void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX);
2070Sstevel@tonic-gate }
2080Sstevel@tonic-gate 
2090Sstevel@tonic-gate static void
field_initialize_month(int is_c_locale)2100Sstevel@tonic-gate field_initialize_month(int is_c_locale)
2110Sstevel@tonic-gate {
2120Sstevel@tonic-gate 	int i;
2130Sstevel@tonic-gate 	int j;
2140Sstevel@tonic-gate 	struct tm this_month;
2150Sstevel@tonic-gate 	const char *c_months[MONTHS_IN_YEAR] = {
2160Sstevel@tonic-gate 		"JAN", "FEB", "MAR", "APR", "MAY", "JUN",
2170Sstevel@tonic-gate 		"JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
2180Sstevel@tonic-gate 	};
2190Sstevel@tonic-gate 
2200Sstevel@tonic-gate 	char month_name[MAX_MON_LEN * MB_LEN_MAX];
2210Sstevel@tonic-gate 	wchar_t	w_month_name[MAX_MON_LEN];
2220Sstevel@tonic-gate 
2230Sstevel@tonic-gate 	if (is_c_locale) {
2240Sstevel@tonic-gate 		for (i = 0; i < MONTHS_IN_YEAR; i++) {
2250Sstevel@tonic-gate 			months[i] = (char *)c_months[i];
2260Sstevel@tonic-gate 			month_lengths[i] = strlen(c_months[i]);
2270Sstevel@tonic-gate 		}
2280Sstevel@tonic-gate 		/*
2290Sstevel@tonic-gate 		 * We don't need to initialize the wide version of the month
2300Sstevel@tonic-gate 		 * names.
2310Sstevel@tonic-gate 		 */
2320Sstevel@tonic-gate 		return;
2330Sstevel@tonic-gate 	}
2340Sstevel@tonic-gate 
2350Sstevel@tonic-gate 	(void) memset(&this_month, 0, sizeof (this_month));
2360Sstevel@tonic-gate 
2370Sstevel@tonic-gate 	for (i = 0; i < MONTHS_IN_YEAR; i++) {
2380Sstevel@tonic-gate 		this_month.tm_mon = i;
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate 		(void) strftime(month_name, sizeof (month_name),
2410Sstevel@tonic-gate 		    "%b", &this_month);
2420Sstevel@tonic-gate 
2430Sstevel@tonic-gate 		for (j = 0; j < strlen(month_name); j++)
2440Sstevel@tonic-gate 			month_name[j] = toupper(month_name[j]);
2450Sstevel@tonic-gate 		(void) mbstowcs(w_month_name, month_name, MAX_MON_LEN);
2460Sstevel@tonic-gate 
2470Sstevel@tonic-gate 		months[i] = strdup(month_name);
2480Sstevel@tonic-gate 		month_lengths[i] = strlen(month_name);
2490Sstevel@tonic-gate 		w_months[i] = wsdup(w_month_name);
2500Sstevel@tonic-gate 		w_month_lengths[i] = wslen(w_month_name);
2510Sstevel@tonic-gate 	}
2520Sstevel@tonic-gate }
2530Sstevel@tonic-gate 
2540Sstevel@tonic-gate void
field_initialize(sort_t * S)2550Sstevel@tonic-gate field_initialize(sort_t *S)
2560Sstevel@tonic-gate {
2570Sstevel@tonic-gate 	field_initialize_month(S->m_c_locale);
2580Sstevel@tonic-gate 	field_initialize_separator();
2590Sstevel@tonic-gate 
2600Sstevel@tonic-gate 	if (S->m_c_locale)
2610Sstevel@tonic-gate 		xfrm_ops = &C_ops;
2620Sstevel@tonic-gate 	else
2630Sstevel@tonic-gate 		xfrm_ops = &SB_ops;
2640Sstevel@tonic-gate }
2650Sstevel@tonic-gate 
2660Sstevel@tonic-gate field_t *
field_new(sort_t * S)2670Sstevel@tonic-gate field_new(sort_t *S)
2680Sstevel@tonic-gate {
2690Sstevel@tonic-gate 	field_t	*F = safe_realloc(NULL, sizeof (field_t));
2700Sstevel@tonic-gate 
2710Sstevel@tonic-gate 	F->f_start_field = -1;
2720Sstevel@tonic-gate 	F->f_start_offset = -1;
2730Sstevel@tonic-gate 	F->f_end_field = -1;
2740Sstevel@tonic-gate 	F->f_end_offset = -1;
2750Sstevel@tonic-gate 	F->f_next = NULL;
2760Sstevel@tonic-gate 
2770Sstevel@tonic-gate 	if (S == NULL) {
2780Sstevel@tonic-gate 		F->f_species = ALPHA;
2790Sstevel@tonic-gate 		F->f_options = 0;
2800Sstevel@tonic-gate 	} else {
2810Sstevel@tonic-gate 		F->f_species = S->m_default_species;
2820Sstevel@tonic-gate 		F->f_options = S->m_field_options;
2830Sstevel@tonic-gate 	}
2840Sstevel@tonic-gate 
2850Sstevel@tonic-gate 	return (F);
2860Sstevel@tonic-gate }
2870Sstevel@tonic-gate 
2880Sstevel@tonic-gate void
field_delete(field_t * F)2890Sstevel@tonic-gate field_delete(field_t *F)
2900Sstevel@tonic-gate {
2910Sstevel@tonic-gate 	free(F);
2920Sstevel@tonic-gate }
2930Sstevel@tonic-gate 
2940Sstevel@tonic-gate /*
2950Sstevel@tonic-gate  * The recursive implementation of field_add_to_chain() given below is
2960Sstevel@tonic-gate  * inappropriate if function calls are expensive, or a truly large number of
2970Sstevel@tonic-gate  * fields are anticipated.
2980Sstevel@tonic-gate  */
2990Sstevel@tonic-gate void
field_add_to_chain(field_t ** F,field_t * A)3000Sstevel@tonic-gate field_add_to_chain(field_t **F, field_t *A)
3010Sstevel@tonic-gate {
3020Sstevel@tonic-gate 	if (*F == NULL)
3030Sstevel@tonic-gate 		*F = A;
3040Sstevel@tonic-gate 	else
3050Sstevel@tonic-gate 		field_add_to_chain(&((*F)->f_next), A);
3060Sstevel@tonic-gate }
3070Sstevel@tonic-gate 
3080Sstevel@tonic-gate #ifdef DEBUG
3090Sstevel@tonic-gate #ifndef _LP64
3100Sstevel@tonic-gate #define	FIELD_FMT \
3110Sstevel@tonic-gate "\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n"
3120Sstevel@tonic-gate #else /* !_LP64 */
3130Sstevel@tonic-gate #define	FIELD_FMT \
3140Sstevel@tonic-gate "\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n"
3150Sstevel@tonic-gate #endif /* !_LP64 */
3160Sstevel@tonic-gate 
3170Sstevel@tonic-gate /*
3180Sstevel@tonic-gate  * field_print is used only for debugging purposes.
3190Sstevel@tonic-gate  */
3200Sstevel@tonic-gate void
field_print(field_t * F)3210Sstevel@tonic-gate field_print(field_t *F)
3220Sstevel@tonic-gate {
3230Sstevel@tonic-gate 	char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"};
3240Sstevel@tonic-gate 	int status = 0;
3250Sstevel@tonic-gate 
3260Sstevel@tonic-gate 	(void) fprintf(stderr, "Type: %s", field_names[F->f_species]);
3270Sstevel@tonic-gate 	(void) fprintf(stderr, "\tOptions: ");
3280Sstevel@tonic-gate 
3290Sstevel@tonic-gate 	if (F->f_options & FIELD_REVERSE_COMPARISONS) {
3300Sstevel@tonic-gate 		(void) fprintf(stderr, "REVERSE");
3310Sstevel@tonic-gate 		status++;
3320Sstevel@tonic-gate 	}
3330Sstevel@tonic-gate 	if (F->f_options & FIELD_DICTIONARY_ORDER) {
3340Sstevel@tonic-gate 		(void) fprintf(stderr, "DICTIONARY ");
3350Sstevel@tonic-gate 		status++;
3360Sstevel@tonic-gate 	}
3370Sstevel@tonic-gate 	if (F->f_options & FIELD_FOLD_UPPERCASE) {
3380Sstevel@tonic-gate 		(void) fprintf(stderr, "UPPERCASE ");
3390Sstevel@tonic-gate 		status++;
3400Sstevel@tonic-gate 	}
3410Sstevel@tonic-gate 	if (F->f_options & FIELD_IGNORE_NONPRINTABLES) {
3420Sstevel@tonic-gate 		(void) fprintf(stderr, "PRINTABLES ");
3430Sstevel@tonic-gate 		status++;
3440Sstevel@tonic-gate 	}
3450Sstevel@tonic-gate 	if (F->f_options & FIELD_IGNORE_BLANKS_START) {
3460Sstevel@tonic-gate 		(void) fprintf(stderr, "BLANKS_START ");
3470Sstevel@tonic-gate 		status++;
3480Sstevel@tonic-gate 	}
3490Sstevel@tonic-gate 	if (F->f_options & FIELD_IGNORE_BLANKS_END) {
3500Sstevel@tonic-gate 		(void) fprintf(stderr, "BLANKS_END ");
3510Sstevel@tonic-gate 		status++;
3520Sstevel@tonic-gate 	}
3530Sstevel@tonic-gate 
3540Sstevel@tonic-gate 	if (status == 0)
3550Sstevel@tonic-gate 		(void) fprintf(stderr, "NO_MODIFIERS");
3560Sstevel@tonic-gate 
3570Sstevel@tonic-gate 	(void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset,
3580Sstevel@tonic-gate 	    F->f_end_field, F->f_end_offset);
3590Sstevel@tonic-gate }
3600Sstevel@tonic-gate #endif /* DEBUG */
3610Sstevel@tonic-gate 
3620Sstevel@tonic-gate static ssize_t
field_boundary(field_t * F,line_rec_t * L,int is_end,int is_blanks)3630Sstevel@tonic-gate field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks)
3640Sstevel@tonic-gate {
3650Sstevel@tonic-gate 	char *S = L->l_data.sp;
3660Sstevel@tonic-gate 	char *T = S;
3670Sstevel@tonic-gate 	char *eol = S + L->l_data_length;
3680Sstevel@tonic-gate 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
3690Sstevel@tonic-gate 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
3700Sstevel@tonic-gate 	ssize_t ret;
3710Sstevel@tonic-gate 
3720Sstevel@tonic-gate 	ASSERT(is_end || field > -1);
3730Sstevel@tonic-gate 
3740Sstevel@tonic-gate 	if (is_end && field == -1)
3750Sstevel@tonic-gate 		return (L->l_data_length);
3760Sstevel@tonic-gate 
3770Sstevel@tonic-gate 	while (field-- > 0) {
3780Sstevel@tonic-gate 		while (T < eol && IS_BLANK(*T))
3790Sstevel@tonic-gate 			T++;
3800Sstevel@tonic-gate 
3810Sstevel@tonic-gate 		while (T < eol && !IS_BLANK(*T))
3820Sstevel@tonic-gate 			T++;
3830Sstevel@tonic-gate 	}
3840Sstevel@tonic-gate 
3850Sstevel@tonic-gate 	if ((!is_end || offset > 0) && is_blanks) {
3860Sstevel@tonic-gate 		while (IS_BLANK(*T))
3870Sstevel@tonic-gate 			T++;
3880Sstevel@tonic-gate 	}
3890Sstevel@tonic-gate 
3900Sstevel@tonic-gate 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
3910Sstevel@tonic-gate 		return (L->l_data_length);
3920Sstevel@tonic-gate 
3930Sstevel@tonic-gate 	return (ret);
3940Sstevel@tonic-gate }
3950Sstevel@tonic-gate 
3960Sstevel@tonic-gate static void
field_delimit(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end)3970Sstevel@tonic-gate field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
3980Sstevel@tonic-gate {
3990Sstevel@tonic-gate 	ASSERT(F->f_start_field > -1);
4000Sstevel@tonic-gate 
4010Sstevel@tonic-gate 	*start = field_boundary(F, L, 0,
4020Sstevel@tonic-gate 	    F->f_options & FIELD_IGNORE_BLANKS_START);
4030Sstevel@tonic-gate 	*end = field_boundary(F, L, 1,
4040Sstevel@tonic-gate 	    F->f_options & FIELD_IGNORE_BLANKS_END);
4050Sstevel@tonic-gate }
4060Sstevel@tonic-gate 
4070Sstevel@tonic-gate static ssize_t
field_boundary_wide(field_t * F,line_rec_t * L,int is_end,int is_blanks)4080Sstevel@tonic-gate field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks)
4090Sstevel@tonic-gate {
4100Sstevel@tonic-gate 	wchar_t *S = L->l_data.wp;
4110Sstevel@tonic-gate 	wchar_t *T = S;
4120Sstevel@tonic-gate 	wchar_t *eol = S + L->l_data_length;
4130Sstevel@tonic-gate 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
4140Sstevel@tonic-gate 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
4150Sstevel@tonic-gate 	ssize_t ret;
4160Sstevel@tonic-gate 
4170Sstevel@tonic-gate 	ASSERT(is_end || field > -1);
4180Sstevel@tonic-gate 
4190Sstevel@tonic-gate 	if (is_end && field == -1)
4200Sstevel@tonic-gate 		return (L->l_data_length);
4210Sstevel@tonic-gate 
4220Sstevel@tonic-gate 	while (field-- > 0) {
4230Sstevel@tonic-gate 		while (T < eol && W_IS_BLANK(*T))
4240Sstevel@tonic-gate 			T++;
4250Sstevel@tonic-gate 
4260Sstevel@tonic-gate 		while (T < eol && !W_IS_BLANK(*T))
4270Sstevel@tonic-gate 			T++;
4280Sstevel@tonic-gate 	}
4290Sstevel@tonic-gate 
4300Sstevel@tonic-gate 	if ((!is_end || offset > 0) && is_blanks) {
4310Sstevel@tonic-gate 		while (W_IS_BLANK(*T))
4320Sstevel@tonic-gate 			T++;
4330Sstevel@tonic-gate 	}
4340Sstevel@tonic-gate 
4350Sstevel@tonic-gate 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
4360Sstevel@tonic-gate 		return (L->l_data_length);
4370Sstevel@tonic-gate 
4380Sstevel@tonic-gate 	return (ret);
4390Sstevel@tonic-gate }
4400Sstevel@tonic-gate 
4410Sstevel@tonic-gate static void
field_delimit_wide(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end)4420Sstevel@tonic-gate field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
4430Sstevel@tonic-gate {
4440Sstevel@tonic-gate 	ASSERT(F->f_start_field > -1);
4450Sstevel@tonic-gate 
4460Sstevel@tonic-gate 	*start = field_boundary_wide(F, L, 0,
4470Sstevel@tonic-gate 	    F->f_options & FIELD_IGNORE_BLANKS_START);
4480Sstevel@tonic-gate 	*end = field_boundary_wide(F, L, 1,
4490Sstevel@tonic-gate 	    F->f_options & FIELD_IGNORE_BLANKS_END);
4500Sstevel@tonic-gate }
4510Sstevel@tonic-gate 
4520Sstevel@tonic-gate static ssize_t
field_boundary_tabbed(field_t * F,line_rec_t * L,int is_end,int is_blanks,vchar_t delimiter)4530Sstevel@tonic-gate field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks,
4540Sstevel@tonic-gate     vchar_t delimiter)
4550Sstevel@tonic-gate {
4560Sstevel@tonic-gate 	char *S = L->l_data.sp;
4570Sstevel@tonic-gate 	char *T = S;
4580Sstevel@tonic-gate 	char *eol = S + L->l_data_length;
4590Sstevel@tonic-gate 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
4600Sstevel@tonic-gate 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
4610Sstevel@tonic-gate 	ssize_t ret;
4620Sstevel@tonic-gate 
4630Sstevel@tonic-gate 	ASSERT(is_end || field > -1);
4640Sstevel@tonic-gate 
4650Sstevel@tonic-gate 	if (is_end && field == -1)
4660Sstevel@tonic-gate 		return (L->l_data_length);
4670Sstevel@tonic-gate 
4680Sstevel@tonic-gate 	while (field-- > 0) {
4690Sstevel@tonic-gate 		T = xstrnchr(T, delimiter.sc, eol - T);
4700Sstevel@tonic-gate 		if (T == NULL || T > eol)
4710Sstevel@tonic-gate 			return (L->l_data_length);
4720Sstevel@tonic-gate 
4730Sstevel@tonic-gate 		T++;
4740Sstevel@tonic-gate 	}
4750Sstevel@tonic-gate 
4760Sstevel@tonic-gate 	if ((!is_end || offset != 0) && is_blanks) {
4770Sstevel@tonic-gate 		while (IS_BLANK(*T))
4780Sstevel@tonic-gate 			T++;
4790Sstevel@tonic-gate 	}
4800Sstevel@tonic-gate 
4814623Srm88369 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
482*12067SJohn.Beck@Sun.COM 		if (L->l_data_length <= 0)
483*12067SJohn.Beck@Sun.COM 			return (0);
4844623Srm88369 		if (S[L->l_data_length - 1] == delimiter.sc) {
4854623Srm88369 			return (L->l_data_length - 1);
4864623Srm88369 		} else {
4874623Srm88369 			return (L->l_data_length);
4884623Srm88369 		}
4894623Srm88369 	}
4900Sstevel@tonic-gate 
4910Sstevel@tonic-gate 	if (is_end && offset == 0)
4920Sstevel@tonic-gate 		ret--;
4930Sstevel@tonic-gate 
4940Sstevel@tonic-gate 	return (ret);
4950Sstevel@tonic-gate }
4960Sstevel@tonic-gate 
4970Sstevel@tonic-gate /*
4980Sstevel@tonic-gate  * field_delimit_tabbed() is called when a field separator has been defined
4990Sstevel@tonic-gate  * using the -t option.  The character at the offset, start, is either one or
5000Sstevel@tonic-gate  * more character positions past the delimiter marking the start of the
5010Sstevel@tonic-gate  * field, or at the end of the line.
5020Sstevel@tonic-gate  */
5030Sstevel@tonic-gate static void
field_delimit_tabbed(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end,vchar_t delimiter)5040Sstevel@tonic-gate field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end,
5050Sstevel@tonic-gate     vchar_t delimiter)
5060Sstevel@tonic-gate {
5070Sstevel@tonic-gate 	ASSERT(F->f_start_field > -1);
5080Sstevel@tonic-gate 
5090Sstevel@tonic-gate 	*start = field_boundary_tabbed(F, L, 0, F->f_options &
5100Sstevel@tonic-gate 	    FIELD_IGNORE_BLANKS_START, delimiter);
5110Sstevel@tonic-gate 	*end = field_boundary_tabbed(F, L, 1, F->f_options &
5120Sstevel@tonic-gate 	    FIELD_IGNORE_BLANKS_END, delimiter);
5130Sstevel@tonic-gate }
5140Sstevel@tonic-gate 
5150Sstevel@tonic-gate static ssize_t
field_boundary_tabbed_wide(field_t * F,line_rec_t * L,int is_end,int is_blanks,vchar_t delimiter)5160Sstevel@tonic-gate field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks,
5170Sstevel@tonic-gate     vchar_t delimiter)
5180Sstevel@tonic-gate {
5190Sstevel@tonic-gate 	wchar_t *S = L->l_data.wp;
5200Sstevel@tonic-gate 	wchar_t *T = S;
5210Sstevel@tonic-gate 	wchar_t *eol = S + L->l_data_length;
5220Sstevel@tonic-gate 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
5230Sstevel@tonic-gate 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
5240Sstevel@tonic-gate 	ssize_t ret;
5250Sstevel@tonic-gate 
5260Sstevel@tonic-gate 	ASSERT(is_end || field > -1);
5270Sstevel@tonic-gate 
5280Sstevel@tonic-gate 	if (is_end && field == -1)
5290Sstevel@tonic-gate 		return (L->l_data_length);
5300Sstevel@tonic-gate 
5310Sstevel@tonic-gate 	while (field-- > 0) {
5320Sstevel@tonic-gate 		T = xwsnchr(T, delimiter.wc, eol - T);
5330Sstevel@tonic-gate 		if (T == NULL || T > eol)
5340Sstevel@tonic-gate 			return (L->l_data_length);
5350Sstevel@tonic-gate 
5360Sstevel@tonic-gate 		T++;
5370Sstevel@tonic-gate 	}
5380Sstevel@tonic-gate 
5390Sstevel@tonic-gate 	if ((!is_end || offset != 0) && is_blanks) {
5400Sstevel@tonic-gate 		while (W_IS_BLANK(*T))
5410Sstevel@tonic-gate 			T++;
5420Sstevel@tonic-gate 	}
5430Sstevel@tonic-gate 
5444623Srm88369 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
545*12067SJohn.Beck@Sun.COM 		if (L->l_data_length <= 0)
546*12067SJohn.Beck@Sun.COM 			return (0);
5474623Srm88369 		if (S[L->l_data_length - 1] == delimiter.wc) {
5484623Srm88369 			return (L->l_data_length - 1);
5494623Srm88369 		} else {
5504623Srm88369 			return (L->l_data_length);
5514623Srm88369 		}
5524623Srm88369 	}
5530Sstevel@tonic-gate 
5540Sstevel@tonic-gate 	if (is_end && offset == 0)
5550Sstevel@tonic-gate 		ret--;
5560Sstevel@tonic-gate 
5570Sstevel@tonic-gate 	return (ret);
5580Sstevel@tonic-gate }
5590Sstevel@tonic-gate 
5600Sstevel@tonic-gate static void
field_delimit_tabbed_wide(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end,vchar_t delimiter)5610Sstevel@tonic-gate field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start,
5620Sstevel@tonic-gate     ssize_t *end, vchar_t delimiter)
5630Sstevel@tonic-gate {
5640Sstevel@tonic-gate 	ASSERT(F->f_start_field > -1);
5650Sstevel@tonic-gate 
5660Sstevel@tonic-gate 	*start = field_boundary_tabbed_wide(F, L, 0, F->f_options &
5670Sstevel@tonic-gate 	    FIELD_IGNORE_BLANKS_START, delimiter);
5680Sstevel@tonic-gate 	*end = field_boundary_tabbed_wide(F, L, 1, F->f_options &
5690Sstevel@tonic-gate 	    FIELD_IGNORE_BLANKS_END, delimiter);
5700Sstevel@tonic-gate }
5710Sstevel@tonic-gate 
5720Sstevel@tonic-gate /*ARGSUSED*/
5730Sstevel@tonic-gate ssize_t
field_convert_month(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)5740Sstevel@tonic-gate field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter,
5750Sstevel@tonic-gate     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
5760Sstevel@tonic-gate {
5770Sstevel@tonic-gate 	int j;
5780Sstevel@tonic-gate 	ssize_t	val;
5790Sstevel@tonic-gate 	char month_candidate[MAX_MON_LEN * MB_LEN_MAX];
5800Sstevel@tonic-gate 	ssize_t month_length = data_length;
5810Sstevel@tonic-gate 	ssize_t month_offset = data_offset;
5820Sstevel@tonic-gate 
5830Sstevel@tonic-gate 	if (sizeof (char) > L->l_collate_bufsize - coll_offset)
5840Sstevel@tonic-gate 		return (-1);
5850Sstevel@tonic-gate 
5860Sstevel@tonic-gate 	(void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX);
5870Sstevel@tonic-gate 
5880Sstevel@tonic-gate 
5890Sstevel@tonic-gate 	/*
5900Sstevel@tonic-gate 	 * The month field formally begins with the first non-blank character.
5910Sstevel@tonic-gate 	 */
5920Sstevel@tonic-gate 	while (IS_BLANK(*(L->l_data.sp + month_offset))) {
5930Sstevel@tonic-gate 		month_offset++;
5940Sstevel@tonic-gate 		month_length--;
5950Sstevel@tonic-gate 	}
5960Sstevel@tonic-gate 
5970Sstevel@tonic-gate 	for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
5980Sstevel@tonic-gate 		month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]);
5990Sstevel@tonic-gate 
6000Sstevel@tonic-gate 	for (j = 0; j < MONTHS_IN_YEAR; j++) {
6010Sstevel@tonic-gate 		if (xstrneql(month_candidate, months[j], month_lengths[j])) {
6020Sstevel@tonic-gate 			*(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET;
6030Sstevel@tonic-gate 			return (1);
6040Sstevel@tonic-gate 		}
6050Sstevel@tonic-gate 	}
6060Sstevel@tonic-gate 
6070Sstevel@tonic-gate 	/*
6080Sstevel@tonic-gate 	 * no matching month; copy string into field.  required behaviour is
6090Sstevel@tonic-gate 	 * that "month-free" keys sort before month-sortable keys, so insert
6100Sstevel@tonic-gate 	 * a "will sort first" token.
6110Sstevel@tonic-gate 	 */
6120Sstevel@tonic-gate 	*(L->l_collate.sp + coll_offset) = '\0' + MO_NONE;
6130Sstevel@tonic-gate 
6140Sstevel@tonic-gate 	val = field_convert_alpha_simple(F, L, delimiter, data_offset,
6150Sstevel@tonic-gate 	    data_length, coll_offset + 1);
6160Sstevel@tonic-gate 
6170Sstevel@tonic-gate 	if (val < 0)
6180Sstevel@tonic-gate 		return (-1);
6190Sstevel@tonic-gate 	else
6200Sstevel@tonic-gate 		return (val + 1);
6210Sstevel@tonic-gate }
6220Sstevel@tonic-gate 
6230Sstevel@tonic-gate /*ARGSUSED*/
6240Sstevel@tonic-gate ssize_t
field_convert_month_wide(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)6250Sstevel@tonic-gate field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
6260Sstevel@tonic-gate     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
6270Sstevel@tonic-gate {
6280Sstevel@tonic-gate 	ssize_t j;
6290Sstevel@tonic-gate 	ssize_t val;
6300Sstevel@tonic-gate 	wchar_t month_candidate[MAX_MON_LEN];
6310Sstevel@tonic-gate 	wchar_t *month;
6320Sstevel@tonic-gate 	wchar_t *buffer = L->l_collate.wp + coll_offset;
6330Sstevel@tonic-gate 	ssize_t month_length = data_length;
6340Sstevel@tonic-gate 	ssize_t month_offset = data_offset;
6350Sstevel@tonic-gate 
6360Sstevel@tonic-gate 	if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) <
6370Sstevel@tonic-gate 	    sizeof (wchar_t))
6380Sstevel@tonic-gate 		return (-1);
6390Sstevel@tonic-gate 
6400Sstevel@tonic-gate 	(void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t));
6410Sstevel@tonic-gate 
6420Sstevel@tonic-gate 
6430Sstevel@tonic-gate 	while (W_IS_BLANK(*(L->l_data.wp + month_offset))) {
6440Sstevel@tonic-gate 		month_offset++;
6450Sstevel@tonic-gate 		month_length--;
6460Sstevel@tonic-gate 	}
6470Sstevel@tonic-gate 
6480Sstevel@tonic-gate 	month = L->l_data.wp + month_offset;
6490Sstevel@tonic-gate 
6500Sstevel@tonic-gate 	for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
6510Sstevel@tonic-gate 		month_candidate[j] = towupper(month[j]);
6520Sstevel@tonic-gate 
6530Sstevel@tonic-gate 	for (j = 0; j < MONTHS_IN_YEAR; j++)
6540Sstevel@tonic-gate 		if (xwcsneql(month_candidate, w_months[j],
6550Sstevel@tonic-gate 		    w_month_lengths[j])) {
6560Sstevel@tonic-gate 			*buffer = L'\0' + j + MO_OFFSET;
6570Sstevel@tonic-gate 			return (1);
6580Sstevel@tonic-gate 		}
6590Sstevel@tonic-gate 
6600Sstevel@tonic-gate 	*buffer = L'\0' + MO_NONE;
6610Sstevel@tonic-gate 
6620Sstevel@tonic-gate 	val = field_convert_alpha_wide(F, L, delimiter, data_offset,
6630Sstevel@tonic-gate 	    data_length, coll_offset + sizeof (wchar_t));
6640Sstevel@tonic-gate 
6650Sstevel@tonic-gate 	if (val < 0)
6660Sstevel@tonic-gate 		return (-1);
6670Sstevel@tonic-gate 	else
6680Sstevel@tonic-gate 		return (val + 1);
6690Sstevel@tonic-gate }
6700Sstevel@tonic-gate 
6710Sstevel@tonic-gate /*
6720Sstevel@tonic-gate  * field_convert_alpha() always fails with return value -1 if the converted
6730Sstevel@tonic-gate  * string would cause l_collate_length to exceed l_collate_bufsize
6740Sstevel@tonic-gate  */
6750Sstevel@tonic-gate /*ARGSUSED*/
6760Sstevel@tonic-gate ssize_t
field_convert_alpha(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)6770Sstevel@tonic-gate field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter,
6780Sstevel@tonic-gate     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
6790Sstevel@tonic-gate {
6800Sstevel@tonic-gate 	static char *compose;
6810Sstevel@tonic-gate 	static ssize_t compose_length;
6820Sstevel@tonic-gate 
6830Sstevel@tonic-gate 	ssize_t	clength = 0;
6840Sstevel@tonic-gate 	ssize_t	dlength;
6850Sstevel@tonic-gate 	ssize_t	i;
6860Sstevel@tonic-gate 
6870Sstevel@tonic-gate 	if (compose_length < (data_length + 1)) {
6880Sstevel@tonic-gate 		compose_length = data_length + 1;
6890Sstevel@tonic-gate 		compose = safe_realloc(compose, compose_length * sizeof (char));
6900Sstevel@tonic-gate 	}
6910Sstevel@tonic-gate 
6920Sstevel@tonic-gate 	for (i = data_offset; i < data_offset + data_length; i++) {
6930Sstevel@tonic-gate 		char t = (L->l_data.sp)[i];
6940Sstevel@tonic-gate 
6950Sstevel@tonic-gate 		if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) &&
6960Sstevel@tonic-gate 		    !isprint((uchar_t)t))
6970Sstevel@tonic-gate 			continue;
6980Sstevel@tonic-gate 
6990Sstevel@tonic-gate 		if ((F->f_options & FIELD_DICTIONARY_ORDER) &&
7000Sstevel@tonic-gate 		    !isalnum((uchar_t)t) && !isspace((uchar_t)t))
7010Sstevel@tonic-gate 			continue;
7020Sstevel@tonic-gate 
7030Sstevel@tonic-gate 		if (F->f_options & FIELD_FOLD_UPPERCASE)
7040Sstevel@tonic-gate 			t = toupper(t);
7050Sstevel@tonic-gate 
7060Sstevel@tonic-gate 		compose[clength++] = t;
7070Sstevel@tonic-gate 	}
7080Sstevel@tonic-gate 	compose[clength] = '\0';
7090Sstevel@tonic-gate 
7100Sstevel@tonic-gate 	if ((dlength = xfrm_ops->sx_len(compose, clength)) <
7110Sstevel@tonic-gate 	    L->l_collate_bufsize - coll_offset)
7120Sstevel@tonic-gate 		return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
7134623Srm88369 		    compose, dlength + 1));
7140Sstevel@tonic-gate 	else
7150Sstevel@tonic-gate 		return ((ssize_t)-1);
7160Sstevel@tonic-gate }
7170Sstevel@tonic-gate 
7180Sstevel@tonic-gate /*ARGSUSED*/
7190Sstevel@tonic-gate ssize_t
field_convert_alpha_simple(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)7200Sstevel@tonic-gate field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter,
7210Sstevel@tonic-gate     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
7220Sstevel@tonic-gate {
7230Sstevel@tonic-gate 	static char *compose;
7240Sstevel@tonic-gate 	static ssize_t compose_length;
7250Sstevel@tonic-gate 
7260Sstevel@tonic-gate 	ssize_t	clength;
7270Sstevel@tonic-gate 	ssize_t	dlength;
7280Sstevel@tonic-gate 
7290Sstevel@tonic-gate 	if (compose_length < (data_length + 1)) {
7300Sstevel@tonic-gate 		compose_length = data_length + 1;
7310Sstevel@tonic-gate 		compose = safe_realloc(compose, compose_length * sizeof (char));
7320Sstevel@tonic-gate 	}
7330Sstevel@tonic-gate 
7340Sstevel@tonic-gate 	(void) memcpy(compose, L->l_data.sp + data_offset, data_length);
7350Sstevel@tonic-gate 	clength = data_length;
7360Sstevel@tonic-gate 	compose[clength] = '\0';
7370Sstevel@tonic-gate 
7380Sstevel@tonic-gate 	if ((dlength = xfrm_ops->sx_len(compose, clength)) <
7390Sstevel@tonic-gate 	    L->l_collate_bufsize - coll_offset)
7400Sstevel@tonic-gate 		return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
7414623Srm88369 		    compose, dlength + 1));
7420Sstevel@tonic-gate 	else
7430Sstevel@tonic-gate 		return ((ssize_t)-1);
7440Sstevel@tonic-gate }
7450Sstevel@tonic-gate 
7460Sstevel@tonic-gate /*ARGSUSED*/
7470Sstevel@tonic-gate ssize_t
field_convert_alpha_wide(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)7480Sstevel@tonic-gate field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
7490Sstevel@tonic-gate     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
7500Sstevel@tonic-gate {
7510Sstevel@tonic-gate 	wchar_t	*compose = safe_realloc(NULL, (data_length + 1) *
7520Sstevel@tonic-gate 	    sizeof (wchar_t));
7530Sstevel@tonic-gate 	ssize_t	clength = 0;
7540Sstevel@tonic-gate 	ssize_t	dlength;
7550Sstevel@tonic-gate 	ssize_t	i;
7560Sstevel@tonic-gate 	ssize_t ret;
7570Sstevel@tonic-gate 
7580Sstevel@tonic-gate 	for (i = data_offset; i < data_offset + data_length; i++) {
7590Sstevel@tonic-gate 		wchar_t	t = (L->l_data.wp)[i];
7600Sstevel@tonic-gate 
7610Sstevel@tonic-gate 		if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t))
7620Sstevel@tonic-gate 			continue;
7630Sstevel@tonic-gate 
7640Sstevel@tonic-gate 		if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) &&
7650Sstevel@tonic-gate 		    !iswspace(t))
7660Sstevel@tonic-gate 			continue;
7670Sstevel@tonic-gate 
7680Sstevel@tonic-gate 		if (F->f_options & FIELD_FOLD_UPPERCASE)
7690Sstevel@tonic-gate 			t = towupper(t);
7700Sstevel@tonic-gate 
7710Sstevel@tonic-gate 		compose[clength++] = t;
7720Sstevel@tonic-gate 	}
7730Sstevel@tonic-gate 	compose[clength] = L'\0';
7740Sstevel@tonic-gate 
7750Sstevel@tonic-gate 	dlength = wcsxfrm(NULL, compose, (size_t)0);
7760Sstevel@tonic-gate 	if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize -
7770Sstevel@tonic-gate 	    coll_offset * sizeof (wchar_t)) {
7780Sstevel@tonic-gate 		ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose,
7790Sstevel@tonic-gate 		    (size_t)dlength + 1);
7800Sstevel@tonic-gate 	} else {
7810Sstevel@tonic-gate 		ret = (ssize_t)-1;
7820Sstevel@tonic-gate 	}
7830Sstevel@tonic-gate 
7840Sstevel@tonic-gate 	safe_free(compose);
7850Sstevel@tonic-gate 
7860Sstevel@tonic-gate 	return (ret);
7870Sstevel@tonic-gate }
7880Sstevel@tonic-gate 
7890Sstevel@tonic-gate /*
7900Sstevel@tonic-gate  * field_convert_numeric() converts the given field into a collatable numerical
7910Sstevel@tonic-gate  * sequence.  The sequence is ordered as { log, integer, separator, fraction },
7920Sstevel@tonic-gate  * with an optional sentinel component at the sequence end.
7930Sstevel@tonic-gate  */
7940Sstevel@tonic-gate /*ARGSUSED*/
7950Sstevel@tonic-gate ssize_t
field_convert_numeric(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)7960Sstevel@tonic-gate field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter,
7970Sstevel@tonic-gate     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
7980Sstevel@tonic-gate {
7990Sstevel@tonic-gate 	char *number;
8000Sstevel@tonic-gate 	char *buffer = L->l_collate.sp + coll_offset;
8010Sstevel@tonic-gate 	ssize_t length;
8020Sstevel@tonic-gate 
8030Sstevel@tonic-gate 	char sign = '2';
8040Sstevel@tonic-gate 	int log_ten;
8050Sstevel@tonic-gate 	char *digits = buffer + 1 + sizeof (int) / sizeof (char);
8060Sstevel@tonic-gate 	size_t j = 0;
8070Sstevel@tonic-gate 	size_t i;
8080Sstevel@tonic-gate 
8090Sstevel@tonic-gate 	int state = BEFORE_NUMBER;
8100Sstevel@tonic-gate 
8110Sstevel@tonic-gate 	number = L->l_data.sp + data_offset;
8120Sstevel@tonic-gate 	length = data_length;
8130Sstevel@tonic-gate 
8140Sstevel@tonic-gate 	/*
8150Sstevel@tonic-gate 	 * Eat leading blanks, if any.
8160Sstevel@tonic-gate 	 */
8170Sstevel@tonic-gate 	for (i = 0; i < length; i++)
8180Sstevel@tonic-gate 		if (!IS_BLANK(number[i]))
8190Sstevel@tonic-gate 			break;
8200Sstevel@tonic-gate 
8210Sstevel@tonic-gate 	/*
8220Sstevel@tonic-gate 	 * Test that there is sufficient size in the collation buffer for our
8230Sstevel@tonic-gate 	 * number.  In addition to the possible remaining characters in the
8240Sstevel@tonic-gate 	 * field, we also require space for the sign (char), logarithm (int),
8250Sstevel@tonic-gate 	 * separator (char), and as many as two string terminators (for reverse
8260Sstevel@tonic-gate 	 * sorts).
8270Sstevel@tonic-gate 	 */
8280Sstevel@tonic-gate 	if (((length - i) + 4 * sizeof (char) + sizeof (int)) >
8290Sstevel@tonic-gate 	    (L->l_collate_bufsize - coll_offset))
8300Sstevel@tonic-gate 		return ((ssize_t)-1);
8310Sstevel@tonic-gate 
8320Sstevel@tonic-gate 	/*
8330Sstevel@tonic-gate 	 * If negative, set sign.
8340Sstevel@tonic-gate 	 */
8350Sstevel@tonic-gate 	if (number[i] == '-') {
8360Sstevel@tonic-gate 		i++;
8370Sstevel@tonic-gate 		sign = '0';
8380Sstevel@tonic-gate 	}
8390Sstevel@tonic-gate 
8400Sstevel@tonic-gate 	/*
8410Sstevel@tonic-gate 	 * Scan integer part; eat leading zeros.
8420Sstevel@tonic-gate 	 */
8430Sstevel@tonic-gate 	for (; i < length; i++) {
8440Sstevel@tonic-gate 		if (IS_SEPARATOR(number[i]))
8450Sstevel@tonic-gate 			continue;
8460Sstevel@tonic-gate 
8470Sstevel@tonic-gate 		if (number[i] == '0' && !(state & IN_NUMBER))
8480Sstevel@tonic-gate 			continue;
8490Sstevel@tonic-gate 
8500Sstevel@tonic-gate 		if (!isdigit((uchar_t)number[i]))
8510Sstevel@tonic-gate 			break;
8520Sstevel@tonic-gate 
8530Sstevel@tonic-gate 		state |= IN_NUMBER;
8540Sstevel@tonic-gate 		if (sign == '0')
8550Sstevel@tonic-gate 			digits[j++] = '0' + '9' - number[i];
8560Sstevel@tonic-gate 		else
8570Sstevel@tonic-gate 			digits[j++] = number[i];
8580Sstevel@tonic-gate 	}
8590Sstevel@tonic-gate 
8600Sstevel@tonic-gate 	if (i < length && IS_DECIMAL(number[i])) {
8610Sstevel@tonic-gate 		/*
8620Sstevel@tonic-gate 		 * Integer part terminated by decimal.
8630Sstevel@tonic-gate 		 */
8640Sstevel@tonic-gate 		digits[j] = DECIMAL_CHAR;
8650Sstevel@tonic-gate 		log_ten = j++;
8660Sstevel@tonic-gate 
8670Sstevel@tonic-gate 		/*
8680Sstevel@tonic-gate 		 * Scan fractional part.
8690Sstevel@tonic-gate 		 */
8700Sstevel@tonic-gate 		for (++i; i < length; i++) {
8710Sstevel@tonic-gate 			if (IS_SEPARATOR(number[i]))
8720Sstevel@tonic-gate 				continue;
8730Sstevel@tonic-gate 
8740Sstevel@tonic-gate 			if (!isdigit((uchar_t)number[i]))
8750Sstevel@tonic-gate 				break;
8760Sstevel@tonic-gate 
8770Sstevel@tonic-gate 			if (number[i] != '0')
8780Sstevel@tonic-gate 				state |= IN_NUMBER;
8790Sstevel@tonic-gate 
8800Sstevel@tonic-gate 			if (sign == '0')
8810Sstevel@tonic-gate 				digits[j++] = '0' + '9' - number[i];
8820Sstevel@tonic-gate 			else
8830Sstevel@tonic-gate 				digits[j++] = number[i];
8840Sstevel@tonic-gate 		}
8850Sstevel@tonic-gate 
8860Sstevel@tonic-gate 		if (sign == '0')
8870Sstevel@tonic-gate 			digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
8880Sstevel@tonic-gate 	} else {
8890Sstevel@tonic-gate 		/*
8900Sstevel@tonic-gate 		 * Nondigit or end of string seen.
8910Sstevel@tonic-gate 		 */
8920Sstevel@tonic-gate 		log_ten = (int)j;
8930Sstevel@tonic-gate 		if (sign == '0')
8940Sstevel@tonic-gate 			digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
8950Sstevel@tonic-gate 		else
8960Sstevel@tonic-gate 			digits[j] = INTERFIELD_SEPARATOR;
8970Sstevel@tonic-gate 	}
8980Sstevel@tonic-gate 
8990Sstevel@tonic-gate 	if ((state & IN_NUMBER) == 0) {
9000Sstevel@tonic-gate 		/*
9010Sstevel@tonic-gate 		 * A non-zero number was not detected; treat as defined zero.
9020Sstevel@tonic-gate 		 */
9030Sstevel@tonic-gate 		sign = '1';
9040Sstevel@tonic-gate 		log_ten = 0;
9050Sstevel@tonic-gate 		digits[0] = '0';
9060Sstevel@tonic-gate 		j = 1;
9070Sstevel@tonic-gate 	}
9080Sstevel@tonic-gate 
9090Sstevel@tonic-gate 	/*
9100Sstevel@tonic-gate 	 * We subtract a constant from the log of negative values so that
9110Sstevel@tonic-gate 	 * they will correctly precede positive values with a zero logarithm.
9120Sstevel@tonic-gate 	 */
9130Sstevel@tonic-gate 	if (sign == '0') {
9140Sstevel@tonic-gate 		if (j != 0)
9150Sstevel@tonic-gate 			log_ten = -log_ten - 2;
9160Sstevel@tonic-gate 		else
9170Sstevel@tonic-gate 			/*
9180Sstevel@tonic-gate 			 * Special case for -0.
9190Sstevel@tonic-gate 			 */
9200Sstevel@tonic-gate 			log_ten = -1;
9210Sstevel@tonic-gate 	}
9220Sstevel@tonic-gate 
9230Sstevel@tonic-gate 	buffer[0] = sign;
9240Sstevel@tonic-gate 
9250Sstevel@tonic-gate 	/*
9260Sstevel@tonic-gate 	 * Place logarithm in big-endian form.
9270Sstevel@tonic-gate 	 */
9280Sstevel@tonic-gate 	for (i = 0; i < sizeof (int); i++)
9290Sstevel@tonic-gate 		buffer[i + 1] = (log_ten << (i * NBBY))
9300Sstevel@tonic-gate 		    >> ((sizeof (int) - 1) * NBBY);
9310Sstevel@tonic-gate 
9320Sstevel@tonic-gate 	if (j + sizeof (char) + sizeof (int) <
9330Sstevel@tonic-gate 	    L->l_collate_bufsize - coll_offset)
9340Sstevel@tonic-gate 		return (j + 1 + sizeof (int));
9350Sstevel@tonic-gate 	else
9360Sstevel@tonic-gate 		return ((ssize_t)-1);
9370Sstevel@tonic-gate }
9380Sstevel@tonic-gate 
9390Sstevel@tonic-gate /*ARGSUSED*/
9400Sstevel@tonic-gate ssize_t
field_convert_numeric_wide(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)9410Sstevel@tonic-gate field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
9420Sstevel@tonic-gate     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
9430Sstevel@tonic-gate {
9440Sstevel@tonic-gate 	wchar_t *number;
9450Sstevel@tonic-gate 	wchar_t *buffer = L->l_collate.wp + coll_offset;
9460Sstevel@tonic-gate 	char *lbuffer;
9470Sstevel@tonic-gate 	ssize_t length;
9480Sstevel@tonic-gate 
9490Sstevel@tonic-gate 	wchar_t	sign = L'2';
9500Sstevel@tonic-gate 	int log_ten;
9510Sstevel@tonic-gate 	wchar_t	*digits = buffer + 1 + sizeof (int)/sizeof (wchar_t);
9520Sstevel@tonic-gate 	size_t j = 0;
9530Sstevel@tonic-gate 	size_t i;
9540Sstevel@tonic-gate 
9550Sstevel@tonic-gate 	int state = BEFORE_NUMBER;
9560Sstevel@tonic-gate 
9570Sstevel@tonic-gate 	number = L->l_data.wp + data_offset;
9580Sstevel@tonic-gate 	length = data_length;
9590Sstevel@tonic-gate 
9600Sstevel@tonic-gate 	for (i = 0; i < length; i++)
9610Sstevel@tonic-gate 		if (!W_IS_BLANK(number[i]))
9620Sstevel@tonic-gate 			break;
9630Sstevel@tonic-gate 
9640Sstevel@tonic-gate 	if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) +
9650Sstevel@tonic-gate 	    sizeof (int)) > (L->l_collate_bufsize - coll_offset))
9660Sstevel@tonic-gate 		return ((ssize_t)-1);
9670Sstevel@tonic-gate 
9680Sstevel@tonic-gate 	if (number[i] == L'-') {
9690Sstevel@tonic-gate 		i++;
9700Sstevel@tonic-gate 		sign = L'0';
9710Sstevel@tonic-gate 	}
9720Sstevel@tonic-gate 
9730Sstevel@tonic-gate 	for (; i < length; i++) {
9740Sstevel@tonic-gate 		if (W_IS_SEPARATOR(number[i]))
9750Sstevel@tonic-gate 			continue;
9760Sstevel@tonic-gate 
9770Sstevel@tonic-gate 		if (number[i] == L'0' && !(state & IN_NUMBER))
9780Sstevel@tonic-gate 			continue;
9790Sstevel@tonic-gate 
9800Sstevel@tonic-gate 		if (!iswdigit(number[i]))
9810Sstevel@tonic-gate 			break;
9820Sstevel@tonic-gate 
9830Sstevel@tonic-gate 		state |= IN_NUMBER;
9840Sstevel@tonic-gate 		if (sign == L'0')
9850Sstevel@tonic-gate 			digits[j++] = L'0' + L'9' - number[i];
9860Sstevel@tonic-gate 		else
9870Sstevel@tonic-gate 			digits[j++] = number[i];
9880Sstevel@tonic-gate 	}
9890Sstevel@tonic-gate 
9900Sstevel@tonic-gate 	if (i < length && W_IS_DECIMAL(number[i])) {
9910Sstevel@tonic-gate 		digits[j] = W_DECIMAL_CHAR;
9920Sstevel@tonic-gate 		log_ten = j++;
9930Sstevel@tonic-gate 
9940Sstevel@tonic-gate 		for (++i; i < length; i++) {
9950Sstevel@tonic-gate 			if (W_IS_SEPARATOR(number[i]))
9960Sstevel@tonic-gate 				continue;
9970Sstevel@tonic-gate 
9980Sstevel@tonic-gate 			if (!iswdigit(number[i]))
9990Sstevel@tonic-gate 				break;
10000Sstevel@tonic-gate 
10010Sstevel@tonic-gate 			if (number[i] != L'0')
10020Sstevel@tonic-gate 				state |= IN_NUMBER;
10030Sstevel@tonic-gate 
10040Sstevel@tonic-gate 			if (sign == L'0')
10050Sstevel@tonic-gate 				digits[j++] = L'0' + L'9' - number[i];
10060Sstevel@tonic-gate 			else
10070Sstevel@tonic-gate 				digits[j++] = number[i];
10080Sstevel@tonic-gate 		}
10090Sstevel@tonic-gate 
10100Sstevel@tonic-gate 		if (sign == L'0')
10110Sstevel@tonic-gate 			digits[j++] = (wchar_t)(WCHAR_MAX -
10120Sstevel@tonic-gate 			    W_INTERFIELD_SEPARATOR);
10130Sstevel@tonic-gate 	} else {
10140Sstevel@tonic-gate 		log_ten = (int)j;
10150Sstevel@tonic-gate 		if (sign == L'0')
10160Sstevel@tonic-gate 			digits[j++] = (wchar_t)(WCHAR_MAX -
10170Sstevel@tonic-gate 			    W_INTERFIELD_SEPARATOR);
10180Sstevel@tonic-gate 		else
10190Sstevel@tonic-gate 			digits[j] = W_INTERFIELD_SEPARATOR;
10200Sstevel@tonic-gate 	}
10210Sstevel@tonic-gate 
10220Sstevel@tonic-gate 	if ((state & IN_NUMBER) == 0) {
10230Sstevel@tonic-gate 		sign = L'1';
10240Sstevel@tonic-gate 		log_ten = 0;
10250Sstevel@tonic-gate 		digits[0] = L'0';
10260Sstevel@tonic-gate 		j = 1;
10270Sstevel@tonic-gate 	}
10280Sstevel@tonic-gate 
10290Sstevel@tonic-gate 	if (sign == L'0') {
10300Sstevel@tonic-gate 		if (j != 0)
10310Sstevel@tonic-gate 			log_ten = -log_ten - 2;
10320Sstevel@tonic-gate 		else
10330Sstevel@tonic-gate 			log_ten = -1;
10340Sstevel@tonic-gate 	}
10350Sstevel@tonic-gate 
10360Sstevel@tonic-gate 	buffer[0] = sign;
10370Sstevel@tonic-gate 	/*
10380Sstevel@tonic-gate 	 * Place logarithm in big-endian form.
10390Sstevel@tonic-gate 	 */
10400Sstevel@tonic-gate 	lbuffer = (char *)(buffer + 1);
10410Sstevel@tonic-gate 	for (i = 0; i < sizeof (int); i++)
10420Sstevel@tonic-gate 		lbuffer[i] = (log_ten << (i * NBBY))
10430Sstevel@tonic-gate 		    >> ((sizeof (int) - 1) * NBBY);
10440Sstevel@tonic-gate 
10450Sstevel@tonic-gate 	if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) <
10460Sstevel@tonic-gate 	    L->l_collate_bufsize - coll_offset * sizeof (wchar_t))
10470Sstevel@tonic-gate 		return (j + 1 + sizeof (int) / sizeof (wchar_t));
10480Sstevel@tonic-gate 	else
10490Sstevel@tonic-gate 		return ((ssize_t)-1);
10500Sstevel@tonic-gate }
10510Sstevel@tonic-gate 
10520Sstevel@tonic-gate /*
10530Sstevel@tonic-gate  * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour
10540Sstevel@tonic-gate  * when coll_offset exceeds l_collate_bufsize.
10550Sstevel@tonic-gate  */
10560Sstevel@tonic-gate ssize_t
field_convert(field_t * F,line_rec_t * L,int flags,vchar_t field_separator)10570Sstevel@tonic-gate field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator)
10580Sstevel@tonic-gate {
10590Sstevel@tonic-gate 	ssize_t coll_offset = 0;
10600Sstevel@tonic-gate 	ssize_t	start, end, distance;
10610Sstevel@tonic-gate 	field_t *cur_fieldp = F;
10620Sstevel@tonic-gate 
10630Sstevel@tonic-gate 	while (cur_fieldp != NULL) {
10640Sstevel@tonic-gate 		/*
10650Sstevel@tonic-gate 		 * delimit field
10660Sstevel@tonic-gate 		 */
10670Sstevel@tonic-gate 		if (!field_separator.sc)
10680Sstevel@tonic-gate 			field_delimit(cur_fieldp, L, &start, &end);
10690Sstevel@tonic-gate 		else
10700Sstevel@tonic-gate 			field_delimit_tabbed(cur_fieldp, L, &start, &end,
10710Sstevel@tonic-gate 			    field_separator);
10720Sstevel@tonic-gate 
10730Sstevel@tonic-gate 		distance = 0;
10740Sstevel@tonic-gate 		if (end - start > 0 ||
10750Sstevel@tonic-gate 		    (end - start == 0 && F->f_species == NUMERIC)) {
10760Sstevel@tonic-gate 			/*
10770Sstevel@tonic-gate 			 * Convert field, appending to collated field of line
10780Sstevel@tonic-gate 			 * record.
10790Sstevel@tonic-gate 			 */
10800Sstevel@tonic-gate 			distance = cur_fieldp->f_convert(cur_fieldp, L,
10810Sstevel@tonic-gate 			    field_separator, start, end - start, coll_offset);
10820Sstevel@tonic-gate 
10830Sstevel@tonic-gate 			/*
10840Sstevel@tonic-gate 			 * branch should execute comparatively rarely
10850Sstevel@tonic-gate 			 */
10860Sstevel@tonic-gate 			if (distance == -1) {
10870Sstevel@tonic-gate 				if (flags & FCV_REALLOC) {
10880Sstevel@tonic-gate 					ASSERT(L->l_collate_bufsize > 0);
10890Sstevel@tonic-gate 					L->l_collate_bufsize *= 2;
10900Sstevel@tonic-gate 					L->l_collate.sp =
10910Sstevel@tonic-gate 					    safe_realloc(L->l_collate.sp,
10920Sstevel@tonic-gate 					    L->l_collate_bufsize);
10930Sstevel@tonic-gate 
10940Sstevel@tonic-gate 					__S(stats_incr_convert_reallocs());
10950Sstevel@tonic-gate 					continue;
10960Sstevel@tonic-gate 				} else {
10970Sstevel@tonic-gate 					/*
10980Sstevel@tonic-gate 					 * FCV_FAIL has been set.
10990Sstevel@tonic-gate 					 */
11000Sstevel@tonic-gate 					return (-1);
11010Sstevel@tonic-gate 				}
11020Sstevel@tonic-gate 			}
11030Sstevel@tonic-gate 		}
11040Sstevel@tonic-gate 
11050Sstevel@tonic-gate 		if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
11060Sstevel@tonic-gate 			xstrninv(L->l_collate.sp, coll_offset, distance);
11070Sstevel@tonic-gate 			*(L->l_collate.sp + coll_offset + distance) =
11080Sstevel@tonic-gate 			    (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
11090Sstevel@tonic-gate 			distance++;
11100Sstevel@tonic-gate 		}
11110Sstevel@tonic-gate 
11120Sstevel@tonic-gate 		ASSERT(distance >= 0);
11130Sstevel@tonic-gate 		coll_offset += distance;
11140Sstevel@tonic-gate 		if (coll_offset >= L->l_collate_bufsize) {
11150Sstevel@tonic-gate 			if (flags & FCV_REALLOC) {
11160Sstevel@tonic-gate 				ASSERT(L->l_collate_bufsize > 0);
11170Sstevel@tonic-gate 				L->l_collate_bufsize *= 2;
11180Sstevel@tonic-gate 				L->l_collate.sp = safe_realloc(L->l_collate.sp,
11190Sstevel@tonic-gate 				    L->l_collate_bufsize);
11200Sstevel@tonic-gate 
11210Sstevel@tonic-gate 				__S(stats_incr_convert_reallocs());
11220Sstevel@tonic-gate 			} else {
11230Sstevel@tonic-gate 				return (-1);
11240Sstevel@tonic-gate 			}
11250Sstevel@tonic-gate 		}
11260Sstevel@tonic-gate 		*(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR;
11270Sstevel@tonic-gate 		coll_offset++;
11280Sstevel@tonic-gate 
11290Sstevel@tonic-gate 		cur_fieldp = cur_fieldp->f_next;
11300Sstevel@tonic-gate 	}
11310Sstevel@tonic-gate 
11320Sstevel@tonic-gate 	L->l_collate_length = coll_offset;
11330Sstevel@tonic-gate 
11340Sstevel@tonic-gate 	return (L->l_collate_length);
11350Sstevel@tonic-gate }
11360Sstevel@tonic-gate 
11370Sstevel@tonic-gate ssize_t
field_convert_wide(field_t * F,line_rec_t * L,int flags,vchar_t field_separator)11380Sstevel@tonic-gate field_convert_wide(field_t *F, line_rec_t *L, int flags,
11390Sstevel@tonic-gate     vchar_t field_separator)
11400Sstevel@tonic-gate {
11410Sstevel@tonic-gate 	ssize_t coll_offset = 0;
11420Sstevel@tonic-gate 	ssize_t	start, end, distance;
11430Sstevel@tonic-gate 	field_t *cur_fieldp = F;
11440Sstevel@tonic-gate 
11450Sstevel@tonic-gate 	while (cur_fieldp != NULL) {
11460Sstevel@tonic-gate 		if (!field_separator.wc)
11470Sstevel@tonic-gate 			field_delimit_wide(cur_fieldp, L, &start, &end);
11480Sstevel@tonic-gate 		else
11490Sstevel@tonic-gate 			field_delimit_tabbed_wide(cur_fieldp, L, &start, &end,
11500Sstevel@tonic-gate 			    field_separator);
11510Sstevel@tonic-gate 
11520Sstevel@tonic-gate 		distance = 0;
11530Sstevel@tonic-gate 		if (end - start > 0 ||
11540Sstevel@tonic-gate 		    end - start == 0 && F->f_species == NUMERIC) {
11550Sstevel@tonic-gate 			distance = cur_fieldp->f_convert(cur_fieldp, L,
11560Sstevel@tonic-gate 			    field_separator, start, end - start, coll_offset);
11570Sstevel@tonic-gate 
11580Sstevel@tonic-gate 			if (distance == -1) {
11590Sstevel@tonic-gate 				if (flags & FCV_REALLOC) {
11600Sstevel@tonic-gate 					ASSERT(L->l_collate_bufsize > 0);
11610Sstevel@tonic-gate 					L->l_collate_bufsize *= 2;
11620Sstevel@tonic-gate 					L->l_collate.wp = safe_realloc(
11630Sstevel@tonic-gate 					    L->l_collate.wp,
11640Sstevel@tonic-gate 					    L->l_collate_bufsize);
11650Sstevel@tonic-gate 
11660Sstevel@tonic-gate 					__S(stats_incr_convert_reallocs());
11670Sstevel@tonic-gate 					continue;
11680Sstevel@tonic-gate 				} else {
11690Sstevel@tonic-gate 					return (-1);
11700Sstevel@tonic-gate 				}
11710Sstevel@tonic-gate 			}
11720Sstevel@tonic-gate 		}
11730Sstevel@tonic-gate 
11740Sstevel@tonic-gate 		if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
11750Sstevel@tonic-gate 			xwcsninv(L->l_collate.wp, coll_offset, distance);
11760Sstevel@tonic-gate 			*(L->l_collate.wp + coll_offset + distance) =
11770Sstevel@tonic-gate 			    WCHAR_MAX - INTERFIELD_SEPARATOR;
11780Sstevel@tonic-gate 			distance++;
11790Sstevel@tonic-gate 		}
11800Sstevel@tonic-gate 
11810Sstevel@tonic-gate 		ASSERT(distance >= 0);
11820Sstevel@tonic-gate 		coll_offset += distance;
11830Sstevel@tonic-gate 		if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) {
11840Sstevel@tonic-gate 			if (flags & FCV_REALLOC) {
11850Sstevel@tonic-gate 				ASSERT(L->l_collate_bufsize > 0);
11860Sstevel@tonic-gate 				L->l_collate_bufsize *= 2;
11870Sstevel@tonic-gate 				L->l_collate.wp = safe_realloc(L->l_collate.wp,
11880Sstevel@tonic-gate 				    L->l_collate_bufsize);
11890Sstevel@tonic-gate 
11900Sstevel@tonic-gate 				__S(stats_incr_convert_reallocs());
11910Sstevel@tonic-gate 			} else {
11920Sstevel@tonic-gate 				return (-1);
11930Sstevel@tonic-gate 			}
11940Sstevel@tonic-gate 		}
11950Sstevel@tonic-gate 		*(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR;
11960Sstevel@tonic-gate 		coll_offset++;
11970Sstevel@tonic-gate 
11980Sstevel@tonic-gate 		cur_fieldp = cur_fieldp->f_next;
11990Sstevel@tonic-gate 	}
12000Sstevel@tonic-gate 
12010Sstevel@tonic-gate 	L->l_collate_length = coll_offset * sizeof (wchar_t);
12020Sstevel@tonic-gate #ifdef _LITTLE_ENDIAN
12030Sstevel@tonic-gate 	xwcsntomsb(L->l_collate.wp, coll_offset);
12040Sstevel@tonic-gate #endif /* _LITTLE_ENDIAN */
12050Sstevel@tonic-gate 
12060Sstevel@tonic-gate 	return (L->l_collate_length);
12070Sstevel@tonic-gate }
12080Sstevel@tonic-gate 
12090Sstevel@tonic-gate /*
12100Sstevel@tonic-gate  * line_convert() and line_convert_wide() are called when the collation vector
12110Sstevel@tonic-gate  * of a given line has been exhausted, and we are performing the final,
12120Sstevel@tonic-gate  * full-line comparison required by the sort specification.  Because we do not
12130Sstevel@tonic-gate  * have a guarantee that l_data is null-terminated, we create an explicitly
12140Sstevel@tonic-gate  * null-terminated copy suitable for transformation to a collatable form for the
12150Sstevel@tonic-gate  * current locale.
12160Sstevel@tonic-gate  */
12170Sstevel@tonic-gate static void
line_convert(line_rec_t * L)12180Sstevel@tonic-gate line_convert(line_rec_t *L)
12190Sstevel@tonic-gate {
12200Sstevel@tonic-gate 	static ssize_t bufsize;
12210Sstevel@tonic-gate 	static char *buffer;
12220Sstevel@tonic-gate 
12230Sstevel@tonic-gate 	if (L->l_raw_collate.sp != NULL)
12240Sstevel@tonic-gate 		return;
12250Sstevel@tonic-gate 
12260Sstevel@tonic-gate 	if (L->l_data_length + 1 > bufsize) {
12270Sstevel@tonic-gate 		buffer = safe_realloc(buffer, L->l_data_length + 1);
12280Sstevel@tonic-gate 		bufsize = L->l_data_length + 1;
12290Sstevel@tonic-gate 	}
12300Sstevel@tonic-gate 
12310Sstevel@tonic-gate 	(void) strncpy(buffer, L->l_data.sp, L->l_data_length);
12320Sstevel@tonic-gate 	buffer[L->l_data_length] = '\0';
12330Sstevel@tonic-gate 
12340Sstevel@tonic-gate 	L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp,
12350Sstevel@tonic-gate 	    xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
12360Sstevel@tonic-gate 	xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer,
12370Sstevel@tonic-gate 	    xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
12380Sstevel@tonic-gate 
12390Sstevel@tonic-gate 	__S(stats_incr_line_conversions());
12400Sstevel@tonic-gate }
12410Sstevel@tonic-gate 
12420Sstevel@tonic-gate static void
line_convert_wide(line_rec_t * L)12430Sstevel@tonic-gate line_convert_wide(line_rec_t *L)
12440Sstevel@tonic-gate {
12450Sstevel@tonic-gate 	static wchar_t *buffer;
12460Sstevel@tonic-gate 	static ssize_t bufsize;
12470Sstevel@tonic-gate 
12480Sstevel@tonic-gate 	ssize_t dlength;
12490Sstevel@tonic-gate 
12500Sstevel@tonic-gate 	if (L->l_raw_collate.wp != NULL)
12510Sstevel@tonic-gate 		return;
12520Sstevel@tonic-gate 
12530Sstevel@tonic-gate 	if (L->l_data_length + 1 > bufsize) {
12540Sstevel@tonic-gate 		buffer = safe_realloc(buffer, (L->l_data_length + 1) *
12550Sstevel@tonic-gate 		    sizeof (wchar_t));
12560Sstevel@tonic-gate 		bufsize = L->l_data_length + 1;
12570Sstevel@tonic-gate 	}
12580Sstevel@tonic-gate 
12590Sstevel@tonic-gate 	(void) wcsncpy(buffer, L->l_data.wp, L->l_data_length);
12600Sstevel@tonic-gate 	buffer[L->l_data_length] = L'\0';
12610Sstevel@tonic-gate 
12620Sstevel@tonic-gate 	dlength = wcsxfrm(NULL, buffer, 0) + 1;
12630Sstevel@tonic-gate 	L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength *
12640Sstevel@tonic-gate 	    sizeof (wchar_t));
12650Sstevel@tonic-gate 	(void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength);
12660Sstevel@tonic-gate 
12670Sstevel@tonic-gate 	__S(stats_incr_line_conversions());
12680Sstevel@tonic-gate }
12690Sstevel@tonic-gate 
12700Sstevel@tonic-gate /*
12710Sstevel@tonic-gate  * Our convention for collation is
12720Sstevel@tonic-gate  *
12730Sstevel@tonic-gate  *	A > B  => r > 0,
12740Sstevel@tonic-gate  *	A == B => r = 0,
12750Sstevel@tonic-gate  *	A < B  => r < 0
12760Sstevel@tonic-gate  *
12770Sstevel@tonic-gate  * This convention is consistent with the definition of memcmp(), strcmp(), and
12780Sstevel@tonic-gate  * strncmp() in the C locale.  collated() and collated_wide() have two optional
12790Sstevel@tonic-gate  * behaviours, which can be activated by setting the appropriate values in
12800Sstevel@tonic-gate  * coll_flag:  COLL_UNIQUE, which returns 0 if the l_collate fields of the line
12810Sstevel@tonic-gate  * records being compared are identical; COLL_DATA_ONLY, which ignores the
12820Sstevel@tonic-gate  * l_collate field for the current comparison; and COLL_REVERSE, which flips the
12830Sstevel@tonic-gate  * result for comparisons that fall through to an actual data comparison (since
12840Sstevel@tonic-gate  * the collated vector should already reflect reverse ordering from field
12850Sstevel@tonic-gate  * conversion).
12860Sstevel@tonic-gate  */
12870Sstevel@tonic-gate int
collated(line_rec_t * A,line_rec_t * B,ssize_t depth,flag_t coll_flag)12880Sstevel@tonic-gate collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
12890Sstevel@tonic-gate {
12900Sstevel@tonic-gate 	ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
12910Sstevel@tonic-gate 	int r;
12920Sstevel@tonic-gate 	int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
12930Sstevel@tonic-gate 	    INT_SIGN_PASS_MASK;
12940Sstevel@tonic-gate 	ssize_t la, lb;
12950Sstevel@tonic-gate 
12960Sstevel@tonic-gate 	if (!(coll_flag & COLL_DATA_ONLY)) {
12970Sstevel@tonic-gate 		if (ml > 0) {
12980Sstevel@tonic-gate 			r = memcmp(A->l_collate.sp + depth,
12990Sstevel@tonic-gate 			    B->l_collate.sp + depth, ml);
13000Sstevel@tonic-gate 
13010Sstevel@tonic-gate 			if (r)
13020Sstevel@tonic-gate 				return (r);
13030Sstevel@tonic-gate 		}
13040Sstevel@tonic-gate 
13050Sstevel@tonic-gate 		if (A->l_collate_length < B->l_collate_length)
13060Sstevel@tonic-gate 			return (-1);
13070Sstevel@tonic-gate 
13080Sstevel@tonic-gate 		if (A->l_collate_length > B->l_collate_length)
13090Sstevel@tonic-gate 			return (1);
13100Sstevel@tonic-gate 	}
13110Sstevel@tonic-gate 
13120Sstevel@tonic-gate 	/*
13130Sstevel@tonic-gate 	 * This is where we cut out, if we know that the current sort is over
13140Sstevel@tonic-gate 	 * the entire line.
13150Sstevel@tonic-gate 	 */
13160Sstevel@tonic-gate 	if (coll_flag & COLL_UNIQUE)
13170Sstevel@tonic-gate 		return (0);
13180Sstevel@tonic-gate 
13190Sstevel@tonic-gate 	line_convert(A);
13200Sstevel@tonic-gate 	line_convert(B);
13210Sstevel@tonic-gate 
13220Sstevel@tonic-gate 	la = strlen(A->l_raw_collate.sp);
13230Sstevel@tonic-gate 	lb = strlen(B->l_raw_collate.sp);
13240Sstevel@tonic-gate 
13250Sstevel@tonic-gate 	r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb));
13260Sstevel@tonic-gate 
13270Sstevel@tonic-gate 	if (r)
13280Sstevel@tonic-gate 		return (r ^ mask);
13290Sstevel@tonic-gate 
13300Sstevel@tonic-gate 	if (la < lb)
13310Sstevel@tonic-gate 		return (-1 ^ mask);
13320Sstevel@tonic-gate 
13330Sstevel@tonic-gate 	if (la > lb)
13340Sstevel@tonic-gate 		return (1 ^ mask);
13350Sstevel@tonic-gate 
13360Sstevel@tonic-gate 	return (0);
13370Sstevel@tonic-gate }
13380Sstevel@tonic-gate 
13390Sstevel@tonic-gate int
collated_wide(line_rec_t * A,line_rec_t * B,ssize_t depth,flag_t coll_flag)13400Sstevel@tonic-gate collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
13410Sstevel@tonic-gate {
13420Sstevel@tonic-gate 	ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
13430Sstevel@tonic-gate 	int r;
13440Sstevel@tonic-gate 	int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
13450Sstevel@tonic-gate 	    INT_SIGN_PASS_MASK;
13460Sstevel@tonic-gate 	ssize_t la, lb;
13470Sstevel@tonic-gate 
13480Sstevel@tonic-gate 	if (!(coll_flag & COLL_DATA_ONLY)) {
13490Sstevel@tonic-gate 		if (ml > 0) {
13500Sstevel@tonic-gate 			r = memcmp(A->l_collate.sp + depth,
13510Sstevel@tonic-gate 			    B->l_collate.sp + depth, ml);
13520Sstevel@tonic-gate 
13530Sstevel@tonic-gate 			if (r)
13540Sstevel@tonic-gate 				return (r);
13550Sstevel@tonic-gate 		}
13560Sstevel@tonic-gate 		if (A->l_collate_length < B->l_collate_length)
13570Sstevel@tonic-gate 			return (-1);
13580Sstevel@tonic-gate 
13590Sstevel@tonic-gate 		if (A->l_collate_length > B->l_collate_length)
13600Sstevel@tonic-gate 			return (1);
13610Sstevel@tonic-gate 	}
13620Sstevel@tonic-gate 
13630Sstevel@tonic-gate 	if (coll_flag & COLL_UNIQUE)
13640Sstevel@tonic-gate 		return (0);
13650Sstevel@tonic-gate 
13660Sstevel@tonic-gate 	line_convert_wide(A);
13670Sstevel@tonic-gate 	line_convert_wide(B);
13680Sstevel@tonic-gate 
13690Sstevel@tonic-gate 	la = wcslen(A->l_raw_collate.wp);
13700Sstevel@tonic-gate 	lb = wcslen(B->l_raw_collate.wp);
13710Sstevel@tonic-gate 
13720Sstevel@tonic-gate 	r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp,
13730Sstevel@tonic-gate 	    (size_t)MIN(la, lb));
13740Sstevel@tonic-gate 
13750Sstevel@tonic-gate 	if (r)
13760Sstevel@tonic-gate 		return (r ^ mask);
13770Sstevel@tonic-gate 
13780Sstevel@tonic-gate 	if (la < lb)
13790Sstevel@tonic-gate 		return (-1 ^ mask);
13800Sstevel@tonic-gate 
13810Sstevel@tonic-gate 	if (la > lb)
13820Sstevel@tonic-gate 		return (1 ^ mask);
13830Sstevel@tonic-gate 
13840Sstevel@tonic-gate 	return (0);
13850Sstevel@tonic-gate }
1386