xref: /onnv-gate/usr/src/cmd/sort/common/fields.c (revision 4623:1da836e15491)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include "fields.h"
29 
30 /*
31  * fields
32  *
33  * Overview
34  *   By a field, we mean the various delimited character sequences within each
35  *   line of the input files.  The sort key consists of an ordered sequence of
36  *   fields, which need not include all possible fields for the given line.
37  *   (Furthermore, not every line need contain sufficient fields for the fields
38  *   given within the sort key.  In fact, none of the lines in the input stream
39  *   need contain sufficient fields.)
40  *
41  *   There are two methods for specifying fields for sort(1); these are
42  *   discussed in options.c.  Here we discuss only the internal representation
43  *   of fields, as used for constructing the collation vector for each line as
44  *   defined by the sort key.
45  *
46  * Representation
47  *   The sort key is a singly-linked list of field specifiers.  At present,
48  *   fields may belong to one of three species:  alphabetical, numerical, or
49  *   monthly; the species (f_species) then indicates the conversion function
50  *   (f_convert) used to transform the raw characters of the character sequence
51  *   to a collatable form.  (In principle, this allows us to consider future
52  *   field species such as hexadecimal.)
53  *
54  *   Fields and offsets are numbered such that zero refers to the first field or
55  *   character, respectively.  Thus, the interpretation of a key specifier, m.n,
56  *   is that the field begins at the nth character beyond the mth occurence of
57  *   the key separator.  If the blanks flag has been specified, then the field
58  *   begins at the nth non-blank character past the mth key separator.  If the
59  *   key separator is unspecified, then the key separator is defined as one or
60  *   more blank characters.
61  *
62  *   In general, the various options afforded by sort may be broken into two
63  *   categories:  field species and field modifiers.  For each field species,
64  *   there is one or more conversion routines that take a delimited character
65  *   sequence and convert it to a character sequence collatable by strcmp() or
66  *   memcmp().  For field species that may be further modified, such as the
67  *   fold-to-uppercase option for alphabetic fields, the conversion routine may
68  *   be aware of how the modifier affects collation.  Finally, the no-modifiers
69  *   case may present an opportunity for a simplified, faster version.
70  *
71  * Code Structure
72  *   The code paths for single-byte and multi-byte locales diverge significantly
73  *   in fields.c.  Most routines have an *_wide() version, which produces an
74  *   equivalent effect for line records whose data field is composed of wide
75  *   characters (wchar_t).  However, the l_collated field of a line record is
76  *   always composed of characters, so that the radix sorts provided in
77  *   internal.c can work in both single- and multi-byte locales.  Thus, in the
78  *   various convert_*_wide() routines, the output is placed in l_collated, with
79  *   a length multiplier of 4.
80  */
81 
82 #define	BEFORE_NUMBER	0x0
83 #define	IN_NUMBER	0x1
84 
85 static char	numerical_separator;
86 static char	numerical_decimal;
87 static char	monetary_separator;
88 static char	monetary_decimal;
89 
90 static wchar_t	w_numerical_separator;
91 static wchar_t	w_numerical_decimal;
92 static wchar_t	w_monetary_separator;
93 static wchar_t	w_monetary_decimal;
94 
95 #define	MONTHS_IN_YEAR	12
96 #define	MAX_MON_LEN	20
97 
98 enum { MO_NONE = 1, MO_OFFSET = 2 };
99 
100 static char	*months[MONTHS_IN_YEAR];
101 static size_t	month_lengths[MONTHS_IN_YEAR];
102 static wchar_t	*w_months[MONTHS_IN_YEAR];
103 static size_t	w_month_lengths[MONTHS_IN_YEAR];
104 
105 #define	DECIMAL_CHAR		(numerical_decimal)
106 #define	IS_BLANK(x)		(isspace((uchar_t)(x)) && (x) != '\n')
107 #define	IS_SEPARATOR(x)		\
108 	((numerical_separator != '\0' && (x) == numerical_separator) || \
109 	(monetary_separator != '\0' && (x) == monetary_separator))
110 #define	IS_DECIMAL(x)		\
111 	((x) == numerical_decimal || \
112 	(monetary_decimal != '\0' && (x) == monetary_decimal))
113 #define	W_DECIMAL_CHAR		(w_numerical_decimal)
114 #define	W_IS_BLANK(x)		(iswspace(x) && (x) != L'\n')
115 #define	W_IS_SEPARATOR(x)	\
116 	((numerical_separator != '\0' && (x) == w_numerical_separator) || \
117 	(monetary_separator != '\0' && (x) == w_monetary_separator))
118 #define	W_IS_DECIMAL(x)		\
119 	(((x) == w_numerical_decimal) || \
120 	(monetary_decimal != '\0' && (x) == w_monetary_decimal))
121 
122 #define	INTERFIELD_SEPARATOR '\0'
123 #define	W_INTERFIELD_SEPARATOR L'\0'
124 
125 #define	INT_SIGN_FLIP_MASK 0x80000000
126 #define	INT_SIGN_PASS_MASK 0x00000000
127 
128 /*
129  * strx_ops_t, xfrm_len, and xfrm_cpy:  In the case where we are sorting in the
130  * C locale, we want to avoid the expense of transforming strings to collatable
131  * forms since, by definition, an arbitrary string in the C locale is already in
132  * its collatable form.  Therefore, we construct a small ops vector (the
133  * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into
134  * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear
135  * strxfrm()-like.
136  */
137 /*ARGSUSED*/
138 static size_t
139 xfrm_len(const char *s2, size_t len)
140 {
141 	return (strxfrm(NULL, s2, 0) + 1);
142 }
143 
144 /*
145  * The length represented by n includes a null character, so to return the
146  * correct length we subtract 1.  Note that this function is only used by
147  * field_convert_alpha, and isn't for general use, as it assumes that n is the
148  * length of s2 plus a null character.
149  */
150 static size_t
151 C_ncpy(char *s1, const char *s2, size_t n)
152 {
153 	(void) strncpy(s1, s2, n);
154 	return (n - 1);
155 }
156 
157 /*ARGSUSED*/
158 static size_t
159 C_len(const char *s, size_t len)
160 {
161 	ASSERT(s != NULL);
162 	return (len);
163 }
164 
165 typedef struct _strx_ops {
166 	size_t	(*sx_len)(const char *, size_t);
167 	size_t	(*sx_xfrm)(char *, const char *, size_t);
168 } strx_ops_t;
169 
170 static const strx_ops_t C_ops = { C_len, C_ncpy };
171 static const strx_ops_t SB_ops = { xfrm_len, strxfrm };
172 
173 static const strx_ops_t *xfrm_ops;
174 
175 static void
176 field_initialize_separator(void)
177 {
178 	/*
179 	 * A locale need not define all of the cases below:  only decimal_point
180 	 * must be defined.  Furthermore, sort(1) has traditionally not used the
181 	 * positive_sign and negative_sign, grouping, or currency_symbols (or
182 	 * their numeric counterparts, if any).
183 	 */
184 	struct lconv *conv = localeconv();
185 
186 	if (!xstreql(conv->thousands_sep, "")) {
187 		numerical_separator = *conv->thousands_sep;
188 		(void) mbtowc(&w_numerical_separator, conv->thousands_sep,
189 		    MB_CUR_MAX);
190 	} else
191 		numerical_separator = '\0';
192 
193 	if (!xstreql(conv->mon_thousands_sep, "")) {
194 		monetary_separator = *conv->mon_thousands_sep;
195 		(void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep,
196 		    MB_CUR_MAX);
197 	} else
198 		monetary_separator = '\0';
199 
200 	if (!xstreql(conv->mon_decimal_point, "")) {
201 		monetary_decimal = *conv->mon_decimal_point;
202 		(void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point,
203 		    MB_CUR_MAX);
204 	} else
205 		monetary_decimal = '\0';
206 
207 	numerical_decimal = *conv->decimal_point;
208 	(void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX);
209 }
210 
211 static void
212 field_initialize_month(int is_c_locale)
213 {
214 	int i;
215 	int j;
216 	struct tm this_month;
217 	const char *c_months[MONTHS_IN_YEAR] = {
218 		"JAN", "FEB", "MAR", "APR", "MAY", "JUN",
219 		"JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
220 	};
221 
222 	char month_name[MAX_MON_LEN * MB_LEN_MAX];
223 	wchar_t	w_month_name[MAX_MON_LEN];
224 
225 	if (is_c_locale) {
226 		for (i = 0; i < MONTHS_IN_YEAR; i++) {
227 			months[i] = (char *)c_months[i];
228 			month_lengths[i] = strlen(c_months[i]);
229 		}
230 		/*
231 		 * We don't need to initialize the wide version of the month
232 		 * names.
233 		 */
234 		return;
235 	}
236 
237 	(void) memset(&this_month, 0, sizeof (this_month));
238 
239 	for (i = 0; i < MONTHS_IN_YEAR; i++) {
240 		this_month.tm_mon = i;
241 
242 		(void) strftime(month_name, sizeof (month_name),
243 		    "%b", &this_month);
244 
245 		for (j = 0; j < strlen(month_name); j++)
246 			month_name[j] = toupper(month_name[j]);
247 		(void) mbstowcs(w_month_name, month_name, MAX_MON_LEN);
248 
249 		months[i] = strdup(month_name);
250 		month_lengths[i] = strlen(month_name);
251 		w_months[i] = wsdup(w_month_name);
252 		w_month_lengths[i] = wslen(w_month_name);
253 	}
254 }
255 
256 void
257 field_initialize(sort_t *S)
258 {
259 	field_initialize_month(S->m_c_locale);
260 	field_initialize_separator();
261 
262 	if (S->m_c_locale)
263 		xfrm_ops = &C_ops;
264 	else
265 		xfrm_ops = &SB_ops;
266 }
267 
268 field_t *
269 field_new(sort_t *S)
270 {
271 	field_t	*F = safe_realloc(NULL, sizeof (field_t));
272 
273 	F->f_start_field = -1;
274 	F->f_start_offset = -1;
275 	F->f_end_field = -1;
276 	F->f_end_offset = -1;
277 	F->f_next = NULL;
278 
279 	if (S == NULL) {
280 		F->f_species = ALPHA;
281 		F->f_options = 0;
282 	} else {
283 		F->f_species = S->m_default_species;
284 		F->f_options = S->m_field_options;
285 	}
286 
287 	return (F);
288 }
289 
290 void
291 field_delete(field_t *F)
292 {
293 	free(F);
294 }
295 
296 /*
297  * The recursive implementation of field_add_to_chain() given below is
298  * inappropriate if function calls are expensive, or a truly large number of
299  * fields are anticipated.
300  */
301 void
302 field_add_to_chain(field_t **F, field_t *A)
303 {
304 	if (*F == NULL)
305 		*F = A;
306 	else
307 		field_add_to_chain(&((*F)->f_next), A);
308 }
309 
310 #ifdef DEBUG
311 #ifndef _LP64
312 #define	FIELD_FMT \
313 "\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n"
314 #else /* !_LP64 */
315 #define	FIELD_FMT \
316 "\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n"
317 #endif /* !_LP64 */
318 
319 /*
320  * field_print is used only for debugging purposes.
321  */
322 void
323 field_print(field_t *F)
324 {
325 	char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"};
326 	int status = 0;
327 
328 	(void) fprintf(stderr, "Type: %s", field_names[F->f_species]);
329 	(void) fprintf(stderr, "\tOptions: ");
330 
331 	if (F->f_options & FIELD_REVERSE_COMPARISONS) {
332 		(void) fprintf(stderr, "REVERSE");
333 		status++;
334 	}
335 	if (F->f_options & FIELD_DICTIONARY_ORDER) {
336 		(void) fprintf(stderr, "DICTIONARY ");
337 		status++;
338 	}
339 	if (F->f_options & FIELD_FOLD_UPPERCASE) {
340 		(void) fprintf(stderr, "UPPERCASE ");
341 		status++;
342 	}
343 	if (F->f_options & FIELD_IGNORE_NONPRINTABLES) {
344 		(void) fprintf(stderr, "PRINTABLES ");
345 		status++;
346 	}
347 	if (F->f_options & FIELD_IGNORE_BLANKS_START) {
348 		(void) fprintf(stderr, "BLANKS_START ");
349 		status++;
350 	}
351 	if (F->f_options & FIELD_IGNORE_BLANKS_END) {
352 		(void) fprintf(stderr, "BLANKS_END ");
353 		status++;
354 	}
355 
356 	if (status == 0)
357 		(void) fprintf(stderr, "NO_MODIFIERS");
358 
359 	(void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset,
360 	    F->f_end_field, F->f_end_offset);
361 }
362 #endif /* DEBUG */
363 
364 static ssize_t
365 field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks)
366 {
367 	char *S = L->l_data.sp;
368 	char *T = S;
369 	char *eol = S + L->l_data_length;
370 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
371 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
372 	ssize_t ret;
373 
374 	ASSERT(is_end || field > -1);
375 
376 	if (is_end && field == -1)
377 		return (L->l_data_length);
378 
379 	while (field-- > 0) {
380 		while (T < eol && IS_BLANK(*T))
381 			T++;
382 
383 		while (T < eol && !IS_BLANK(*T))
384 			T++;
385 	}
386 
387 	if ((!is_end || offset > 0) && is_blanks) {
388 		while (IS_BLANK(*T))
389 			T++;
390 	}
391 
392 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
393 		return (L->l_data_length);
394 
395 	return (ret);
396 }
397 
398 static void
399 field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
400 {
401 	ASSERT(F->f_start_field > -1);
402 
403 	*start = field_boundary(F, L, 0,
404 	    F->f_options & FIELD_IGNORE_BLANKS_START);
405 	*end = field_boundary(F, L, 1,
406 	    F->f_options & FIELD_IGNORE_BLANKS_END);
407 }
408 
409 static ssize_t
410 field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks)
411 {
412 	wchar_t *S = L->l_data.wp;
413 	wchar_t *T = S;
414 	wchar_t *eol = S + L->l_data_length;
415 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
416 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
417 	ssize_t ret;
418 
419 	ASSERT(is_end || field > -1);
420 
421 	if (is_end && field == -1)
422 		return (L->l_data_length);
423 
424 	while (field-- > 0) {
425 		while (T < eol && W_IS_BLANK(*T))
426 			T++;
427 
428 		while (T < eol && !W_IS_BLANK(*T))
429 			T++;
430 	}
431 
432 	if ((!is_end || offset > 0) && is_blanks) {
433 		while (W_IS_BLANK(*T))
434 			T++;
435 	}
436 
437 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
438 		return (L->l_data_length);
439 
440 	return (ret);
441 }
442 
443 static void
444 field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
445 {
446 	ASSERT(F->f_start_field > -1);
447 
448 	*start = field_boundary_wide(F, L, 0,
449 	    F->f_options & FIELD_IGNORE_BLANKS_START);
450 	*end = field_boundary_wide(F, L, 1,
451 	    F->f_options & FIELD_IGNORE_BLANKS_END);
452 }
453 
454 static ssize_t
455 field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks,
456     vchar_t delimiter)
457 {
458 	char *S = L->l_data.sp;
459 	char *T = S;
460 	char *eol = S + L->l_data_length;
461 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
462 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
463 	ssize_t ret;
464 
465 	ASSERT(is_end || field > -1);
466 
467 	if (is_end && field == -1)
468 		return (L->l_data_length);
469 
470 	while (field-- > 0) {
471 		T = xstrnchr(T, delimiter.sc, eol - T);
472 		if (T == NULL || T > eol)
473 			return (L->l_data_length);
474 
475 		T++;
476 	}
477 
478 	if ((!is_end || offset != 0) && is_blanks) {
479 		while (IS_BLANK(*T))
480 			T++;
481 	}
482 
483 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
484 		if (S[L->l_data_length - 1] == delimiter.sc) {
485 			return (L->l_data_length - 1);
486 		} else {
487 			return (L->l_data_length);
488 		}
489 	}
490 
491 	if (is_end && offset == 0)
492 		ret--;
493 
494 	return (ret);
495 }
496 
497 /*
498  * field_delimit_tabbed() is called when a field separator has been defined
499  * using the -t option.  The character at the offset, start, is either one or
500  * more character positions past the delimiter marking the start of the
501  * field, or at the end of the line.
502  */
503 static void
504 field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end,
505     vchar_t delimiter)
506 {
507 	ASSERT(F->f_start_field > -1);
508 
509 	*start = field_boundary_tabbed(F, L, 0, F->f_options &
510 	    FIELD_IGNORE_BLANKS_START, delimiter);
511 	*end = field_boundary_tabbed(F, L, 1, F->f_options &
512 	    FIELD_IGNORE_BLANKS_END, delimiter);
513 }
514 
515 static ssize_t
516 field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks,
517     vchar_t delimiter)
518 {
519 	wchar_t *S = L->l_data.wp;
520 	wchar_t *T = S;
521 	wchar_t *eol = S + L->l_data_length;
522 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
523 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
524 	ssize_t ret;
525 
526 	ASSERT(is_end || field > -1);
527 
528 	if (is_end && field == -1)
529 		return (L->l_data_length);
530 
531 	while (field-- > 0) {
532 		T = xwsnchr(T, delimiter.wc, eol - T);
533 		if (T == NULL || T > eol)
534 			return (L->l_data_length);
535 
536 		T++;
537 	}
538 
539 	if ((!is_end || offset != 0) && is_blanks) {
540 		while (W_IS_BLANK(*T))
541 			T++;
542 	}
543 
544 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
545 		if (S[L->l_data_length - 1] == delimiter.wc) {
546 			return (L->l_data_length - 1);
547 		} else {
548 			return (L->l_data_length);
549 		}
550 	}
551 
552 	if (is_end && offset == 0)
553 		ret--;
554 
555 	return (ret);
556 }
557 
558 static void
559 field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start,
560     ssize_t *end, vchar_t delimiter)
561 {
562 	ASSERT(F->f_start_field > -1);
563 
564 	*start = field_boundary_tabbed_wide(F, L, 0, F->f_options &
565 	    FIELD_IGNORE_BLANKS_START, delimiter);
566 	*end = field_boundary_tabbed_wide(F, L, 1, F->f_options &
567 	    FIELD_IGNORE_BLANKS_END, delimiter);
568 }
569 
570 /*ARGSUSED*/
571 ssize_t
572 field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter,
573     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
574 {
575 	int j;
576 	ssize_t	val;
577 	char month_candidate[MAX_MON_LEN * MB_LEN_MAX];
578 	ssize_t month_length = data_length;
579 	ssize_t month_offset = data_offset;
580 
581 	if (sizeof (char) > L->l_collate_bufsize - coll_offset)
582 		return (-1);
583 
584 	(void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX);
585 
586 
587 	/*
588 	 * The month field formally begins with the first non-blank character.
589 	 */
590 	while (IS_BLANK(*(L->l_data.sp + month_offset))) {
591 		month_offset++;
592 		month_length--;
593 	}
594 
595 	for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
596 		month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]);
597 
598 	for (j = 0; j < MONTHS_IN_YEAR; j++) {
599 		if (xstrneql(month_candidate, months[j], month_lengths[j])) {
600 			*(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET;
601 			return (1);
602 		}
603 	}
604 
605 	/*
606 	 * no matching month; copy string into field.  required behaviour is
607 	 * that "month-free" keys sort before month-sortable keys, so insert
608 	 * a "will sort first" token.
609 	 */
610 	*(L->l_collate.sp + coll_offset) = '\0' + MO_NONE;
611 
612 	val = field_convert_alpha_simple(F, L, delimiter, data_offset,
613 	    data_length, coll_offset + 1);
614 
615 	if (val < 0)
616 		return (-1);
617 	else
618 		return (val + 1);
619 }
620 
621 /*ARGSUSED*/
622 ssize_t
623 field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
624     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
625 {
626 	ssize_t j;
627 	ssize_t val;
628 	wchar_t month_candidate[MAX_MON_LEN];
629 	wchar_t *month;
630 	wchar_t *buffer = L->l_collate.wp + coll_offset;
631 	ssize_t month_length = data_length;
632 	ssize_t month_offset = data_offset;
633 
634 	if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) <
635 	    sizeof (wchar_t))
636 		return (-1);
637 
638 	(void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t));
639 
640 
641 	while (W_IS_BLANK(*(L->l_data.wp + month_offset))) {
642 		month_offset++;
643 		month_length--;
644 	}
645 
646 	month = L->l_data.wp + month_offset;
647 
648 	for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
649 		month_candidate[j] = towupper(month[j]);
650 
651 	for (j = 0; j < MONTHS_IN_YEAR; j++)
652 		if (xwcsneql(month_candidate, w_months[j],
653 		    w_month_lengths[j])) {
654 			*buffer = L'\0' + j + MO_OFFSET;
655 			return (1);
656 		}
657 
658 	*buffer = L'\0' + MO_NONE;
659 
660 	val = field_convert_alpha_wide(F, L, delimiter, data_offset,
661 	    data_length, coll_offset + sizeof (wchar_t));
662 
663 	if (val < 0)
664 		return (-1);
665 	else
666 		return (val + 1);
667 }
668 
669 /*
670  * field_convert_alpha() always fails with return value -1 if the converted
671  * string would cause l_collate_length to exceed l_collate_bufsize
672  */
673 /*ARGSUSED*/
674 ssize_t
675 field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter,
676     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
677 {
678 	static char *compose;
679 	static ssize_t compose_length;
680 
681 	ssize_t	clength = 0;
682 	ssize_t	dlength;
683 	ssize_t	i;
684 
685 	if (compose_length < (data_length + 1)) {
686 		compose_length = data_length + 1;
687 		compose = safe_realloc(compose, compose_length * sizeof (char));
688 	}
689 
690 	for (i = data_offset; i < data_offset + data_length; i++) {
691 		char t = (L->l_data.sp)[i];
692 
693 		if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) &&
694 		    !isprint((uchar_t)t))
695 			continue;
696 
697 		if ((F->f_options & FIELD_DICTIONARY_ORDER) &&
698 		    !isalnum((uchar_t)t) && !isspace((uchar_t)t))
699 			continue;
700 
701 		if (F->f_options & FIELD_FOLD_UPPERCASE)
702 			t = toupper(t);
703 
704 		compose[clength++] = t;
705 	}
706 	compose[clength] = '\0';
707 
708 	if ((dlength = xfrm_ops->sx_len(compose, clength)) <
709 	    L->l_collate_bufsize - coll_offset)
710 		return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
711 		    compose, dlength + 1));
712 	else
713 		return ((ssize_t)-1);
714 }
715 
716 /*ARGSUSED*/
717 ssize_t
718 field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter,
719     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
720 {
721 	static char *compose;
722 	static ssize_t compose_length;
723 
724 	ssize_t	clength;
725 	ssize_t	dlength;
726 
727 	if (compose_length < (data_length + 1)) {
728 		compose_length = data_length + 1;
729 		compose = safe_realloc(compose, compose_length * sizeof (char));
730 	}
731 
732 	(void) memcpy(compose, L->l_data.sp + data_offset, data_length);
733 	clength = data_length;
734 	compose[clength] = '\0';
735 
736 	if ((dlength = xfrm_ops->sx_len(compose, clength)) <
737 	    L->l_collate_bufsize - coll_offset)
738 		return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
739 		    compose, dlength + 1));
740 	else
741 		return ((ssize_t)-1);
742 }
743 
744 /*ARGSUSED*/
745 ssize_t
746 field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
747     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
748 {
749 	wchar_t	*compose = safe_realloc(NULL, (data_length + 1) *
750 	    sizeof (wchar_t));
751 	ssize_t	clength = 0;
752 	ssize_t	dlength;
753 	ssize_t	i;
754 	ssize_t ret;
755 
756 	for (i = data_offset; i < data_offset + data_length; i++) {
757 		wchar_t	t = (L->l_data.wp)[i];
758 
759 		if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t))
760 			continue;
761 
762 		if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) &&
763 		    !iswspace(t))
764 			continue;
765 
766 		if (F->f_options & FIELD_FOLD_UPPERCASE)
767 			t = towupper(t);
768 
769 		compose[clength++] = t;
770 	}
771 	compose[clength] = L'\0';
772 
773 	dlength = wcsxfrm(NULL, compose, (size_t)0);
774 	if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize -
775 	    coll_offset * sizeof (wchar_t)) {
776 		ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose,
777 		    (size_t)dlength + 1);
778 	} else {
779 		ret = (ssize_t)-1;
780 	}
781 
782 	safe_free(compose);
783 
784 	return (ret);
785 }
786 
787 /*
788  * field_convert_numeric() converts the given field into a collatable numerical
789  * sequence.  The sequence is ordered as { log, integer, separator, fraction },
790  * with an optional sentinel component at the sequence end.
791  */
792 /*ARGSUSED*/
793 ssize_t
794 field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter,
795     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
796 {
797 	char *number;
798 	char *buffer = L->l_collate.sp + coll_offset;
799 	ssize_t length;
800 
801 	char sign = '2';
802 	int log_ten;
803 	char *digits = buffer + 1 + sizeof (int) / sizeof (char);
804 	size_t j = 0;
805 	size_t i;
806 
807 	int state = BEFORE_NUMBER;
808 
809 	number = L->l_data.sp + data_offset;
810 	length = data_length;
811 
812 	/*
813 	 * Eat leading blanks, if any.
814 	 */
815 	for (i = 0; i < length; i++)
816 		if (!IS_BLANK(number[i]))
817 			break;
818 
819 	/*
820 	 * Test that there is sufficient size in the collation buffer for our
821 	 * number.  In addition to the possible remaining characters in the
822 	 * field, we also require space for the sign (char), logarithm (int),
823 	 * separator (char), and as many as two string terminators (for reverse
824 	 * sorts).
825 	 */
826 	if (((length - i) + 4 * sizeof (char) + sizeof (int)) >
827 	    (L->l_collate_bufsize - coll_offset))
828 		return ((ssize_t)-1);
829 
830 	/*
831 	 * If negative, set sign.
832 	 */
833 	if (number[i] == '-') {
834 		i++;
835 		sign = '0';
836 	}
837 
838 	/*
839 	 * Scan integer part; eat leading zeros.
840 	 */
841 	for (; i < length; i++) {
842 		if (IS_SEPARATOR(number[i]))
843 			continue;
844 
845 		if (number[i] == '0' && !(state & IN_NUMBER))
846 			continue;
847 
848 		if (!isdigit((uchar_t)number[i]))
849 			break;
850 
851 		state |= IN_NUMBER;
852 		if (sign == '0')
853 			digits[j++] = '0' + '9' - number[i];
854 		else
855 			digits[j++] = number[i];
856 	}
857 
858 	if (i < length && IS_DECIMAL(number[i])) {
859 		/*
860 		 * Integer part terminated by decimal.
861 		 */
862 		digits[j] = DECIMAL_CHAR;
863 		log_ten = j++;
864 
865 		/*
866 		 * Scan fractional part.
867 		 */
868 		for (++i; i < length; i++) {
869 			if (IS_SEPARATOR(number[i]))
870 				continue;
871 
872 			if (!isdigit((uchar_t)number[i]))
873 				break;
874 
875 			if (number[i] != '0')
876 				state |= IN_NUMBER;
877 
878 			if (sign == '0')
879 				digits[j++] = '0' + '9' - number[i];
880 			else
881 				digits[j++] = number[i];
882 		}
883 
884 		if (sign == '0')
885 			digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
886 	} else {
887 		/*
888 		 * Nondigit or end of string seen.
889 		 */
890 		log_ten = (int)j;
891 		if (sign == '0')
892 			digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
893 		else
894 			digits[j] = INTERFIELD_SEPARATOR;
895 	}
896 
897 	if ((state & IN_NUMBER) == 0) {
898 		/*
899 		 * A non-zero number was not detected; treat as defined zero.
900 		 */
901 		sign = '1';
902 		log_ten = 0;
903 		digits[0] = '0';
904 		j = 1;
905 	}
906 
907 	/*
908 	 * We subtract a constant from the log of negative values so that
909 	 * they will correctly precede positive values with a zero logarithm.
910 	 */
911 	if (sign == '0') {
912 		if (j != 0)
913 			log_ten = -log_ten - 2;
914 		else
915 			/*
916 			 * Special case for -0.
917 			 */
918 			log_ten = -1;
919 	}
920 
921 	buffer[0] = sign;
922 
923 	/*
924 	 * Place logarithm in big-endian form.
925 	 */
926 	for (i = 0; i < sizeof (int); i++)
927 		buffer[i + 1] = (log_ten << (i * NBBY))
928 		    >> ((sizeof (int) - 1) * NBBY);
929 
930 	if (j + sizeof (char) + sizeof (int) <
931 	    L->l_collate_bufsize - coll_offset)
932 		return (j + 1 + sizeof (int));
933 	else
934 		return ((ssize_t)-1);
935 }
936 
937 /*ARGSUSED*/
938 ssize_t
939 field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
940     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
941 {
942 	wchar_t *number;
943 	wchar_t *buffer = L->l_collate.wp + coll_offset;
944 	char *lbuffer;
945 	ssize_t length;
946 
947 	wchar_t	sign = L'2';
948 	int log_ten;
949 	wchar_t	*digits = buffer + 1 + sizeof (int)/sizeof (wchar_t);
950 	size_t j = 0;
951 	size_t i;
952 
953 	int state = BEFORE_NUMBER;
954 
955 	number = L->l_data.wp + data_offset;
956 	length = data_length;
957 
958 	for (i = 0; i < length; i++)
959 		if (!W_IS_BLANK(number[i]))
960 			break;
961 
962 	if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) +
963 	    sizeof (int)) > (L->l_collate_bufsize - coll_offset))
964 		return ((ssize_t)-1);
965 
966 	if (number[i] == L'-') {
967 		i++;
968 		sign = L'0';
969 	}
970 
971 	for (; i < length; i++) {
972 		if (W_IS_SEPARATOR(number[i]))
973 			continue;
974 
975 		if (number[i] == L'0' && !(state & IN_NUMBER))
976 			continue;
977 
978 		if (!iswdigit(number[i]))
979 			break;
980 
981 		state |= IN_NUMBER;
982 		if (sign == L'0')
983 			digits[j++] = L'0' + L'9' - number[i];
984 		else
985 			digits[j++] = number[i];
986 	}
987 
988 	if (i < length && W_IS_DECIMAL(number[i])) {
989 		digits[j] = W_DECIMAL_CHAR;
990 		log_ten = j++;
991 
992 		for (++i; i < length; i++) {
993 			if (W_IS_SEPARATOR(number[i]))
994 				continue;
995 
996 			if (!iswdigit(number[i]))
997 				break;
998 
999 			if (number[i] != L'0')
1000 				state |= IN_NUMBER;
1001 
1002 			if (sign == L'0')
1003 				digits[j++] = L'0' + L'9' - number[i];
1004 			else
1005 				digits[j++] = number[i];
1006 		}
1007 
1008 		if (sign == L'0')
1009 			digits[j++] = (wchar_t)(WCHAR_MAX -
1010 			    W_INTERFIELD_SEPARATOR);
1011 	} else {
1012 		log_ten = (int)j;
1013 		if (sign == L'0')
1014 			digits[j++] = (wchar_t)(WCHAR_MAX -
1015 			    W_INTERFIELD_SEPARATOR);
1016 		else
1017 			digits[j] = W_INTERFIELD_SEPARATOR;
1018 	}
1019 
1020 	if ((state & IN_NUMBER) == 0) {
1021 		sign = L'1';
1022 		log_ten = 0;
1023 		digits[0] = L'0';
1024 		j = 1;
1025 	}
1026 
1027 	if (sign == L'0') {
1028 		if (j != 0)
1029 			log_ten = -log_ten - 2;
1030 		else
1031 			log_ten = -1;
1032 	}
1033 
1034 	buffer[0] = sign;
1035 	/*
1036 	 * Place logarithm in big-endian form.
1037 	 */
1038 	lbuffer = (char *)(buffer + 1);
1039 	for (i = 0; i < sizeof (int); i++)
1040 		lbuffer[i] = (log_ten << (i * NBBY))
1041 		    >> ((sizeof (int) - 1) * NBBY);
1042 
1043 	if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) <
1044 	    L->l_collate_bufsize - coll_offset * sizeof (wchar_t))
1045 		return (j + 1 + sizeof (int) / sizeof (wchar_t));
1046 	else
1047 		return ((ssize_t)-1);
1048 }
1049 
1050 /*
1051  * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour
1052  * when coll_offset exceeds l_collate_bufsize.
1053  */
1054 ssize_t
1055 field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator)
1056 {
1057 	ssize_t coll_offset = 0;
1058 	ssize_t	start, end, distance;
1059 	field_t *cur_fieldp = F;
1060 
1061 	while (cur_fieldp != NULL) {
1062 		/*
1063 		 * delimit field
1064 		 */
1065 		if (!field_separator.sc)
1066 			field_delimit(cur_fieldp, L, &start, &end);
1067 		else
1068 			field_delimit_tabbed(cur_fieldp, L, &start, &end,
1069 			    field_separator);
1070 
1071 		distance = 0;
1072 		if (end - start > 0 ||
1073 		    (end - start == 0 && F->f_species == NUMERIC)) {
1074 			/*
1075 			 * Convert field, appending to collated field of line
1076 			 * record.
1077 			 */
1078 			distance = cur_fieldp->f_convert(cur_fieldp, L,
1079 			    field_separator, start, end - start, coll_offset);
1080 
1081 			/*
1082 			 * branch should execute comparatively rarely
1083 			 */
1084 			if (distance == -1) {
1085 				if (flags & FCV_REALLOC) {
1086 					ASSERT(L->l_collate_bufsize > 0);
1087 					L->l_collate_bufsize *= 2;
1088 					L->l_collate.sp =
1089 					    safe_realloc(L->l_collate.sp,
1090 					    L->l_collate_bufsize);
1091 
1092 					__S(stats_incr_convert_reallocs());
1093 					continue;
1094 				} else {
1095 					/*
1096 					 * FCV_FAIL has been set.
1097 					 */
1098 					return (-1);
1099 				}
1100 			}
1101 		}
1102 
1103 		if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
1104 			xstrninv(L->l_collate.sp, coll_offset, distance);
1105 			*(L->l_collate.sp + coll_offset + distance) =
1106 			    (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
1107 			distance++;
1108 		}
1109 
1110 		ASSERT(distance >= 0);
1111 		coll_offset += distance;
1112 		if (coll_offset >= L->l_collate_bufsize) {
1113 			if (flags & FCV_REALLOC) {
1114 				ASSERT(L->l_collate_bufsize > 0);
1115 				L->l_collate_bufsize *= 2;
1116 				L->l_collate.sp = safe_realloc(L->l_collate.sp,
1117 				    L->l_collate_bufsize);
1118 
1119 				__S(stats_incr_convert_reallocs());
1120 			} else {
1121 				return (-1);
1122 			}
1123 		}
1124 		*(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR;
1125 		coll_offset++;
1126 
1127 		cur_fieldp = cur_fieldp->f_next;
1128 	}
1129 
1130 	L->l_collate_length = coll_offset;
1131 
1132 	return (L->l_collate_length);
1133 }
1134 
1135 ssize_t
1136 field_convert_wide(field_t *F, line_rec_t *L, int flags,
1137     vchar_t field_separator)
1138 {
1139 	ssize_t coll_offset = 0;
1140 	ssize_t	start, end, distance;
1141 	field_t *cur_fieldp = F;
1142 
1143 	while (cur_fieldp != NULL) {
1144 		if (!field_separator.wc)
1145 			field_delimit_wide(cur_fieldp, L, &start, &end);
1146 		else
1147 			field_delimit_tabbed_wide(cur_fieldp, L, &start, &end,
1148 			    field_separator);
1149 
1150 		distance = 0;
1151 		if (end - start > 0 ||
1152 		    end - start == 0 && F->f_species == NUMERIC) {
1153 			distance = cur_fieldp->f_convert(cur_fieldp, L,
1154 			    field_separator, start, end - start, coll_offset);
1155 
1156 			if (distance == -1) {
1157 				if (flags & FCV_REALLOC) {
1158 					ASSERT(L->l_collate_bufsize > 0);
1159 					L->l_collate_bufsize *= 2;
1160 					L->l_collate.wp = safe_realloc(
1161 					    L->l_collate.wp,
1162 					    L->l_collate_bufsize);
1163 
1164 					__S(stats_incr_convert_reallocs());
1165 					continue;
1166 				} else {
1167 					return (-1);
1168 				}
1169 			}
1170 		}
1171 
1172 		if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
1173 			xwcsninv(L->l_collate.wp, coll_offset, distance);
1174 			*(L->l_collate.wp + coll_offset + distance) =
1175 			    WCHAR_MAX - INTERFIELD_SEPARATOR;
1176 			distance++;
1177 		}
1178 
1179 		ASSERT(distance >= 0);
1180 		coll_offset += distance;
1181 		if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) {
1182 			if (flags & FCV_REALLOC) {
1183 				ASSERT(L->l_collate_bufsize > 0);
1184 				L->l_collate_bufsize *= 2;
1185 				L->l_collate.wp = safe_realloc(L->l_collate.wp,
1186 				    L->l_collate_bufsize);
1187 
1188 				__S(stats_incr_convert_reallocs());
1189 			} else {
1190 				return (-1);
1191 			}
1192 		}
1193 		*(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR;
1194 		coll_offset++;
1195 
1196 		cur_fieldp = cur_fieldp->f_next;
1197 	}
1198 
1199 	L->l_collate_length = coll_offset * sizeof (wchar_t);
1200 #ifdef _LITTLE_ENDIAN
1201 	xwcsntomsb(L->l_collate.wp, coll_offset);
1202 #endif /* _LITTLE_ENDIAN */
1203 
1204 	return (L->l_collate_length);
1205 }
1206 
1207 /*
1208  * line_convert() and line_convert_wide() are called when the collation vector
1209  * of a given line has been exhausted, and we are performing the final,
1210  * full-line comparison required by the sort specification.  Because we do not
1211  * have a guarantee that l_data is null-terminated, we create an explicitly
1212  * null-terminated copy suitable for transformation to a collatable form for the
1213  * current locale.
1214  */
1215 static void
1216 line_convert(line_rec_t *L)
1217 {
1218 	static ssize_t bufsize;
1219 	static char *buffer;
1220 
1221 	if (L->l_raw_collate.sp != NULL)
1222 		return;
1223 
1224 	if (L->l_data_length + 1 > bufsize) {
1225 		buffer = safe_realloc(buffer, L->l_data_length + 1);
1226 		bufsize = L->l_data_length + 1;
1227 	}
1228 
1229 	(void) strncpy(buffer, L->l_data.sp, L->l_data_length);
1230 	buffer[L->l_data_length] = '\0';
1231 
1232 	L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp,
1233 	    xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
1234 	xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer,
1235 	    xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
1236 
1237 	__S(stats_incr_line_conversions());
1238 }
1239 
1240 static void
1241 line_convert_wide(line_rec_t *L)
1242 {
1243 	static wchar_t *buffer;
1244 	static ssize_t bufsize;
1245 
1246 	ssize_t dlength;
1247 
1248 	if (L->l_raw_collate.wp != NULL)
1249 		return;
1250 
1251 	if (L->l_data_length + 1 > bufsize) {
1252 		buffer = safe_realloc(buffer, (L->l_data_length + 1) *
1253 		    sizeof (wchar_t));
1254 		bufsize = L->l_data_length + 1;
1255 	}
1256 
1257 	(void) wcsncpy(buffer, L->l_data.wp, L->l_data_length);
1258 	buffer[L->l_data_length] = L'\0';
1259 
1260 	dlength = wcsxfrm(NULL, buffer, 0) + 1;
1261 	L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength *
1262 	    sizeof (wchar_t));
1263 	(void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength);
1264 
1265 	__S(stats_incr_line_conversions());
1266 }
1267 
1268 /*
1269  * Our convention for collation is
1270  *
1271  *	A > B  => r > 0,
1272  *	A == B => r = 0,
1273  *	A < B  => r < 0
1274  *
1275  * This convention is consistent with the definition of memcmp(), strcmp(), and
1276  * strncmp() in the C locale.  collated() and collated_wide() have two optional
1277  * behaviours, which can be activated by setting the appropriate values in
1278  * coll_flag:  COLL_UNIQUE, which returns 0 if the l_collate fields of the line
1279  * records being compared are identical; COLL_DATA_ONLY, which ignores the
1280  * l_collate field for the current comparison; and COLL_REVERSE, which flips the
1281  * result for comparisons that fall through to an actual data comparison (since
1282  * the collated vector should already reflect reverse ordering from field
1283  * conversion).
1284  */
1285 int
1286 collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
1287 {
1288 	ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
1289 	int r;
1290 	int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
1291 	    INT_SIGN_PASS_MASK;
1292 	ssize_t la, lb;
1293 
1294 	if (!(coll_flag & COLL_DATA_ONLY)) {
1295 		if (ml > 0) {
1296 			r = memcmp(A->l_collate.sp + depth,
1297 			    B->l_collate.sp + depth, ml);
1298 
1299 			if (r)
1300 				return (r);
1301 		}
1302 
1303 		if (A->l_collate_length < B->l_collate_length)
1304 			return (-1);
1305 
1306 		if (A->l_collate_length > B->l_collate_length)
1307 			return (1);
1308 	}
1309 
1310 	/*
1311 	 * This is where we cut out, if we know that the current sort is over
1312 	 * the entire line.
1313 	 */
1314 	if (coll_flag & COLL_UNIQUE)
1315 		return (0);
1316 
1317 	line_convert(A);
1318 	line_convert(B);
1319 
1320 	la = strlen(A->l_raw_collate.sp);
1321 	lb = strlen(B->l_raw_collate.sp);
1322 
1323 	r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb));
1324 
1325 	if (r)
1326 		return (r ^ mask);
1327 
1328 	if (la < lb)
1329 		return (-1 ^ mask);
1330 
1331 	if (la > lb)
1332 		return (1 ^ mask);
1333 
1334 	return (0);
1335 }
1336 
1337 int
1338 collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
1339 {
1340 	ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
1341 	int r;
1342 	int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
1343 	    INT_SIGN_PASS_MASK;
1344 	ssize_t la, lb;
1345 
1346 	if (!(coll_flag & COLL_DATA_ONLY)) {
1347 		if (ml > 0) {
1348 			r = memcmp(A->l_collate.sp + depth,
1349 			    B->l_collate.sp + depth, ml);
1350 
1351 			if (r)
1352 				return (r);
1353 		}
1354 		if (A->l_collate_length < B->l_collate_length)
1355 			return (-1);
1356 
1357 		if (A->l_collate_length > B->l_collate_length)
1358 			return (1);
1359 	}
1360 
1361 	if (coll_flag & COLL_UNIQUE)
1362 		return (0);
1363 
1364 	line_convert_wide(A);
1365 	line_convert_wide(B);
1366 
1367 	la = wcslen(A->l_raw_collate.wp);
1368 	lb = wcslen(B->l_raw_collate.wp);
1369 
1370 	r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp,
1371 	    (size_t)MIN(la, lb));
1372 
1373 	if (r)
1374 		return (r ^ mask);
1375 
1376 	if (la < lb)
1377 		return (-1 ^ mask);
1378 
1379 	if (la > lb)
1380 		return (1 ^ mask);
1381 
1382 	return (0);
1383 }
1384