xref: /dflybsd-src/usr.bin/sort/vsort.c (revision 50fc853e3b7db00a98d1340044b5206d291d1925)
1*50fc853eSJohn Marino /*-
2*50fc853eSJohn Marino  * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
3*50fc853eSJohn Marino  * Copyright (C) 2012 Gabor Kovesdan <gabor@FreeBSD.org>
4*50fc853eSJohn Marino  * All rights reserved.
5*50fc853eSJohn Marino  *
6*50fc853eSJohn Marino  * Redistribution and use in source and binary forms, with or without
7*50fc853eSJohn Marino  * modification, are permitted provided that the following conditions
8*50fc853eSJohn Marino  * are met:
9*50fc853eSJohn Marino  * 1. Redistributions of source code must retain the above copyright
10*50fc853eSJohn Marino  *    notice, this list of conditions and the following disclaimer.
11*50fc853eSJohn Marino  * 2. Redistributions in binary form must reproduce the above copyright
12*50fc853eSJohn Marino  *    notice, this list of conditions and the following disclaimer in the
13*50fc853eSJohn Marino  *    documentation and/or other materials provided with the distribution.
14*50fc853eSJohn Marino  *
15*50fc853eSJohn Marino  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16*50fc853eSJohn Marino  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17*50fc853eSJohn Marino  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18*50fc853eSJohn Marino  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19*50fc853eSJohn Marino  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20*50fc853eSJohn Marino  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21*50fc853eSJohn Marino  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22*50fc853eSJohn Marino  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23*50fc853eSJohn Marino  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24*50fc853eSJohn Marino  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25*50fc853eSJohn Marino  * SUCH DAMAGE.
26*50fc853eSJohn Marino  *
27*50fc853eSJohn Marino  * $FreeBSD: head/usr.bin/sort/vsort.c 281132 2015-04-06 02:35:55Z pfg $
28*50fc853eSJohn Marino  */
29*50fc853eSJohn Marino 
30*50fc853eSJohn Marino 
31*50fc853eSJohn Marino #include <sys/types.h>
32*50fc853eSJohn Marino 
33*50fc853eSJohn Marino #include <ctype.h>
34*50fc853eSJohn Marino #include <stdlib.h>
35*50fc853eSJohn Marino #include <string.h>
36*50fc853eSJohn Marino 
37*50fc853eSJohn Marino #include "sort.h"
38*50fc853eSJohn Marino #include "vsort.h"
39*50fc853eSJohn Marino 
40*50fc853eSJohn Marino static inline bool
isdigit_clocale(wchar_t c)41*50fc853eSJohn Marino isdigit_clocale(wchar_t c)
42*50fc853eSJohn Marino {
43*50fc853eSJohn Marino 
44*50fc853eSJohn Marino 	return (c >= L'0' && c <= L'9');
45*50fc853eSJohn Marino }
46*50fc853eSJohn Marino 
47*50fc853eSJohn Marino static inline bool
isalpha_clocale(wchar_t c)48*50fc853eSJohn Marino isalpha_clocale(wchar_t c)
49*50fc853eSJohn Marino {
50*50fc853eSJohn Marino 
51*50fc853eSJohn Marino 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'));
52*50fc853eSJohn Marino }
53*50fc853eSJohn Marino 
54*50fc853eSJohn Marino static inline bool
isalnum_clocale(wchar_t c)55*50fc853eSJohn Marino isalnum_clocale(wchar_t c)
56*50fc853eSJohn Marino {
57*50fc853eSJohn Marino 
58*50fc853eSJohn Marino 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') ||
59*50fc853eSJohn Marino 	    (c >= L'0' && c <= L'9'));
60*50fc853eSJohn Marino }
61*50fc853eSJohn Marino 
62*50fc853eSJohn Marino /*
63*50fc853eSJohn Marino  * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$
64*50fc853eSJohn Marino  * Set length of string before suffix.
65*50fc853eSJohn Marino  */
66*50fc853eSJohn Marino static void
find_suffix(bwstring_iterator si,bwstring_iterator se,size_t * len)67*50fc853eSJohn Marino find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len)
68*50fc853eSJohn Marino {
69*50fc853eSJohn Marino 	wchar_t c;
70*50fc853eSJohn Marino 	size_t clen;
71*50fc853eSJohn Marino 	bool expect_alpha, sfx;
72*50fc853eSJohn Marino 
73*50fc853eSJohn Marino 	sfx = false;
74*50fc853eSJohn Marino 	expect_alpha = false;
75*50fc853eSJohn Marino 	*len = 0;
76*50fc853eSJohn Marino 	clen = 0;
77*50fc853eSJohn Marino 
78*50fc853eSJohn Marino 	while ((si < se) && (c = bws_get_iter_value(si))) {
79*50fc853eSJohn Marino 		if (expect_alpha) {
80*50fc853eSJohn Marino 			expect_alpha = false;
81*50fc853eSJohn Marino 			if (!isalpha_clocale(c) && (c != L'~'))
82*50fc853eSJohn Marino 				sfx = false;
83*50fc853eSJohn Marino 		} else if (c == L'.') {
84*50fc853eSJohn Marino 			expect_alpha = true;
85*50fc853eSJohn Marino 			if (!sfx) {
86*50fc853eSJohn Marino 				sfx = true;
87*50fc853eSJohn Marino 				*len = clen;
88*50fc853eSJohn Marino 			}
89*50fc853eSJohn Marino 		} else if (!isalnum_clocale(c) && (c != L'~'))
90*50fc853eSJohn Marino 			sfx = false;
91*50fc853eSJohn Marino 
92*50fc853eSJohn Marino 		si = bws_iterator_inc(si, 1);
93*50fc853eSJohn Marino 		++clen;
94*50fc853eSJohn Marino 	}
95*50fc853eSJohn Marino 
96*50fc853eSJohn Marino 	/* This code must be here to make the implementation compatible
97*50fc853eSJohn Marino 	 * with WORDING of GNU sort documentation.
98*50fc853eSJohn Marino 	 * But the GNU sort implementation is not following its own
99*50fc853eSJohn Marino 	 * documentation.  GNU sort allows empty file extensions
100*50fc853eSJohn Marino 	 * (just dot with nothing after); but the regular expression in
101*50fc853eSJohn Marino 	 * their documentation does not allow empty file extensions.
102*50fc853eSJohn Marino 	 * We chose to make our implementation compatible with GNU sort
103*50fc853eSJohn Marino 	 * implementation.  If they will ever fix their bug, this code
104*50fc853eSJohn Marino 	 * must be uncommented. Or they may choose to fix the info page,
105*50fc853eSJohn Marino 	 * then the code stays commented.
106*50fc853eSJohn Marino 	 *
107*50fc853eSJohn Marino 	 if (expect_alpha)
108*50fc853eSJohn Marino 		sfx = false;
109*50fc853eSJohn Marino 	 */
110*50fc853eSJohn Marino 
111*50fc853eSJohn Marino 	if (!sfx)
112*50fc853eSJohn Marino 		*len = clen;
113*50fc853eSJohn Marino }
114*50fc853eSJohn Marino 
115*50fc853eSJohn Marino static inline int
cmp_chars(wchar_t c1,wchar_t c2)116*50fc853eSJohn Marino cmp_chars(wchar_t c1, wchar_t c2)
117*50fc853eSJohn Marino {
118*50fc853eSJohn Marino 
119*50fc853eSJohn Marino 	if (c1 == c2)
120*50fc853eSJohn Marino 		return (0);
121*50fc853eSJohn Marino 
122*50fc853eSJohn Marino 	if (c1 == L'~')
123*50fc853eSJohn Marino 		return (-1);
124*50fc853eSJohn Marino 	if (c2 == L'~')
125*50fc853eSJohn Marino 		return (+1);
126*50fc853eSJohn Marino 
127*50fc853eSJohn Marino 	if (isdigit_clocale(c1) || !c1)
128*50fc853eSJohn Marino 		return ((isdigit_clocale(c2) || !c2) ? 0 : -1);
129*50fc853eSJohn Marino 
130*50fc853eSJohn Marino 	if (isdigit_clocale(c2) || !c2)
131*50fc853eSJohn Marino 		return (+1);
132*50fc853eSJohn Marino 
133*50fc853eSJohn Marino 	if (isalpha_clocale(c1))
134*50fc853eSJohn Marino 		return ((isalpha_clocale(c2)) ? ((int) c1 - (int) c2) : -1);
135*50fc853eSJohn Marino 
136*50fc853eSJohn Marino 	if (isalpha_clocale(c2))
137*50fc853eSJohn Marino 		return (+1);
138*50fc853eSJohn Marino 
139*50fc853eSJohn Marino 	return ((int) c1 - (int) c2);
140*50fc853eSJohn Marino }
141*50fc853eSJohn Marino 
142*50fc853eSJohn Marino static int
cmpversions(bwstring_iterator si1,bwstring_iterator se1,bwstring_iterator si2,bwstring_iterator se2)143*50fc853eSJohn Marino cmpversions(bwstring_iterator si1, bwstring_iterator se1,
144*50fc853eSJohn Marino     bwstring_iterator si2, bwstring_iterator se2)
145*50fc853eSJohn Marino {
146*50fc853eSJohn Marino 	int cmp, diff;
147*50fc853eSJohn Marino 
148*50fc853eSJohn Marino 	while ((si1 < se1) || (si2 < se2)) {
149*50fc853eSJohn Marino 		diff = 0;
150*50fc853eSJohn Marino 
151*50fc853eSJohn Marino 		while (((si1 < se1) &&
152*50fc853eSJohn Marino 		    !isdigit_clocale(bws_get_iter_value(si1))) ||
153*50fc853eSJohn Marino 		    ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) {
154*50fc853eSJohn Marino 			wchar_t c1, c2;
155*50fc853eSJohn Marino 
156*50fc853eSJohn Marino 			c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0;
157*50fc853eSJohn Marino 			c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0;
158*50fc853eSJohn Marino 
159*50fc853eSJohn Marino 			cmp = cmp_chars(c1, c2);
160*50fc853eSJohn Marino 			if (cmp)
161*50fc853eSJohn Marino 				return (cmp);
162*50fc853eSJohn Marino 
163*50fc853eSJohn Marino 			if (si1 < se1)
164*50fc853eSJohn Marino 				si1 = bws_iterator_inc(si1, 1);
165*50fc853eSJohn Marino 			if (si2 < se2)
166*50fc853eSJohn Marino 				si2 = bws_iterator_inc(si2, 1);
167*50fc853eSJohn Marino 		}
168*50fc853eSJohn Marino 
169*50fc853eSJohn Marino 		while (bws_get_iter_value(si1) == L'0')
170*50fc853eSJohn Marino 			si1 = bws_iterator_inc(si1, 1);
171*50fc853eSJohn Marino 
172*50fc853eSJohn Marino 		while (bws_get_iter_value(si2) == L'0')
173*50fc853eSJohn Marino 			si2 = bws_iterator_inc(si2, 1);
174*50fc853eSJohn Marino 
175*50fc853eSJohn Marino 		while (isdigit_clocale(bws_get_iter_value(si1)) &&
176*50fc853eSJohn Marino 		    isdigit_clocale(bws_get_iter_value(si2))) {
177*50fc853eSJohn Marino 			if (!diff)
178*50fc853eSJohn Marino 				diff = ((int)bws_get_iter_value(si1) -
179*50fc853eSJohn Marino 				    (int)bws_get_iter_value(si2));
180*50fc853eSJohn Marino 			si1 = bws_iterator_inc(si1, 1);
181*50fc853eSJohn Marino 			si2 = bws_iterator_inc(si2, 1);
182*50fc853eSJohn Marino 		}
183*50fc853eSJohn Marino 
184*50fc853eSJohn Marino 		if (isdigit_clocale(bws_get_iter_value(si1)))
185*50fc853eSJohn Marino 			return (1);
186*50fc853eSJohn Marino 
187*50fc853eSJohn Marino 		if (isdigit_clocale(bws_get_iter_value(si2)))
188*50fc853eSJohn Marino 			return (-1);
189*50fc853eSJohn Marino 
190*50fc853eSJohn Marino 		if (diff)
191*50fc853eSJohn Marino 			return (diff);
192*50fc853eSJohn Marino 	}
193*50fc853eSJohn Marino 
194*50fc853eSJohn Marino 	return (0);
195*50fc853eSJohn Marino }
196*50fc853eSJohn Marino 
197*50fc853eSJohn Marino /*
198*50fc853eSJohn Marino  * Compare two version strings
199*50fc853eSJohn Marino  */
200*50fc853eSJohn Marino int
vcmp(struct bwstring * s1,struct bwstring * s2)201*50fc853eSJohn Marino vcmp(struct bwstring *s1, struct bwstring *s2)
202*50fc853eSJohn Marino {
203*50fc853eSJohn Marino 	bwstring_iterator si1, si2;
204*50fc853eSJohn Marino 	wchar_t c1, c2;
205*50fc853eSJohn Marino 	size_t len1, len2, slen1, slen2;
206*50fc853eSJohn Marino 	int cmp_bytes, cmp_res;
207*50fc853eSJohn Marino 
208*50fc853eSJohn Marino 	if (s1 == s2)
209*50fc853eSJohn Marino 		return (0);
210*50fc853eSJohn Marino 
211*50fc853eSJohn Marino 	cmp_bytes = bwscmp(s1, s2, 0);
212*50fc853eSJohn Marino 	if (cmp_bytes == 0)
213*50fc853eSJohn Marino 		return (0);
214*50fc853eSJohn Marino 
215*50fc853eSJohn Marino 	len1 = slen1 = BWSLEN(s1);
216*50fc853eSJohn Marino 	len2 = slen2 = BWSLEN(s2);
217*50fc853eSJohn Marino 
218*50fc853eSJohn Marino 	if (slen1 < 1)
219*50fc853eSJohn Marino 		return (-1);
220*50fc853eSJohn Marino 	if (slen2 < 1)
221*50fc853eSJohn Marino 		return (+1);
222*50fc853eSJohn Marino 
223*50fc853eSJohn Marino 	si1 = bws_begin(s1);
224*50fc853eSJohn Marino 	si2 = bws_begin(s2);
225*50fc853eSJohn Marino 
226*50fc853eSJohn Marino 	c1 = bws_get_iter_value(si1);
227*50fc853eSJohn Marino 	c2 = bws_get_iter_value(si2);
228*50fc853eSJohn Marino 
229*50fc853eSJohn Marino 	if (c1 == L'.' && (slen1 == 1))
230*50fc853eSJohn Marino 		return (-1);
231*50fc853eSJohn Marino 
232*50fc853eSJohn Marino 	if (c2 == L'.' && (slen2 == 1))
233*50fc853eSJohn Marino 		return (+1);
234*50fc853eSJohn Marino 
235*50fc853eSJohn Marino 	if (slen1 == 2 && c1 == L'.' &&
236*50fc853eSJohn Marino 	    bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.')
237*50fc853eSJohn Marino 		return (-1);
238*50fc853eSJohn Marino 	if (slen2 == 2 && c2 == L'.' &&
239*50fc853eSJohn Marino 	    bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.')
240*50fc853eSJohn Marino 		return (+1);
241*50fc853eSJohn Marino 
242*50fc853eSJohn Marino 	if (c1 == L'.' && c2 != L'.')
243*50fc853eSJohn Marino 		return (-1);
244*50fc853eSJohn Marino 	if (c1 != L'.' && c2 == L'.')
245*50fc853eSJohn Marino 		return (+1);
246*50fc853eSJohn Marino 
247*50fc853eSJohn Marino 	if (c1 == L'.' && c2 == L'.') {
248*50fc853eSJohn Marino 		si1 = bws_iterator_inc(si1, 1);
249*50fc853eSJohn Marino 		si2 = bws_iterator_inc(si2, 1);
250*50fc853eSJohn Marino 	}
251*50fc853eSJohn Marino 
252*50fc853eSJohn Marino 	find_suffix(si1, bws_end(s1), &len1);
253*50fc853eSJohn Marino 	find_suffix(si2, bws_end(s2), &len2);
254*50fc853eSJohn Marino 
255*50fc853eSJohn Marino 	if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0))
256*50fc853eSJohn Marino 		return (cmp_bytes);
257*50fc853eSJohn Marino 
258*50fc853eSJohn Marino 	cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2,
259*50fc853eSJohn Marino 	    bws_iterator_inc(si2, len2));
260*50fc853eSJohn Marino 
261*50fc853eSJohn Marino 	if (cmp_res == 0)
262*50fc853eSJohn Marino 	  cmp_res = cmp_bytes;
263*50fc853eSJohn Marino 
264*50fc853eSJohn Marino 	return (cmp_res);
265*50fc853eSJohn Marino }
266