xref: /openbsd-src/usr.bin/sort/vsort.c (revision 5bf6c2a65528befefa057782d6a181e120cbd208)
1*5bf6c2a6Smillert /*	$OpenBSD: vsort.c,v 1.2 2015/04/01 21:46:38 millert Exp $	*/
279428148Smillert 
379428148Smillert /*-
479428148Smillert  * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
579428148Smillert  * Copyright (C) 2012 Gabor Kovesdan <gabor@FreeBSD.org>
679428148Smillert  * All rights reserved.
779428148Smillert  *
879428148Smillert  * Redistribution and use in source and binary forms, with or without
979428148Smillert  * modification, are permitted provided that the following conditions
1079428148Smillert  * are met:
1179428148Smillert  * 1. Redistributions of source code must retain the above copyright
1279428148Smillert  *    notice, this list of conditions and the following disclaimer.
1379428148Smillert  * 2. Redistributions in binary form must reproduce the above copyright
1479428148Smillert  *    notice, this list of conditions and the following disclaimer in the
1579428148Smillert  *    documentation and/or other materials provided with the distribution.
1679428148Smillert  *
1779428148Smillert  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1879428148Smillert  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1979428148Smillert  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2079428148Smillert  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2179428148Smillert  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2279428148Smillert  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2379428148Smillert  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2479428148Smillert  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2579428148Smillert  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2679428148Smillert  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2779428148Smillert  * SUCH DAMAGE.
2879428148Smillert  */
2979428148Smillert 
3079428148Smillert #include <sys/types.h>
3179428148Smillert 
3279428148Smillert #include <ctype.h>
3379428148Smillert #include <stdlib.h>
3479428148Smillert #include <string.h>
3579428148Smillert 
3679428148Smillert #include "sort.h"
3779428148Smillert #include "vsort.h"
3879428148Smillert 
3979428148Smillert static inline bool
isdigit_clocale(wchar_t c)4079428148Smillert isdigit_clocale(wchar_t c)
4179428148Smillert {
4279428148Smillert 	return (c >= L'0' && c <= L'9');
4379428148Smillert }
4479428148Smillert 
4579428148Smillert static inline bool
isalpha_clocale(wchar_t c)4679428148Smillert isalpha_clocale(wchar_t c)
4779428148Smillert {
4879428148Smillert 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'));
4979428148Smillert }
5079428148Smillert 
5179428148Smillert static inline bool
isalnum_clocale(wchar_t c)5279428148Smillert isalnum_clocale(wchar_t c)
5379428148Smillert {
5479428148Smillert 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') ||
5579428148Smillert 	    (c >= L'0' && c <= L'9'));
5679428148Smillert }
5779428148Smillert 
5879428148Smillert /*
5979428148Smillert  * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$
6079428148Smillert  * Set length of string before suffix.
6179428148Smillert  */
6279428148Smillert static void
find_suffix(bwstring_iterator si,bwstring_iterator se,size_t * len)6379428148Smillert find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len)
6479428148Smillert {
6579428148Smillert 	wchar_t c;
6679428148Smillert 	size_t clen;
6779428148Smillert 	bool expect_alpha, sfx;
6879428148Smillert 
6979428148Smillert 	sfx = false;
7079428148Smillert 	expect_alpha = false;
7179428148Smillert 	*len = 0;
7279428148Smillert 	clen = 0;
7379428148Smillert 
7479428148Smillert 	while ((si < se) && (c = bws_get_iter_value(si))) {
7579428148Smillert 		if (expect_alpha) {
7679428148Smillert 			expect_alpha = false;
7779428148Smillert 			if (!isalpha_clocale(c) && (c != L'~'))
7879428148Smillert 				sfx = false;
7979428148Smillert 		} else if (c == L'.') {
8079428148Smillert 			expect_alpha = true;
8179428148Smillert 			if (!sfx) {
8279428148Smillert 				sfx = true;
8379428148Smillert 				*len = clen;
8479428148Smillert 			}
8579428148Smillert 		} else if (!isalnum_clocale(c) && (c != L'~'))
8679428148Smillert 			sfx = false;
8779428148Smillert 
8879428148Smillert 		si = bws_iterator_inc(si, 1);
8979428148Smillert 		++clen;
9079428148Smillert 	}
9179428148Smillert 
9279428148Smillert 	/* This code must be here to make the implementation compatible
9379428148Smillert 	 * with WORDING of GNU sort documentation.
9479428148Smillert 	 * But the GNU sort implementation is not following its own
9579428148Smillert 	 * documentation.  GNU sort allows empty file extensions
9679428148Smillert 	 * (just dot with nothing after); but the regular expression in
9779428148Smillert 	 * their documentation does not allow empty file extensions.
9879428148Smillert 	 * We chose to make our implementation compatible with GNU sort
9979428148Smillert 	 * implementation.  If they will ever fix their bug, this code
10079428148Smillert 	 * must be uncommented. Or they may choose to fix the info page,
10179428148Smillert 	 * then the code stays commented.
10279428148Smillert 	 *
10379428148Smillert 	 if (expect_alpha)
10479428148Smillert 	 	sfx = false;
10579428148Smillert 	 */
10679428148Smillert 
10779428148Smillert 	if (!sfx)
10879428148Smillert 		*len = clen;
10979428148Smillert }
11079428148Smillert 
11179428148Smillert static inline int
cmp_chars(wchar_t c1,wchar_t c2)11279428148Smillert cmp_chars(wchar_t c1, wchar_t c2)
11379428148Smillert {
11479428148Smillert 	if (c1 == c2)
11579428148Smillert 		return 0;
11679428148Smillert 
11779428148Smillert 	if (c1 == L'~')
11879428148Smillert 		return -1;
11979428148Smillert 	if (c2 == L'~')
12079428148Smillert 		return 1;
12179428148Smillert 
12279428148Smillert 	if (isdigit_clocale(c1) || !c1)
12379428148Smillert 		return (isdigit_clocale(c2) || !c2) ? 0 : -1;
12479428148Smillert 
12579428148Smillert 	if (isdigit_clocale(c2) || !c2)
12679428148Smillert 		return 1;
12779428148Smillert 
12879428148Smillert 	if (isalpha_clocale(c1))
12979428148Smillert 		return isalpha_clocale(c2) ? (int)c1 - (int)c2 : -1;
13079428148Smillert 
13179428148Smillert 	if (isalpha_clocale(c2))
13279428148Smillert 		return 1;
13379428148Smillert 
13479428148Smillert 	return (int)c1 - (int)c2;
13579428148Smillert }
13679428148Smillert 
13779428148Smillert static int
cmpversions(bwstring_iterator si1,bwstring_iterator se1,bwstring_iterator si2,bwstring_iterator se2)13879428148Smillert cmpversions(bwstring_iterator si1, bwstring_iterator se1,
13979428148Smillert     bwstring_iterator si2, bwstring_iterator se2)
14079428148Smillert {
14179428148Smillert 	int cmp, diff;
14279428148Smillert 
14379428148Smillert 	while ((si1 < se1) || (si2 < se2)) {
14479428148Smillert 		diff = 0;
14579428148Smillert 
14679428148Smillert 		while (((si1 < se1) &&
14779428148Smillert 		    !isdigit_clocale(bws_get_iter_value(si1))) ||
14879428148Smillert 		    ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) {
14979428148Smillert 			wchar_t c1, c2;
15079428148Smillert 
15179428148Smillert 			c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0;
15279428148Smillert 			c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0;
15379428148Smillert 
15479428148Smillert 			cmp = cmp_chars(c1, c2);
15579428148Smillert 			if (cmp)
15679428148Smillert 				return cmp;
15779428148Smillert 
15879428148Smillert 			if (si1 < se1)
15979428148Smillert 				si1 = bws_iterator_inc(si1, 1);
16079428148Smillert 			if (si2 < se2)
16179428148Smillert 				si2 = bws_iterator_inc(si2, 1);
16279428148Smillert 		}
16379428148Smillert 
16479428148Smillert 		while (bws_get_iter_value(si1) == L'0')
16579428148Smillert 			si1 = bws_iterator_inc(si1, 1);
16679428148Smillert 
16779428148Smillert 		while (bws_get_iter_value(si2) == L'0')
16879428148Smillert 			si2 = bws_iterator_inc(si2, 1);
16979428148Smillert 
17079428148Smillert 		while (isdigit_clocale(bws_get_iter_value(si1)) &&
17179428148Smillert 		    isdigit_clocale(bws_get_iter_value(si2))) {
17279428148Smillert 			if (!diff)
17379428148Smillert 				diff = ((int)bws_get_iter_value(si1) -
17479428148Smillert 				    (int)bws_get_iter_value(si2));
17579428148Smillert 			si1 = bws_iterator_inc(si1, 1);
17679428148Smillert 			si2 = bws_iterator_inc(si2, 1);
17779428148Smillert 		}
17879428148Smillert 
17979428148Smillert 		if (isdigit_clocale(bws_get_iter_value(si1)))
18079428148Smillert 			return 1;
18179428148Smillert 
18279428148Smillert 		if (isdigit_clocale(bws_get_iter_value(si2)))
18379428148Smillert 			return -1;
18479428148Smillert 
18579428148Smillert 		if (diff)
18679428148Smillert 			return diff;
18779428148Smillert 	}
18879428148Smillert 
18979428148Smillert 	return 0;
19079428148Smillert }
19179428148Smillert 
19279428148Smillert /*
19379428148Smillert  * Compare two version strings
19479428148Smillert  */
19579428148Smillert int
vcmp(struct bwstring * s1,struct bwstring * s2)19679428148Smillert vcmp(struct bwstring *s1, struct bwstring *s2)
19779428148Smillert {
19879428148Smillert 	bwstring_iterator si1, si2;
19979428148Smillert 	wchar_t c1, c2;
20079428148Smillert 	size_t len1, len2, slen1, slen2;
20179428148Smillert 	int cmp_bytes, cmp_res;
20279428148Smillert 
20379428148Smillert 	if (s1 == s2)
20479428148Smillert 		return 0;
20579428148Smillert 
20679428148Smillert 	cmp_bytes = bwscmp(s1, s2, 0);
20779428148Smillert 	if (cmp_bytes == 0)
20879428148Smillert 		return 0;
20979428148Smillert 
21079428148Smillert 	len1 = slen1 = BWSLEN(s1);
21179428148Smillert 	len2 = slen2 = BWSLEN(s2);
21279428148Smillert 
21379428148Smillert 	if (slen1 < 1)
21479428148Smillert 		return -1;
21579428148Smillert 	if (slen2 < 1)
21679428148Smillert 		return 1;
21779428148Smillert 
21879428148Smillert 	si1 = bws_begin(s1);
21979428148Smillert 	si2 = bws_begin(s2);
22079428148Smillert 
22179428148Smillert 	c1 = bws_get_iter_value(si1);
22279428148Smillert 	c2 = bws_get_iter_value(si2);
22379428148Smillert 
22479428148Smillert 	if (c1 == L'.' && (slen1 == 1))
22579428148Smillert 		return -1;
22679428148Smillert 
22779428148Smillert 	if (c2 == L'.' && (slen2 == 1))
22879428148Smillert 		return 1;
22979428148Smillert 
23079428148Smillert 	if (slen1 == 2 && c1 == L'.' &&
23179428148Smillert 	    bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.')
23279428148Smillert 		return -1;
23379428148Smillert 	if (slen2 == 2 && c2 == L'.' &&
23479428148Smillert 	    bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.')
23579428148Smillert 		return 1;
23679428148Smillert 
23779428148Smillert 	if (c1 == L'.' && c2 != L'.')
23879428148Smillert 		return -1;
23979428148Smillert 	if (c1 != L'.' && c2 == L'.')
24079428148Smillert 		return 1;
24179428148Smillert 
24279428148Smillert 	if (c1 == L'.' && c2 == L'.') {
24379428148Smillert 		si1 = bws_iterator_inc(si1, 1);
24479428148Smillert 		si2 = bws_iterator_inc(si2, 1);
24579428148Smillert 	}
24679428148Smillert 
24779428148Smillert 	find_suffix(si1, bws_end(s1), &len1);
24879428148Smillert 	find_suffix(si2, bws_end(s2), &len2);
24979428148Smillert 
25079428148Smillert 	if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0))
25179428148Smillert 		return cmp_bytes;
25279428148Smillert 
25379428148Smillert 	cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2,
25479428148Smillert 	    bws_iterator_inc(si2, len2));
25579428148Smillert 
25679428148Smillert 	if (cmp_res == 0)
25779428148Smillert 	  cmp_res = cmp_bytes;
25879428148Smillert 
25979428148Smillert 	return cmp_res;
26079428148Smillert }
261