xref: /netbsd-src/external/ibm-public/postfix/dist/src/util/strcasecmp_utf8.c (revision e89934bbf778a6d6d6894877c4da59d0c7835b0f)
1 /*	$NetBSD: strcasecmp_utf8.c,v 1.2 2017/02/14 01:16:49 christos Exp $	*/
2 
3 /*++
4 /* NAME
5 /*	strcasecmp_utf8 3
6 /* SUMMARY
7 /*	caseless string comparison
8 /* SYNOPSIS
9 /*	#include <stringops.h>
10 /*
11 /*	int	strcasecmp_utf8(
12 /*	const char *s1,
13 /*	const char *s2)
14 /*
15 /*	int	strncasecmp_utf8(
16 /*	const char *s1,
17 /*	const char *s2,
18 /*	ssize_t	len)
19 /* AUXILIARY FUNCTIONS
20 /*	int	strcasecmp_utf8x(
21 /*	int	flags,
22 /*	const char *s1,
23 /*	const char *s2)
24 /*
25 /*	int	strncasecmp_utf8x(
26 /*	int	flags,
27 /*	const char *s1,
28 /*	const char *s2,
29 /*	ssize_t	len)
30 /* DESCRIPTION
31 /*	strcasecmp_utf8() implements caseless string comparison for
32 /*	UTF-8 text, with an API similar to strcasecmp(). Only ASCII
33 /*	characters are casefolded when the code is compiled without
34 /*	EAI support or when util_utf8_enable is zero.
35 /*
36 /*	strncasecmp_utf8() implements caseless string comparison
37 /*	for UTF-8 text, with an API similar to strncasecmp(). Only
38 /*	ASCII characters are casefolded when the code is compiled
39 /*	without EAI support or when util_utf8_enable is zero.
40 /*
41 /*	strcasecmp_utf8x() and strncasecmp_utf8x() implement a more
42 /*	complex API that provides the above functionality and more.
43 /*
44 /*	Arguments:
45 /* .IP "s1, s2"
46 /*	Null-terminated strings to be compared.
47 /* .IP len
48 /*	String length before casefolding.
49 /* .IP flags
50 /*	Zero or CASEF_FLAG_UTF8. The latter flag enables UTF-8 case
51 /*	folding instead of folding only ASCII characters. This flag
52 /*	is ignored when compiled without EAI support.
53 /* SEE ALSO
54 /*	casefold(), casefold text for caseless comparison.
55 /* LICENSE
56 /* .ad
57 /* .fi
58 /*	The Secure Mailer license must be distributed with this software.
59 /* AUTHOR(S)
60 /*	Wietse Venema
61 /*	IBM T.J. Watson Research
62 /*	P.O. Box 704
63 /*	Yorktown Heights, NY 10598, USA
64 /*
65 /*	Wietse Venema
66 /*	Google, Inc.
67 /*	111 8th Avenue
68 /*	New York, NY 10011, USA
69 /*--*/
70 
71  /*
72   * System library.
73   */
74 #include <sys_defs.h>
75 #include <string.h>
76 
77 #ifdef STRCASECMP_IN_STRINGS_H
78 #include <strings.h>
79 #endif
80 
81  /*
82   * Utility library.
83   */
84 #include <stringops.h>
85 
86 #define STR(x)	vstring_str(x)
87 
88 static VSTRING *f1;			/* casefold result for s1 */
89 static VSTRING *f2;			/* casefold result for s2 */
90 
91 /* strcasecmp_utf8_init - initialize */
92 
strcasecmp_utf8_init(void)93 static void strcasecmp_utf8_init(void)
94 {
95     f1 = vstring_alloc(100);
96     f2 = vstring_alloc(100);
97 }
98 
99 /* strcasecmp_utf8x - caseless string comparison */
100 
strcasecmp_utf8x(int flags,const char * s1,const char * s2)101 int     strcasecmp_utf8x(int flags, const char *s1, const char *s2)
102 {
103 
104     /*
105      * Short-circuit optimization for ASCII-only text. This may be slower
106      * than using a cache for all results. We must not expose strcasecmp(3)
107      * to non-ASCII text.
108      */
109     if (allascii(s1) && allascii(s2))
110 	return (strcasecmp(s1, s2));
111 
112     if (f1 == 0)
113 	strcasecmp_utf8_init();
114 
115     /*
116      * Cross our fingers and hope that strcmp() remains agnostic of
117      * charactersets and locales.
118      */
119     flags &= CASEF_FLAG_UTF8;
120     casefoldx(flags, f1, s1, -1);
121     casefoldx(flags, f2, s2, -1);
122     return (strcmp(STR(f1), STR(f2)));
123 }
124 
125 /* strncasecmp_utf8x - caseless string comparison */
126 
strncasecmp_utf8x(int flags,const char * s1,const char * s2,ssize_t len)127 int     strncasecmp_utf8x(int flags, const char *s1, const char *s2,
128 			          ssize_t len)
129 {
130 
131     /*
132      * Consider using a cache for all results.
133      */
134     if (f1 == 0)
135 	strcasecmp_utf8_init();
136 
137     /*
138      * Short-circuit optimization for ASCII-only text. This may be slower
139      * than using a cache for all results. See comments above for limitations
140      * of strcasecmp().
141      */
142     if (allascii_len(s1, len) && allascii_len(s2, len))
143 	return (strncasecmp(s1, s2, len));
144 
145     /*
146      * Caution: casefolding may change the number of bytes. See comments
147      * above for concerns about strcmp().
148      */
149     flags &= CASEF_FLAG_UTF8;
150     casefoldx(flags, f1, s1, len);
151     casefoldx(flags, f2, s2, len);
152     return (strcmp(STR(f1), STR(f2)));
153 }
154 
155 #ifdef TEST
156 #include <stdio.h>
157 #include <stdlib.h>
158 #include <vstream.h>
159 #include <vstring_vstream.h>
160 #include <msg_vstream.h>
161 #include <argv.h>
162 
main(int argc,char ** argv)163 int     main(int argc, char **argv)
164 {
165     VSTRING *buffer = vstring_alloc(1);
166     ARGV   *cmd;
167     char  **args;
168     int     len;
169     int     flags;
170     int     res;
171 
172     msg_vstream_init(argv[0], VSTREAM_ERR);
173     flags = CASEF_FLAG_UTF8;
174     util_utf8_enable = 1;
175     while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
176 	vstream_printf("> %s\n", STR(buffer));
177 	cmd = argv_split(STR(buffer), CHARS_SPACE);
178 	if (cmd->argc == 0 || cmd->argv[0][0] == '#')
179 	    continue;
180 	args = cmd->argv;
181 
182 	/*
183 	 * Compare two strings.
184 	 */
185 	if (strcmp(args[0], "compare") == 0 && cmd->argc == 3) {
186 	    res = strcasecmp_utf8x(flags, args[1], args[2]);
187 	    vstream_printf("\"%s\" %s \"%s\"\n",
188 			   args[1],
189 			   res < 0 ? "<" : res == 0 ? "==" : ">",
190 			   args[2]);
191 	}
192 
193 	/*
194 	 * Compare two substrings.
195 	 */
196 	else if (strcmp(args[0], "compare-len") == 0 && cmd->argc == 4
197 		 && sscanf(args[3], "%d", &len) == 1 && len >= 0) {
198 	    res = strncasecmp_utf8x(flags, args[1], args[2], len);
199 	    vstream_printf("\"%.*s\" %s \"%.*s\"\n",
200 			   len, args[1],
201 			   res < 0 ? "<" : res == 0 ? "==" : ">",
202 			   len, args[2]);
203 	}
204 
205 	/*
206 	 * Usage.
207 	 */
208 	else {
209 	    vstream_printf("Usage: %s compare <s1> <s2> | compare-len <s1> <s2> <len>\n",
210 			   argv[0]);
211 	}
212 	vstream_fflush(VSTREAM_OUT);
213 	argv_free(cmd);
214     }
215     exit(0);
216 }
217 
218 #endif					/* TEST */
219