xref: /openbsd-src/regress/lib/libcrypto/utf8/utf8test.c (revision 388ed389a70366cb77ade81aec3b1b8d79e1f8db)
1 /*
2  * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org>
3  *
4  * Permission to use, copy, modify, and distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*
18  * A mostly exhaustive test of UTF-8 decoder and encoder
19  */
20 
21 #include <stdio.h>
22 #include <string.h>
23 #include <err.h>
24 
25 #include <openssl/asn1.h>
26 
27 #define	UNCHANGED	0xfedcba98
28 
29 #define ASSERT(x)						\
30 	do {							\
31 		if (!(x))					\
32 			errx(1, "test failed at line %d: %s",	\
33 			    __LINE__, #x);			\
34 	} while (0)
35 
36 int
37 main(void)
38 {
39 	unsigned char testbuf[] = "012345";
40 	const unsigned char zerobuf[sizeof testbuf] = { 0 };
41 	unsigned long value;
42 	int i, j, k, l, ret;
43 
44 	/*
45 	 * First, verify UTF8_getc()
46 	 */
47 	value = UNCHANGED;
48 	ret = UTF8_getc(testbuf, 0, &value);
49 	ASSERT(ret == 0);
50 	ASSERT(value == UNCHANGED);
51 
52 	/* check all valid single-byte chars */
53 	for (i = 0; i < 0x80; i++) {
54 		testbuf[0] = i;
55 		ret = UTF8_getc(testbuf, 1, &value);
56 		ASSERT(ret == 1);
57 		ASSERT(value == i);
58 
59 		ret = UTF8_getc(testbuf, 2, &value);
60 		ASSERT(ret == 1);
61 		ASSERT(value == i);
62 	}
63 
64 	/*
65 	 * Verify failure on all invalid initial bytes:
66 	 *	0x80 - 0xBF	following bytes only
67 	 *	0xC0 - 0xC1	used to be in non-shortest forms
68 	 *	0xF5 - 0xFD	used to be initial for 5 and 6 byte sequences
69 	 *	0xFE - 0xFF	have never been valid in utf-8
70 	 */
71 	for (i = 0x80; i < 0xC2; i++) {
72 		value = UNCHANGED;
73 		testbuf[0] = i;
74 		ret = UTF8_getc(testbuf, 1, &value);
75 		ASSERT(ret == -2);
76 		ASSERT(value == UNCHANGED);
77 	}
78 	for (i = 0xF5; i < 0x100; i++) {
79 		value = UNCHANGED;
80 		testbuf[0] = i;
81 		ret = UTF8_getc(testbuf, 1, &value);
82 		ASSERT(ret == -2);
83 		ASSERT(value == UNCHANGED);
84 	}
85 
86 	/*
87 	 * Verify handling of all two-byte sequences
88 	 */
89 	for (i = 0xC2; i < 0xE0; i++) {
90 		testbuf[0] = i;
91 
92 		for (j = 0; j < 0x100; j++) {
93 			testbuf[1] = j;
94 
95 			value = UNCHANGED;
96 			ret = UTF8_getc(testbuf, 1, &value);
97 			ASSERT(ret == -1);
98 			ASSERT(value == UNCHANGED);
99 
100 			ret = UTF8_getc(testbuf, 2, &value);
101 
102 			/* outside range of trailing bytes */
103 			if (j < 0x80 || j > 0xBF) {
104 				ASSERT(ret == -3);
105 				ASSERT(value == UNCHANGED);
106 				continue;
107 			}
108 
109 			/* valid */
110 			ASSERT(ret == 2);
111 			ASSERT((value & 0x3F) == (j & 0x3F));
112 			ASSERT(value >> 6 == (i & 0x1F));
113 		}
114 	}
115 
116 	/*
117 	 * Verify handling of all three-byte sequences
118 	 */
119 	for (i = 0xE0; i < 0xF0; i++) {
120 		testbuf[0] = i;
121 
122 		for (j = 0; j < 0x100; j++) {
123 			testbuf[1] = j;
124 
125 			for (k = 0; k < 0x100; k++) {
126 				testbuf[2] = k;
127 
128 				value = UNCHANGED;
129 				ret = UTF8_getc(testbuf, 2, &value);
130 				ASSERT(ret == -1);
131 				ASSERT(value == UNCHANGED);
132 
133 				ret = UTF8_getc(testbuf, 3, &value);
134 
135 				/* outside range of trailing bytes */
136 				if (j < 0x80 || j > 0xBF ||
137 				    k < 0x80 || k > 0xBF) {
138 					ASSERT(ret == -3);
139 					ASSERT(value == UNCHANGED);
140 					continue;
141 				}
142 
143 				/* non-shortest form */
144 				if (i == 0xE0 && j < 0xA0) {
145 					ASSERT(ret == -4);
146 					ASSERT(value == UNCHANGED);
147 					continue;
148 				}
149 
150 				/* surrogate pair code point */
151 				if (i == 0xED && j > 0x9F) {
152 					ASSERT(ret == -2);
153 					ASSERT(value == UNCHANGED);
154 					continue;
155 				}
156 
157 				ASSERT(ret == 3);
158 				ASSERT((value & 0x3F) == (k & 0x3F));
159 				ASSERT(((value >> 6) & 0x3F) == (j & 0x3F));
160 				ASSERT(value >> 12 == (i & 0x0F));
161 			}
162 		}
163 	}
164 
165 	/*
166 	 * Verify handling of all four-byte sequences
167 	 */
168 	for (i = 0xF0; i < 0xF5; i++) {
169 		testbuf[0] = i;
170 
171 		for (j = 0; j < 0x100; j++) {
172 			testbuf[1] = j;
173 
174 			for (k = 0; k < 0x100; k++) {
175 				testbuf[2] = k;
176 
177 				for (l = 0; l < 0x100; l++) {
178 					testbuf[3] = l;
179 
180 					value = UNCHANGED;
181 					ret = UTF8_getc(testbuf, 3, &value);
182 					ASSERT(ret == -1);
183 					ASSERT(value == UNCHANGED);
184 
185 					ret = UTF8_getc(testbuf, 4, &value);
186 
187 					/* outside range of trailing bytes */
188 					if (j < 0x80 || j > 0xBF ||
189 					    k < 0x80 || k > 0xBF ||
190 					    l < 0x80 || l > 0xBF) {
191 						ASSERT(ret == -3);
192 						ASSERT(value == UNCHANGED);
193 						continue;
194 					}
195 
196 					/* non-shortest form */
197 					if (i == 0xF0 && j < 0x90) {
198 						ASSERT(ret == -4);
199 						ASSERT(value == UNCHANGED);
200 						continue;
201 					}
202 
203 					/* beyond end of UCS range */
204 					if (i == 0xF4 && j > 0x8F) {
205 						ASSERT(ret == -2);
206 						ASSERT(value == UNCHANGED);
207 						continue;
208 					}
209 
210 					ASSERT(ret == 4);
211 					ASSERT((value & 0x3F) == (l & 0x3F));
212 					ASSERT(((value >> 6) & 0x3F) ==
213 							  (k & 0x3F));
214 					ASSERT(((value >> 12) & 0x3F) ==
215 							   (j & 0x3F));
216 					ASSERT(value >> 18 == (i & 0x07));
217 				}
218 			}
219 		}
220 	}
221 
222 
223 	/*
224 	 * Next, verify UTF8_putc()
225 	 */
226 	memset(testbuf, 0, sizeof testbuf);
227 
228 	/* single-byte sequences */
229 	for (i = 0; i < 0x80; i++) {
230 		ret = UTF8_putc(NULL, 0, i);
231 		ASSERT(ret == 1);
232 
233 		testbuf[0] = 0;
234 		ret = UTF8_putc(testbuf, 0, i);
235 		ASSERT(ret == -1);
236 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
237 
238 		ret = UTF8_putc(testbuf, 1, i);
239 		ASSERT(ret == 1);
240 		ASSERT(testbuf[0] == i);
241 		ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0);
242 	}
243 
244 	/* two-byte sequences */
245 	for (i = 0x80; i < 0x800; i++) {
246 		ret = UTF8_putc(NULL, 0, i);
247 		ASSERT(ret == 2);
248 
249 		testbuf[0] = testbuf[1] = 0;
250 		ret = UTF8_putc(testbuf, 1, i);
251 		ASSERT(ret == -1);
252 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
253 
254 		ret = UTF8_putc(testbuf, 2, i);
255 		ASSERT(ret == 2);
256 		ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0);
257 		ret = UTF8_getc(testbuf, 2, &value);
258 		ASSERT(ret == 2);
259 		ASSERT(value == i);
260 	}
261 
262 	/* three-byte sequences */
263 	for (i = 0x800; i < 0x10000; i++) {
264 		if (i >= 0xD800 && i < 0xE000) {
265 			/* surrogates aren't valid */
266 			ret = UTF8_putc(NULL, 0, i);
267 			ASSERT(ret == -2);
268 			continue;
269 		}
270 
271 		ret = UTF8_putc(NULL, 0, i);
272 		ASSERT(ret == 3);
273 
274 		testbuf[0] = testbuf[1] = testbuf[2] = 0;
275 		ret = UTF8_putc(testbuf, 2, i);
276 		ASSERT(ret == -1);
277 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
278 
279 		ret = UTF8_putc(testbuf, 3, i);
280 		ASSERT(ret == 3);
281 		ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0);
282 		ret = UTF8_getc(testbuf, 3, &value);
283 		ASSERT(ret == 3);
284 		ASSERT(value == i);
285 	}
286 
287 	/* four-byte sequences */
288 	for (i = 0x10000; i < 0x110000; i++) {
289 		ret = UTF8_putc(NULL, 0, i);
290 		ASSERT(ret == 4);
291 
292 		testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0;
293 		ret = UTF8_putc(testbuf, 3, i);
294 		ASSERT(ret == -1);
295 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
296 
297 		ret = UTF8_putc(testbuf, 4, i);
298 		ASSERT(ret == 4);
299 		ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0);
300 		ret = UTF8_getc(testbuf, 4, &value);
301 		ASSERT(ret == 4);
302 		ASSERT(value == i);
303 	}
304 
305 	/* spot check some larger values to confirm error return */
306 	for (i = 0x110000; i < 0x110100; i++) {
307 		ret = UTF8_putc(NULL, 0, i);
308 		ASSERT(ret == -2);
309 	}
310 	for (value = (unsigned long)-1; value > (unsigned long)-256; value--) {
311 		ret = UTF8_putc(NULL, 0, value);
312 		ASSERT(ret == -2);
313 	}
314 
315 	return 0;
316 }
317