xref: /openbsd-src/regress/lib/libcrypto/utf8/utf8test.c (revision c9675a23de50ec5aa20be3956f170f2eccffb293)
1 /*	$OpenBSD: utf8test.c,v 1.5 2022/11/26 16:08:56 tb Exp $	*/
2 /*
3  * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 /*
19  * A mostly exhaustive test of UTF-8 decoder and encoder
20  */
21 
22 #include <stdio.h>
23 #include <string.h>
24 #include <err.h>
25 
26 #include <openssl/asn1.h>
27 #include "asn1_local.h"		/* peek into the internals */
28 
29 #define	UNCHANGED	0xfedcba98
30 
31 #define ASSERT(x)						\
32 	do {							\
33 		if (!(x))					\
34 			errx(1, "test failed at line %d: %s",	\
35 			    __LINE__, #x);			\
36 	} while (0)
37 
38 int
main(void)39 main(void)
40 {
41 	unsigned char testbuf[] = "012345";
42 	const unsigned char zerobuf[sizeof testbuf] = { 0 };
43 	unsigned long value;
44 	unsigned int i, j, k, l;
45 	int ret;
46 
47 	/*
48 	 * First, verify UTF8_getc()
49 	 */
50 	value = UNCHANGED;
51 	ret = UTF8_getc(testbuf, 0, &value);
52 	ASSERT(ret == 0);
53 	ASSERT(value == UNCHANGED);
54 
55 	/* check all valid single-byte chars */
56 	for (i = 0; i < 0x80; i++) {
57 		testbuf[0] = i;
58 		ret = UTF8_getc(testbuf, 1, &value);
59 		ASSERT(ret == 1);
60 		ASSERT(value == i);
61 
62 		ret = UTF8_getc(testbuf, 2, &value);
63 		ASSERT(ret == 1);
64 		ASSERT(value == i);
65 	}
66 
67 	/*
68 	 * Verify failure on all invalid initial bytes:
69 	 *	0x80 - 0xBF	following bytes only
70 	 *	0xC0 - 0xC1	used to be in non-shortest forms
71 	 *	0xF5 - 0xFD	used to be initial for 5 and 6 byte sequences
72 	 *	0xFE - 0xFF	have never been valid in utf-8
73 	 */
74 	for (i = 0x80; i < 0xC2; i++) {
75 		value = UNCHANGED;
76 		testbuf[0] = i;
77 		ret = UTF8_getc(testbuf, 1, &value);
78 		ASSERT(ret == -2);
79 		ASSERT(value == UNCHANGED);
80 	}
81 	for (i = 0xF5; i < 0x100; i++) {
82 		value = UNCHANGED;
83 		testbuf[0] = i;
84 		ret = UTF8_getc(testbuf, 1, &value);
85 		ASSERT(ret == -2);
86 		ASSERT(value == UNCHANGED);
87 	}
88 
89 	/*
90 	 * Verify handling of all two-byte sequences
91 	 */
92 	for (i = 0xC2; i < 0xE0; i++) {
93 		testbuf[0] = i;
94 
95 		for (j = 0; j < 0x100; j++) {
96 			testbuf[1] = j;
97 
98 			value = UNCHANGED;
99 			ret = UTF8_getc(testbuf, 1, &value);
100 			ASSERT(ret == -1);
101 			ASSERT(value == UNCHANGED);
102 
103 			ret = UTF8_getc(testbuf, 2, &value);
104 
105 			/* outside range of trailing bytes */
106 			if (j < 0x80 || j > 0xBF) {
107 				ASSERT(ret == -3);
108 				ASSERT(value == UNCHANGED);
109 				continue;
110 			}
111 
112 			/* valid */
113 			ASSERT(ret == 2);
114 			ASSERT((value & 0x3F) == (j & 0x3F));
115 			ASSERT(value >> 6 == (i & 0x1F));
116 		}
117 	}
118 
119 	/*
120 	 * Verify handling of all three-byte sequences
121 	 */
122 	for (i = 0xE0; i < 0xF0; i++) {
123 		testbuf[0] = i;
124 
125 		for (j = 0; j < 0x100; j++) {
126 			testbuf[1] = j;
127 
128 			for (k = 0; k < 0x100; k++) {
129 				testbuf[2] = k;
130 
131 				value = UNCHANGED;
132 				ret = UTF8_getc(testbuf, 2, &value);
133 				ASSERT(ret == -1);
134 				ASSERT(value == UNCHANGED);
135 
136 				ret = UTF8_getc(testbuf, 3, &value);
137 
138 				/* outside range of trailing bytes */
139 				if (j < 0x80 || j > 0xBF ||
140 				    k < 0x80 || k > 0xBF) {
141 					ASSERT(ret == -3);
142 					ASSERT(value == UNCHANGED);
143 					continue;
144 				}
145 
146 				/* non-shortest form */
147 				if (i == 0xE0 && j < 0xA0) {
148 					ASSERT(ret == -4);
149 					ASSERT(value == UNCHANGED);
150 					continue;
151 				}
152 
153 				/* surrogate pair code point */
154 				if (i == 0xED && j > 0x9F) {
155 					ASSERT(ret == -2);
156 					ASSERT(value == UNCHANGED);
157 					continue;
158 				}
159 
160 				ASSERT(ret == 3);
161 				ASSERT((value & 0x3F) == (k & 0x3F));
162 				ASSERT(((value >> 6) & 0x3F) == (j & 0x3F));
163 				ASSERT(value >> 12 == (i & 0x0F));
164 			}
165 		}
166 	}
167 
168 	/*
169 	 * Verify handling of all four-byte sequences
170 	 */
171 	for (i = 0xF0; i < 0xF5; i++) {
172 		testbuf[0] = i;
173 
174 		for (j = 0; j < 0x100; j++) {
175 			testbuf[1] = j;
176 
177 			for (k = 0; k < 0x100; k++) {
178 				testbuf[2] = k;
179 
180 				for (l = 0; l < 0x100; l++) {
181 					testbuf[3] = l;
182 
183 					value = UNCHANGED;
184 					ret = UTF8_getc(testbuf, 3, &value);
185 					ASSERT(ret == -1);
186 					ASSERT(value == UNCHANGED);
187 
188 					ret = UTF8_getc(testbuf, 4, &value);
189 
190 					/* outside range of trailing bytes */
191 					if (j < 0x80 || j > 0xBF ||
192 					    k < 0x80 || k > 0xBF ||
193 					    l < 0x80 || l > 0xBF) {
194 						ASSERT(ret == -3);
195 						ASSERT(value == UNCHANGED);
196 						continue;
197 					}
198 
199 					/* non-shortest form */
200 					if (i == 0xF0 && j < 0x90) {
201 						ASSERT(ret == -4);
202 						ASSERT(value == UNCHANGED);
203 						continue;
204 					}
205 
206 					/* beyond end of UCS range */
207 					if (i == 0xF4 && j > 0x8F) {
208 						ASSERT(ret == -2);
209 						ASSERT(value == UNCHANGED);
210 						continue;
211 					}
212 
213 					ASSERT(ret == 4);
214 					ASSERT((value & 0x3F) == (l & 0x3F));
215 					ASSERT(((value >> 6) & 0x3F) ==
216 							  (k & 0x3F));
217 					ASSERT(((value >> 12) & 0x3F) ==
218 							   (j & 0x3F));
219 					ASSERT(value >> 18 == (i & 0x07));
220 				}
221 			}
222 		}
223 	}
224 
225 
226 	/*
227 	 * Next, verify UTF8_putc()
228 	 */
229 	memset(testbuf, 0, sizeof testbuf);
230 
231 	/* single-byte sequences */
232 	for (i = 0; i < 0x80; i++) {
233 		ret = UTF8_putc(NULL, 0, i);
234 		ASSERT(ret == 1);
235 
236 		testbuf[0] = 0;
237 		ret = UTF8_putc(testbuf, 0, i);
238 		ASSERT(ret == -1);
239 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
240 
241 		ret = UTF8_putc(testbuf, 1, i);
242 		ASSERT(ret == 1);
243 		ASSERT(testbuf[0] == i);
244 		ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0);
245 	}
246 
247 	/* two-byte sequences */
248 	for (i = 0x80; i < 0x800; i++) {
249 		ret = UTF8_putc(NULL, 0, i);
250 		ASSERT(ret == 2);
251 
252 		testbuf[0] = testbuf[1] = 0;
253 		ret = UTF8_putc(testbuf, 1, i);
254 		ASSERT(ret == -1);
255 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
256 
257 		ret = UTF8_putc(testbuf, 2, i);
258 		ASSERT(ret == 2);
259 		ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0);
260 		ret = UTF8_getc(testbuf, 2, &value);
261 		ASSERT(ret == 2);
262 		ASSERT(value == i);
263 	}
264 
265 	/* three-byte sequences */
266 	for (i = 0x800; i < 0x10000; i++) {
267 		if (i >= 0xD800 && i < 0xE000) {
268 			/* surrogates aren't valid */
269 			ret = UTF8_putc(NULL, 0, i);
270 			ASSERT(ret == -2);
271 			continue;
272 		}
273 
274 		ret = UTF8_putc(NULL, 0, i);
275 		ASSERT(ret == 3);
276 
277 		testbuf[0] = testbuf[1] = testbuf[2] = 0;
278 		ret = UTF8_putc(testbuf, 2, i);
279 		ASSERT(ret == -1);
280 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
281 
282 		ret = UTF8_putc(testbuf, 3, i);
283 		ASSERT(ret == 3);
284 		ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0);
285 		ret = UTF8_getc(testbuf, 3, &value);
286 		ASSERT(ret == 3);
287 		ASSERT(value == i);
288 	}
289 
290 	/* four-byte sequences */
291 	for (i = 0x10000; i < 0x110000; i++) {
292 		ret = UTF8_putc(NULL, 0, i);
293 		ASSERT(ret == 4);
294 
295 		testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0;
296 		ret = UTF8_putc(testbuf, 3, i);
297 		ASSERT(ret == -1);
298 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
299 
300 		ret = UTF8_putc(testbuf, 4, i);
301 		ASSERT(ret == 4);
302 		ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0);
303 		ret = UTF8_getc(testbuf, 4, &value);
304 		ASSERT(ret == 4);
305 		ASSERT(value == i);
306 	}
307 
308 	/* spot check some larger values to confirm error return */
309 	for (i = 0x110000; i < 0x110100; i++) {
310 		ret = UTF8_putc(NULL, 0, i);
311 		ASSERT(ret == -2);
312 	}
313 	for (value = (unsigned long)-1; value > (unsigned long)-256; value--) {
314 		ret = UTF8_putc(NULL, 0, value);
315 		ASSERT(ret == -2);
316 	}
317 
318 	return 0;
319 }
320