1 /* $OpenBSD: utf8test.c,v 1.5 2022/11/26 16:08:56 tb Exp $ */
2 /*
3 * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18 /*
19 * A mostly exhaustive test of UTF-8 decoder and encoder
20 */
21
22 #include <stdio.h>
23 #include <string.h>
24 #include <err.h>
25
26 #include <openssl/asn1.h>
27 #include "asn1_local.h" /* peek into the internals */
28
29 #define UNCHANGED 0xfedcba98
30
31 #define ASSERT(x) \
32 do { \
33 if (!(x)) \
34 errx(1, "test failed at line %d: %s", \
35 __LINE__, #x); \
36 } while (0)
37
38 int
main(void)39 main(void)
40 {
41 unsigned char testbuf[] = "012345";
42 const unsigned char zerobuf[sizeof testbuf] = { 0 };
43 unsigned long value;
44 unsigned int i, j, k, l;
45 int ret;
46
47 /*
48 * First, verify UTF8_getc()
49 */
50 value = UNCHANGED;
51 ret = UTF8_getc(testbuf, 0, &value);
52 ASSERT(ret == 0);
53 ASSERT(value == UNCHANGED);
54
55 /* check all valid single-byte chars */
56 for (i = 0; i < 0x80; i++) {
57 testbuf[0] = i;
58 ret = UTF8_getc(testbuf, 1, &value);
59 ASSERT(ret == 1);
60 ASSERT(value == i);
61
62 ret = UTF8_getc(testbuf, 2, &value);
63 ASSERT(ret == 1);
64 ASSERT(value == i);
65 }
66
67 /*
68 * Verify failure on all invalid initial bytes:
69 * 0x80 - 0xBF following bytes only
70 * 0xC0 - 0xC1 used to be in non-shortest forms
71 * 0xF5 - 0xFD used to be initial for 5 and 6 byte sequences
72 * 0xFE - 0xFF have never been valid in utf-8
73 */
74 for (i = 0x80; i < 0xC2; i++) {
75 value = UNCHANGED;
76 testbuf[0] = i;
77 ret = UTF8_getc(testbuf, 1, &value);
78 ASSERT(ret == -2);
79 ASSERT(value == UNCHANGED);
80 }
81 for (i = 0xF5; i < 0x100; i++) {
82 value = UNCHANGED;
83 testbuf[0] = i;
84 ret = UTF8_getc(testbuf, 1, &value);
85 ASSERT(ret == -2);
86 ASSERT(value == UNCHANGED);
87 }
88
89 /*
90 * Verify handling of all two-byte sequences
91 */
92 for (i = 0xC2; i < 0xE0; i++) {
93 testbuf[0] = i;
94
95 for (j = 0; j < 0x100; j++) {
96 testbuf[1] = j;
97
98 value = UNCHANGED;
99 ret = UTF8_getc(testbuf, 1, &value);
100 ASSERT(ret == -1);
101 ASSERT(value == UNCHANGED);
102
103 ret = UTF8_getc(testbuf, 2, &value);
104
105 /* outside range of trailing bytes */
106 if (j < 0x80 || j > 0xBF) {
107 ASSERT(ret == -3);
108 ASSERT(value == UNCHANGED);
109 continue;
110 }
111
112 /* valid */
113 ASSERT(ret == 2);
114 ASSERT((value & 0x3F) == (j & 0x3F));
115 ASSERT(value >> 6 == (i & 0x1F));
116 }
117 }
118
119 /*
120 * Verify handling of all three-byte sequences
121 */
122 for (i = 0xE0; i < 0xF0; i++) {
123 testbuf[0] = i;
124
125 for (j = 0; j < 0x100; j++) {
126 testbuf[1] = j;
127
128 for (k = 0; k < 0x100; k++) {
129 testbuf[2] = k;
130
131 value = UNCHANGED;
132 ret = UTF8_getc(testbuf, 2, &value);
133 ASSERT(ret == -1);
134 ASSERT(value == UNCHANGED);
135
136 ret = UTF8_getc(testbuf, 3, &value);
137
138 /* outside range of trailing bytes */
139 if (j < 0x80 || j > 0xBF ||
140 k < 0x80 || k > 0xBF) {
141 ASSERT(ret == -3);
142 ASSERT(value == UNCHANGED);
143 continue;
144 }
145
146 /* non-shortest form */
147 if (i == 0xE0 && j < 0xA0) {
148 ASSERT(ret == -4);
149 ASSERT(value == UNCHANGED);
150 continue;
151 }
152
153 /* surrogate pair code point */
154 if (i == 0xED && j > 0x9F) {
155 ASSERT(ret == -2);
156 ASSERT(value == UNCHANGED);
157 continue;
158 }
159
160 ASSERT(ret == 3);
161 ASSERT((value & 0x3F) == (k & 0x3F));
162 ASSERT(((value >> 6) & 0x3F) == (j & 0x3F));
163 ASSERT(value >> 12 == (i & 0x0F));
164 }
165 }
166 }
167
168 /*
169 * Verify handling of all four-byte sequences
170 */
171 for (i = 0xF0; i < 0xF5; i++) {
172 testbuf[0] = i;
173
174 for (j = 0; j < 0x100; j++) {
175 testbuf[1] = j;
176
177 for (k = 0; k < 0x100; k++) {
178 testbuf[2] = k;
179
180 for (l = 0; l < 0x100; l++) {
181 testbuf[3] = l;
182
183 value = UNCHANGED;
184 ret = UTF8_getc(testbuf, 3, &value);
185 ASSERT(ret == -1);
186 ASSERT(value == UNCHANGED);
187
188 ret = UTF8_getc(testbuf, 4, &value);
189
190 /* outside range of trailing bytes */
191 if (j < 0x80 || j > 0xBF ||
192 k < 0x80 || k > 0xBF ||
193 l < 0x80 || l > 0xBF) {
194 ASSERT(ret == -3);
195 ASSERT(value == UNCHANGED);
196 continue;
197 }
198
199 /* non-shortest form */
200 if (i == 0xF0 && j < 0x90) {
201 ASSERT(ret == -4);
202 ASSERT(value == UNCHANGED);
203 continue;
204 }
205
206 /* beyond end of UCS range */
207 if (i == 0xF4 && j > 0x8F) {
208 ASSERT(ret == -2);
209 ASSERT(value == UNCHANGED);
210 continue;
211 }
212
213 ASSERT(ret == 4);
214 ASSERT((value & 0x3F) == (l & 0x3F));
215 ASSERT(((value >> 6) & 0x3F) ==
216 (k & 0x3F));
217 ASSERT(((value >> 12) & 0x3F) ==
218 (j & 0x3F));
219 ASSERT(value >> 18 == (i & 0x07));
220 }
221 }
222 }
223 }
224
225
226 /*
227 * Next, verify UTF8_putc()
228 */
229 memset(testbuf, 0, sizeof testbuf);
230
231 /* single-byte sequences */
232 for (i = 0; i < 0x80; i++) {
233 ret = UTF8_putc(NULL, 0, i);
234 ASSERT(ret == 1);
235
236 testbuf[0] = 0;
237 ret = UTF8_putc(testbuf, 0, i);
238 ASSERT(ret == -1);
239 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
240
241 ret = UTF8_putc(testbuf, 1, i);
242 ASSERT(ret == 1);
243 ASSERT(testbuf[0] == i);
244 ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0);
245 }
246
247 /* two-byte sequences */
248 for (i = 0x80; i < 0x800; i++) {
249 ret = UTF8_putc(NULL, 0, i);
250 ASSERT(ret == 2);
251
252 testbuf[0] = testbuf[1] = 0;
253 ret = UTF8_putc(testbuf, 1, i);
254 ASSERT(ret == -1);
255 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
256
257 ret = UTF8_putc(testbuf, 2, i);
258 ASSERT(ret == 2);
259 ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0);
260 ret = UTF8_getc(testbuf, 2, &value);
261 ASSERT(ret == 2);
262 ASSERT(value == i);
263 }
264
265 /* three-byte sequences */
266 for (i = 0x800; i < 0x10000; i++) {
267 if (i >= 0xD800 && i < 0xE000) {
268 /* surrogates aren't valid */
269 ret = UTF8_putc(NULL, 0, i);
270 ASSERT(ret == -2);
271 continue;
272 }
273
274 ret = UTF8_putc(NULL, 0, i);
275 ASSERT(ret == 3);
276
277 testbuf[0] = testbuf[1] = testbuf[2] = 0;
278 ret = UTF8_putc(testbuf, 2, i);
279 ASSERT(ret == -1);
280 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
281
282 ret = UTF8_putc(testbuf, 3, i);
283 ASSERT(ret == 3);
284 ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0);
285 ret = UTF8_getc(testbuf, 3, &value);
286 ASSERT(ret == 3);
287 ASSERT(value == i);
288 }
289
290 /* four-byte sequences */
291 for (i = 0x10000; i < 0x110000; i++) {
292 ret = UTF8_putc(NULL, 0, i);
293 ASSERT(ret == 4);
294
295 testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0;
296 ret = UTF8_putc(testbuf, 3, i);
297 ASSERT(ret == -1);
298 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
299
300 ret = UTF8_putc(testbuf, 4, i);
301 ASSERT(ret == 4);
302 ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0);
303 ret = UTF8_getc(testbuf, 4, &value);
304 ASSERT(ret == 4);
305 ASSERT(value == i);
306 }
307
308 /* spot check some larger values to confirm error return */
309 for (i = 0x110000; i < 0x110100; i++) {
310 ret = UTF8_putc(NULL, 0, i);
311 ASSERT(ret == -2);
312 }
313 for (value = (unsigned long)-1; value > (unsigned long)-256; value--) {
314 ret = UTF8_putc(NULL, 0, value);
315 ASSERT(ret == -2);
316 }
317
318 return 0;
319 }
320