xref: /openbsd-src/regress/lib/libcrypto/utf8/utf8test.c (revision c9675a23de50ec5aa20be3956f170f2eccffb293)
1*c9675a23Stb /*	$OpenBSD: utf8test.c,v 1.5 2022/11/26 16:08:56 tb Exp $	*/
239187941Sguenther /*
339187941Sguenther  * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org>
439187941Sguenther  *
539187941Sguenther  * Permission to use, copy, modify, and distribute this software for any
639187941Sguenther  * purpose with or without fee is hereby granted, provided that the above
739187941Sguenther  * copyright notice and this permission notice appear in all copies.
839187941Sguenther  *
939187941Sguenther  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1039187941Sguenther  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1139187941Sguenther  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1239187941Sguenther  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1339187941Sguenther  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1439187941Sguenther  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1539187941Sguenther  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1639187941Sguenther  */
1739187941Sguenther 
1839187941Sguenther /*
1939187941Sguenther  * A mostly exhaustive test of UTF-8 decoder and encoder
2039187941Sguenther  */
2139187941Sguenther 
2239187941Sguenther #include <stdio.h>
2339187941Sguenther #include <string.h>
2439187941Sguenther #include <err.h>
2539187941Sguenther 
2639187941Sguenther #include <openssl/asn1.h>
27*c9675a23Stb #include "asn1_local.h"		/* peek into the internals */
2839187941Sguenther 
2939187941Sguenther #define	UNCHANGED	0xfedcba98
3039187941Sguenther 
3139187941Sguenther #define ASSERT(x)						\
3239187941Sguenther 	do {							\
3339187941Sguenther 		if (!(x))					\
3439187941Sguenther 			errx(1, "test failed at line %d: %s",	\
3539187941Sguenther 			    __LINE__, #x);			\
3639187941Sguenther 	} while (0)
3739187941Sguenther 
3839187941Sguenther int
main(void)3939187941Sguenther main(void)
4039187941Sguenther {
4139187941Sguenther 	unsigned char testbuf[] = "012345";
4239187941Sguenther 	const unsigned char zerobuf[sizeof testbuf] = { 0 };
4339187941Sguenther 	unsigned long value;
44ecd8c27dSguenther 	unsigned int i, j, k, l;
45ecd8c27dSguenther 	int ret;
4639187941Sguenther 
4739187941Sguenther 	/*
4839187941Sguenther 	 * First, verify UTF8_getc()
4939187941Sguenther 	 */
5039187941Sguenther 	value = UNCHANGED;
5139187941Sguenther 	ret = UTF8_getc(testbuf, 0, &value);
5239187941Sguenther 	ASSERT(ret == 0);
5339187941Sguenther 	ASSERT(value == UNCHANGED);
5439187941Sguenther 
5539187941Sguenther 	/* check all valid single-byte chars */
5639187941Sguenther 	for (i = 0; i < 0x80; i++) {
5739187941Sguenther 		testbuf[0] = i;
5839187941Sguenther 		ret = UTF8_getc(testbuf, 1, &value);
5939187941Sguenther 		ASSERT(ret == 1);
6039187941Sguenther 		ASSERT(value == i);
6139187941Sguenther 
6239187941Sguenther 		ret = UTF8_getc(testbuf, 2, &value);
6339187941Sguenther 		ASSERT(ret == 1);
6439187941Sguenther 		ASSERT(value == i);
6539187941Sguenther 	}
6639187941Sguenther 
6739187941Sguenther 	/*
6839187941Sguenther 	 * Verify failure on all invalid initial bytes:
6939187941Sguenther 	 *	0x80 - 0xBF	following bytes only
7039187941Sguenther 	 *	0xC0 - 0xC1	used to be in non-shortest forms
7139187941Sguenther 	 *	0xF5 - 0xFD	used to be initial for 5 and 6 byte sequences
7239187941Sguenther 	 *	0xFE - 0xFF	have never been valid in utf-8
7339187941Sguenther 	 */
7439187941Sguenther 	for (i = 0x80; i < 0xC2; i++) {
7539187941Sguenther 		value = UNCHANGED;
7639187941Sguenther 		testbuf[0] = i;
7739187941Sguenther 		ret = UTF8_getc(testbuf, 1, &value);
7839187941Sguenther 		ASSERT(ret == -2);
7939187941Sguenther 		ASSERT(value == UNCHANGED);
8039187941Sguenther 	}
8139187941Sguenther 	for (i = 0xF5; i < 0x100; i++) {
8239187941Sguenther 		value = UNCHANGED;
8339187941Sguenther 		testbuf[0] = i;
8439187941Sguenther 		ret = UTF8_getc(testbuf, 1, &value);
8539187941Sguenther 		ASSERT(ret == -2);
8639187941Sguenther 		ASSERT(value == UNCHANGED);
8739187941Sguenther 	}
8839187941Sguenther 
8939187941Sguenther 	/*
9039187941Sguenther 	 * Verify handling of all two-byte sequences
9139187941Sguenther 	 */
9239187941Sguenther 	for (i = 0xC2; i < 0xE0; i++) {
9339187941Sguenther 		testbuf[0] = i;
9439187941Sguenther 
9539187941Sguenther 		for (j = 0; j < 0x100; j++) {
9639187941Sguenther 			testbuf[1] = j;
9739187941Sguenther 
9839187941Sguenther 			value = UNCHANGED;
9939187941Sguenther 			ret = UTF8_getc(testbuf, 1, &value);
10039187941Sguenther 			ASSERT(ret == -1);
10139187941Sguenther 			ASSERT(value == UNCHANGED);
10239187941Sguenther 
10339187941Sguenther 			ret = UTF8_getc(testbuf, 2, &value);
10439187941Sguenther 
10539187941Sguenther 			/* outside range of trailing bytes */
10639187941Sguenther 			if (j < 0x80 || j > 0xBF) {
10739187941Sguenther 				ASSERT(ret == -3);
10839187941Sguenther 				ASSERT(value == UNCHANGED);
10939187941Sguenther 				continue;
11039187941Sguenther 			}
11139187941Sguenther 
11239187941Sguenther 			/* valid */
11339187941Sguenther 			ASSERT(ret == 2);
11439187941Sguenther 			ASSERT((value & 0x3F) == (j & 0x3F));
11539187941Sguenther 			ASSERT(value >> 6 == (i & 0x1F));
11639187941Sguenther 		}
11739187941Sguenther 	}
11839187941Sguenther 
11939187941Sguenther 	/*
12039187941Sguenther 	 * Verify handling of all three-byte sequences
12139187941Sguenther 	 */
12239187941Sguenther 	for (i = 0xE0; i < 0xF0; i++) {
12339187941Sguenther 		testbuf[0] = i;
12439187941Sguenther 
12539187941Sguenther 		for (j = 0; j < 0x100; j++) {
12639187941Sguenther 			testbuf[1] = j;
12739187941Sguenther 
12839187941Sguenther 			for (k = 0; k < 0x100; k++) {
12939187941Sguenther 				testbuf[2] = k;
13039187941Sguenther 
13139187941Sguenther 				value = UNCHANGED;
13239187941Sguenther 				ret = UTF8_getc(testbuf, 2, &value);
13339187941Sguenther 				ASSERT(ret == -1);
13439187941Sguenther 				ASSERT(value == UNCHANGED);
13539187941Sguenther 
13639187941Sguenther 				ret = UTF8_getc(testbuf, 3, &value);
13739187941Sguenther 
13839187941Sguenther 				/* outside range of trailing bytes */
13939187941Sguenther 				if (j < 0x80 || j > 0xBF ||
14039187941Sguenther 				    k < 0x80 || k > 0xBF) {
14139187941Sguenther 					ASSERT(ret == -3);
14239187941Sguenther 					ASSERT(value == UNCHANGED);
14339187941Sguenther 					continue;
14439187941Sguenther 				}
14539187941Sguenther 
14639187941Sguenther 				/* non-shortest form */
14739187941Sguenther 				if (i == 0xE0 && j < 0xA0) {
14839187941Sguenther 					ASSERT(ret == -4);
14939187941Sguenther 					ASSERT(value == UNCHANGED);
15039187941Sguenther 					continue;
15139187941Sguenther 				}
15239187941Sguenther 
15339187941Sguenther 				/* surrogate pair code point */
15439187941Sguenther 				if (i == 0xED && j > 0x9F) {
15539187941Sguenther 					ASSERT(ret == -2);
15639187941Sguenther 					ASSERT(value == UNCHANGED);
15739187941Sguenther 					continue;
15839187941Sguenther 				}
15939187941Sguenther 
16039187941Sguenther 				ASSERT(ret == 3);
16139187941Sguenther 				ASSERT((value & 0x3F) == (k & 0x3F));
16239187941Sguenther 				ASSERT(((value >> 6) & 0x3F) == (j & 0x3F));
16339187941Sguenther 				ASSERT(value >> 12 == (i & 0x0F));
16439187941Sguenther 			}
16539187941Sguenther 		}
16639187941Sguenther 	}
16739187941Sguenther 
16839187941Sguenther 	/*
16939187941Sguenther 	 * Verify handling of all four-byte sequences
17039187941Sguenther 	 */
17139187941Sguenther 	for (i = 0xF0; i < 0xF5; i++) {
17239187941Sguenther 		testbuf[0] = i;
17339187941Sguenther 
17439187941Sguenther 		for (j = 0; j < 0x100; j++) {
17539187941Sguenther 			testbuf[1] = j;
17639187941Sguenther 
17739187941Sguenther 			for (k = 0; k < 0x100; k++) {
17839187941Sguenther 				testbuf[2] = k;
17939187941Sguenther 
18039187941Sguenther 				for (l = 0; l < 0x100; l++) {
18139187941Sguenther 					testbuf[3] = l;
18239187941Sguenther 
18339187941Sguenther 					value = UNCHANGED;
18439187941Sguenther 					ret = UTF8_getc(testbuf, 3, &value);
18539187941Sguenther 					ASSERT(ret == -1);
18639187941Sguenther 					ASSERT(value == UNCHANGED);
18739187941Sguenther 
18839187941Sguenther 					ret = UTF8_getc(testbuf, 4, &value);
18939187941Sguenther 
19039187941Sguenther 					/* outside range of trailing bytes */
19139187941Sguenther 					if (j < 0x80 || j > 0xBF ||
19239187941Sguenther 					    k < 0x80 || k > 0xBF ||
19339187941Sguenther 					    l < 0x80 || l > 0xBF) {
19439187941Sguenther 						ASSERT(ret == -3);
19539187941Sguenther 						ASSERT(value == UNCHANGED);
19639187941Sguenther 						continue;
19739187941Sguenther 					}
19839187941Sguenther 
19939187941Sguenther 					/* non-shortest form */
20039187941Sguenther 					if (i == 0xF0 && j < 0x90) {
20139187941Sguenther 						ASSERT(ret == -4);
20239187941Sguenther 						ASSERT(value == UNCHANGED);
20339187941Sguenther 						continue;
20439187941Sguenther 					}
20539187941Sguenther 
20639187941Sguenther 					/* beyond end of UCS range */
20739187941Sguenther 					if (i == 0xF4 && j > 0x8F) {
20839187941Sguenther 						ASSERT(ret == -2);
20939187941Sguenther 						ASSERT(value == UNCHANGED);
21039187941Sguenther 						continue;
21139187941Sguenther 					}
21239187941Sguenther 
21339187941Sguenther 					ASSERT(ret == 4);
21439187941Sguenther 					ASSERT((value & 0x3F) == (l & 0x3F));
21539187941Sguenther 					ASSERT(((value >> 6) & 0x3F) ==
21639187941Sguenther 							  (k & 0x3F));
21739187941Sguenther 					ASSERT(((value >> 12) & 0x3F) ==
21839187941Sguenther 							   (j & 0x3F));
21939187941Sguenther 					ASSERT(value >> 18 == (i & 0x07));
22039187941Sguenther 				}
22139187941Sguenther 			}
22239187941Sguenther 		}
22339187941Sguenther 	}
22439187941Sguenther 
22539187941Sguenther 
22639187941Sguenther 	/*
22739187941Sguenther 	 * Next, verify UTF8_putc()
22839187941Sguenther 	 */
22939187941Sguenther 	memset(testbuf, 0, sizeof testbuf);
23039187941Sguenther 
23139187941Sguenther 	/* single-byte sequences */
23239187941Sguenther 	for (i = 0; i < 0x80; i++) {
23339187941Sguenther 		ret = UTF8_putc(NULL, 0, i);
23439187941Sguenther 		ASSERT(ret == 1);
23539187941Sguenther 
23639187941Sguenther 		testbuf[0] = 0;
23739187941Sguenther 		ret = UTF8_putc(testbuf, 0, i);
23839187941Sguenther 		ASSERT(ret == -1);
23939187941Sguenther 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
24039187941Sguenther 
24139187941Sguenther 		ret = UTF8_putc(testbuf, 1, i);
24239187941Sguenther 		ASSERT(ret == 1);
24339187941Sguenther 		ASSERT(testbuf[0] == i);
24439187941Sguenther 		ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0);
24539187941Sguenther 	}
24639187941Sguenther 
24739187941Sguenther 	/* two-byte sequences */
24839187941Sguenther 	for (i = 0x80; i < 0x800; i++) {
24939187941Sguenther 		ret = UTF8_putc(NULL, 0, i);
25039187941Sguenther 		ASSERT(ret == 2);
25139187941Sguenther 
25239187941Sguenther 		testbuf[0] = testbuf[1] = 0;
25339187941Sguenther 		ret = UTF8_putc(testbuf, 1, i);
25439187941Sguenther 		ASSERT(ret == -1);
25539187941Sguenther 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
25639187941Sguenther 
25739187941Sguenther 		ret = UTF8_putc(testbuf, 2, i);
25839187941Sguenther 		ASSERT(ret == 2);
25939187941Sguenther 		ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0);
26039187941Sguenther 		ret = UTF8_getc(testbuf, 2, &value);
26139187941Sguenther 		ASSERT(ret == 2);
26239187941Sguenther 		ASSERT(value == i);
26339187941Sguenther 	}
26439187941Sguenther 
26539187941Sguenther 	/* three-byte sequences */
26639187941Sguenther 	for (i = 0x800; i < 0x10000; i++) {
267388ed389Sguenther 		if (i >= 0xD800 && i < 0xE000) {
268388ed389Sguenther 			/* surrogates aren't valid */
269388ed389Sguenther 			ret = UTF8_putc(NULL, 0, i);
270388ed389Sguenther 			ASSERT(ret == -2);
27139187941Sguenther 			continue;
272388ed389Sguenther 		}
27339187941Sguenther 
27439187941Sguenther 		ret = UTF8_putc(NULL, 0, i);
27539187941Sguenther 		ASSERT(ret == 3);
27639187941Sguenther 
27739187941Sguenther 		testbuf[0] = testbuf[1] = testbuf[2] = 0;
27839187941Sguenther 		ret = UTF8_putc(testbuf, 2, i);
27939187941Sguenther 		ASSERT(ret == -1);
28039187941Sguenther 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
28139187941Sguenther 
28239187941Sguenther 		ret = UTF8_putc(testbuf, 3, i);
28339187941Sguenther 		ASSERT(ret == 3);
28439187941Sguenther 		ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0);
28539187941Sguenther 		ret = UTF8_getc(testbuf, 3, &value);
28639187941Sguenther 		ASSERT(ret == 3);
28739187941Sguenther 		ASSERT(value == i);
28839187941Sguenther 	}
28939187941Sguenther 
29039187941Sguenther 	/* four-byte sequences */
29139187941Sguenther 	for (i = 0x10000; i < 0x110000; i++) {
29239187941Sguenther 		ret = UTF8_putc(NULL, 0, i);
29339187941Sguenther 		ASSERT(ret == 4);
29439187941Sguenther 
29539187941Sguenther 		testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0;
29639187941Sguenther 		ret = UTF8_putc(testbuf, 3, i);
29739187941Sguenther 		ASSERT(ret == -1);
29839187941Sguenther 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
29939187941Sguenther 
30039187941Sguenther 		ret = UTF8_putc(testbuf, 4, i);
30139187941Sguenther 		ASSERT(ret == 4);
30239187941Sguenther 		ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0);
30339187941Sguenther 		ret = UTF8_getc(testbuf, 4, &value);
30439187941Sguenther 		ASSERT(ret == 4);
30539187941Sguenther 		ASSERT(value == i);
30639187941Sguenther 	}
30739187941Sguenther 
308388ed389Sguenther 	/* spot check some larger values to confirm error return */
309388ed389Sguenther 	for (i = 0x110000; i < 0x110100; i++) {
310388ed389Sguenther 		ret = UTF8_putc(NULL, 0, i);
311388ed389Sguenther 		ASSERT(ret == -2);
312388ed389Sguenther 	}
313388ed389Sguenther 	for (value = (unsigned long)-1; value > (unsigned long)-256; value--) {
314388ed389Sguenther 		ret = UTF8_putc(NULL, 0, value);
315388ed389Sguenther 		ASSERT(ret == -2);
316388ed389Sguenther 	}
31739187941Sguenther 
31839187941Sguenther 	return 0;
31939187941Sguenther }
320