1*c9675a23Stb /* $OpenBSD: utf8test.c,v 1.5 2022/11/26 16:08:56 tb Exp $ */
239187941Sguenther /*
339187941Sguenther * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org>
439187941Sguenther *
539187941Sguenther * Permission to use, copy, modify, and distribute this software for any
639187941Sguenther * purpose with or without fee is hereby granted, provided that the above
739187941Sguenther * copyright notice and this permission notice appear in all copies.
839187941Sguenther *
939187941Sguenther * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1039187941Sguenther * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1139187941Sguenther * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1239187941Sguenther * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1339187941Sguenther * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1439187941Sguenther * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1539187941Sguenther * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1639187941Sguenther */
1739187941Sguenther
1839187941Sguenther /*
1939187941Sguenther * A mostly exhaustive test of UTF-8 decoder and encoder
2039187941Sguenther */
2139187941Sguenther
2239187941Sguenther #include <stdio.h>
2339187941Sguenther #include <string.h>
2439187941Sguenther #include <err.h>
2539187941Sguenther
2639187941Sguenther #include <openssl/asn1.h>
27*c9675a23Stb #include "asn1_local.h" /* peek into the internals */
2839187941Sguenther
2939187941Sguenther #define UNCHANGED 0xfedcba98
3039187941Sguenther
3139187941Sguenther #define ASSERT(x) \
3239187941Sguenther do { \
3339187941Sguenther if (!(x)) \
3439187941Sguenther errx(1, "test failed at line %d: %s", \
3539187941Sguenther __LINE__, #x); \
3639187941Sguenther } while (0)
3739187941Sguenther
3839187941Sguenther int
main(void)3939187941Sguenther main(void)
4039187941Sguenther {
4139187941Sguenther unsigned char testbuf[] = "012345";
4239187941Sguenther const unsigned char zerobuf[sizeof testbuf] = { 0 };
4339187941Sguenther unsigned long value;
44ecd8c27dSguenther unsigned int i, j, k, l;
45ecd8c27dSguenther int ret;
4639187941Sguenther
4739187941Sguenther /*
4839187941Sguenther * First, verify UTF8_getc()
4939187941Sguenther */
5039187941Sguenther value = UNCHANGED;
5139187941Sguenther ret = UTF8_getc(testbuf, 0, &value);
5239187941Sguenther ASSERT(ret == 0);
5339187941Sguenther ASSERT(value == UNCHANGED);
5439187941Sguenther
5539187941Sguenther /* check all valid single-byte chars */
5639187941Sguenther for (i = 0; i < 0x80; i++) {
5739187941Sguenther testbuf[0] = i;
5839187941Sguenther ret = UTF8_getc(testbuf, 1, &value);
5939187941Sguenther ASSERT(ret == 1);
6039187941Sguenther ASSERT(value == i);
6139187941Sguenther
6239187941Sguenther ret = UTF8_getc(testbuf, 2, &value);
6339187941Sguenther ASSERT(ret == 1);
6439187941Sguenther ASSERT(value == i);
6539187941Sguenther }
6639187941Sguenther
6739187941Sguenther /*
6839187941Sguenther * Verify failure on all invalid initial bytes:
6939187941Sguenther * 0x80 - 0xBF following bytes only
7039187941Sguenther * 0xC0 - 0xC1 used to be in non-shortest forms
7139187941Sguenther * 0xF5 - 0xFD used to be initial for 5 and 6 byte sequences
7239187941Sguenther * 0xFE - 0xFF have never been valid in utf-8
7339187941Sguenther */
7439187941Sguenther for (i = 0x80; i < 0xC2; i++) {
7539187941Sguenther value = UNCHANGED;
7639187941Sguenther testbuf[0] = i;
7739187941Sguenther ret = UTF8_getc(testbuf, 1, &value);
7839187941Sguenther ASSERT(ret == -2);
7939187941Sguenther ASSERT(value == UNCHANGED);
8039187941Sguenther }
8139187941Sguenther for (i = 0xF5; i < 0x100; i++) {
8239187941Sguenther value = UNCHANGED;
8339187941Sguenther testbuf[0] = i;
8439187941Sguenther ret = UTF8_getc(testbuf, 1, &value);
8539187941Sguenther ASSERT(ret == -2);
8639187941Sguenther ASSERT(value == UNCHANGED);
8739187941Sguenther }
8839187941Sguenther
8939187941Sguenther /*
9039187941Sguenther * Verify handling of all two-byte sequences
9139187941Sguenther */
9239187941Sguenther for (i = 0xC2; i < 0xE0; i++) {
9339187941Sguenther testbuf[0] = i;
9439187941Sguenther
9539187941Sguenther for (j = 0; j < 0x100; j++) {
9639187941Sguenther testbuf[1] = j;
9739187941Sguenther
9839187941Sguenther value = UNCHANGED;
9939187941Sguenther ret = UTF8_getc(testbuf, 1, &value);
10039187941Sguenther ASSERT(ret == -1);
10139187941Sguenther ASSERT(value == UNCHANGED);
10239187941Sguenther
10339187941Sguenther ret = UTF8_getc(testbuf, 2, &value);
10439187941Sguenther
10539187941Sguenther /* outside range of trailing bytes */
10639187941Sguenther if (j < 0x80 || j > 0xBF) {
10739187941Sguenther ASSERT(ret == -3);
10839187941Sguenther ASSERT(value == UNCHANGED);
10939187941Sguenther continue;
11039187941Sguenther }
11139187941Sguenther
11239187941Sguenther /* valid */
11339187941Sguenther ASSERT(ret == 2);
11439187941Sguenther ASSERT((value & 0x3F) == (j & 0x3F));
11539187941Sguenther ASSERT(value >> 6 == (i & 0x1F));
11639187941Sguenther }
11739187941Sguenther }
11839187941Sguenther
11939187941Sguenther /*
12039187941Sguenther * Verify handling of all three-byte sequences
12139187941Sguenther */
12239187941Sguenther for (i = 0xE0; i < 0xF0; i++) {
12339187941Sguenther testbuf[0] = i;
12439187941Sguenther
12539187941Sguenther for (j = 0; j < 0x100; j++) {
12639187941Sguenther testbuf[1] = j;
12739187941Sguenther
12839187941Sguenther for (k = 0; k < 0x100; k++) {
12939187941Sguenther testbuf[2] = k;
13039187941Sguenther
13139187941Sguenther value = UNCHANGED;
13239187941Sguenther ret = UTF8_getc(testbuf, 2, &value);
13339187941Sguenther ASSERT(ret == -1);
13439187941Sguenther ASSERT(value == UNCHANGED);
13539187941Sguenther
13639187941Sguenther ret = UTF8_getc(testbuf, 3, &value);
13739187941Sguenther
13839187941Sguenther /* outside range of trailing bytes */
13939187941Sguenther if (j < 0x80 || j > 0xBF ||
14039187941Sguenther k < 0x80 || k > 0xBF) {
14139187941Sguenther ASSERT(ret == -3);
14239187941Sguenther ASSERT(value == UNCHANGED);
14339187941Sguenther continue;
14439187941Sguenther }
14539187941Sguenther
14639187941Sguenther /* non-shortest form */
14739187941Sguenther if (i == 0xE0 && j < 0xA0) {
14839187941Sguenther ASSERT(ret == -4);
14939187941Sguenther ASSERT(value == UNCHANGED);
15039187941Sguenther continue;
15139187941Sguenther }
15239187941Sguenther
15339187941Sguenther /* surrogate pair code point */
15439187941Sguenther if (i == 0xED && j > 0x9F) {
15539187941Sguenther ASSERT(ret == -2);
15639187941Sguenther ASSERT(value == UNCHANGED);
15739187941Sguenther continue;
15839187941Sguenther }
15939187941Sguenther
16039187941Sguenther ASSERT(ret == 3);
16139187941Sguenther ASSERT((value & 0x3F) == (k & 0x3F));
16239187941Sguenther ASSERT(((value >> 6) & 0x3F) == (j & 0x3F));
16339187941Sguenther ASSERT(value >> 12 == (i & 0x0F));
16439187941Sguenther }
16539187941Sguenther }
16639187941Sguenther }
16739187941Sguenther
16839187941Sguenther /*
16939187941Sguenther * Verify handling of all four-byte sequences
17039187941Sguenther */
17139187941Sguenther for (i = 0xF0; i < 0xF5; i++) {
17239187941Sguenther testbuf[0] = i;
17339187941Sguenther
17439187941Sguenther for (j = 0; j < 0x100; j++) {
17539187941Sguenther testbuf[1] = j;
17639187941Sguenther
17739187941Sguenther for (k = 0; k < 0x100; k++) {
17839187941Sguenther testbuf[2] = k;
17939187941Sguenther
18039187941Sguenther for (l = 0; l < 0x100; l++) {
18139187941Sguenther testbuf[3] = l;
18239187941Sguenther
18339187941Sguenther value = UNCHANGED;
18439187941Sguenther ret = UTF8_getc(testbuf, 3, &value);
18539187941Sguenther ASSERT(ret == -1);
18639187941Sguenther ASSERT(value == UNCHANGED);
18739187941Sguenther
18839187941Sguenther ret = UTF8_getc(testbuf, 4, &value);
18939187941Sguenther
19039187941Sguenther /* outside range of trailing bytes */
19139187941Sguenther if (j < 0x80 || j > 0xBF ||
19239187941Sguenther k < 0x80 || k > 0xBF ||
19339187941Sguenther l < 0x80 || l > 0xBF) {
19439187941Sguenther ASSERT(ret == -3);
19539187941Sguenther ASSERT(value == UNCHANGED);
19639187941Sguenther continue;
19739187941Sguenther }
19839187941Sguenther
19939187941Sguenther /* non-shortest form */
20039187941Sguenther if (i == 0xF0 && j < 0x90) {
20139187941Sguenther ASSERT(ret == -4);
20239187941Sguenther ASSERT(value == UNCHANGED);
20339187941Sguenther continue;
20439187941Sguenther }
20539187941Sguenther
20639187941Sguenther /* beyond end of UCS range */
20739187941Sguenther if (i == 0xF4 && j > 0x8F) {
20839187941Sguenther ASSERT(ret == -2);
20939187941Sguenther ASSERT(value == UNCHANGED);
21039187941Sguenther continue;
21139187941Sguenther }
21239187941Sguenther
21339187941Sguenther ASSERT(ret == 4);
21439187941Sguenther ASSERT((value & 0x3F) == (l & 0x3F));
21539187941Sguenther ASSERT(((value >> 6) & 0x3F) ==
21639187941Sguenther (k & 0x3F));
21739187941Sguenther ASSERT(((value >> 12) & 0x3F) ==
21839187941Sguenther (j & 0x3F));
21939187941Sguenther ASSERT(value >> 18 == (i & 0x07));
22039187941Sguenther }
22139187941Sguenther }
22239187941Sguenther }
22339187941Sguenther }
22439187941Sguenther
22539187941Sguenther
22639187941Sguenther /*
22739187941Sguenther * Next, verify UTF8_putc()
22839187941Sguenther */
22939187941Sguenther memset(testbuf, 0, sizeof testbuf);
23039187941Sguenther
23139187941Sguenther /* single-byte sequences */
23239187941Sguenther for (i = 0; i < 0x80; i++) {
23339187941Sguenther ret = UTF8_putc(NULL, 0, i);
23439187941Sguenther ASSERT(ret == 1);
23539187941Sguenther
23639187941Sguenther testbuf[0] = 0;
23739187941Sguenther ret = UTF8_putc(testbuf, 0, i);
23839187941Sguenther ASSERT(ret == -1);
23939187941Sguenther ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
24039187941Sguenther
24139187941Sguenther ret = UTF8_putc(testbuf, 1, i);
24239187941Sguenther ASSERT(ret == 1);
24339187941Sguenther ASSERT(testbuf[0] == i);
24439187941Sguenther ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0);
24539187941Sguenther }
24639187941Sguenther
24739187941Sguenther /* two-byte sequences */
24839187941Sguenther for (i = 0x80; i < 0x800; i++) {
24939187941Sguenther ret = UTF8_putc(NULL, 0, i);
25039187941Sguenther ASSERT(ret == 2);
25139187941Sguenther
25239187941Sguenther testbuf[0] = testbuf[1] = 0;
25339187941Sguenther ret = UTF8_putc(testbuf, 1, i);
25439187941Sguenther ASSERT(ret == -1);
25539187941Sguenther ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
25639187941Sguenther
25739187941Sguenther ret = UTF8_putc(testbuf, 2, i);
25839187941Sguenther ASSERT(ret == 2);
25939187941Sguenther ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0);
26039187941Sguenther ret = UTF8_getc(testbuf, 2, &value);
26139187941Sguenther ASSERT(ret == 2);
26239187941Sguenther ASSERT(value == i);
26339187941Sguenther }
26439187941Sguenther
26539187941Sguenther /* three-byte sequences */
26639187941Sguenther for (i = 0x800; i < 0x10000; i++) {
267388ed389Sguenther if (i >= 0xD800 && i < 0xE000) {
268388ed389Sguenther /* surrogates aren't valid */
269388ed389Sguenther ret = UTF8_putc(NULL, 0, i);
270388ed389Sguenther ASSERT(ret == -2);
27139187941Sguenther continue;
272388ed389Sguenther }
27339187941Sguenther
27439187941Sguenther ret = UTF8_putc(NULL, 0, i);
27539187941Sguenther ASSERT(ret == 3);
27639187941Sguenther
27739187941Sguenther testbuf[0] = testbuf[1] = testbuf[2] = 0;
27839187941Sguenther ret = UTF8_putc(testbuf, 2, i);
27939187941Sguenther ASSERT(ret == -1);
28039187941Sguenther ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
28139187941Sguenther
28239187941Sguenther ret = UTF8_putc(testbuf, 3, i);
28339187941Sguenther ASSERT(ret == 3);
28439187941Sguenther ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0);
28539187941Sguenther ret = UTF8_getc(testbuf, 3, &value);
28639187941Sguenther ASSERT(ret == 3);
28739187941Sguenther ASSERT(value == i);
28839187941Sguenther }
28939187941Sguenther
29039187941Sguenther /* four-byte sequences */
29139187941Sguenther for (i = 0x10000; i < 0x110000; i++) {
29239187941Sguenther ret = UTF8_putc(NULL, 0, i);
29339187941Sguenther ASSERT(ret == 4);
29439187941Sguenther
29539187941Sguenther testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0;
29639187941Sguenther ret = UTF8_putc(testbuf, 3, i);
29739187941Sguenther ASSERT(ret == -1);
29839187941Sguenther ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
29939187941Sguenther
30039187941Sguenther ret = UTF8_putc(testbuf, 4, i);
30139187941Sguenther ASSERT(ret == 4);
30239187941Sguenther ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0);
30339187941Sguenther ret = UTF8_getc(testbuf, 4, &value);
30439187941Sguenther ASSERT(ret == 4);
30539187941Sguenther ASSERT(value == i);
30639187941Sguenther }
30739187941Sguenther
308388ed389Sguenther /* spot check some larger values to confirm error return */
309388ed389Sguenther for (i = 0x110000; i < 0x110100; i++) {
310388ed389Sguenther ret = UTF8_putc(NULL, 0, i);
311388ed389Sguenther ASSERT(ret == -2);
312388ed389Sguenther }
313388ed389Sguenther for (value = (unsigned long)-1; value > (unsigned long)-256; value--) {
314388ed389Sguenther ret = UTF8_putc(NULL, 0, value);
315388ed389Sguenther ASSERT(ret == -2);
316388ed389Sguenther }
31739187941Sguenther
31839187941Sguenther return 0;
31939187941Sguenther }
320