1 /* $OpenBSD: utf8test.c,v 1.4 2018/07/17 17:06:50 tb Exp $ */ 2 /* 3 * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 /* 19 * A mostly exhaustive test of UTF-8 decoder and encoder 20 */ 21 22 #include <stdio.h> 23 #include <string.h> 24 #include <err.h> 25 26 #include <openssl/asn1.h> 27 #include "asn1_locl.h" /* peek into the internals */ 28 29 #define UNCHANGED 0xfedcba98 30 31 #define ASSERT(x) \ 32 do { \ 33 if (!(x)) \ 34 errx(1, "test failed at line %d: %s", \ 35 __LINE__, #x); \ 36 } while (0) 37 38 int 39 main(void) 40 { 41 unsigned char testbuf[] = "012345"; 42 const unsigned char zerobuf[sizeof testbuf] = { 0 }; 43 unsigned long value; 44 unsigned int i, j, k, l; 45 int ret; 46 47 /* 48 * First, verify UTF8_getc() 49 */ 50 value = UNCHANGED; 51 ret = UTF8_getc(testbuf, 0, &value); 52 ASSERT(ret == 0); 53 ASSERT(value == UNCHANGED); 54 55 /* check all valid single-byte chars */ 56 for (i = 0; i < 0x80; i++) { 57 testbuf[0] = i; 58 ret = UTF8_getc(testbuf, 1, &value); 59 ASSERT(ret == 1); 60 ASSERT(value == i); 61 62 ret = UTF8_getc(testbuf, 2, &value); 63 ASSERT(ret == 1); 64 ASSERT(value == i); 65 } 66 67 /* 68 * Verify failure on all invalid initial bytes: 69 * 0x80 - 0xBF following bytes only 70 * 0xC0 - 0xC1 used to be in non-shortest forms 71 * 0xF5 - 0xFD used to be initial for 5 and 6 byte sequences 72 * 0xFE - 0xFF have never been valid in utf-8 73 */ 74 for (i = 0x80; i < 0xC2; i++) { 75 value = UNCHANGED; 76 testbuf[0] = i; 77 ret = UTF8_getc(testbuf, 1, &value); 78 ASSERT(ret == -2); 79 ASSERT(value == UNCHANGED); 80 } 81 for (i = 0xF5; i < 0x100; i++) { 82 value = UNCHANGED; 83 testbuf[0] = i; 84 ret = UTF8_getc(testbuf, 1, &value); 85 ASSERT(ret == -2); 86 ASSERT(value == UNCHANGED); 87 } 88 89 /* 90 * Verify handling of all two-byte sequences 91 */ 92 for (i = 0xC2; i < 0xE0; i++) { 93 testbuf[0] = i; 94 95 for (j = 0; j < 0x100; j++) { 96 testbuf[1] = j; 97 98 value = UNCHANGED; 99 ret = UTF8_getc(testbuf, 1, &value); 100 ASSERT(ret == -1); 101 ASSERT(value == UNCHANGED); 102 103 ret = UTF8_getc(testbuf, 2, &value); 104 105 /* outside range of trailing bytes */ 106 if (j < 0x80 || j > 0xBF) { 107 ASSERT(ret == -3); 108 ASSERT(value == UNCHANGED); 109 continue; 110 } 111 112 /* valid */ 113 ASSERT(ret == 2); 114 ASSERT((value & 0x3F) == (j & 0x3F)); 115 ASSERT(value >> 6 == (i & 0x1F)); 116 } 117 } 118 119 /* 120 * Verify handling of all three-byte sequences 121 */ 122 for (i = 0xE0; i < 0xF0; i++) { 123 testbuf[0] = i; 124 125 for (j = 0; j < 0x100; j++) { 126 testbuf[1] = j; 127 128 for (k = 0; k < 0x100; k++) { 129 testbuf[2] = k; 130 131 value = UNCHANGED; 132 ret = UTF8_getc(testbuf, 2, &value); 133 ASSERT(ret == -1); 134 ASSERT(value == UNCHANGED); 135 136 ret = UTF8_getc(testbuf, 3, &value); 137 138 /* outside range of trailing bytes */ 139 if (j < 0x80 || j > 0xBF || 140 k < 0x80 || k > 0xBF) { 141 ASSERT(ret == -3); 142 ASSERT(value == UNCHANGED); 143 continue; 144 } 145 146 /* non-shortest form */ 147 if (i == 0xE0 && j < 0xA0) { 148 ASSERT(ret == -4); 149 ASSERT(value == UNCHANGED); 150 continue; 151 } 152 153 /* surrogate pair code point */ 154 if (i == 0xED && j > 0x9F) { 155 ASSERT(ret == -2); 156 ASSERT(value == UNCHANGED); 157 continue; 158 } 159 160 ASSERT(ret == 3); 161 ASSERT((value & 0x3F) == (k & 0x3F)); 162 ASSERT(((value >> 6) & 0x3F) == (j & 0x3F)); 163 ASSERT(value >> 12 == (i & 0x0F)); 164 } 165 } 166 } 167 168 /* 169 * Verify handling of all four-byte sequences 170 */ 171 for (i = 0xF0; i < 0xF5; i++) { 172 testbuf[0] = i; 173 174 for (j = 0; j < 0x100; j++) { 175 testbuf[1] = j; 176 177 for (k = 0; k < 0x100; k++) { 178 testbuf[2] = k; 179 180 for (l = 0; l < 0x100; l++) { 181 testbuf[3] = l; 182 183 value = UNCHANGED; 184 ret = UTF8_getc(testbuf, 3, &value); 185 ASSERT(ret == -1); 186 ASSERT(value == UNCHANGED); 187 188 ret = UTF8_getc(testbuf, 4, &value); 189 190 /* outside range of trailing bytes */ 191 if (j < 0x80 || j > 0xBF || 192 k < 0x80 || k > 0xBF || 193 l < 0x80 || l > 0xBF) { 194 ASSERT(ret == -3); 195 ASSERT(value == UNCHANGED); 196 continue; 197 } 198 199 /* non-shortest form */ 200 if (i == 0xF0 && j < 0x90) { 201 ASSERT(ret == -4); 202 ASSERT(value == UNCHANGED); 203 continue; 204 } 205 206 /* beyond end of UCS range */ 207 if (i == 0xF4 && j > 0x8F) { 208 ASSERT(ret == -2); 209 ASSERT(value == UNCHANGED); 210 continue; 211 } 212 213 ASSERT(ret == 4); 214 ASSERT((value & 0x3F) == (l & 0x3F)); 215 ASSERT(((value >> 6) & 0x3F) == 216 (k & 0x3F)); 217 ASSERT(((value >> 12) & 0x3F) == 218 (j & 0x3F)); 219 ASSERT(value >> 18 == (i & 0x07)); 220 } 221 } 222 } 223 } 224 225 226 /* 227 * Next, verify UTF8_putc() 228 */ 229 memset(testbuf, 0, sizeof testbuf); 230 231 /* single-byte sequences */ 232 for (i = 0; i < 0x80; i++) { 233 ret = UTF8_putc(NULL, 0, i); 234 ASSERT(ret == 1); 235 236 testbuf[0] = 0; 237 ret = UTF8_putc(testbuf, 0, i); 238 ASSERT(ret == -1); 239 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); 240 241 ret = UTF8_putc(testbuf, 1, i); 242 ASSERT(ret == 1); 243 ASSERT(testbuf[0] == i); 244 ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0); 245 } 246 247 /* two-byte sequences */ 248 for (i = 0x80; i < 0x800; i++) { 249 ret = UTF8_putc(NULL, 0, i); 250 ASSERT(ret == 2); 251 252 testbuf[0] = testbuf[1] = 0; 253 ret = UTF8_putc(testbuf, 1, i); 254 ASSERT(ret == -1); 255 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); 256 257 ret = UTF8_putc(testbuf, 2, i); 258 ASSERT(ret == 2); 259 ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0); 260 ret = UTF8_getc(testbuf, 2, &value); 261 ASSERT(ret == 2); 262 ASSERT(value == i); 263 } 264 265 /* three-byte sequences */ 266 for (i = 0x800; i < 0x10000; i++) { 267 if (i >= 0xD800 && i < 0xE000) { 268 /* surrogates aren't valid */ 269 ret = UTF8_putc(NULL, 0, i); 270 ASSERT(ret == -2); 271 continue; 272 } 273 274 ret = UTF8_putc(NULL, 0, i); 275 ASSERT(ret == 3); 276 277 testbuf[0] = testbuf[1] = testbuf[2] = 0; 278 ret = UTF8_putc(testbuf, 2, i); 279 ASSERT(ret == -1); 280 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); 281 282 ret = UTF8_putc(testbuf, 3, i); 283 ASSERT(ret == 3); 284 ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0); 285 ret = UTF8_getc(testbuf, 3, &value); 286 ASSERT(ret == 3); 287 ASSERT(value == i); 288 } 289 290 /* four-byte sequences */ 291 for (i = 0x10000; i < 0x110000; i++) { 292 ret = UTF8_putc(NULL, 0, i); 293 ASSERT(ret == 4); 294 295 testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0; 296 ret = UTF8_putc(testbuf, 3, i); 297 ASSERT(ret == -1); 298 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); 299 300 ret = UTF8_putc(testbuf, 4, i); 301 ASSERT(ret == 4); 302 ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0); 303 ret = UTF8_getc(testbuf, 4, &value); 304 ASSERT(ret == 4); 305 ASSERT(value == i); 306 } 307 308 /* spot check some larger values to confirm error return */ 309 for (i = 0x110000; i < 0x110100; i++) { 310 ret = UTF8_putc(NULL, 0, i); 311 ASSERT(ret == -2); 312 } 313 for (value = (unsigned long)-1; value > (unsigned long)-256; value--) { 314 ret = UTF8_putc(NULL, 0, value); 315 ASSERT(ret == -2); 316 } 317 318 return 0; 319 } 320