1 /* 2 * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin) 3 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 4 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 5 * Copyright (c) 2002-2004 Tim J. Robbins 6 * All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * WCSBIN_EOF - Indicate EOF on input buffer. 37 * 38 * WCSBIN_SURRO - Pass-through surrogate space (typically if the UTF-8 39 * has already been escaped), on bytes-to-wchars and 40 * wchars-to-bytes. Escaping of other illegal codes will 41 * still occur on input but de-escaping will not occur 42 * on output (they will remain in the surrogate space). 43 * 44 * WCSBIN_LONGCODES - Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences 45 * (normally illegal), otherwise escape it on input 46 * and fail on output. 47 * 48 * WCSBIN_STRICT - Allow byte-to-wide conversions to fail. 49 */ 50 51 #include <sys/param.h> 52 53 #include <errno.h> 54 #include <limits.h> 55 #include <runetype.h> 56 #include <stdlib.h> 57 #include <string.h> 58 #include <wchar.h> 59 #include "mblocal.h" 60 61 extern int __mb_sb_limit; 62 63 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 64 size_t, mbstate_t * __restrict); 65 static int _UTF8_mbsinit(const mbstate_t *); 66 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 67 const char ** __restrict, size_t, size_t, 68 mbstate_t * __restrict); 69 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 70 mbstate_t * __restrict); 71 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 72 size_t, size_t, mbstate_t * __restrict); 73 static size_t _UTF8_mbintowcr(wchar_t * __restrict dst, 74 const char * __restrict src, 75 size_t dlen, size_t *slen, int flags); 76 static size_t _UTF8_wcrtombin(char * __restrict dst, 77 const wchar_t * __restrict src, 78 size_t dlen, size_t *slen, int flags); 79 80 typedef struct { 81 wchar_t ch; 82 int want; 83 wchar_t lbound; 84 } _UTF8State; 85 86 int 87 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 88 { 89 90 l->__mbrtowc = _UTF8_mbrtowc; 91 l->__wcrtomb = _UTF8_wcrtomb; 92 l->__mbsinit = _UTF8_mbsinit; 93 l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 94 l->__wcsnrtombs = _UTF8_wcsnrtombs; 95 l->__mbintowcr = _UTF8_mbintowcr; 96 l->__wcrtombin = _UTF8_wcrtombin; 97 l->runes = rl; 98 l->__mb_cur_max = 4; 99 /* 100 * UCS-4 encoding used as the internal representation, so 101 * slots 0x0080-0x00FF are occuped and must be excluded 102 * from the single byte ctype by setting the limit. 103 */ 104 l->__mb_sb_limit = 128; 105 106 return (0); 107 } 108 109 static int 110 _UTF8_mbsinit(const mbstate_t *ps) 111 { 112 113 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 114 } 115 116 static size_t 117 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 118 mbstate_t * __restrict ps) 119 { 120 _UTF8State *us; 121 int ch, i, mask, want; 122 wchar_t lbound, wch; 123 124 us = (_UTF8State *)ps; 125 126 if (us->want < 0 || us->want > 4) { 127 errno = EINVAL; 128 return ((size_t)-1); 129 } 130 131 if (s == NULL) { 132 s = ""; 133 n = 1; 134 pwc = NULL; 135 } 136 137 if (n == 0) 138 /* Incomplete multibyte sequence */ 139 return ((size_t)-2); 140 141 if (us->want == 0) { 142 /* 143 * Determine the number of octets that make up this character 144 * from the first octet, and a mask that extracts the 145 * interesting bits of the first octet. We already know 146 * the character is at least two bytes long. 147 * 148 * We also specify a lower bound for the character code to 149 * detect redundant, non-"shortest form" encodings. For 150 * example, the sequence C0 80 is _not_ a legal representation 151 * of the null character. This enforces a 1-to-1 mapping 152 * between character codes and their multibyte representations. 153 */ 154 ch = (unsigned char)*s; 155 if ((ch & 0x80) == 0) { 156 /* Fast path for plain ASCII characters. */ 157 if (pwc != NULL) 158 *pwc = ch; 159 return (ch != '\0' ? 1 : 0); 160 } 161 if ((ch & 0xe0) == 0xc0) { 162 mask = 0x1f; 163 want = 2; 164 lbound = 0x80; 165 } else if ((ch & 0xf0) == 0xe0) { 166 mask = 0x0f; 167 want = 3; 168 lbound = 0x800; 169 } else if ((ch & 0xf8) == 0xf0) { 170 mask = 0x07; 171 want = 4; 172 lbound = 0x10000; 173 } else { 174 /* 175 * Malformed input; input is not UTF-8. 176 */ 177 errno = EILSEQ; 178 return ((size_t)-1); 179 } 180 } else { 181 want = us->want; 182 lbound = us->lbound; 183 } 184 185 /* 186 * Decode the octet sequence representing the character in chunks 187 * of 6 bits, most significant first. 188 */ 189 if (us->want == 0) 190 wch = (unsigned char)*s++ & mask; 191 else 192 wch = us->ch; 193 194 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 195 if ((*s & 0xc0) != 0x80) { 196 /* 197 * Malformed input; bad characters in the middle 198 * of a character. 199 */ 200 errno = EILSEQ; 201 return ((size_t)-1); 202 } 203 wch <<= 6; 204 wch |= *s++ & 0x3f; 205 } 206 if (i < want) { 207 /* Incomplete multibyte sequence. */ 208 us->want = want - i; 209 us->lbound = lbound; 210 us->ch = wch; 211 return ((size_t)-2); 212 } 213 if (wch < lbound || (wch & ~0x10ffff)) { 214 /* 215 * Malformed input; redundant encoding or illegal 216 * code sequence. 217 */ 218 errno = EILSEQ; 219 return ((size_t)-1); 220 } 221 if (pwc != NULL) 222 *pwc = wch; 223 us->want = 0; 224 return (wch == L'\0' ? 0 : want); 225 } 226 227 static size_t 228 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 229 size_t nms, size_t len, mbstate_t * __restrict ps) 230 { 231 _UTF8State *us; 232 const char *s; 233 size_t nchr; 234 wchar_t wc; 235 size_t nb; 236 237 us = (_UTF8State *)ps; 238 239 s = *src; 240 nchr = 0; 241 242 if (dst == NULL) { 243 /* 244 * The fast path in the loop below is not safe if an ASCII 245 * character appears as anything but the first byte of a 246 * multibyte sequence. Check now to avoid doing it in the loop. 247 */ 248 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 249 errno = EILSEQ; 250 return ((size_t)-1); 251 } 252 for (;;) { 253 if (nms > 0 && (signed char)*s > 0) 254 /* 255 * Fast path for plain ASCII characters 256 * excluding NUL. 257 */ 258 nb = 1; 259 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 260 (size_t)-1) 261 /* Invalid sequence - mbrtowc() sets errno. */ 262 return ((size_t)-1); 263 else if (nb == 0 || nb == (size_t)-2) 264 return (nchr); 265 s += nb; 266 nms -= nb; 267 nchr++; 268 } 269 /*NOTREACHED*/ 270 } 271 272 /* 273 * The fast path in the loop below is not safe if an ASCII 274 * character appears as anything but the first byte of a 275 * multibyte sequence. Check now to avoid doing it in the loop. 276 */ 277 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 278 errno = EILSEQ; 279 return ((size_t)-1); 280 } 281 while (len-- > 0) { 282 if (nms > 0 && (signed char)*s > 0) { 283 /* 284 * Fast path for plain ASCII characters 285 * excluding NUL. 286 */ 287 *dst = (wchar_t)*s; 288 nb = 1; 289 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 290 (size_t)-1) { 291 *src = s; 292 return ((size_t)-1); 293 } else if (nb == (size_t)-2) { 294 *src = s + nms; 295 return (nchr); 296 } else if (nb == 0) { 297 *src = NULL; 298 return (nchr); 299 } 300 s += nb; 301 nms -= nb; 302 nchr++; 303 dst++; 304 } 305 *src = s; 306 return (nchr); 307 } 308 309 static size_t 310 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 311 { 312 _UTF8State *us; 313 unsigned char lead; 314 int i, len; 315 316 us = (_UTF8State *)ps; 317 318 if (us->want != 0) { 319 errno = EINVAL; 320 return ((size_t)-1); 321 } 322 323 if (s == NULL) 324 /* Reset to initial shift state (no-op) */ 325 return (1); 326 327 /* 328 * Determine the number of octets needed to represent this character. 329 * We always output the shortest sequence possible. Also specify the 330 * first few bits of the first octet, which contains the information 331 * about the sequence length. 332 */ 333 if ((wc & ~0x7f) == 0) { 334 /* Fast path for plain ASCII characters. */ 335 *s = (char)wc; 336 return (1); 337 } else if ((wc & ~0x7ff) == 0) { 338 lead = 0xc0; 339 len = 2; 340 } else if ((wc & ~0xffff) == 0) { 341 lead = 0xe0; 342 len = 3; 343 } else if ((wc & ~0x10ffff) == 0) { 344 lead = 0xf0; 345 len = 4; 346 } else { 347 errno = EILSEQ; 348 return ((size_t)-1); 349 } 350 351 /* 352 * Output the octets representing the character in chunks 353 * of 6 bits, least significant last. The first octet is 354 * a special case because it contains the sequence length 355 * information. 356 */ 357 for (i = len - 1; i > 0; i--) { 358 s[i] = (wc & 0x3f) | 0x80; 359 wc >>= 6; 360 } 361 *s = (wc & 0xff) | lead; 362 363 return (len); 364 } 365 366 static size_t 367 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 368 size_t nwc, size_t len, mbstate_t * __restrict ps) 369 { 370 _UTF8State *us; 371 char buf[MB_LEN_MAX]; 372 const wchar_t *s; 373 size_t nbytes; 374 size_t nb; 375 376 us = (_UTF8State *)ps; 377 378 if (us->want != 0) { 379 errno = EINVAL; 380 return ((size_t)-1); 381 } 382 383 s = *src; 384 nbytes = 0; 385 386 if (dst == NULL) { 387 while (nwc-- > 0) { 388 if (0 <= *s && *s < 0x80) 389 /* Fast path for plain ASCII characters. */ 390 nb = 1; 391 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 392 (size_t)-1) 393 /* Invalid character - wcrtomb() sets errno. */ 394 return ((size_t)-1); 395 if (*s == L'\0') 396 return (nbytes + nb - 1); 397 s++; 398 nbytes += nb; 399 } 400 return (nbytes); 401 } 402 403 while (len > 0 && nwc-- > 0) { 404 if (0 <= *s && *s < 0x80) { 405 /* Fast path for plain ASCII characters. */ 406 nb = 1; 407 *dst = *s; 408 } else if (len > (size_t)MB_CUR_MAX) { 409 /* Enough space to translate in-place. */ 410 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 411 *src = s; 412 return ((size_t)-1); 413 } 414 } else { 415 /* 416 * May not be enough space; use temp. buffer. 417 */ 418 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 419 *src = s; 420 return ((size_t)-1); 421 } 422 if (nb > (int)len) 423 /* MB sequence for character won't fit. */ 424 break; 425 (void) memcpy(dst, buf, nb); 426 } 427 if (*s == L'\0') { 428 *src = NULL; 429 return (nbytes + nb - 1); 430 } 431 s++; 432 dst += nb; 433 len -= nb; 434 nbytes += nb; 435 } 436 *src = s; 437 return (nbytes); 438 } 439 440 /* 441 * Clean binary to wchar buffer conversions. This is basically like a normal 442 * buffer conversion but with a sane argument API and escaping. See none.c 443 * for a more complete description. 444 */ 445 static size_t 446 _UTF8_mbintowcr(wchar_t * __restrict dst, const char * __restrict src, 447 size_t dlen, size_t *slen, int flags) 448 { 449 size_t i; 450 size_t j; 451 size_t k; 452 size_t n = *slen; 453 int ch, mask, want; 454 wchar_t lbound, wch; 455 456 for (i = j = 0; i < n; ++i) { 457 if (j == dlen) 458 break; 459 ch = (unsigned char)src[i]; 460 461 if ((ch & 0x80) == 0) { 462 /* Fast path for plain ASCII characters. */ 463 if (dst) 464 dst[j] = ch; 465 ++j; 466 continue; 467 } 468 if ((ch & 0xe0) == 0xc0) { 469 mask = 0x1f; 470 want = 2; 471 lbound = 0x80; 472 } else if ((ch & 0xf0) == 0xe0) { 473 mask = 0x0f; 474 want = 3; 475 lbound = 0x800; 476 } else if ((ch & 0xf8) == 0xf0) { 477 mask = 0x07; 478 want = 4; 479 lbound = 0x10000; 480 } else if ((ch & 0xfc) == 0xf8) { 481 /* normally illegal, handled down below */ 482 mask = 0x03; 483 want = 5; 484 lbound = 0x200000; 485 } else if ((ch & 0xfe) == 0xfc) { 486 /* normally illegal, handled down below */ 487 mask = 0x01; 488 want = 6; 489 lbound = 0x4000000; 490 } else { 491 /* 492 * Malformed input; input is not UTF-8, escape 493 * with UTF-8B. 494 */ 495 if (flags & WCSBIN_STRICT) { 496 if (i == 0) { 497 errno = EILSEQ; 498 return ((size_t)-1); 499 } 500 break; 501 } 502 if (dst) 503 dst[j] = 0xDC00 | ch; 504 ++j; 505 continue; 506 } 507 508 /* 509 * Construct wchar_t from multibyte sequence. 510 */ 511 wch = ch & mask; 512 for (k = 1; k < want; ++k) { 513 /* 514 * Stop if not enough input (don't do this early 515 * so we can detect illegal characters as they occur 516 * in the stream). 517 * 518 * If termination is requested force-escape all chars. 519 */ 520 if (i + k >= n) { 521 if (flags & WCSBIN_EOF) { 522 want = n - i; 523 goto forceesc; 524 } 525 goto breakout; 526 } 527 528 ch = src[i+k]; 529 if ((ch & 0xc0) != 0x80) { 530 /* 531 * Malformed input, bad characters in the 532 * middle of a multibyte sequence. Escape 533 * with UTF-8B. 534 */ 535 if (flags & WCSBIN_STRICT) { 536 if (i == 0) { 537 errno = EILSEQ; 538 return ((size_t)-1); 539 } 540 goto breakout; 541 } 542 if (dst) 543 dst[j] = 0xDC00 | (unsigned char)src[i]; 544 ++j; 545 goto loopup; 546 } 547 wch <<= 6; 548 wch |= ch & 0x3f; 549 } 550 551 /* 552 * Check validity of the wchar. If invalid we could escape 553 * just the first character and loop up, but it ought to be 554 * more readable if we escape all the chars in the sequence 555 * (since they are all >= 0x80 and might represent a legacy 556 * 5-byte or 6-byte code). 557 */ 558 if (wch < lbound || 559 ((flags & WCSBIN_LONGCODES) == 0 && (wch & ~0x10ffff)) || 560 ((flags & WCSBIN_LONGCODES) == 0 && want >= 5)) { 561 goto forceesc; 562 } 563 564 /* 565 * Check if wch is a surrogate code (which also encloses our 566 * UTF-8B escaping range). This is normally illegal in UTF8. 567 * If it is, we need to escape each characer in the sequence. 568 * Breakout if there isn't enough output buffer space. 569 * 570 * If (flags & WCSBIN_SURRO) the caller wishes to accept 571 * surrogate codes, i.e. the input might potentially already 572 * be escaped UTF8-B or unchecked UTF-16 that was converted 573 * into UTF-8. 574 */ 575 if ((flags & WCSBIN_SURRO) == 0 && 576 wch >= 0xD800 && wch <= 0xDFFF) { 577 forceesc: 578 if (j + want > dlen) 579 break; 580 if (flags & WCSBIN_STRICT) { 581 if (i == 0) { 582 errno = EILSEQ; 583 return ((size_t)-1); 584 } 585 break; 586 } 587 for (k = 0; k < want; ++k) { 588 if (dst) { 589 dst[j] = 0xDC00 | 590 (unsigned char)src[i+k]; 591 } 592 ++j; 593 } 594 i += k - 1; 595 } else { 596 i += k - 1; 597 if (dst) 598 dst[j] = wch; 599 ++j; 600 } 601 loopup: 602 ; 603 } 604 breakout: 605 *slen = i; 606 607 return j; 608 } 609 610 static size_t 611 _UTF8_wcrtombin(char * __restrict dst, const wchar_t * __restrict src, 612 size_t dlen, size_t *slen, int flags) 613 { 614 size_t i; 615 size_t j; 616 size_t k; 617 size_t n = *slen; 618 size_t len; 619 unsigned char lead; 620 wchar_t wc; 621 622 for (i = j = 0; i < n; ++i) { 623 if (j == dlen) 624 break; 625 wc = src[i]; 626 627 if ((wc & ~0x7f) == 0) { 628 /* Fast path for plain ASCII characters. */ 629 if (dst) 630 dst[j] = (unsigned char)wc; 631 ++j; 632 continue; 633 } 634 if ((wc & ~0x7ff) == 0) { 635 lead = 0xc0; 636 len = 2; 637 } else if (wc >= 0xDC80 && wc <= 0xDCFF && 638 (flags & WCSBIN_SURRO) == 0) { 639 if (flags & WCSBIN_STRICT) { 640 /* 641 * STRICT without SURRO is an error for 642 * surrogates. 643 */ 644 if (i == 0) { 645 errno = EILSEQ; 646 return ((size_t)-1); 647 } 648 break; 649 } 650 if (dst) 651 dst[j] = (unsigned char)wc; 652 ++j; 653 continue; 654 } else if ((wc & ~0xffff) == 0) { 655 if (wc >= 0xD800 && wc <= 0xDFFF && 656 (flags & (WCSBIN_SURRO | WCSBIN_STRICT)) == 657 WCSBIN_STRICT) { 658 /* 659 * Surrogates in general are an error 660 * if STRICT is specified and SURRO is not 661 * specified. 662 */ 663 if (i == 0) { 664 errno = EILSEQ; 665 return ((size_t)-1); 666 } 667 break; 668 } 669 lead = 0xe0; 670 len = 3; 671 } else if ((wc & ~0x10ffff) == 0) { 672 lead = 0xf0; 673 len = 4; 674 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x200000) { 675 /* normally illegal */ 676 lead = 0xf0; 677 len = 4; 678 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x4000000) { 679 /* normally illegal */ 680 lead = 0xf8; 681 len = 5; 682 } else if ((flags & WCSBIN_LONGCODES) && 683 (uint32_t)wc < 0x80000000U) { 684 /* normally illegal */ 685 lead = 0xfc; 686 len = 6; 687 } else { 688 if (i == 0) { 689 errno = EILSEQ; 690 return ((size_t)-1); 691 } 692 /* stop here, process error on next loop */ 693 break; 694 } 695 696 /* 697 * Output the octets representing the character in chunks 698 * of 6 bits, least significant last. The first octet is 699 * a special case because it contains the sequence length 700 * information. 701 */ 702 if (j + len > dlen) 703 break; 704 k = j; 705 j += len; 706 if (dst) { 707 while (--len > 0) { 708 dst[k + len] = (wc & 0x3f) | 0x80; 709 wc >>= 6; 710 } 711 dst[k] = (wc & 0xff) | lead; 712 } 713 } 714 *slen = i; 715 716 return j; 717 } 718 719 size_t 720 utf8towcr(wchar_t * __restrict dst, const char * __restrict src, 721 size_t dlen, size_t *slen, int flags) 722 { 723 return _UTF8_mbintowcr(dst, src, dlen, slen, flags); 724 } 725 726 size_t 727 wcrtoutf8(char * __restrict dst, const wchar_t * __restrict src, 728 size_t dlen, size_t *slen, int flags) 729 { 730 return _UTF8_wcrtombin(dst, src, dlen, slen, flags); 731 } 732