1 /* $NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $ */ 2 3 /*- 4 * Copyright (c) 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Dieter Baron. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $"); 34 35 #include <sys/null.h> 36 37 #include "unicode.h" 38 39 size_t 40 utf8_to_utf16(uint16_t *dst, size_t dst_len, 41 const char *src, size_t src_len, 42 int flags, int *errp) 43 { 44 const unsigned char *s; 45 size_t spos, dpos; 46 int error; 47 uint16_t c; 48 49 #define IS_CONT(c) (((c)&0xc0) == 0x80) 50 51 error = 0; 52 s = (const unsigned char *)src; 53 spos = dpos = 0; 54 while (spos < src_len) { 55 if (s[spos] < 0x80) { 56 c = s[spos++]; 57 } else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) 58 && (spos >= src_len || !IS_CONT(s[spos+1])) 59 && s[spos]>=0xa0) { 60 /* not valid UTF-8, assume ISO 8859-1 */ 61 c = s[spos++]; 62 } else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { 63 /* continuation byte without lead byte 64 * or lead byte for codepoint above 0x10ffff */ 65 error++; 66 spos++; 67 continue; 68 } else if (s[spos] < 0xe0) { 69 if (spos >= src_len || !IS_CONT(s[spos+1])) { 70 spos++; 71 error++; 72 continue; 73 } 74 c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); 75 spos += 2; 76 if (c < 0x80) { 77 /* overlong encoding */ 78 error++; 79 continue; 80 } 81 } else if (s[spos] < 0xf0) { 82 if (spos >= src_len-2 || 83 !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { 84 spos++; 85 error++; 86 continue; 87 } 88 c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) 89 | (s[spos+2] & 0x3f); 90 spos += 3; 91 if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { 92 /* overlong encoding or encoded surrogate */ 93 error++; 94 continue; 95 } 96 } else { 97 uint32_t cc; 98 /* UTF-16 surrogate pair */ 99 100 if (spos >= src_len-3 || !IS_CONT(s[spos+1]) 101 || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { 102 spos++; 103 error++; 104 continue; 105 } 106 cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) 107 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); 108 spos += 4; 109 if (cc < 0x10000) { 110 /* overlong encoding */ 111 error++; 112 continue; 113 } 114 if (dst && dpos < dst_len) 115 dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); 116 dpos++; 117 c = 0xdc00 | ((cc-0x10000) & 0x3ffff); 118 } 119 120 if (dst && dpos < dst_len) 121 dst[dpos] = c; 122 dpos++; 123 } 124 125 if (errp) 126 *errp = error; 127 return dpos; 128 #undef IS_CONT 129 } 130 131 132 size_t 133 utf16_to_utf8(char *dst, size_t dst_len, 134 const uint16_t *src, size_t src_len, 135 int flags, int *errp) 136 { 137 uint8_t spos, dpos; 138 int error; 139 140 #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) 141 #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) 142 143 error = 0; 144 dpos = 0; 145 for (spos = 0; spos < src_len; spos++) { 146 if (src[spos] < 0x80) { 147 CHECK_LENGTH(1); 148 ADD_BYTE(src[spos]); 149 } else if (src[spos] < 0x800) { 150 CHECK_LENGTH(2); 151 ADD_BYTE(0xc0 | (src[spos]>>6)); 152 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 153 } else if ((src[spos] & 0xdc00) == 0xd800) { 154 uint32_t c; 155 /* first surrogate */ 156 if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { 157 /* no second surrogate present */ 158 error++; 159 continue; 160 } 161 spos++; 162 CHECK_LENGTH(4); 163 c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; 164 ADD_BYTE(0xf0 | (c>>18)); 165 ADD_BYTE(0x80 | ((c>>12) & 0x3f)); 166 ADD_BYTE(0x80 | ((c>>6) & 0x3f)); 167 ADD_BYTE(0x80 | (c & 0x3f)); 168 } else if ((src[spos] & 0xdc00) == 0xdc00) { 169 /* second surrogate without preceding first surrogate */ 170 error++; 171 } else { 172 CHECK_LENGTH(3); 173 ADD_BYTE(0xe0 | src[spos]>>12); 174 ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); 175 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 176 } 177 } 178 179 if (errp) 180 *errp = error; 181 return dpos; 182 #undef ADD_BYTE 183 #undef CHECK_LENGTH 184 } 185