1 /* $NetBSD: unicode.c,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */ 2 3 /*- 4 * Copyright (c) 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Dieter Baron. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/null.h> 33 34 #include "unicode.h" 35 36 size_t 37 utf8_to_utf16(uint16_t *dst, size_t dst_len, 38 const char *src, size_t src_len, 39 int flags, int *errp) 40 { 41 const unsigned char *s; 42 size_t spos, dpos; 43 int error; 44 uint16_t c; 45 46 #define IS_CONT(c) (((c)&0xc0) == 0x80) 47 48 error = 0; 49 s = (const unsigned char *)src; 50 spos = dpos = 0; 51 while (spos<src_len) { 52 if (s[spos] < 0x80) 53 c = s[spos++]; 54 else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) 55 && (spos >= src_len || !IS_CONT(s[spos+1])) 56 && s[spos]>=0xa0) { 57 /* not valid UTF-8, assume ISO 8859-1 */ 58 c = s[spos++]; 59 } 60 else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { 61 /* continuation byte without lead byte 62 or lead byte for codepoint above 0x10ffff */ 63 error++; 64 spos++; 65 continue; 66 } 67 else if (s[spos] < 0xe0) { 68 if (spos >= src_len || !IS_CONT(s[spos+1])) { 69 spos++; 70 error++; 71 continue; 72 } 73 c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); 74 spos += 2; 75 if (c < 0x80) { 76 /* overlong encoding */ 77 error++; 78 continue; 79 } 80 } 81 else if (s[spos] < 0xf0) { 82 if (spos >= src_len-2 83 || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { 84 spos++; 85 error++; 86 continue; 87 } 88 c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) 89 | (s[spos+2] & 0x3f); 90 spos += 3; 91 if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { 92 /* overlong encoding or encoded surrogate */ 93 error++; 94 continue; 95 } 96 } 97 else { 98 uint32_t cc; 99 /* UTF-16 surrogate pair */ 100 101 if (spos >= src_len-3 || !IS_CONT(s[spos+1]) 102 || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { 103 spos++; 104 error++; 105 106 continue; 107 } 108 cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) 109 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); 110 spos += 4; 111 if (cc < 0x10000) { 112 /* overlong encoding */ 113 error++; 114 continue; 115 } 116 if (dst && dpos < dst_len) 117 dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); 118 dpos++; 119 c = 0xdc00 | ((cc-0x10000) & 0x3ffff); 120 } 121 122 if (dst && dpos < dst_len) 123 dst[dpos] = c; 124 dpos++; 125 } 126 127 if (errp) 128 *errp = error; 129 130 return dpos; 131 132 #undef IS_CONT 133 } 134 135 136 size_t 137 utf16_to_utf8(char *dst, size_t dst_len, 138 const uint16_t *src, size_t src_len, 139 int flags, int *errp) 140 { 141 uint8_t spos, dpos; 142 int error; 143 144 #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) 145 #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) 146 147 error = 0; 148 dpos = 0; 149 for (spos=0; spos<src_len; spos++) { 150 if (src[spos] < 0x80) { 151 CHECK_LENGTH(1); 152 ADD_BYTE(src[spos]); 153 } 154 else if (src[spos] < 0x800) { 155 CHECK_LENGTH(2); 156 ADD_BYTE(0xc0 | (src[spos]>>6)); 157 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 158 } 159 else if ((src[spos] & 0xdc00) == 0xd800) { 160 uint32_t c; 161 /* first surrogate */ 162 if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { 163 /* no second surrogate present */ 164 error++; 165 continue; 166 } 167 spos++; 168 CHECK_LENGTH(4); 169 c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; 170 ADD_BYTE(0xf0 | (c>>18)); 171 ADD_BYTE(0x80 | ((c>>12) & 0x3f)); 172 ADD_BYTE(0x80 | ((c>>6) & 0x3f)); 173 ADD_BYTE(0x80 | (c & 0x3f)); 174 } 175 else if ((src[spos] & 0xdc00) == 0xdc00) { 176 /* second surrogate without preceding first surrogate */ 177 error++; 178 } 179 else { 180 CHECK_LENGTH(3); 181 ADD_BYTE(0xe0 | src[spos]>>12); 182 ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); 183 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 184 } 185 } 186 187 if (errp) 188 *errp = error; 189 190 return dpos; 191 192 #undef ADD_BYTE 193 #undef CHECK_LENGTH 194 } 195