1*f2ee5fedSmaxv /* $NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $ */
24f1d6e61Sdillo
34f1d6e61Sdillo /*-
44f1d6e61Sdillo * Copyright (c) 2007 The NetBSD Foundation, Inc.
54f1d6e61Sdillo * All rights reserved.
64f1d6e61Sdillo *
74f1d6e61Sdillo * This code is derived from software contributed to The NetBSD Foundation
84f1d6e61Sdillo * by Dieter Baron.
94f1d6e61Sdillo *
104f1d6e61Sdillo * Redistribution and use in source and binary forms, with or without
114f1d6e61Sdillo * modification, are permitted provided that the following conditions
124f1d6e61Sdillo * are met:
134f1d6e61Sdillo * 1. Redistributions of source code must retain the above copyright
144f1d6e61Sdillo * notice, this list of conditions and the following disclaimer.
154f1d6e61Sdillo * 2. Redistributions in binary form must reproduce the above copyright
164f1d6e61Sdillo * notice, this list of conditions and the following disclaimer in the
174f1d6e61Sdillo * documentation and/or other materials provided with the distribution.
184f1d6e61Sdillo *
194f1d6e61Sdillo * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
204f1d6e61Sdillo * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
214f1d6e61Sdillo * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
224f1d6e61Sdillo * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
234f1d6e61Sdillo * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
244f1d6e61Sdillo * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
254f1d6e61Sdillo * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
264f1d6e61Sdillo * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
274f1d6e61Sdillo * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
284f1d6e61Sdillo * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
294f1d6e61Sdillo * POSSIBILITY OF SUCH DAMAGE.
304f1d6e61Sdillo */
314f1d6e61Sdillo
32ceed9c72Slukem #include <sys/cdefs.h>
33*f2ee5fedSmaxv __KERNEL_RCSID(0, "$NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $");
34ceed9c72Slukem
354f1d6e61Sdillo #include <sys/null.h>
364f1d6e61Sdillo
374f1d6e61Sdillo #include "unicode.h"
384f1d6e61Sdillo
394f1d6e61Sdillo size_t
utf8_to_utf16(uint16_t * dst,size_t dst_len,const char * src,size_t src_len,int flags,int * errp)404f1d6e61Sdillo utf8_to_utf16(uint16_t *dst, size_t dst_len,
414f1d6e61Sdillo const char *src, size_t src_len,
424f1d6e61Sdillo int flags, int *errp)
434f1d6e61Sdillo {
444f1d6e61Sdillo const unsigned char *s;
454f1d6e61Sdillo size_t spos, dpos;
464f1d6e61Sdillo int error;
474f1d6e61Sdillo uint16_t c;
484f1d6e61Sdillo
494f1d6e61Sdillo #define IS_CONT(c) (((c)&0xc0) == 0x80)
504f1d6e61Sdillo
514f1d6e61Sdillo error = 0;
524f1d6e61Sdillo s = (const unsigned char *)src;
534f1d6e61Sdillo spos = dpos = 0;
544f1d6e61Sdillo while (spos < src_len) {
55*f2ee5fedSmaxv if (s[spos] < 0x80) {
564f1d6e61Sdillo c = s[spos++];
57*f2ee5fedSmaxv } else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
584f1d6e61Sdillo && (spos >= src_len || !IS_CONT(s[spos+1]))
594f1d6e61Sdillo && s[spos]>=0xa0) {
604f1d6e61Sdillo /* not valid UTF-8, assume ISO 8859-1 */
614f1d6e61Sdillo c = s[spos++];
62*f2ee5fedSmaxv } else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
634f1d6e61Sdillo /* continuation byte without lead byte
64*f2ee5fedSmaxv * or lead byte for codepoint above 0x10ffff */
654f1d6e61Sdillo error++;
664f1d6e61Sdillo spos++;
674f1d6e61Sdillo continue;
68*f2ee5fedSmaxv } else if (s[spos] < 0xe0) {
694f1d6e61Sdillo if (spos >= src_len || !IS_CONT(s[spos+1])) {
704f1d6e61Sdillo spos++;
714f1d6e61Sdillo error++;
724f1d6e61Sdillo continue;
734f1d6e61Sdillo }
744f1d6e61Sdillo c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
754f1d6e61Sdillo spos += 2;
764f1d6e61Sdillo if (c < 0x80) {
774f1d6e61Sdillo /* overlong encoding */
784f1d6e61Sdillo error++;
794f1d6e61Sdillo continue;
804f1d6e61Sdillo }
81*f2ee5fedSmaxv } else if (s[spos] < 0xf0) {
82*f2ee5fedSmaxv if (spos >= src_len-2 ||
83*f2ee5fedSmaxv !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
844f1d6e61Sdillo spos++;
854f1d6e61Sdillo error++;
864f1d6e61Sdillo continue;
874f1d6e61Sdillo }
884f1d6e61Sdillo c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
894f1d6e61Sdillo | (s[spos+2] & 0x3f);
904f1d6e61Sdillo spos += 3;
914f1d6e61Sdillo if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
924f1d6e61Sdillo /* overlong encoding or encoded surrogate */
934f1d6e61Sdillo error++;
944f1d6e61Sdillo continue;
954f1d6e61Sdillo }
96*f2ee5fedSmaxv } else {
974f1d6e61Sdillo uint32_t cc;
984f1d6e61Sdillo /* UTF-16 surrogate pair */
994f1d6e61Sdillo
1004f1d6e61Sdillo if (spos >= src_len-3 || !IS_CONT(s[spos+1])
1014f1d6e61Sdillo || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
1024f1d6e61Sdillo spos++;
1034f1d6e61Sdillo error++;
1044f1d6e61Sdillo continue;
1054f1d6e61Sdillo }
1064f1d6e61Sdillo cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
1074f1d6e61Sdillo | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
1084f1d6e61Sdillo spos += 4;
1094f1d6e61Sdillo if (cc < 0x10000) {
1104f1d6e61Sdillo /* overlong encoding */
1114f1d6e61Sdillo error++;
1124f1d6e61Sdillo continue;
1134f1d6e61Sdillo }
1144f1d6e61Sdillo if (dst && dpos < dst_len)
1154f1d6e61Sdillo dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
1164f1d6e61Sdillo dpos++;
1174f1d6e61Sdillo c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
1184f1d6e61Sdillo }
1194f1d6e61Sdillo
1204f1d6e61Sdillo if (dst && dpos < dst_len)
1214f1d6e61Sdillo dst[dpos] = c;
1224f1d6e61Sdillo dpos++;
1234f1d6e61Sdillo }
1244f1d6e61Sdillo
1254f1d6e61Sdillo if (errp)
1264f1d6e61Sdillo *errp = error;
1274f1d6e61Sdillo return dpos;
1284f1d6e61Sdillo #undef IS_CONT
1294f1d6e61Sdillo }
1304f1d6e61Sdillo
1314f1d6e61Sdillo
1324f1d6e61Sdillo size_t
utf16_to_utf8(char * dst,size_t dst_len,const uint16_t * src,size_t src_len,int flags,int * errp)1334f1d6e61Sdillo utf16_to_utf8(char *dst, size_t dst_len,
1344f1d6e61Sdillo const uint16_t *src, size_t src_len,
1354f1d6e61Sdillo int flags, int *errp)
1364f1d6e61Sdillo {
1374f1d6e61Sdillo uint8_t spos, dpos;
1384f1d6e61Sdillo int error;
1394f1d6e61Sdillo
1404f1d6e61Sdillo #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL)
1414f1d6e61Sdillo #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++)
1424f1d6e61Sdillo
1434f1d6e61Sdillo error = 0;
1444f1d6e61Sdillo dpos = 0;
1454f1d6e61Sdillo for (spos = 0; spos < src_len; spos++) {
1464f1d6e61Sdillo if (src[spos] < 0x80) {
1474f1d6e61Sdillo CHECK_LENGTH(1);
1484f1d6e61Sdillo ADD_BYTE(src[spos]);
149*f2ee5fedSmaxv } else if (src[spos] < 0x800) {
1504f1d6e61Sdillo CHECK_LENGTH(2);
1514f1d6e61Sdillo ADD_BYTE(0xc0 | (src[spos]>>6));
1524f1d6e61Sdillo ADD_BYTE(0x80 | (src[spos] & 0x3f));
153*f2ee5fedSmaxv } else if ((src[spos] & 0xdc00) == 0xd800) {
1544f1d6e61Sdillo uint32_t c;
1554f1d6e61Sdillo /* first surrogate */
1564f1d6e61Sdillo if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
1574f1d6e61Sdillo /* no second surrogate present */
1584f1d6e61Sdillo error++;
1594f1d6e61Sdillo continue;
1604f1d6e61Sdillo }
1614f1d6e61Sdillo spos++;
1624f1d6e61Sdillo CHECK_LENGTH(4);
1634f1d6e61Sdillo c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
1644f1d6e61Sdillo ADD_BYTE(0xf0 | (c>>18));
1654f1d6e61Sdillo ADD_BYTE(0x80 | ((c>>12) & 0x3f));
1664f1d6e61Sdillo ADD_BYTE(0x80 | ((c>>6) & 0x3f));
1674f1d6e61Sdillo ADD_BYTE(0x80 | (c & 0x3f));
168*f2ee5fedSmaxv } else if ((src[spos] & 0xdc00) == 0xdc00) {
1694f1d6e61Sdillo /* second surrogate without preceding first surrogate */
1704f1d6e61Sdillo error++;
171*f2ee5fedSmaxv } else {
1724f1d6e61Sdillo CHECK_LENGTH(3);
1734f1d6e61Sdillo ADD_BYTE(0xe0 | src[spos]>>12);
1744f1d6e61Sdillo ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
1754f1d6e61Sdillo ADD_BYTE(0x80 | (src[spos] & 0x3f));
1764f1d6e61Sdillo }
1774f1d6e61Sdillo }
1784f1d6e61Sdillo
1794f1d6e61Sdillo if (errp)
1804f1d6e61Sdillo *errp = error;
1814f1d6e61Sdillo return dpos;
1824f1d6e61Sdillo #undef ADD_BYTE
1834f1d6e61Sdillo #undef CHECK_LENGTH
1844f1d6e61Sdillo }
185