1 /* Conversion UTF-8 to UCS-4. 2 Copyright (C) 2001-2002 Free Software Foundation, Inc. 3 Written by Bruno Haible <haible@clisp.cons.org>, 2001. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19 20 #include <stddef.h> 21 22 /* Return the length (number of units) of the first character in S, putting 23 its 'ucs4_t' representation in *PUC. */ 24 static int 25 u8_mbtouc_aux (unsigned int *puc, const unsigned char *s, size_t n) 26 { 27 unsigned char c = *s; 28 29 if (c >= 0xc2) 30 { 31 if (c < 0xe0) 32 { 33 if (n >= 2) 34 { 35 if ((s[1] ^ 0x80) < 0x40) 36 { 37 *puc = ((unsigned int) (c & 0x1f) << 6) 38 | (unsigned int) (s[1] ^ 0x80); 39 return 2; 40 } 41 /* invalid multibyte character */ 42 } 43 else 44 { 45 /* incomplete multibyte character */ 46 *puc = 0xfffd; 47 return n; 48 } 49 } 50 else if (c < 0xf0) 51 { 52 if (n >= 3) 53 { 54 if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 55 && (c >= 0xe1 || s[1] >= 0xa0)) 56 { 57 *puc = ((unsigned int) (c & 0x0f) << 12) 58 | ((unsigned int) (s[1] ^ 0x80) << 6) 59 | (unsigned int) (s[2] ^ 0x80); 60 return 3; 61 } 62 /* invalid multibyte character */ 63 } 64 else 65 { 66 /* incomplete multibyte character */ 67 *puc = 0xfffd; 68 return n; 69 } 70 } 71 else if (c < 0xf8) 72 { 73 if (n >= 4) 74 { 75 if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 76 && (s[3] ^ 0x80) < 0x40 77 && (c >= 0xf1 || s[1] >= 0x90) 78 #if 1 79 && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) 80 #endif 81 ) 82 { 83 *puc = ((unsigned int) (c & 0x07) << 18) 84 | ((unsigned int) (s[1] ^ 0x80) << 12) 85 | ((unsigned int) (s[2] ^ 0x80) << 6) 86 | (unsigned int) (s[3] ^ 0x80); 87 return 4; 88 } 89 /* invalid multibyte character */ 90 } 91 else 92 { 93 /* incomplete multibyte character */ 94 *puc = 0xfffd; 95 return n; 96 } 97 } 98 #if 0 99 else if (c < 0xfc) 100 { 101 if (n >= 5) 102 { 103 if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 104 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 105 && (c >= 0xf9 || s[1] >= 0x88)) 106 { 107 *puc = ((unsigned int) (c & 0x03) << 24) 108 | ((unsigned int) (s[1] ^ 0x80) << 18) 109 | ((unsigned int) (s[2] ^ 0x80) << 12) 110 | ((unsigned int) (s[3] ^ 0x80) << 6) 111 | (unsigned int) (s[4] ^ 0x80); 112 return 5; 113 } 114 /* invalid multibyte character */ 115 } 116 else 117 { 118 /* incomplete multibyte character */ 119 *puc = 0xfffd; 120 return n; 121 } 122 } 123 else if (c < 0xfe) 124 { 125 if (n >= 6) 126 { 127 if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 128 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 129 && (s[5] ^ 0x80) < 0x40 130 && (c >= 0xfd || s[1] >= 0x84)) 131 { 132 *puc = ((unsigned int) (c & 0x01) << 30) 133 | ((unsigned int) (s[1] ^ 0x80) << 24) 134 | ((unsigned int) (s[2] ^ 0x80) << 18) 135 | ((unsigned int) (s[3] ^ 0x80) << 12) 136 | ((unsigned int) (s[4] ^ 0x80) << 6) 137 | (unsigned int) (s[5] ^ 0x80); 138 return 6; 139 } 140 /* invalid multibyte character */ 141 } 142 else 143 { 144 /* incomplete multibyte character */ 145 *puc = 0xfffd; 146 return n; 147 } 148 } 149 #endif 150 } 151 /* invalid multibyte character */ 152 *puc = 0xfffd; 153 return 1; 154 } 155 static inline int 156 u8_mbtouc (unsigned int *puc, const unsigned char *s, size_t n) 157 { 158 unsigned char c = *s; 159 160 if (c < 0x80) 161 { 162 *puc = c; 163 return 1; 164 } 165 else 166 return u8_mbtouc_aux (puc, s, n); 167 } 168