1 /* $NetBSD: citrus_utf1632.c,v 1.4 2005/10/29 18:02:04 tshiozak Exp $ */ 2 3 /*- 4 * Copyright (c)2003 Citrus Project, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #if defined(LIBC_SCCS) && !defined(lint) 31 __RCSID("$NetBSD: citrus_utf1632.c,v 1.4 2005/10/29 18:02:04 tshiozak Exp $"); 32 #endif /* LIBC_SCCS and not lint */ 33 34 #include <assert.h> 35 #include <errno.h> 36 #include <string.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <stddef.h> 40 #include <locale.h> 41 #include <limits.h> 42 #include <wchar.h> 43 #include <sys/types.h> 44 #include <sys/endian.h> 45 46 #include "citrus_namespace.h" 47 #include "citrus_types.h" 48 #include "citrus_module.h" 49 #include "citrus_stdenc.h" 50 #include "citrus_bcs.h" 51 52 #include "citrus_utf1632.h" 53 54 55 /* ---------------------------------------------------------------------- 56 * private stuffs used by templates 57 */ 58 59 typedef struct { 60 u_int8_t ch[4]; 61 int chlen; 62 int current_endian; 63 } _UTF1632State; 64 65 typedef struct { 66 int preffered_endian; 67 unsigned int cur_max; 68 #define _ENDIAN_UNKNOWN 0 69 #define _ENDIAN_BIG 1 70 #define _ENDIAN_LITTLE 2 71 u_int32_t mode; 72 #define _MODE_UTF32 0x00000001U 73 #define _MODE_FORCE_ENDIAN 0x00000002U 74 } _UTF1632EncodingInfo; 75 76 #define _FUNCNAME(m) _citrus_UTF1632_##m 77 #define _ENCODING_INFO _UTF1632EncodingInfo 78 #define _ENCODING_STATE _UTF1632State 79 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 80 #define _ENCODING_IS_STATE_DEPENDENT 0 81 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 82 83 84 static __inline void 85 /*ARGSUSED*/ 86 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei, _UTF1632State *s) 87 { 88 memset(s, 0, sizeof(*s)); 89 } 90 91 static int 92 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 93 const char **s, size_t n, _UTF1632State *psenc, 94 size_t *nresult) 95 { 96 int chlenbak, endian, needlen; 97 wchar_t wc; 98 size_t result; 99 const char *s0; 100 101 _DIAGASSERT(nresult != 0); 102 _DIAGASSERT(ei != NULL); 103 _DIAGASSERT(s != NULL); 104 _DIAGASSERT(psenc != NULL); 105 106 s0 = *s; 107 108 if (s0 == NULL) { 109 _citrus_UTF1632_init_state(ei, psenc); 110 *nresult = 0; /* state independent */ 111 return (0); 112 } 113 114 result = 0; 115 chlenbak = psenc->chlen; 116 117 refetch: 118 if ((ei->mode & _MODE_UTF32) != 0 || chlenbak>=2) 119 needlen = 4; 120 else 121 needlen = 2; 122 123 while (chlenbak < needlen) { 124 if (n==0) 125 goto restart; 126 psenc->ch[chlenbak++] = *s0++; 127 n--; 128 result++; 129 } 130 131 /* judge endian marker */ 132 if ((ei->mode & _MODE_UTF32) == 0) { 133 /* UTF16 */ 134 if (psenc->ch[0]==0xFE && psenc->ch[1]==0xFF) { 135 psenc->current_endian = _ENDIAN_BIG; 136 chlenbak = 0; 137 goto refetch; 138 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE) { 139 psenc->current_endian = _ENDIAN_LITTLE; 140 chlenbak = 0; 141 goto refetch; 142 } 143 } else { 144 /* UTF32 */ 145 if (psenc->ch[0]==0x00 && psenc->ch[1]==0x00 && 146 psenc->ch[2]==0xFE && psenc->ch[3]==0xFF) { 147 psenc->current_endian = _ENDIAN_BIG; 148 chlenbak = 0; 149 goto refetch; 150 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE && 151 psenc->ch[2]==0x00 && psenc->ch[3]==0x00) { 152 psenc->current_endian = _ENDIAN_LITTLE; 153 chlenbak = 0; 154 goto refetch; 155 } 156 } 157 if ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 158 psenc->current_endian == _ENDIAN_UNKNOWN) 159 endian = ei->preffered_endian; 160 else 161 endian = psenc->current_endian; 162 163 /* get wc */ 164 if ((ei->mode & _MODE_UTF32) == 0) { 165 /* UTF16 */ 166 if (needlen==2) { 167 switch (endian) { 168 case _ENDIAN_LITTLE: 169 wc = (psenc->ch[0] | 170 ((wchar_t)psenc->ch[1] << 8)); 171 break; 172 case _ENDIAN_BIG: 173 wc = (psenc->ch[1] | 174 ((wchar_t)psenc->ch[0] << 8)); 175 break; 176 } 177 if (wc >= 0xD800 && wc <= 0xDBFF) { 178 /* surrogate high */ 179 needlen=4; 180 goto refetch; 181 } 182 } else { 183 /* surrogate low */ 184 wc -= 0xD800; /* wc : surrogate high (see above) */ 185 wc <<= 10; 186 switch (endian) { 187 case _ENDIAN_LITTLE: 188 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 189 goto ilseq; 190 wc |= psenc->ch[2]; 191 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 192 break; 193 case _ENDIAN_BIG: 194 if (psenc->ch[3]<0xDC || psenc->ch[3]>0xDF) 195 goto ilseq; 196 wc |= psenc->ch[3]; 197 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 198 break; 199 } 200 wc += 0x10000; 201 } 202 } else { 203 /* UTF32 */ 204 switch (endian) { 205 case _ENDIAN_LITTLE: 206 wc = (psenc->ch[0] | 207 ((wchar_t)psenc->ch[1] << 8) | 208 ((wchar_t)psenc->ch[2] << 16) | 209 ((wchar_t)psenc->ch[3] << 24)); 210 break; 211 case _ENDIAN_BIG: 212 wc = (psenc->ch[3] | 213 ((wchar_t)psenc->ch[2] << 8) | 214 ((wchar_t)psenc->ch[1] << 16) | 215 ((wchar_t)psenc->ch[0] << 24)); 216 break; 217 } 218 } 219 220 221 *pwc = wc; 222 psenc->chlen = 0; 223 *nresult = result; 224 *s = s0; 225 226 return (0); 227 228 ilseq: 229 *nresult = (size_t)-1; 230 psenc->chlen = 0; 231 return (EILSEQ); 232 233 restart: 234 *nresult = (size_t)-2; 235 psenc->chlen = chlenbak; 236 *s = s0; 237 return (0); 238 } 239 240 static int 241 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 242 wchar_t wc, _UTF1632State *psenc, 243 size_t *nresult) 244 { 245 int ret; 246 wchar_t wc2; 247 248 _DIAGASSERT(ei != NULL); 249 _DIAGASSERT(nresult != 0); 250 _DIAGASSERT(s != NULL); 251 252 wc2 = 0; 253 if ((ei->mode & _MODE_UTF32)==0) { 254 /* UTF16 */ 255 if (wc>0xFFFF) { 256 /* surrogate */ 257 if (wc>0x10FFFF) { 258 ret = EILSEQ; 259 goto err; 260 } 261 if (n < 4) { 262 ret = E2BIG; 263 goto err; 264 } 265 wc -= 0x10000; 266 wc2 = (wc & 0x3FF) | 0xDC00; 267 wc = (wc>>10) | 0xD800; 268 *nresult = (size_t)4; 269 } else { 270 if (n < 2) { 271 ret = E2BIG; 272 goto err; 273 } 274 *nresult = (size_t)2; 275 } 276 277 surrogate: 278 switch (ei->preffered_endian) { 279 case _ENDIAN_BIG: 280 s[1] = wc; 281 s[0] = (wc >>= 8); 282 break; 283 case _ENDIAN_LITTLE: 284 s[0] = wc; 285 s[1] = (wc >>= 8); 286 break; 287 } 288 if (wc2!=0) { 289 wc = wc2; 290 wc2 = 0; 291 s += 2; 292 goto surrogate; 293 } 294 } else { 295 /* UTF32 */ 296 if (n < 4) { 297 ret = E2BIG; 298 goto err; 299 } 300 switch (ei->preffered_endian) { 301 case _ENDIAN_BIG: 302 s[3] = wc; 303 s[2] = (wc >>= 8); 304 s[1] = (wc >>= 8); 305 s[0] = (wc >>= 8); 306 break; 307 case _ENDIAN_LITTLE: 308 s[0] = wc; 309 s[1] = (wc >>= 8); 310 s[2] = (wc >>= 8); 311 s[3] = (wc >>= 8); 312 break; 313 } 314 *nresult = (size_t)4; 315 } 316 317 return 0; 318 319 err: 320 *nresult = (size_t)-1; 321 return ret; 322 } 323 324 static void 325 parse_variable(_UTF1632EncodingInfo * __restrict ei, 326 const void * __restrict var, size_t lenvar) 327 { 328 #define MATCH(x, act) \ 329 do { \ 330 if (lenvar >= (sizeof(#x)-1) && \ 331 _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \ 332 act; \ 333 lenvar -= sizeof(#x)-1; \ 334 p += sizeof(#x)-1; \ 335 } \ 336 } while (/*CONSTCOND*/0) 337 const char *p; 338 p = var; 339 while (lenvar>0) { 340 switch (*p) { 341 case 'B': 342 case 'b': 343 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 344 break; 345 case 'L': 346 case 'l': 347 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 348 break; 349 case 'F': 350 case 'f': 351 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 352 break; 353 case 'U': 354 case 'u': 355 MATCH(utf32, ei->mode |= _MODE_UTF32); 356 break; 357 } 358 p++; 359 lenvar--; 360 } 361 } 362 363 static int 364 /*ARGSUSED*/ 365 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 366 const void * __restrict var, 367 size_t lenvar) 368 { 369 _DIAGASSERT(ei != NULL); 370 371 memset((void *)ei, 0, sizeof(*ei)); 372 373 parse_variable(ei, var, lenvar); 374 375 if ((ei->mode&_MODE_UTF32)==0) 376 ei->cur_max = 6; /* endian + surrogate */ 377 else 378 ei->cur_max = 8; /* endian + normal */ 379 380 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 381 #if BYTE_ORDER == BIG_ENDIAN 382 ei->preffered_endian = _ENDIAN_BIG; 383 #else 384 ei->preffered_endian = _ENDIAN_LITTLE; 385 #endif 386 } 387 388 return (0); 389 } 390 391 static void 392 /*ARGSUSED*/ 393 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei) 394 { 395 } 396 397 static __inline int 398 /*ARGSUSED*/ 399 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei, 400 _csid_t * __restrict csid, 401 _index_t * __restrict idx, 402 _wc_t wc) 403 { 404 405 _DIAGASSERT(csid != NULL && idx != NULL); 406 407 *csid = 0; 408 *idx = (_index_t)wc; 409 410 return (0); 411 } 412 413 static __inline int 414 /*ARGSUSED*/ 415 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei, 416 _wc_t * __restrict wc, 417 _csid_t csid, _index_t idx) 418 { 419 420 _DIAGASSERT(wc != NULL); 421 422 if (csid != 0) 423 return (EILSEQ); 424 425 *wc = (_wc_t)idx; 426 427 return (0); 428 } 429 430 static __inline int 431 /*ARGSUSED*/ 432 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei, 433 _UTF1632State * __restrict psenc, 434 int * __restrict rstate) 435 { 436 437 if (psenc->chlen == 0) 438 *rstate = _STDENC_SDGEN_INITIAL; 439 else 440 *rstate = _STDENC_SDGEN_INCOMPLETE_CHAR; 441 442 return 0; 443 } 444 445 /* ---------------------------------------------------------------------- 446 * public interface for stdenc 447 */ 448 449 _CITRUS_STDENC_DECLS(UTF1632); 450 _CITRUS_STDENC_DEF_OPS(UTF1632); 451 452 #include "citrus_stdenc_template.h" 453