1 /* $NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $ */ 2 3 /*- 4 * Copyright (c)2003 Citrus Project, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #if defined(LIBC_SCCS) && !defined(lint) 31 __RCSID("$NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $"); 32 #endif /* LIBC_SCCS and not lint */ 33 34 #include <assert.h> 35 #include <errno.h> 36 #include <string.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <stddef.h> 40 #include <limits.h> 41 #include <wchar.h> 42 #include <sys/types.h> 43 #include <machine/endian.h> 44 45 #include "citrus_namespace.h" 46 #include "citrus_types.h" 47 #include "citrus_module.h" 48 #include "citrus_stdenc.h" 49 #include "citrus_bcs.h" 50 51 #include "citrus_utf1632.h" 52 53 54 /* ---------------------------------------------------------------------- 55 * private stuffs used by templates 56 */ 57 58 typedef struct { 59 u_int8_t ch[4]; 60 int chlen; 61 int current_endian; 62 } _UTF1632State; 63 64 typedef struct { 65 int preffered_endian; 66 unsigned int cur_max; 67 #define _ENDIAN_UNKNOWN 0 68 #define _ENDIAN_BIG 1 69 #define _ENDIAN_LITTLE 2 70 u_int32_t mode; 71 #define _MODE_UTF32 0x00000001U 72 #define _MODE_FORCE_ENDIAN 0x00000002U 73 } _UTF1632EncodingInfo; 74 75 #define _FUNCNAME(m) _citrus_UTF1632_##m 76 #define _ENCODING_INFO _UTF1632EncodingInfo 77 #define _ENCODING_STATE _UTF1632State 78 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 79 #define _ENCODING_IS_STATE_DEPENDENT 0 80 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 81 82 83 static __inline void 84 /*ARGSUSED*/ 85 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei, _UTF1632State *s) 86 { 87 memset(s, 0, sizeof(*s)); 88 } 89 90 static int 91 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 92 const char **s, size_t n, _UTF1632State *psenc, 93 size_t *nresult) 94 { 95 int chlenbak, endian, needlen; 96 wchar_t wc; 97 size_t result; 98 const char *s0; 99 100 _DIAGASSERT(nresult != 0); 101 _DIAGASSERT(ei != NULL); 102 _DIAGASSERT(s != NULL); 103 _DIAGASSERT(psenc != NULL); 104 105 s0 = *s; 106 107 if (s0 == NULL) { 108 _citrus_UTF1632_init_state(ei, psenc); 109 *nresult = 0; /* state independent */ 110 return (0); 111 } 112 113 result = 0; 114 chlenbak = psenc->chlen; 115 116 refetch: 117 if ((ei->mode & _MODE_UTF32) != 0 || chlenbak>=2) 118 needlen = 4; 119 else 120 needlen = 2; 121 122 while (chlenbak < needlen) { 123 if (n==0) 124 goto restart; 125 psenc->ch[chlenbak++] = *s0++; 126 n--; 127 result++; 128 } 129 130 /* judge endian marker */ 131 if ((ei->mode & _MODE_UTF32) == 0) { 132 /* UTF16 */ 133 if (psenc->ch[0]==0xFE && psenc->ch[1]==0xFF) { 134 psenc->current_endian = _ENDIAN_BIG; 135 chlenbak = 0; 136 goto refetch; 137 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE) { 138 psenc->current_endian = _ENDIAN_LITTLE; 139 chlenbak = 0; 140 goto refetch; 141 } 142 } else { 143 /* UTF32 */ 144 if (psenc->ch[0]==0x00 && psenc->ch[1]==0x00 && 145 psenc->ch[2]==0xFE && psenc->ch[3]==0xFF) { 146 psenc->current_endian = _ENDIAN_BIG; 147 chlenbak = 0; 148 goto refetch; 149 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE && 150 psenc->ch[2]==0x00 && psenc->ch[3]==0x00) { 151 psenc->current_endian = _ENDIAN_LITTLE; 152 chlenbak = 0; 153 goto refetch; 154 } 155 } 156 if ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 157 psenc->current_endian == _ENDIAN_UNKNOWN) 158 endian = ei->preffered_endian; 159 else 160 endian = psenc->current_endian; 161 162 /* get wc */ 163 if ((ei->mode & _MODE_UTF32) == 0) { 164 /* UTF16 */ 165 if (needlen==2) { 166 switch (endian) { 167 case _ENDIAN_LITTLE: 168 wc = (psenc->ch[0] | 169 ((wchar_t)psenc->ch[1] << 8)); 170 break; 171 case _ENDIAN_BIG: 172 wc = (psenc->ch[1] | 173 ((wchar_t)psenc->ch[0] << 8)); 174 break; 175 default: 176 goto ilseq; 177 } 178 if (wc >= 0xD800 && wc <= 0xDBFF) { 179 /* surrogate high */ 180 needlen=4; 181 goto refetch; 182 } 183 } else { 184 /* surrogate low */ 185 wc -= 0xD800; /* wc : surrogate high (see above) */ 186 wc <<= 10; 187 switch (endian) { 188 case _ENDIAN_LITTLE: 189 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 190 goto ilseq; 191 wc |= psenc->ch[2]; 192 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 193 break; 194 case _ENDIAN_BIG: 195 if (psenc->ch[3]<0xDC || psenc->ch[3]>0xDF) 196 goto ilseq; 197 wc |= psenc->ch[3]; 198 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 199 break; 200 default: 201 goto ilseq; 202 } 203 wc += 0x10000; 204 } 205 } else { 206 /* UTF32 */ 207 switch (endian) { 208 case _ENDIAN_LITTLE: 209 wc = (psenc->ch[0] | 210 ((wchar_t)psenc->ch[1] << 8) | 211 ((wchar_t)psenc->ch[2] << 16) | 212 ((wchar_t)psenc->ch[3] << 24)); 213 break; 214 case _ENDIAN_BIG: 215 wc = (psenc->ch[3] | 216 ((wchar_t)psenc->ch[2] << 8) | 217 ((wchar_t)psenc->ch[1] << 16) | 218 ((wchar_t)psenc->ch[0] << 24)); 219 break; 220 default: 221 goto ilseq; 222 } 223 if (wc >= 0xD800 && wc <= 0xDFFF) 224 goto ilseq; 225 } 226 227 228 *pwc = wc; 229 psenc->chlen = 0; 230 *nresult = result; 231 *s = s0; 232 233 return (0); 234 235 ilseq: 236 *nresult = (size_t)-1; 237 psenc->chlen = 0; 238 return (EILSEQ); 239 240 restart: 241 *nresult = (size_t)-2; 242 psenc->chlen = chlenbak; 243 *s = s0; 244 return (0); 245 } 246 247 static int 248 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 249 wchar_t wc, _UTF1632State *psenc, 250 size_t *nresult) 251 { 252 int ret; 253 wchar_t wc2; 254 static const char _bom[4] = { 255 #if BYTE_ORDER == BIG_ENDIAN 256 0x00, 0x00, 0xFE, 0xFF, 257 #else 258 0xFF, 0xFE, 0x00, 0x00, 259 #endif 260 }; 261 const char *bom = &_bom[0]; 262 size_t cnt; 263 264 _DIAGASSERT(ei != NULL); 265 _DIAGASSERT(nresult != 0); 266 _DIAGASSERT(s != NULL); 267 268 cnt = (size_t)0; 269 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 270 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 271 if (ei->mode & _MODE_UTF32) { 272 cnt = 4; 273 } else { 274 cnt = 2; 275 #if BYTE_ORDER == BIG_ENDIAN 276 bom += 2; 277 #endif 278 } 279 if (n < cnt) 280 goto e2big; 281 memcpy(s, bom, cnt); 282 s += cnt, n -= cnt; 283 } 284 psenc->current_endian = ei->preffered_endian; 285 } 286 287 wc2 = 0; 288 if ((ei->mode & _MODE_UTF32)==0) { 289 /* UTF16 */ 290 if (wc>0xFFFF) { 291 /* surrogate */ 292 if (wc>0x10FFFF) 293 goto ilseq; 294 if (n < 4) 295 goto e2big; 296 cnt += 4; 297 wc -= 0x10000; 298 wc2 = (wc & 0x3FF) | 0xDC00; 299 wc = (wc>>10) | 0xD800; 300 } else { 301 if (n < 2) 302 goto e2big; 303 cnt += 2; 304 } 305 306 surrogate: 307 switch (psenc->current_endian) { 308 case _ENDIAN_BIG: 309 s[1] = wc; 310 s[0] = (wc >>= 8); 311 break; 312 case _ENDIAN_LITTLE: 313 s[0] = wc; 314 s[1] = (wc >>= 8); 315 break; 316 } 317 if (wc2!=0) { 318 wc = wc2; 319 wc2 = 0; 320 s += 2; 321 goto surrogate; 322 } 323 } else { 324 /* UTF32 */ 325 if (wc >= 0xD800 && wc <= 0xDFFF) 326 goto ilseq; 327 if (n < 4) 328 goto e2big; 329 cnt += 4; 330 switch (psenc->current_endian) { 331 case _ENDIAN_BIG: 332 s[3] = wc; 333 s[2] = (wc >>= 8); 334 s[1] = (wc >>= 8); 335 s[0] = (wc >>= 8); 336 break; 337 case _ENDIAN_LITTLE: 338 s[0] = wc; 339 s[1] = (wc >>= 8); 340 s[2] = (wc >>= 8); 341 s[3] = (wc >>= 8); 342 break; 343 } 344 } 345 *nresult = cnt; 346 347 return 0; 348 349 ilseq: 350 *nresult = (size_t)-1; 351 return EILSEQ; 352 e2big: 353 *nresult = (size_t)-1; 354 return E2BIG; 355 } 356 357 static void 358 parse_variable(_UTF1632EncodingInfo * __restrict ei, 359 const void * __restrict var, size_t lenvar) 360 { 361 #define MATCH(x, act) \ 362 do { \ 363 if (lenvar >= (sizeof(#x)-1) && \ 364 _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \ 365 act; \ 366 lenvar -= sizeof(#x)-1; \ 367 p += sizeof(#x)-1; \ 368 } \ 369 } while (/*CONSTCOND*/0) 370 const char *p; 371 p = var; 372 while (lenvar>0) { 373 switch (*p) { 374 case 'B': 375 case 'b': 376 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 377 break; 378 case 'L': 379 case 'l': 380 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 381 break; 382 case 'F': 383 case 'f': 384 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 385 break; 386 case 'U': 387 case 'u': 388 MATCH(utf32, ei->mode |= _MODE_UTF32); 389 break; 390 } 391 p++; 392 lenvar--; 393 } 394 } 395 396 static int 397 /*ARGSUSED*/ 398 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 399 const void * __restrict var, 400 size_t lenvar) 401 { 402 _DIAGASSERT(ei != NULL); 403 404 memset((void *)ei, 0, sizeof(*ei)); 405 406 parse_variable(ei, var, lenvar); 407 408 if ((ei->mode&_MODE_UTF32)==0) 409 ei->cur_max = 6; /* endian + surrogate */ 410 else 411 ei->cur_max = 8; /* endian + normal */ 412 413 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 414 #if BYTE_ORDER == BIG_ENDIAN 415 ei->preffered_endian = _ENDIAN_BIG; 416 #else 417 ei->preffered_endian = _ENDIAN_LITTLE; 418 #endif 419 } 420 421 return (0); 422 } 423 424 static void 425 /*ARGSUSED*/ 426 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei) 427 { 428 } 429 430 static __inline int 431 /*ARGSUSED*/ 432 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei, 433 _csid_t * __restrict csid, 434 _index_t * __restrict idx, 435 _wc_t wc) 436 { 437 438 _DIAGASSERT(csid != NULL && idx != NULL); 439 440 *csid = 0; 441 *idx = (_index_t)wc; 442 443 return (0); 444 } 445 446 static __inline int 447 /*ARGSUSED*/ 448 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei, 449 _wc_t * __restrict wc, 450 _csid_t csid, _index_t idx) 451 { 452 453 _DIAGASSERT(wc != NULL); 454 455 if (csid != 0) 456 return (EILSEQ); 457 458 *wc = (_wc_t)idx; 459 460 return (0); 461 } 462 463 static __inline int 464 /*ARGSUSED*/ 465 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei, 466 _UTF1632State * __restrict psenc, 467 int * __restrict rstate) 468 { 469 470 if (psenc->chlen == 0) 471 *rstate = _STDENC_SDGEN_INITIAL; 472 else 473 *rstate = _STDENC_SDGEN_INCOMPLETE_CHAR; 474 475 return 0; 476 } 477 478 /* ---------------------------------------------------------------------- 479 * public interface for stdenc 480 */ 481 482 _CITRUS_STDENC_DECLS(UTF1632); 483 _CITRUS_STDENC_DEF_OPS(UTF1632); 484 485 #include "citrus_stdenc_template.h" 486