1 /* $NetBSD: citrus_utf1632.c,v 1.12 2012/02/12 13:51:29 wiz Exp $ */ 2 3 /*- 4 * Copyright (c)2003 Citrus Project, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #if defined(LIBC_SCCS) && !defined(lint) 31 __RCSID("$NetBSD: citrus_utf1632.c,v 1.12 2012/02/12 13:51:29 wiz Exp $"); 32 #endif /* LIBC_SCCS and not lint */ 33 34 #include <assert.h> 35 #include <errno.h> 36 #include <string.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <stddef.h> 40 #include <limits.h> 41 #include <wchar.h> 42 #include <sys/types.h> 43 #include <machine/endian.h> 44 45 #include "citrus_namespace.h" 46 #include "citrus_types.h" 47 #include "citrus_module.h" 48 #include "citrus_stdenc.h" 49 #include "citrus_bcs.h" 50 51 #include "citrus_utf1632.h" 52 53 54 /* ---------------------------------------------------------------------- 55 * private stuffs used by templates 56 */ 57 58 typedef struct { 59 u_int8_t ch[4]; 60 int chlen; 61 int current_endian; 62 } _UTF1632State; 63 64 typedef struct { 65 int preffered_endian; 66 unsigned int cur_max; 67 #define _ENDIAN_UNKNOWN 0 68 #define _ENDIAN_BIG 1 69 #define _ENDIAN_LITTLE 2 70 u_int32_t mode; 71 #define _MODE_UTF32 0x00000001U 72 #define _MODE_FORCE_ENDIAN 0x00000002U 73 } _UTF1632EncodingInfo; 74 75 #define _FUNCNAME(m) _citrus_UTF1632_##m 76 #define _ENCODING_INFO _UTF1632EncodingInfo 77 #define _ENCODING_STATE _UTF1632State 78 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 79 #define _ENCODING_IS_STATE_DEPENDENT 0 80 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 81 82 83 static __inline void 84 /*ARGSUSED*/ 85 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei, _UTF1632State *s) 86 { 87 memset(s, 0, sizeof(*s)); 88 } 89 90 static int 91 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 92 const char **s, size_t n, _UTF1632State *psenc, 93 size_t *nresult) 94 { 95 int chlenbak, endian, needlen; 96 wchar_t wc; 97 size_t result; 98 const char *s0; 99 100 _DIAGASSERT(nresult != 0); 101 _DIAGASSERT(ei != NULL); 102 _DIAGASSERT(s != NULL); 103 _DIAGASSERT(psenc != NULL); 104 105 s0 = *s; 106 107 if (s0 == NULL) { 108 _citrus_UTF1632_init_state(ei, psenc); 109 *nresult = 0; /* state independent */ 110 return (0); 111 } 112 113 result = 0; 114 chlenbak = psenc->chlen; 115 116 refetch: 117 if ((ei->mode & _MODE_UTF32) != 0 || chlenbak>=2) 118 needlen = 4; 119 else 120 needlen = 2; 121 122 while (chlenbak < needlen) { 123 if (n==0) 124 goto restart; 125 psenc->ch[chlenbak++] = *s0++; 126 n--; 127 result++; 128 } 129 130 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 131 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 132 /* judge endian marker */ 133 if ((ei->mode & _MODE_UTF32) == 0) { 134 /* UTF16 */ 135 if (psenc->ch[0]==0xFE && psenc->ch[1]==0xFF) { 136 psenc->current_endian = _ENDIAN_BIG; 137 chlenbak = 0; 138 goto refetch; 139 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE) { 140 psenc->current_endian = _ENDIAN_LITTLE; 141 chlenbak = 0; 142 goto refetch; 143 } 144 } else { 145 /* UTF32 */ 146 if (psenc->ch[0]==0x00 && psenc->ch[1]==0x00 && 147 psenc->ch[2]==0xFE && psenc->ch[3]==0xFF) { 148 psenc->current_endian = _ENDIAN_BIG; 149 chlenbak = 0; 150 goto refetch; 151 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE && 152 psenc->ch[2]==0x00 && psenc->ch[3]==0x00) { 153 psenc->current_endian = _ENDIAN_LITTLE; 154 chlenbak = 0; 155 goto refetch; 156 } 157 } 158 } 159 psenc->current_endian = ei->preffered_endian; 160 } 161 endian = psenc->current_endian; 162 163 /* get wc */ 164 if ((ei->mode & _MODE_UTF32) == 0) { 165 /* UTF16 */ 166 if (needlen==2) { 167 switch (endian) { 168 case _ENDIAN_LITTLE: 169 wc = (psenc->ch[0] | 170 ((wchar_t)psenc->ch[1] << 8)); 171 break; 172 case _ENDIAN_BIG: 173 wc = (psenc->ch[1] | 174 ((wchar_t)psenc->ch[0] << 8)); 175 break; 176 default: 177 goto ilseq; 178 } 179 if (wc >= 0xD800 && wc <= 0xDBFF) { 180 /* surrogate high */ 181 needlen=4; 182 goto refetch; 183 } 184 } else { 185 /* surrogate low */ 186 wc -= 0xD800; /* wc : surrogate high (see above) */ 187 wc <<= 10; 188 switch (endian) { 189 case _ENDIAN_LITTLE: 190 if (psenc->ch[3]<0xDC || psenc->ch[3]>0xDF) 191 goto ilseq; 192 wc |= psenc->ch[2]; 193 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 194 break; 195 case _ENDIAN_BIG: 196 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 197 goto ilseq; 198 wc |= psenc->ch[3]; 199 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 200 break; 201 default: 202 goto ilseq; 203 } 204 wc += 0x10000; 205 } 206 } else { 207 /* UTF32 */ 208 switch (endian) { 209 case _ENDIAN_LITTLE: 210 wc = (psenc->ch[0] | 211 ((wchar_t)psenc->ch[1] << 8) | 212 ((wchar_t)psenc->ch[2] << 16) | 213 ((wchar_t)psenc->ch[3] << 24)); 214 break; 215 case _ENDIAN_BIG: 216 wc = (psenc->ch[3] | 217 ((wchar_t)psenc->ch[2] << 8) | 218 ((wchar_t)psenc->ch[1] << 16) | 219 ((wchar_t)psenc->ch[0] << 24)); 220 break; 221 default: 222 goto ilseq; 223 } 224 if (wc >= 0xD800 && wc <= 0xDFFF) 225 goto ilseq; 226 } 227 228 229 *pwc = wc; 230 psenc->chlen = 0; 231 *nresult = result; 232 *s = s0; 233 234 return (0); 235 236 ilseq: 237 *nresult = (size_t)-1; 238 psenc->chlen = 0; 239 return (EILSEQ); 240 241 restart: 242 *nresult = (size_t)-2; 243 psenc->chlen = chlenbak; 244 *s = s0; 245 return (0); 246 } 247 248 static int 249 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 250 wchar_t wc, _UTF1632State *psenc, 251 size_t *nresult) 252 { 253 wchar_t wc2; 254 static const char _bom[4] = { 255 #if BYTE_ORDER == BIG_ENDIAN 256 0x00, 0x00, 0xFE, 0xFF, 257 #else 258 0xFF, 0xFE, 0x00, 0x00, 259 #endif 260 }; 261 const char *bom = &_bom[0]; 262 size_t cnt; 263 264 _DIAGASSERT(ei != NULL); 265 _DIAGASSERT(nresult != 0); 266 _DIAGASSERT(s != NULL); 267 268 cnt = (size_t)0; 269 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 270 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 271 if (ei->mode & _MODE_UTF32) { 272 cnt = 4; 273 } else { 274 cnt = 2; 275 #if BYTE_ORDER == BIG_ENDIAN 276 bom += 2; 277 #endif 278 } 279 if (n < cnt) 280 goto e2big; 281 memcpy(s, bom, cnt); 282 s += cnt, n -= cnt; 283 } 284 psenc->current_endian = ei->preffered_endian; 285 } 286 287 wc2 = 0; 288 if ((ei->mode & _MODE_UTF32)==0) { 289 /* UTF16 */ 290 if (wc>0xFFFF) { 291 /* surrogate */ 292 if (wc>0x10FFFF) 293 goto ilseq; 294 if (n < 4) 295 goto e2big; 296 cnt += 4; 297 wc -= 0x10000; 298 wc2 = (wc & 0x3FF) | 0xDC00; 299 wc = (wc>>10) | 0xD800; 300 } else { 301 if (n < 2) 302 goto e2big; 303 cnt += 2; 304 } 305 306 surrogate: 307 switch (psenc->current_endian) { 308 case _ENDIAN_BIG: 309 s[1] = wc; 310 s[0] = (wc >>= 8); 311 break; 312 case _ENDIAN_LITTLE: 313 s[0] = wc; 314 s[1] = (wc >>= 8); 315 break; 316 } 317 if (wc2!=0) { 318 wc = wc2; 319 wc2 = 0; 320 s += 2; 321 goto surrogate; 322 } 323 } else { 324 /* UTF32 */ 325 if (wc >= 0xD800 && wc <= 0xDFFF) 326 goto ilseq; 327 if (n < 4) 328 goto e2big; 329 cnt += 4; 330 switch (psenc->current_endian) { 331 case _ENDIAN_BIG: 332 s[3] = wc; 333 s[2] = (wc >>= 8); 334 s[1] = (wc >>= 8); 335 s[0] = (wc >>= 8); 336 break; 337 case _ENDIAN_LITTLE: 338 s[0] = wc; 339 s[1] = (wc >>= 8); 340 s[2] = (wc >>= 8); 341 s[3] = (wc >>= 8); 342 break; 343 } 344 } 345 *nresult = cnt; 346 347 return 0; 348 349 ilseq: 350 *nresult = (size_t)-1; 351 return EILSEQ; 352 e2big: 353 *nresult = (size_t)-1; 354 return E2BIG; 355 } 356 357 static void 358 parse_variable(_UTF1632EncodingInfo * __restrict ei, 359 const void * __restrict var, size_t lenvar) 360 { 361 #define MATCH(x, act) \ 362 do { \ 363 if (lenvar >= (sizeof(#x)-1) && \ 364 _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \ 365 act; \ 366 lenvar -= sizeof(#x)-1; \ 367 p += sizeof(#x)-1; \ 368 } \ 369 } while (/*CONSTCOND*/0) 370 const char *p; 371 p = var; 372 while (lenvar>0) { 373 switch (*p) { 374 case 'B': 375 case 'b': 376 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 377 break; 378 case 'L': 379 case 'l': 380 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 381 break; 382 case 'F': 383 case 'f': 384 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 385 break; 386 case 'U': 387 case 'u': 388 MATCH(utf32, ei->mode |= _MODE_UTF32); 389 break; 390 } 391 p++; 392 lenvar--; 393 } 394 } 395 396 static int 397 /*ARGSUSED*/ 398 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 399 const void * __restrict var, 400 size_t lenvar) 401 { 402 _DIAGASSERT(ei != NULL); 403 404 memset((void *)ei, 0, sizeof(*ei)); 405 406 parse_variable(ei, var, lenvar); 407 408 if ((ei->mode&_MODE_UTF32)==0) 409 ei->cur_max = 6; /* endian + surrogate */ 410 else 411 ei->cur_max = 8; /* endian + normal */ 412 413 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 414 #if BYTE_ORDER == BIG_ENDIAN 415 ei->preffered_endian = _ENDIAN_BIG; 416 #else 417 ei->preffered_endian = _ENDIAN_LITTLE; 418 #endif 419 } 420 421 return (0); 422 } 423 424 static void 425 /*ARGSUSED*/ 426 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei) 427 { 428 } 429 430 static __inline int 431 /*ARGSUSED*/ 432 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei, 433 _csid_t * __restrict csid, 434 _index_t * __restrict idx, 435 _wc_t wc) 436 { 437 438 _DIAGASSERT(csid != NULL && idx != NULL); 439 440 *csid = 0; 441 *idx = (_index_t)wc; 442 443 return (0); 444 } 445 446 static __inline int 447 /*ARGSUSED*/ 448 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei, 449 _wc_t * __restrict wc, 450 _csid_t csid, _index_t idx) 451 { 452 453 _DIAGASSERT(wc != NULL); 454 455 if (csid != 0) 456 return (EILSEQ); 457 458 *wc = (_wc_t)idx; 459 460 return (0); 461 } 462 463 static __inline int 464 /*ARGSUSED*/ 465 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei, 466 _UTF1632State * __restrict psenc, 467 int * __restrict rstate) 468 { 469 470 if (psenc->chlen == 0) 471 *rstate = _STDENC_SDGEN_INITIAL; 472 else 473 *rstate = _STDENC_SDGEN_INCOMPLETE_CHAR; 474 475 return 0; 476 } 477 478 /* ---------------------------------------------------------------------- 479 * public interface for stdenc 480 */ 481 482 _CITRUS_STDENC_DECLS(UTF1632); 483 _CITRUS_STDENC_DEF_OPS(UTF1632); 484 485 #include "citrus_stdenc_template.h" 486