1 /* $NetBSD: citrus_utf1632.c,v 1.7 2006/10/27 14:13:55 tnozaki Exp $ */ 2 3 /*- 4 * Copyright (c)2003 Citrus Project, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #if defined(LIBC_SCCS) && !defined(lint) 31 __RCSID("$NetBSD: citrus_utf1632.c,v 1.7 2006/10/27 14:13:55 tnozaki Exp $"); 32 #endif /* LIBC_SCCS and not lint */ 33 34 #include <assert.h> 35 #include <errno.h> 36 #include <string.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <stddef.h> 40 #include <locale.h> 41 #include <limits.h> 42 #include <wchar.h> 43 #include <sys/types.h> 44 #include <machine/endian.h> 45 46 #include "citrus_namespace.h" 47 #include "citrus_types.h" 48 #include "citrus_module.h" 49 #include "citrus_stdenc.h" 50 #include "citrus_bcs.h" 51 52 #include "citrus_utf1632.h" 53 54 55 /* ---------------------------------------------------------------------- 56 * private stuffs used by templates 57 */ 58 59 typedef struct { 60 u_int8_t ch[4]; 61 int chlen; 62 int current_endian; 63 } _UTF1632State; 64 65 typedef struct { 66 int preffered_endian; 67 unsigned int cur_max; 68 #define _ENDIAN_UNKNOWN 0 69 #define _ENDIAN_BIG 1 70 #define _ENDIAN_LITTLE 2 71 u_int32_t mode; 72 #define _MODE_UTF32 0x00000001U 73 #define _MODE_FORCE_ENDIAN 0x00000002U 74 } _UTF1632EncodingInfo; 75 76 #define _FUNCNAME(m) _citrus_UTF1632_##m 77 #define _ENCODING_INFO _UTF1632EncodingInfo 78 #define _ENCODING_STATE _UTF1632State 79 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 80 #define _ENCODING_IS_STATE_DEPENDENT 0 81 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 82 83 84 static __inline void 85 /*ARGSUSED*/ 86 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei, _UTF1632State *s) 87 { 88 memset(s, 0, sizeof(*s)); 89 } 90 91 static int 92 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 93 const char **s, size_t n, _UTF1632State *psenc, 94 size_t *nresult) 95 { 96 int chlenbak, endian, needlen; 97 wchar_t wc; 98 size_t result; 99 const char *s0; 100 101 _DIAGASSERT(nresult != 0); 102 _DIAGASSERT(ei != NULL); 103 _DIAGASSERT(s != NULL); 104 _DIAGASSERT(psenc != NULL); 105 106 s0 = *s; 107 108 if (s0 == NULL) { 109 _citrus_UTF1632_init_state(ei, psenc); 110 *nresult = 0; /* state independent */ 111 return (0); 112 } 113 114 result = 0; 115 chlenbak = psenc->chlen; 116 117 refetch: 118 if ((ei->mode & _MODE_UTF32) != 0 || chlenbak>=2) 119 needlen = 4; 120 else 121 needlen = 2; 122 123 while (chlenbak < needlen) { 124 if (n==0) 125 goto restart; 126 psenc->ch[chlenbak++] = *s0++; 127 n--; 128 result++; 129 } 130 131 /* judge endian marker */ 132 if ((ei->mode & _MODE_UTF32) == 0) { 133 /* UTF16 */ 134 if (psenc->ch[0]==0xFE && psenc->ch[1]==0xFF) { 135 psenc->current_endian = _ENDIAN_BIG; 136 chlenbak = 0; 137 goto refetch; 138 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE) { 139 psenc->current_endian = _ENDIAN_LITTLE; 140 chlenbak = 0; 141 goto refetch; 142 } 143 } else { 144 /* UTF32 */ 145 if (psenc->ch[0]==0x00 && psenc->ch[1]==0x00 && 146 psenc->ch[2]==0xFE && psenc->ch[3]==0xFF) { 147 psenc->current_endian = _ENDIAN_BIG; 148 chlenbak = 0; 149 goto refetch; 150 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE && 151 psenc->ch[2]==0x00 && psenc->ch[3]==0x00) { 152 psenc->current_endian = _ENDIAN_LITTLE; 153 chlenbak = 0; 154 goto refetch; 155 } 156 } 157 if ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 158 psenc->current_endian == _ENDIAN_UNKNOWN) 159 endian = ei->preffered_endian; 160 else 161 endian = psenc->current_endian; 162 163 /* get wc */ 164 if ((ei->mode & _MODE_UTF32) == 0) { 165 /* UTF16 */ 166 if (needlen==2) { 167 switch (endian) { 168 case _ENDIAN_LITTLE: 169 wc = (psenc->ch[0] | 170 ((wchar_t)psenc->ch[1] << 8)); 171 break; 172 case _ENDIAN_BIG: 173 wc = (psenc->ch[1] | 174 ((wchar_t)psenc->ch[0] << 8)); 175 break; 176 default: 177 goto ilseq; 178 } 179 if (wc >= 0xD800 && wc <= 0xDBFF) { 180 /* surrogate high */ 181 needlen=4; 182 goto refetch; 183 } 184 } else { 185 /* surrogate low */ 186 wc -= 0xD800; /* wc : surrogate high (see above) */ 187 wc <<= 10; 188 switch (endian) { 189 case _ENDIAN_LITTLE: 190 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 191 goto ilseq; 192 wc |= psenc->ch[2]; 193 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 194 break; 195 case _ENDIAN_BIG: 196 if (psenc->ch[3]<0xDC || psenc->ch[3]>0xDF) 197 goto ilseq; 198 wc |= psenc->ch[3]; 199 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 200 break; 201 default: 202 goto ilseq; 203 } 204 wc += 0x10000; 205 } 206 } else { 207 /* UTF32 */ 208 switch (endian) { 209 case _ENDIAN_LITTLE: 210 wc = (psenc->ch[0] | 211 ((wchar_t)psenc->ch[1] << 8) | 212 ((wchar_t)psenc->ch[2] << 16) | 213 ((wchar_t)psenc->ch[3] << 24)); 214 break; 215 case _ENDIAN_BIG: 216 wc = (psenc->ch[3] | 217 ((wchar_t)psenc->ch[2] << 8) | 218 ((wchar_t)psenc->ch[1] << 16) | 219 ((wchar_t)psenc->ch[0] << 24)); 220 break; 221 default: 222 goto ilseq; 223 } 224 if (wc >= 0xD800 && wc <= 0xDFFF) 225 goto ilseq; 226 } 227 228 229 *pwc = wc; 230 psenc->chlen = 0; 231 *nresult = result; 232 *s = s0; 233 234 return (0); 235 236 ilseq: 237 *nresult = (size_t)-1; 238 psenc->chlen = 0; 239 return (EILSEQ); 240 241 restart: 242 *nresult = (size_t)-2; 243 psenc->chlen = chlenbak; 244 *s = s0; 245 return (0); 246 } 247 248 static int 249 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 250 wchar_t wc, _UTF1632State *psenc, 251 size_t *nresult) 252 { 253 int ret; 254 wchar_t wc2; 255 256 _DIAGASSERT(ei != NULL); 257 _DIAGASSERT(nresult != 0); 258 _DIAGASSERT(s != NULL); 259 260 wc2 = 0; 261 if ((ei->mode & _MODE_UTF32)==0) { 262 /* UTF16 */ 263 if (wc>0xFFFF) { 264 /* surrogate */ 265 if (wc>0x10FFFF) { 266 ret = EILSEQ; 267 goto err; 268 } 269 if (n < 4) { 270 ret = E2BIG; 271 goto err; 272 } 273 wc -= 0x10000; 274 wc2 = (wc & 0x3FF) | 0xDC00; 275 wc = (wc>>10) | 0xD800; 276 *nresult = (size_t)4; 277 } else { 278 if (n < 2) { 279 ret = E2BIG; 280 goto err; 281 } 282 *nresult = (size_t)2; 283 } 284 285 surrogate: 286 switch (ei->preffered_endian) { 287 case _ENDIAN_BIG: 288 s[1] = wc; 289 s[0] = (wc >>= 8); 290 break; 291 case _ENDIAN_LITTLE: 292 s[0] = wc; 293 s[1] = (wc >>= 8); 294 break; 295 } 296 if (wc2!=0) { 297 wc = wc2; 298 wc2 = 0; 299 s += 2; 300 goto surrogate; 301 } 302 } else { 303 /* UTF32 */ 304 if (wc >= 0xD800 && wc <= 0xDFFF) 305 goto err; 306 if (n < 4) { 307 ret = E2BIG; 308 goto err; 309 } 310 switch (ei->preffered_endian) { 311 case _ENDIAN_BIG: 312 s[3] = wc; 313 s[2] = (wc >>= 8); 314 s[1] = (wc >>= 8); 315 s[0] = (wc >>= 8); 316 break; 317 case _ENDIAN_LITTLE: 318 s[0] = wc; 319 s[1] = (wc >>= 8); 320 s[2] = (wc >>= 8); 321 s[3] = (wc >>= 8); 322 break; 323 } 324 *nresult = (size_t)4; 325 } 326 327 return 0; 328 329 err: 330 *nresult = (size_t)-1; 331 return ret; 332 } 333 334 static void 335 parse_variable(_UTF1632EncodingInfo * __restrict ei, 336 const void * __restrict var, size_t lenvar) 337 { 338 #define MATCH(x, act) \ 339 do { \ 340 if (lenvar >= (sizeof(#x)-1) && \ 341 _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \ 342 act; \ 343 lenvar -= sizeof(#x)-1; \ 344 p += sizeof(#x)-1; \ 345 } \ 346 } while (/*CONSTCOND*/0) 347 const char *p; 348 p = var; 349 while (lenvar>0) { 350 switch (*p) { 351 case 'B': 352 case 'b': 353 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 354 break; 355 case 'L': 356 case 'l': 357 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 358 break; 359 case 'F': 360 case 'f': 361 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 362 break; 363 case 'U': 364 case 'u': 365 MATCH(utf32, ei->mode |= _MODE_UTF32); 366 break; 367 } 368 p++; 369 lenvar--; 370 } 371 } 372 373 static int 374 /*ARGSUSED*/ 375 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 376 const void * __restrict var, 377 size_t lenvar) 378 { 379 _DIAGASSERT(ei != NULL); 380 381 memset((void *)ei, 0, sizeof(*ei)); 382 383 parse_variable(ei, var, lenvar); 384 385 if ((ei->mode&_MODE_UTF32)==0) 386 ei->cur_max = 6; /* endian + surrogate */ 387 else 388 ei->cur_max = 8; /* endian + normal */ 389 390 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 391 #if BYTE_ORDER == BIG_ENDIAN 392 ei->preffered_endian = _ENDIAN_BIG; 393 #else 394 ei->preffered_endian = _ENDIAN_LITTLE; 395 #endif 396 } 397 398 return (0); 399 } 400 401 static void 402 /*ARGSUSED*/ 403 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei) 404 { 405 } 406 407 static __inline int 408 /*ARGSUSED*/ 409 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei, 410 _csid_t * __restrict csid, 411 _index_t * __restrict idx, 412 _wc_t wc) 413 { 414 415 _DIAGASSERT(csid != NULL && idx != NULL); 416 417 *csid = 0; 418 *idx = (_index_t)wc; 419 420 return (0); 421 } 422 423 static __inline int 424 /*ARGSUSED*/ 425 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei, 426 _wc_t * __restrict wc, 427 _csid_t csid, _index_t idx) 428 { 429 430 _DIAGASSERT(wc != NULL); 431 432 if (csid != 0) 433 return (EILSEQ); 434 435 *wc = (_wc_t)idx; 436 437 return (0); 438 } 439 440 static __inline int 441 /*ARGSUSED*/ 442 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei, 443 _UTF1632State * __restrict psenc, 444 int * __restrict rstate) 445 { 446 447 if (psenc->chlen == 0) 448 *rstate = _STDENC_SDGEN_INITIAL; 449 else 450 *rstate = _STDENC_SDGEN_INCOMPLETE_CHAR; 451 452 return 0; 453 } 454 455 /* ---------------------------------------------------------------------- 456 * public interface for stdenc 457 */ 458 459 _CITRUS_STDENC_DECLS(UTF1632); 460 _CITRUS_STDENC_DEF_OPS(UTF1632); 461 462 #include "citrus_stdenc_template.h" 463