1 /* $NetBSD: citrus_utf1632.c,v 1.6 2006/03/19 01:24:09 christos Exp $ */ 2 3 /*- 4 * Copyright (c)2003 Citrus Project, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #if defined(LIBC_SCCS) && !defined(lint) 31 __RCSID("$NetBSD: citrus_utf1632.c,v 1.6 2006/03/19 01:24:09 christos Exp $"); 32 #endif /* LIBC_SCCS and not lint */ 33 34 #include <assert.h> 35 #include <errno.h> 36 #include <string.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <stddef.h> 40 #include <locale.h> 41 #include <limits.h> 42 #include <wchar.h> 43 #include <sys/types.h> 44 #include <machine/endian.h> 45 46 #include "citrus_namespace.h" 47 #include "citrus_types.h" 48 #include "citrus_module.h" 49 #include "citrus_stdenc.h" 50 #include "citrus_bcs.h" 51 52 #include "citrus_utf1632.h" 53 54 55 /* ---------------------------------------------------------------------- 56 * private stuffs used by templates 57 */ 58 59 typedef struct { 60 u_int8_t ch[4]; 61 int chlen; 62 int current_endian; 63 } _UTF1632State; 64 65 typedef struct { 66 int preffered_endian; 67 unsigned int cur_max; 68 #define _ENDIAN_UNKNOWN 0 69 #define _ENDIAN_BIG 1 70 #define _ENDIAN_LITTLE 2 71 u_int32_t mode; 72 #define _MODE_UTF32 0x00000001U 73 #define _MODE_FORCE_ENDIAN 0x00000002U 74 } _UTF1632EncodingInfo; 75 76 #define _FUNCNAME(m) _citrus_UTF1632_##m 77 #define _ENCODING_INFO _UTF1632EncodingInfo 78 #define _ENCODING_STATE _UTF1632State 79 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 80 #define _ENCODING_IS_STATE_DEPENDENT 0 81 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 82 83 84 static __inline void 85 /*ARGSUSED*/ 86 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei, _UTF1632State *s) 87 { 88 memset(s, 0, sizeof(*s)); 89 } 90 91 static int 92 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 93 const char **s, size_t n, _UTF1632State *psenc, 94 size_t *nresult) 95 { 96 int chlenbak, endian, needlen; 97 wchar_t wc; 98 size_t result; 99 const char *s0; 100 101 _DIAGASSERT(nresult != 0); 102 _DIAGASSERT(ei != NULL); 103 _DIAGASSERT(s != NULL); 104 _DIAGASSERT(psenc != NULL); 105 106 s0 = *s; 107 108 if (s0 == NULL) { 109 _citrus_UTF1632_init_state(ei, psenc); 110 *nresult = 0; /* state independent */ 111 return (0); 112 } 113 114 result = 0; 115 chlenbak = psenc->chlen; 116 117 refetch: 118 if ((ei->mode & _MODE_UTF32) != 0 || chlenbak>=2) 119 needlen = 4; 120 else 121 needlen = 2; 122 123 while (chlenbak < needlen) { 124 if (n==0) 125 goto restart; 126 psenc->ch[chlenbak++] = *s0++; 127 n--; 128 result++; 129 } 130 131 /* judge endian marker */ 132 if ((ei->mode & _MODE_UTF32) == 0) { 133 /* UTF16 */ 134 if (psenc->ch[0]==0xFE && psenc->ch[1]==0xFF) { 135 psenc->current_endian = _ENDIAN_BIG; 136 chlenbak = 0; 137 goto refetch; 138 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE) { 139 psenc->current_endian = _ENDIAN_LITTLE; 140 chlenbak = 0; 141 goto refetch; 142 } 143 } else { 144 /* UTF32 */ 145 if (psenc->ch[0]==0x00 && psenc->ch[1]==0x00 && 146 psenc->ch[2]==0xFE && psenc->ch[3]==0xFF) { 147 psenc->current_endian = _ENDIAN_BIG; 148 chlenbak = 0; 149 goto refetch; 150 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE && 151 psenc->ch[2]==0x00 && psenc->ch[3]==0x00) { 152 psenc->current_endian = _ENDIAN_LITTLE; 153 chlenbak = 0; 154 goto refetch; 155 } 156 } 157 if ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 158 psenc->current_endian == _ENDIAN_UNKNOWN) 159 endian = ei->preffered_endian; 160 else 161 endian = psenc->current_endian; 162 163 /* get wc */ 164 if ((ei->mode & _MODE_UTF32) == 0) { 165 /* UTF16 */ 166 if (needlen==2) { 167 switch (endian) { 168 case _ENDIAN_LITTLE: 169 wc = (psenc->ch[0] | 170 ((wchar_t)psenc->ch[1] << 8)); 171 break; 172 case _ENDIAN_BIG: 173 wc = (psenc->ch[1] | 174 ((wchar_t)psenc->ch[0] << 8)); 175 break; 176 default: 177 goto ilseq; 178 } 179 if (wc >= 0xD800 && wc <= 0xDBFF) { 180 /* surrogate high */ 181 needlen=4; 182 goto refetch; 183 } 184 } else { 185 /* surrogate low */ 186 wc -= 0xD800; /* wc : surrogate high (see above) */ 187 wc <<= 10; 188 switch (endian) { 189 case _ENDIAN_LITTLE: 190 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 191 goto ilseq; 192 wc |= psenc->ch[2]; 193 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 194 break; 195 case _ENDIAN_BIG: 196 if (psenc->ch[3]<0xDC || psenc->ch[3]>0xDF) 197 goto ilseq; 198 wc |= psenc->ch[3]; 199 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 200 break; 201 default: 202 goto ilseq; 203 } 204 wc += 0x10000; 205 } 206 } else { 207 /* UTF32 */ 208 switch (endian) { 209 case _ENDIAN_LITTLE: 210 wc = (psenc->ch[0] | 211 ((wchar_t)psenc->ch[1] << 8) | 212 ((wchar_t)psenc->ch[2] << 16) | 213 ((wchar_t)psenc->ch[3] << 24)); 214 break; 215 case _ENDIAN_BIG: 216 wc = (psenc->ch[3] | 217 ((wchar_t)psenc->ch[2] << 8) | 218 ((wchar_t)psenc->ch[1] << 16) | 219 ((wchar_t)psenc->ch[0] << 24)); 220 break; 221 default: 222 goto ilseq; 223 } 224 } 225 226 227 *pwc = wc; 228 psenc->chlen = 0; 229 *nresult = result; 230 *s = s0; 231 232 return (0); 233 234 ilseq: 235 *nresult = (size_t)-1; 236 psenc->chlen = 0; 237 return (EILSEQ); 238 239 restart: 240 *nresult = (size_t)-2; 241 psenc->chlen = chlenbak; 242 *s = s0; 243 return (0); 244 } 245 246 static int 247 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 248 wchar_t wc, _UTF1632State *psenc, 249 size_t *nresult) 250 { 251 int ret; 252 wchar_t wc2; 253 254 _DIAGASSERT(ei != NULL); 255 _DIAGASSERT(nresult != 0); 256 _DIAGASSERT(s != NULL); 257 258 wc2 = 0; 259 if ((ei->mode & _MODE_UTF32)==0) { 260 /* UTF16 */ 261 if (wc>0xFFFF) { 262 /* surrogate */ 263 if (wc>0x10FFFF) { 264 ret = EILSEQ; 265 goto err; 266 } 267 if (n < 4) { 268 ret = E2BIG; 269 goto err; 270 } 271 wc -= 0x10000; 272 wc2 = (wc & 0x3FF) | 0xDC00; 273 wc = (wc>>10) | 0xD800; 274 *nresult = (size_t)4; 275 } else { 276 if (n < 2) { 277 ret = E2BIG; 278 goto err; 279 } 280 *nresult = (size_t)2; 281 } 282 283 surrogate: 284 switch (ei->preffered_endian) { 285 case _ENDIAN_BIG: 286 s[1] = wc; 287 s[0] = (wc >>= 8); 288 break; 289 case _ENDIAN_LITTLE: 290 s[0] = wc; 291 s[1] = (wc >>= 8); 292 break; 293 } 294 if (wc2!=0) { 295 wc = wc2; 296 wc2 = 0; 297 s += 2; 298 goto surrogate; 299 } 300 } else { 301 /* UTF32 */ 302 if (n < 4) { 303 ret = E2BIG; 304 goto err; 305 } 306 switch (ei->preffered_endian) { 307 case _ENDIAN_BIG: 308 s[3] = wc; 309 s[2] = (wc >>= 8); 310 s[1] = (wc >>= 8); 311 s[0] = (wc >>= 8); 312 break; 313 case _ENDIAN_LITTLE: 314 s[0] = wc; 315 s[1] = (wc >>= 8); 316 s[2] = (wc >>= 8); 317 s[3] = (wc >>= 8); 318 break; 319 } 320 *nresult = (size_t)4; 321 } 322 323 return 0; 324 325 err: 326 *nresult = (size_t)-1; 327 return ret; 328 } 329 330 static void 331 parse_variable(_UTF1632EncodingInfo * __restrict ei, 332 const void * __restrict var, size_t lenvar) 333 { 334 #define MATCH(x, act) \ 335 do { \ 336 if (lenvar >= (sizeof(#x)-1) && \ 337 _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \ 338 act; \ 339 lenvar -= sizeof(#x)-1; \ 340 p += sizeof(#x)-1; \ 341 } \ 342 } while (/*CONSTCOND*/0) 343 const char *p; 344 p = var; 345 while (lenvar>0) { 346 switch (*p) { 347 case 'B': 348 case 'b': 349 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 350 break; 351 case 'L': 352 case 'l': 353 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 354 break; 355 case 'F': 356 case 'f': 357 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 358 break; 359 case 'U': 360 case 'u': 361 MATCH(utf32, ei->mode |= _MODE_UTF32); 362 break; 363 } 364 p++; 365 lenvar--; 366 } 367 } 368 369 static int 370 /*ARGSUSED*/ 371 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 372 const void * __restrict var, 373 size_t lenvar) 374 { 375 _DIAGASSERT(ei != NULL); 376 377 memset((void *)ei, 0, sizeof(*ei)); 378 379 parse_variable(ei, var, lenvar); 380 381 if ((ei->mode&_MODE_UTF32)==0) 382 ei->cur_max = 6; /* endian + surrogate */ 383 else 384 ei->cur_max = 8; /* endian + normal */ 385 386 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 387 #if BYTE_ORDER == BIG_ENDIAN 388 ei->preffered_endian = _ENDIAN_BIG; 389 #else 390 ei->preffered_endian = _ENDIAN_LITTLE; 391 #endif 392 } 393 394 return (0); 395 } 396 397 static void 398 /*ARGSUSED*/ 399 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei) 400 { 401 } 402 403 static __inline int 404 /*ARGSUSED*/ 405 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei, 406 _csid_t * __restrict csid, 407 _index_t * __restrict idx, 408 _wc_t wc) 409 { 410 411 _DIAGASSERT(csid != NULL && idx != NULL); 412 413 *csid = 0; 414 *idx = (_index_t)wc; 415 416 return (0); 417 } 418 419 static __inline int 420 /*ARGSUSED*/ 421 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei, 422 _wc_t * __restrict wc, 423 _csid_t csid, _index_t idx) 424 { 425 426 _DIAGASSERT(wc != NULL); 427 428 if (csid != 0) 429 return (EILSEQ); 430 431 *wc = (_wc_t)idx; 432 433 return (0); 434 } 435 436 static __inline int 437 /*ARGSUSED*/ 438 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei, 439 _UTF1632State * __restrict psenc, 440 int * __restrict rstate) 441 { 442 443 if (psenc->chlen == 0) 444 *rstate = _STDENC_SDGEN_INITIAL; 445 else 446 *rstate = _STDENC_SDGEN_INCOMPLETE_CHAR; 447 448 return 0; 449 } 450 451 /* ---------------------------------------------------------------------- 452 * public interface for stdenc 453 */ 454 455 _CITRUS_STDENC_DECLS(UTF1632); 456 _CITRUS_STDENC_DEF_OPS(UTF1632); 457 458 #include "citrus_stdenc_template.h" 459