1 /* $NetBSD $ */ 2 3 /*- 4 * Copyright (c)2003 Citrus Project, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #if defined(LIBC_SCCS) && !defined(lint) 31 __RCSID("$NetBSD: citrus_utf1632.c,v 1.1 2003/06/25 09:51:49 tshiozak Exp $"); 32 #endif /* LIBC_SCCS and not lint */ 33 34 #include <assert.h> 35 #include <errno.h> 36 #include <string.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <stddef.h> 40 #include <locale.h> 41 #include <limits.h> 42 #include <wchar.h> 43 #include <sys/types.h> 44 #include <sys/endian.h> 45 46 #include "citrus_namespace.h" 47 #include "citrus_types.h" 48 #include "citrus_module.h" 49 #include "citrus_stdenc.h" 50 #include "citrus_bcs.h" 51 52 #include "citrus_utf1632.h" 53 54 55 /* ---------------------------------------------------------------------- 56 * private stuffs used by templates 57 */ 58 59 typedef struct { 60 u_int8_t ch[4]; 61 int chlen; 62 int current_endian; 63 } _UTF1632State; 64 65 typedef struct { 66 int preffered_endian; 67 unsigned int cur_max; 68 #define _ENDIAN_UNKNOWN 0 69 #define _ENDIAN_BIG 1 70 #define _ENDIAN_LITTLE 2 71 u_int32_t mode; 72 #define _MODE_UTF32 0x00000001U 73 #define _MODE_FORCE_ENDIAN 0x00000002U 74 } _UTF1632EncodingInfo; 75 76 #define _FUNCNAME(m) _citrus_UTF1632_##m 77 #define _ENCODING_INFO _UTF1632EncodingInfo 78 #define _ENCODING_STATE _UTF1632State 79 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 80 #define _ENCODING_IS_STATE_DEPENDENT 0 81 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 82 83 84 static __inline void 85 /*ARGSUSED*/ 86 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei, _UTF1632State *s) 87 { 88 memset(s, 0, sizeof(*s)); 89 } 90 91 static int 92 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 93 const char **s, size_t n, _UTF1632State *psenc, 94 size_t *nresult) 95 { 96 int chlenbak, endian, needlen; 97 wchar_t wc; 98 size_t result; 99 const char *s0; 100 101 _DIAGASSERT(nresult != 0); 102 _DIAGASSERT(ei != NULL); 103 _DIAGASSERT(s != NULL); 104 _DIAGASSERT(psenc != NULL); 105 106 s0 = *s; 107 108 if (s0 == NULL) { 109 _citrus_UTF1632_init_state(ei, psenc); 110 *nresult = 0; /* state independent */ 111 return (0); 112 } 113 114 result = 0; 115 chlenbak = psenc->chlen; 116 117 refetch: 118 if ((ei->mode & _MODE_UTF32) != 0 || chlenbak>=2) 119 needlen = 4; 120 else 121 needlen = 2; 122 123 while (chlenbak < needlen) { 124 if (n==0) 125 goto restart; 126 psenc->ch[chlenbak++] = *s0++; 127 n--; 128 result++; 129 } 130 131 /* judge endian marker */ 132 if ((ei->mode & _MODE_UTF32) == 0) { 133 /* UTF16 */ 134 if (psenc->ch[0]==0xFE && psenc->ch[1]==0xFF) { 135 psenc->current_endian = _ENDIAN_BIG; 136 chlenbak = 0; 137 goto refetch; 138 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE) { 139 psenc->current_endian = _ENDIAN_LITTLE; 140 chlenbak = 0; 141 goto refetch; 142 } 143 } else { 144 /* UTF32 */ 145 if (psenc->ch[0]==0x00 && psenc->ch[1]==0x00 && 146 psenc->ch[2]==0xFE && psenc->ch[3]==0xFF) { 147 psenc->current_endian = _ENDIAN_BIG; 148 chlenbak = 0; 149 goto refetch; 150 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE && 151 psenc->ch[2]==0x00 && psenc->ch[3]==0x00) { 152 psenc->current_endian = _ENDIAN_LITTLE; 153 chlenbak = 0; 154 goto refetch; 155 } 156 } 157 if ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 158 psenc->current_endian == _ENDIAN_UNKNOWN) 159 endian = ei->preffered_endian; 160 else 161 endian = psenc->current_endian; 162 163 /* get wc */ 164 if ((ei->mode & _MODE_UTF32) == 0) { 165 /* UTF16 */ 166 if (needlen==2) { 167 switch (endian) { 168 case _ENDIAN_LITTLE: 169 wc = (psenc->ch[0] | 170 ((wchar_t)psenc->ch[1] << 8)); 171 break; 172 case _ENDIAN_BIG: 173 wc = (psenc->ch[1] | 174 ((wchar_t)psenc->ch[0] << 8)); 175 break; 176 } 177 if (wc >= 0xD800 && wc <= 0xDBFF) { 178 /* surrogate high */ 179 needlen=4; 180 goto refetch; 181 } 182 } else { 183 /* surrogate low */ 184 wc -= 0xD800; /* wc : surrogate high (see above) */ 185 wc <<= 10; 186 switch (endian) { 187 case _ENDIAN_LITTLE: 188 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 189 goto ilseq; 190 wc |= psenc->ch[2]; 191 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 192 break; 193 case _ENDIAN_BIG: 194 if (psenc->ch[3]<0xDC || psenc->ch[3]>0xDF) 195 goto ilseq; 196 wc |= psenc->ch[3]; 197 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 198 break; 199 } 200 wc += 0x10000; 201 } 202 } else { 203 /* UTF32 */ 204 switch (endian) { 205 case _ENDIAN_LITTLE: 206 wc = (psenc->ch[0] | 207 ((wchar_t)psenc->ch[1] << 8) | 208 ((wchar_t)psenc->ch[2] << 16) | 209 ((wchar_t)psenc->ch[3] << 24)); 210 break; 211 case _ENDIAN_BIG: 212 wc = (psenc->ch[3] | 213 ((wchar_t)psenc->ch[2] << 8) | 214 ((wchar_t)psenc->ch[1] << 16) | 215 ((wchar_t)psenc->ch[0] << 24)); 216 break; 217 } 218 } 219 220 221 *pwc = wc; 222 psenc->chlen = 0; 223 *nresult = result; 224 *s = s0; 225 226 return (0); 227 228 ilseq: 229 *nresult = (size_t)-1; 230 psenc->chlen = 0; 231 return (EILSEQ); 232 233 restart: 234 *nresult = (size_t)-2; 235 psenc->chlen = chlenbak; 236 *s = s0; 237 return (0); 238 } 239 240 static int 241 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 242 wchar_t wc, _UTF1632State *psenc, 243 size_t *nresult) 244 { 245 int ret; 246 wchar_t wc2; 247 248 _DIAGASSERT(ei != NULL); 249 _DIAGASSERT(nresult != 0); 250 _DIAGASSERT(s != NULL); 251 252 /* reset state */ 253 if (wc == 0) { 254 *nresult = 0; /* stateless */ 255 return 0; 256 } 257 258 wc2 = 0; 259 if ((ei->mode & _MODE_UTF32)==0) { 260 /* UTF16 */ 261 if (wc>0xFFFF) { 262 /* surrogate */ 263 if (wc>0x10FFFF) { 264 ret = EILSEQ; 265 goto err; 266 } 267 if (n < 4) { 268 ret = E2BIG; 269 goto err; 270 } 271 wc -= 0x10000; 272 wc2 = (wc & 0x3FF) | 0xDC00; 273 wc = (wc>>10) | 0xD800; 274 *nresult = (size_t)4; 275 } else { 276 if (n < 2) { 277 ret = E2BIG; 278 goto err; 279 } 280 *nresult = (size_t)2; 281 } 282 283 surrogate: 284 switch (ei->preffered_endian) { 285 case _ENDIAN_BIG: 286 s[1] = wc; 287 s[0] = (wc >>= 8); 288 break; 289 case _ENDIAN_LITTLE: 290 s[0] = wc; 291 s[1] = (wc >>= 8); 292 break; 293 } 294 if (wc2!=0) { 295 wc = wc2; 296 wc2 = 0; 297 s += 2; 298 goto surrogate; 299 } 300 } else { 301 /* UTF32 */ 302 if (n < 4) { 303 ret = E2BIG; 304 goto err; 305 } 306 switch (ei->preffered_endian) { 307 case _ENDIAN_BIG: 308 s[3] = wc; 309 s[2] = (wc >>= 8); 310 s[1] = (wc >>= 8); 311 s[0] = (wc >>= 8); 312 break; 313 case _ENDIAN_LITTLE: 314 s[0] = wc; 315 s[1] = (wc >>= 8); 316 s[2] = (wc >>= 8); 317 s[3] = (wc >>= 8); 318 break; 319 } 320 *nresult = (size_t)4; 321 } 322 323 return 0; 324 325 err: 326 *nresult = (size_t)-1; 327 return ret; 328 } 329 330 static void 331 parse_variable(_UTF1632EncodingInfo * __restrict ei, 332 const void * __restrict var, size_t lenvar) 333 { 334 #define MATCH(x, act) \ 335 do { \ 336 if (lenvar >= (sizeof(#x)-1) && \ 337 _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \ 338 act; \ 339 lenvar -= sizeof(#x)-1; \ 340 p += sizeof(#x)-1; \ 341 } \ 342 } while (/*CONSTCOND*/0) 343 const char *p; 344 p = var; 345 while (lenvar>0) { 346 switch (*p) { 347 case 'B': 348 case 'b': 349 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 350 break; 351 case 'L': 352 case 'l': 353 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 354 break; 355 case 'F': 356 case 'f': 357 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 358 break; 359 case 'U': 360 case 'u': 361 MATCH(utf32, ei->mode |= _MODE_UTF32); 362 break; 363 } 364 p++; 365 lenvar--; 366 } 367 } 368 369 static int 370 /*ARGSUSED*/ 371 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 372 const void * __restrict var, 373 size_t lenvar) 374 { 375 _DIAGASSERT(ei != NULL); 376 377 memset((void *)ei, 0, sizeof(*ei)); 378 379 parse_variable(ei, var, lenvar); 380 381 if ((ei->mode&_MODE_UTF32)==0) 382 ei->cur_max = 6; /* endian + surrogate */ 383 else 384 ei->cur_max = 8; /* endian + normal */ 385 386 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 387 #if BYTE_ORDER == BIG_ENDIAN 388 ei->preffered_endian = _ENDIAN_BIG; 389 #else 390 ei->preffered_endian = _ENDIAN_LITTLE; 391 #endif 392 } 393 394 return (0); 395 } 396 397 static void 398 /*ARGSUSED*/ 399 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei) 400 { 401 } 402 403 static __inline int 404 /*ARGSUSED*/ 405 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei, 406 _csid_t * __restrict csid, 407 _index_t * __restrict idx, 408 _wc_t wc) 409 { 410 411 _DIAGASSERT(csid != NULL && idx != NULL); 412 413 *csid = 0; 414 *idx = (_index_t)wc; 415 416 return (0); 417 } 418 419 static __inline int 420 /*ARGSUSED*/ 421 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei, 422 _wc_t * __restrict wc, 423 _csid_t csid, _index_t idx) 424 { 425 426 _DIAGASSERT(wc != NULL); 427 428 if (csid != 0) 429 return (EILSEQ); 430 431 *wc = (_wc_t)idx; 432 433 return (0); 434 } 435 436 437 /* ---------------------------------------------------------------------- 438 * public interface for stdenc 439 */ 440 441 _CITRUS_STDENC_DECLS(UTF1632); 442 _CITRUS_STDENC_DEF_OPS(UTF1632); 443 444 #include "citrus_stdenc_template.h" 445