1 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== 2 * 3 * The LLVM Compiler Infrastructure 4 * 5 * This file is distributed under the University of Illinois Open Source 6 * License. See LICENSE.TXT for details. 7 * 8 *===------------------------------------------------------------------------=*/ 9 /* 10 * Copyright 2001-2004 Unicode, Inc. 11 * 12 * Disclaimer 13 * 14 * This source code is provided as is by Unicode, Inc. No claims are 15 * made as to fitness for any particular purpose. No warranties of any 16 * kind are expressed or implied. The recipient agrees to determine 17 * applicability of information provided. If this file has been 18 * purchased on magnetic or optical media from Unicode, Inc., the 19 * sole remedy for any claim will be exchange of defective media 20 * within 90 days of receipt. 21 * 22 * Limitations on Rights to Redistribute This Code 23 * 24 * Unicode, Inc. hereby grants the right to freely use the information 25 * supplied in this file in the creation of products supporting the 26 * Unicode Standard, and to make copies of this file in any form 27 * for internal or external distribution as long as this notice 28 * remains attached. 29 */ 30 31 /* --------------------------------------------------------------------- 32 33 Conversions between UTF32, UTF-16, and UTF-8. Source code file. 34 Author: Mark E. Davis, 1994. 35 Rev History: Rick McGowan, fixes & updates May 2001. 36 Sept 2001: fixed const & error conditions per 37 mods suggested by S. Parent & A. Lillich. 38 June 2002: Tim Dodd added detection and handling of incomplete 39 source sequences, enhanced error detection, added casts 40 to eliminate compiler warnings. 41 July 2003: slight mods to back out aggressive FFFE detection. 42 Jan 2004: updated switches in from-UTF8 conversions. 43 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 44 45 See the header file "ConvertUTF.h" for complete documentation. 46 47 ------------------------------------------------------------------------ */ 48 49 #include "llvm/Support/ConvertUTF.h" 50 #ifdef CVTUTF_DEBUG 51 #include <stdio.h> 52 #endif 53 #include <assert.h> 54 55 /* 56 * This code extensively uses fall-through switches. 57 * Keep the compiler from warning about that. 58 */ 59 #if defined(__clang__) && defined(__has_warning) 60 # if __has_warning("-Wimplicit-fallthrough") 61 # define ConvertUTF_DISABLE_WARNINGS \ 62 _Pragma("clang diagnostic push") \ 63 _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"") 64 # define ConvertUTF_RESTORE_WARNINGS \ 65 _Pragma("clang diagnostic pop") 66 # endif 67 #elif defined(__GNUC__) && __GNUC__ > 6 68 # define ConvertUTF_DISABLE_WARNINGS \ 69 _Pragma("GCC diagnostic push") \ 70 _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") 71 # define ConvertUTF_RESTORE_WARNINGS \ 72 _Pragma("GCC diagnostic pop") 73 #endif 74 #ifndef ConvertUTF_DISABLE_WARNINGS 75 # define ConvertUTF_DISABLE_WARNINGS 76 #endif 77 #ifndef ConvertUTF_RESTORE_WARNINGS 78 # define ConvertUTF_RESTORE_WARNINGS 79 #endif 80 81 ConvertUTF_DISABLE_WARNINGS 82 83 namespace llvm { 84 85 static const int halfShift = 10; /* used for shifting by 10 bits */ 86 87 static const UTF32 halfBase = 0x0010000UL; 88 static const UTF32 halfMask = 0x3FFUL; 89 90 #define UNI_SUR_HIGH_START (UTF32)0xD800 91 #define UNI_SUR_HIGH_END (UTF32)0xDBFF 92 #define UNI_SUR_LOW_START (UTF32)0xDC00 93 #define UNI_SUR_LOW_END (UTF32)0xDFFF 94 95 /* --------------------------------------------------------------------- */ 96 97 /* 98 * Index into the table below with the first byte of a UTF-8 sequence to 99 * get the number of trailing bytes that are supposed to follow it. 100 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 101 * left as-is for anyone who may want to do such conversion, which was 102 * allowed in earlier algorithms. 103 */ 104 static const char trailingBytesForUTF8[256] = { 105 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 106 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 107 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 108 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 109 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 110 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 111 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 112 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 113 }; 114 115 /* 116 * Magic values subtracted from a buffer value during UTF8 conversion. 117 * This table contains as many values as there might be trailing bytes 118 * in a UTF-8 sequence. 119 */ 120 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 121 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 122 123 /* 124 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 125 * into the first byte, depending on how many bytes follow. There are 126 * as many entries in this table as there are UTF-8 sequence types. 127 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 128 * for *legal* UTF-8 will be 4 or fewer bytes total. 129 */ 130 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 131 132 /* --------------------------------------------------------------------- */ 133 134 /* The interface converts a whole buffer to avoid function-call overhead. 135 * Constants have been gathered. Loops & conditionals have been removed as 136 * much as possible for efficiency, in favor of drop-through switches. 137 * (See "Note A" at the bottom of the file for equivalent code.) 138 * If your compiler supports it, the "isLegalUTF8" call can be turned 139 * into an inline function. 140 */ 141 142 143 /* --------------------------------------------------------------------- */ 144 145 ConversionResult ConvertUTF32toUTF16 ( 146 const UTF32** sourceStart, const UTF32* sourceEnd, 147 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 148 ConversionResult result = conversionOK; 149 const UTF32* source = *sourceStart; 150 UTF16* target = *targetStart; 151 while (source < sourceEnd) { 152 UTF32 ch; 153 if (target >= targetEnd) { 154 result = targetExhausted; break; 155 } 156 ch = *source++; 157 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 158 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 159 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 160 if (flags == strictConversion) { 161 --source; /* return to the illegal value itself */ 162 result = sourceIllegal; 163 break; 164 } else { 165 *target++ = UNI_REPLACEMENT_CHAR; 166 } 167 } else { 168 *target++ = (UTF16)ch; /* normal case */ 169 } 170 } else if (ch > UNI_MAX_LEGAL_UTF32) { 171 if (flags == strictConversion) { 172 result = sourceIllegal; 173 } else { 174 *target++ = UNI_REPLACEMENT_CHAR; 175 } 176 } else { 177 /* target is a character in range 0xFFFF - 0x10FFFF. */ 178 if (target + 1 >= targetEnd) { 179 --source; /* Back up source pointer! */ 180 result = targetExhausted; break; 181 } 182 ch -= halfBase; 183 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 184 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 185 } 186 } 187 *sourceStart = source; 188 *targetStart = target; 189 return result; 190 } 191 192 /* --------------------------------------------------------------------- */ 193 194 ConversionResult ConvertUTF16toUTF32 ( 195 const UTF16** sourceStart, const UTF16* sourceEnd, 196 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 197 ConversionResult result = conversionOK; 198 const UTF16* source = *sourceStart; 199 UTF32* target = *targetStart; 200 UTF32 ch, ch2; 201 while (source < sourceEnd) { 202 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 203 ch = *source++; 204 /* If we have a surrogate pair, convert to UTF32 first. */ 205 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 206 /* If the 16 bits following the high surrogate are in the source buffer... */ 207 if (source < sourceEnd) { 208 ch2 = *source; 209 /* If it's a low surrogate, convert to UTF32. */ 210 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 211 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 212 + (ch2 - UNI_SUR_LOW_START) + halfBase; 213 ++source; 214 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 215 --source; /* return to the illegal value itself */ 216 result = sourceIllegal; 217 break; 218 } 219 } else { /* We don't have the 16 bits following the high surrogate. */ 220 --source; /* return to the high surrogate */ 221 result = sourceExhausted; 222 break; 223 } 224 } else if (flags == strictConversion) { 225 /* UTF-16 surrogate values are illegal in UTF-32 */ 226 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 227 --source; /* return to the illegal value itself */ 228 result = sourceIllegal; 229 break; 230 } 231 } 232 if (target >= targetEnd) { 233 source = oldSource; /* Back up source pointer! */ 234 result = targetExhausted; break; 235 } 236 *target++ = ch; 237 } 238 *sourceStart = source; 239 *targetStart = target; 240 #ifdef CVTUTF_DEBUG 241 if (result == sourceIllegal) { 242 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 243 fflush(stderr); 244 } 245 #endif 246 return result; 247 } 248 ConversionResult ConvertUTF16toUTF8 ( 249 const UTF16** sourceStart, const UTF16* sourceEnd, 250 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 251 ConversionResult result = conversionOK; 252 const UTF16* source = *sourceStart; 253 UTF8* target = *targetStart; 254 while (source < sourceEnd) { 255 UTF32 ch; 256 unsigned short bytesToWrite = 0; 257 const UTF32 byteMask = 0xBF; 258 const UTF32 byteMark = 0x80; 259 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 260 ch = *source++; 261 /* If we have a surrogate pair, convert to UTF32 first. */ 262 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 263 /* If the 16 bits following the high surrogate are in the source buffer... */ 264 if (source < sourceEnd) { 265 UTF32 ch2 = *source; 266 /* If it's a low surrogate, convert to UTF32. */ 267 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 268 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 269 + (ch2 - UNI_SUR_LOW_START) + halfBase; 270 ++source; 271 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 272 --source; /* return to the illegal value itself */ 273 result = sourceIllegal; 274 break; 275 } 276 } else { /* We don't have the 16 bits following the high surrogate. */ 277 --source; /* return to the high surrogate */ 278 result = sourceExhausted; 279 break; 280 } 281 } else if (flags == strictConversion) { 282 /* UTF-16 surrogate values are illegal in UTF-32 */ 283 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 284 --source; /* return to the illegal value itself */ 285 result = sourceIllegal; 286 break; 287 } 288 } 289 /* Figure out how many bytes the result will require */ 290 if (ch < (UTF32)0x80) { bytesToWrite = 1; 291 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 292 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 293 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 294 } else { bytesToWrite = 3; 295 ch = UNI_REPLACEMENT_CHAR; 296 } 297 298 target += bytesToWrite; 299 if (target > targetEnd) { 300 source = oldSource; /* Back up source pointer! */ 301 target -= bytesToWrite; result = targetExhausted; break; 302 } 303 switch (bytesToWrite) { /* note: everything falls through. */ 304 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 305 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 306 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 307 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 308 } 309 target += bytesToWrite; 310 } 311 *sourceStart = source; 312 *targetStart = target; 313 return result; 314 } 315 316 /* --------------------------------------------------------------------- */ 317 318 ConversionResult ConvertUTF32toUTF8 ( 319 const UTF32** sourceStart, const UTF32* sourceEnd, 320 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 321 ConversionResult result = conversionOK; 322 const UTF32* source = *sourceStart; 323 UTF8* target = *targetStart; 324 while (source < sourceEnd) { 325 UTF32 ch; 326 unsigned short bytesToWrite = 0; 327 const UTF32 byteMask = 0xBF; 328 const UTF32 byteMark = 0x80; 329 ch = *source++; 330 if (flags == strictConversion ) { 331 /* UTF-16 surrogate values are illegal in UTF-32 */ 332 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 333 --source; /* return to the illegal value itself */ 334 result = sourceIllegal; 335 break; 336 } 337 } 338 /* 339 * Figure out how many bytes the result will require. Turn any 340 * illegally large UTF32 things (> Plane 17) into replacement chars. 341 */ 342 if (ch < (UTF32)0x80) { bytesToWrite = 1; 343 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 344 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 345 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 346 } else { bytesToWrite = 3; 347 ch = UNI_REPLACEMENT_CHAR; 348 result = sourceIllegal; 349 } 350 351 target += bytesToWrite; 352 if (target > targetEnd) { 353 --source; /* Back up source pointer! */ 354 target -= bytesToWrite; result = targetExhausted; break; 355 } 356 switch (bytesToWrite) { /* note: everything falls through. */ 357 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 358 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 359 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 360 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 361 } 362 target += bytesToWrite; 363 } 364 *sourceStart = source; 365 *targetStart = target; 366 return result; 367 } 368 369 /* --------------------------------------------------------------------- */ 370 371 /* 372 * Utility routine to tell whether a sequence of bytes is legal UTF-8. 373 * This must be called with the length pre-determined by the first byte. 374 * If not calling this from ConvertUTF8to*, then the length can be set by: 375 * length = trailingBytesForUTF8[*source]+1; 376 * and the sequence is illegal right away if there aren't that many bytes 377 * available. 378 * If presented with a length > 4, this returns false. The Unicode 379 * definition of UTF-8 goes up to 4-byte sequences. 380 */ 381 382 static Boolean isLegalUTF8(const UTF8 *source, int length) { 383 UTF8 a; 384 const UTF8 *srcptr = source+length; 385 switch (length) { 386 default: return false; 387 /* Everything else falls through when "true"... */ 388 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 389 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 390 case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 391 392 switch (*source) { 393 /* no fall-through in this inner switch */ 394 case 0xE0: if (a < 0xA0) return false; break; 395 case 0xED: if (a > 0x9F) return false; break; 396 case 0xF0: if (a < 0x90) return false; break; 397 case 0xF4: if (a > 0x8F) return false; break; 398 default: if (a < 0x80) return false; 399 } 400 401 case 1: if (*source >= 0x80 && *source < 0xC2) return false; 402 } 403 if (*source > 0xF4) return false; 404 return true; 405 } 406 407 /* --------------------------------------------------------------------- */ 408 409 /* 410 * Exported function to return whether a UTF-8 sequence is legal or not. 411 * This is not used here; it's just exported. 412 */ 413 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 414 int length = trailingBytesForUTF8[*source]+1; 415 if (length > sourceEnd - source) { 416 return false; 417 } 418 return isLegalUTF8(source, length); 419 } 420 421 /* --------------------------------------------------------------------- */ 422 423 static unsigned 424 findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, 425 const UTF8 *sourceEnd) { 426 UTF8 b1, b2, b3; 427 428 assert(!isLegalUTF8Sequence(source, sourceEnd)); 429 430 /* 431 * Unicode 6.3.0, D93b: 432 * 433 * Maximal subpart of an ill-formed subsequence: The longest code unit 434 * subsequence starting at an unconvertible offset that is either: 435 * a. the initial subsequence of a well-formed code unit sequence, or 436 * b. a subsequence of length one. 437 */ 438 439 if (source == sourceEnd) 440 return 0; 441 442 /* 443 * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 444 * Byte Sequences. 445 */ 446 447 b1 = *source; 448 ++source; 449 if (b1 >= 0xC2 && b1 <= 0xDF) { 450 /* 451 * First byte is valid, but we know that this code unit sequence is 452 * invalid, so the maximal subpart has to end after the first byte. 453 */ 454 return 1; 455 } 456 457 if (source == sourceEnd) 458 return 1; 459 460 b2 = *source; 461 ++source; 462 463 if (b1 == 0xE0) { 464 return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1; 465 } 466 if (b1 >= 0xE1 && b1 <= 0xEC) { 467 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 468 } 469 if (b1 == 0xED) { 470 return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1; 471 } 472 if (b1 >= 0xEE && b1 <= 0xEF) { 473 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 474 } 475 if (b1 == 0xF0) { 476 if (b2 >= 0x90 && b2 <= 0xBF) { 477 if (source == sourceEnd) 478 return 2; 479 480 b3 = *source; 481 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 482 } 483 return 1; 484 } 485 if (b1 >= 0xF1 && b1 <= 0xF3) { 486 if (b2 >= 0x80 && b2 <= 0xBF) { 487 if (source == sourceEnd) 488 return 2; 489 490 b3 = *source; 491 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 492 } 493 return 1; 494 } 495 if (b1 == 0xF4) { 496 if (b2 >= 0x80 && b2 <= 0x8F) { 497 if (source == sourceEnd) 498 return 2; 499 500 b3 = *source; 501 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 502 } 503 return 1; 504 } 505 506 assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5); 507 /* 508 * There are no valid sequences that start with these bytes. Maximal subpart 509 * is defined to have length 1 in these cases. 510 */ 511 return 1; 512 } 513 514 /* --------------------------------------------------------------------- */ 515 516 /* 517 * Exported function to return the total number of bytes in a codepoint 518 * represented in UTF-8, given the value of the first byte. 519 */ 520 unsigned getNumBytesForUTF8(UTF8 first) { 521 return trailingBytesForUTF8[first] + 1; 522 } 523 524 /* --------------------------------------------------------------------- */ 525 526 /* 527 * Exported function to return whether a UTF-8 string is legal or not. 528 * This is not used here; it's just exported. 529 */ 530 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) { 531 while (*source != sourceEnd) { 532 int length = trailingBytesForUTF8[**source] + 1; 533 if (length > sourceEnd - *source || !isLegalUTF8(*source, length)) 534 return false; 535 *source += length; 536 } 537 return true; 538 } 539 540 /* --------------------------------------------------------------------- */ 541 542 ConversionResult ConvertUTF8toUTF16 ( 543 const UTF8** sourceStart, const UTF8* sourceEnd, 544 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 545 ConversionResult result = conversionOK; 546 const UTF8* source = *sourceStart; 547 UTF16* target = *targetStart; 548 while (source < sourceEnd) { 549 UTF32 ch = 0; 550 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 551 if (extraBytesToRead >= sourceEnd - source) { 552 result = sourceExhausted; break; 553 } 554 /* Do this check whether lenient or strict */ 555 if (!isLegalUTF8(source, extraBytesToRead+1)) { 556 result = sourceIllegal; 557 break; 558 } 559 /* 560 * The cases all fall through. See "Note A" below. 561 */ 562 switch (extraBytesToRead) { 563 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 564 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 565 case 3: ch += *source++; ch <<= 6; 566 case 2: ch += *source++; ch <<= 6; 567 case 1: ch += *source++; ch <<= 6; 568 case 0: ch += *source++; 569 } 570 ch -= offsetsFromUTF8[extraBytesToRead]; 571 572 if (target >= targetEnd) { 573 source -= (extraBytesToRead+1); /* Back up source pointer! */ 574 result = targetExhausted; break; 575 } 576 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 577 /* UTF-16 surrogate values are illegal in UTF-32 */ 578 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 579 if (flags == strictConversion) { 580 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 581 result = sourceIllegal; 582 break; 583 } else { 584 *target++ = UNI_REPLACEMENT_CHAR; 585 } 586 } else { 587 *target++ = (UTF16)ch; /* normal case */ 588 } 589 } else if (ch > UNI_MAX_UTF16) { 590 if (flags == strictConversion) { 591 result = sourceIllegal; 592 source -= (extraBytesToRead+1); /* return to the start */ 593 break; /* Bail out; shouldn't continue */ 594 } else { 595 *target++ = UNI_REPLACEMENT_CHAR; 596 } 597 } else { 598 /* target is a character in range 0xFFFF - 0x10FFFF. */ 599 if (target + 1 >= targetEnd) { 600 source -= (extraBytesToRead+1); /* Back up source pointer! */ 601 result = targetExhausted; break; 602 } 603 ch -= halfBase; 604 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 605 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 606 } 607 } 608 *sourceStart = source; 609 *targetStart = target; 610 return result; 611 } 612 613 /* --------------------------------------------------------------------- */ 614 615 static ConversionResult ConvertUTF8toUTF32Impl( 616 const UTF8** sourceStart, const UTF8* sourceEnd, 617 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, 618 Boolean InputIsPartial) { 619 ConversionResult result = conversionOK; 620 const UTF8* source = *sourceStart; 621 UTF32* target = *targetStart; 622 while (source < sourceEnd) { 623 UTF32 ch = 0; 624 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 625 if (extraBytesToRead >= sourceEnd - source) { 626 if (flags == strictConversion || InputIsPartial) { 627 result = sourceExhausted; 628 break; 629 } else { 630 result = sourceIllegal; 631 632 /* 633 * Replace the maximal subpart of ill-formed sequence with 634 * replacement character. 635 */ 636 source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 637 sourceEnd); 638 *target++ = UNI_REPLACEMENT_CHAR; 639 continue; 640 } 641 } 642 if (target >= targetEnd) { 643 result = targetExhausted; break; 644 } 645 646 /* Do this check whether lenient or strict */ 647 if (!isLegalUTF8(source, extraBytesToRead+1)) { 648 result = sourceIllegal; 649 if (flags == strictConversion) { 650 /* Abort conversion. */ 651 break; 652 } else { 653 /* 654 * Replace the maximal subpart of ill-formed sequence with 655 * replacement character. 656 */ 657 source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 658 sourceEnd); 659 *target++ = UNI_REPLACEMENT_CHAR; 660 continue; 661 } 662 } 663 /* 664 * The cases all fall through. See "Note A" below. 665 */ 666 switch (extraBytesToRead) { 667 case 5: ch += *source++; ch <<= 6; 668 case 4: ch += *source++; ch <<= 6; 669 case 3: ch += *source++; ch <<= 6; 670 case 2: ch += *source++; ch <<= 6; 671 case 1: ch += *source++; ch <<= 6; 672 case 0: ch += *source++; 673 } 674 ch -= offsetsFromUTF8[extraBytesToRead]; 675 676 if (ch <= UNI_MAX_LEGAL_UTF32) { 677 /* 678 * UTF-16 surrogate values are illegal in UTF-32, and anything 679 * over Plane 17 (> 0x10FFFF) is illegal. 680 */ 681 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 682 if (flags == strictConversion) { 683 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 684 result = sourceIllegal; 685 break; 686 } else { 687 *target++ = UNI_REPLACEMENT_CHAR; 688 } 689 } else { 690 *target++ = ch; 691 } 692 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 693 result = sourceIllegal; 694 *target++ = UNI_REPLACEMENT_CHAR; 695 } 696 } 697 *sourceStart = source; 698 *targetStart = target; 699 return result; 700 } 701 702 ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, 703 const UTF8 *sourceEnd, 704 UTF32 **targetStart, 705 UTF32 *targetEnd, 706 ConversionFlags flags) { 707 return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 708 flags, /*InputIsPartial=*/true); 709 } 710 711 ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, 712 const UTF8 *sourceEnd, UTF32 **targetStart, 713 UTF32 *targetEnd, ConversionFlags flags) { 714 return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 715 flags, /*InputIsPartial=*/false); 716 } 717 718 /* --------------------------------------------------------------------- 719 720 Note A. 721 The fall-through switches in UTF-8 reading code save a 722 temp variable, some decrements & conditionals. The switches 723 are equivalent to the following loop: 724 { 725 int tmpBytesToRead = extraBytesToRead+1; 726 do { 727 ch += *source++; 728 --tmpBytesToRead; 729 if (tmpBytesToRead) ch <<= 6; 730 } while (tmpBytesToRead > 0); 731 } 732 In UTF-8 writing code, the switches on "bytesToWrite" are 733 similarly unrolled loops. 734 735 --------------------------------------------------------------------- */ 736 737 } // namespace llvm 738 739 ConvertUTF_RESTORE_WARNINGS 740