1 // Written in the D programming language. 2 3 /++ 4 Encode and decode UTF-8, UTF-16 and UTF-32 strings. 5 6 UTF character support is restricted to 7 $(D '\u0000' <= character <= '\U0010FFFF'). 8 9 $(SCRIPT inhibitQuickIndex = 1;) 10 $(BOOKTABLE, 11 $(TR $(TH Category) $(TH Functions)) 12 $(TR $(TD Decode) $(TD 13 $(LREF decode) 14 $(LREF decodeFront) 15 )) 16 $(TR $(TD Lazy decode) $(TD 17 $(LREF byCodeUnit) 18 $(LREF byChar) 19 $(LREF byWchar) 20 $(LREF byDchar) 21 $(LREF byUTF) 22 )) 23 $(TR $(TD Encode) $(TD 24 $(LREF encode) 25 $(LREF toUTF8) 26 $(LREF toUTF16) 27 $(LREF toUTF32) 28 $(LREF toUTFz) 29 $(LREF toUTF16z) 30 )) 31 $(TR $(TD Length) $(TD 32 $(LREF codeLength) 33 $(LREF count) 34 $(LREF stride) 35 $(LREF strideBack) 36 )) 37 $(TR $(TD Index) $(TD 38 $(LREF toUCSindex) 39 $(LREF toUTFindex) 40 )) 41 $(TR $(TD Validation) $(TD 42 $(LREF isValidDchar) 43 $(LREF validate) 44 )) 45 $(TR $(TD Miscellaneous) $(TD 46 $(LREF replacementDchar) 47 $(LREF UseReplacementDchar) 48 $(LREF UTFException) 49 )) 50 ) 51 See_Also: 52 $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> 53 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> 54 $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) 55 Copyright: Copyright Digital Mars 2000 - 2012. 56 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 57 Authors: $(HTTP digitalmars.com, Walter Bright) and Jonathan M Davis 58 Source: $(PHOBOSSRC std/_utf.d) 59 +/ 60 module std.utf; 61 62 import std.exception; // basicExceptionCtors 63 import std.meta; // AliasSeq 64 import std.range.primitives; 65 import std.traits; // isSomeChar, isSomeString 66 import std.typecons; // Flag, Yes, No 67 68 69 /++ 70 Exception thrown on errors in std.utf functions. 71 +/ 72 class UTFException : Exception 73 { 74 import core.internal.string : unsignedToTempString, UnsignedStringBuf; 75 76 uint[4] sequence; 77 size_t len; 78 79 @safe pure nothrow @nogc 80 UTFException setSequence(scope uint[] data...) 81 { 82 assert(data.length <= 4); 83 84 len = data.length < 4 ? data.length : 4; 85 sequence[0 .. len] = data[0 .. len]; 86 87 return this; 88 } 89 90 // FIXME: Use std.exception.basicExceptionCtors here once bug #11500 is fixed 91 92 this(string msg, string file = __FILE__, size_t line = __LINE__, 93 Throwable next = null) @nogc @safe pure nothrow 94 { 95 super(msg, file, line, next); 96 } 97 98 this(string msg, size_t index, string file = __FILE__, 99 size_t line = __LINE__, Throwable next = null) @safe pure nothrow 100 { 101 UnsignedStringBuf buf = void; 102 msg ~= " (at index " ~ unsignedToTempString(index, buf, 10) ~ ")"; 103 super(msg, file, line, next); 104 } 105 106 107 override string toString() const 108 { 109 if (len == 0) 110 { 111 /* Exception.toString() is not marked as const, although 112 * it is const-compatible. 113 */ 114 //return super.toString(); 115 auto e = () @trusted { return cast(Exception) super; } (); 116 return e.toString(); 117 } 118 119 string result = "Invalid UTF sequence:"; 120 121 foreach (i; sequence[0 .. len]) 122 { 123 UnsignedStringBuf buf = void; 124 result ~= ' '; 125 auto h = unsignedToTempString(i, buf, 16); 126 if (h.length == 1) 127 result ~= '0'; 128 result ~= h; 129 result ~= 'x'; 130 } 131 132 if (super.msg.length > 0) 133 { 134 result ~= " - "; 135 result ~= super.msg; 136 } 137 138 return result; 139 } 140 } 141 142 /* 143 Provide array of invalidly encoded UTF strings. Useful for testing. 144 145 Params: 146 Char = char, wchar, or dchar 147 148 Returns: 149 an array of invalidly encoded UTF strings 150 */ 151 152 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow 153 if (isSomeChar!Char) 154 { 155 static if (is(Char == char)) 156 { 157 enum x = 0xDC00; // invalid surrogate value 158 enum y = 0x110000; // out of range 159 160 static immutable string[8] result = 161 [ 162 "\x80", // not a start byte 163 "\xC0", // truncated 164 "\xC0\xC0", // invalid continuation 165 "\xF0\x82\x82\xAC", // overlong 166 [ 167 0xE0 | (x >> 12), 168 0x80 | ((x >> 6) & 0x3F), 169 0x80 | (x & 0x3F) 170 ], 171 [ 172 cast(char)(0xF0 | (y >> 18)), 173 cast(char)(0x80 | ((y >> 12) & 0x3F)), 174 cast(char)(0x80 | ((y >> 6) & 0x3F)), 175 cast(char)(0x80 | (y & 0x3F)) 176 ], 177 [ 178 cast(char)(0xF8 | 3), // 5 byte encoding 179 cast(char)(0x80 | 3), 180 cast(char)(0x80 | 3), 181 cast(char)(0x80 | 3), 182 cast(char)(0x80 | 3), 183 ], 184 [ 185 cast(char)(0xFC | 3), // 6 byte encoding 186 cast(char)(0x80 | 3), 187 cast(char)(0x80 | 3), 188 cast(char)(0x80 | 3), 189 cast(char)(0x80 | 3), 190 cast(char)(0x80 | 3), 191 ], 192 ]; 193 194 return result[]; 195 } 196 else static if (is(Char == wchar)) 197 { 198 static immutable wstring[5] result = 199 [ 200 [ 201 cast(wchar) 0xDC00, 202 ], 203 [ 204 cast(wchar) 0xDFFF, 205 ], 206 [ 207 cast(wchar) 0xDBFF, 208 cast(wchar) 0xDBFF, 209 ], 210 [ 211 cast(wchar) 0xDBFF, 212 cast(wchar) 0xE000, 213 ], 214 [ 215 cast(wchar) 0xD800, 216 ], 217 ]; 218 219 return result[]; 220 } 221 else static if (is(Char == dchar)) 222 { 223 static immutable dstring[3] result = 224 [ 225 [ cast(dchar) 0x110000 ], 226 [ cast(dchar) 0x00D800 ], 227 [ cast(dchar) 0x00DFFF ], 228 ]; 229 230 return result; 231 } 232 else 233 static assert(0); 234 } 235 236 /++ 237 Check whether the given Unicode code point is valid. 238 239 Params: 240 c = code point to check 241 242 Returns: 243 $(D true) iff $(D c) is a valid Unicode code point 244 245 Note: 246 $(D '\uFFFE') and $(D '\uFFFF') are considered valid by $(D isValidDchar), 247 as they are permitted for internal use by an application, but they are 248 not allowed for interchange by the Unicode standard. 249 +/ 250 bool isValidDchar(dchar c) pure nothrow @safe @nogc 251 { 252 return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF); 253 } 254 255 pure nothrow @safe @nogc unittest 256 { 257 import std.exception; 258 259 assertCTFEable!( 260 { 261 assert( isValidDchar(cast(dchar)'a') == true); 262 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false); 263 264 assert(!isValidDchar(cast(dchar) 0x00D800)); 265 assert(!isValidDchar(cast(dchar) 0x00DBFF)); 266 assert(!isValidDchar(cast(dchar) 0x00DC00)); 267 assert(!isValidDchar(cast(dchar) 0x00DFFF)); 268 assert( isValidDchar(cast(dchar) 0x00FFFE)); 269 assert( isValidDchar(cast(dchar) 0x00FFFF)); 270 assert( isValidDchar(cast(dchar) 0x01FFFF)); 271 assert( isValidDchar(cast(dchar) 0x10FFFF)); 272 assert(!isValidDchar(cast(dchar) 0x110000)); 273 }); 274 } 275 276 277 /++ 278 Calculate the length of the UTF sequence starting at $(D index) 279 in $(D str). 280 281 Params: 282 str = input range of UTF code units. Must be random access if 283 $(D index) is passed 284 index = starting index of UTF sequence (default: $(D 0)) 285 286 Returns: 287 The number of code units in the UTF sequence. For UTF-8, this is a 288 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 289 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 290 291 Throws: 292 May throw a $(D UTFException) if $(D str[index]) is not the start of a 293 valid UTF sequence. 294 295 Note: 296 $(D stride) will only analyze the first $(D str[index]) element. It 297 will not fully verify the validity of the UTF sequence, nor even verify 298 the presence of the sequence: it will not actually guarantee that 299 $(D index + stride(str, index) <= str.length). 300 +/ 301 uint stride(S)(auto ref S str, size_t index) 302 if (is(S : const char[]) || 303 (isRandomAccessRange!S && is(Unqual!(ElementType!S) == char))) 304 { 305 static if (is(typeof(str.length) : ulong)) 306 assert(index < str.length, "Past the end of the UTF-8 sequence"); 307 immutable c = str[index]; 308 309 if (c < 0x80) 310 return 1; 311 else 312 return strideImpl(c, index); 313 } 314 315 /// Ditto 316 uint stride(S)(auto ref S str) 317 if (is(S : const char[]) || 318 (isInputRange!S && is(Unqual!(ElementType!S) == char))) 319 { 320 static if (is(S : const char[])) 321 immutable c = str[0]; 322 else 323 immutable c = str.front; 324 325 if (c < 0x80) 326 return 1; 327 else 328 return strideImpl(c, 0); 329 } 330 331 private uint strideImpl(char c, size_t index) @trusted pure 332 in { assert(c & 0x80); } 333 body 334 { 335 import core.bitop : bsr; 336 immutable msbs = 7 - bsr((~uint(c)) & 0xFF); 337 if (c == 0xFF || msbs < 2 || msbs > 4) 338 throw new UTFException("Invalid UTF-8 sequence", index); 339 return msbs; 340 } 341 342 @system unittest 343 { 344 import core.exception : AssertError; 345 import std.conv : to; 346 import std.exception; 347 import std.string : format; 348 static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__) 349 { 350 enforce(stride(s, i) == codeLength!char(c), 351 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 352 353 enforce(stride(RandomCU!char(s), i) == codeLength!char(c), 354 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 355 356 auto refRandom = new RefRandomCU!char(s); 357 immutable randLen = refRandom.length; 358 enforce(stride(refRandom, i) == codeLength!char(c), 359 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 360 enforce(refRandom.length == randLen, 361 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 362 363 if (i == 0) 364 { 365 enforce(stride(s) == codeLength!char(c), 366 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 367 368 enforce(stride(InputCU!char(s)) == codeLength!char(c), 369 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 370 371 auto refBidir = new RefBidirCU!char(s); 372 immutable bidirLen = refBidir.length; 373 enforce(stride(refBidir) == codeLength!char(c), 374 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 375 enforce(refBidir.length == bidirLen, 376 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 377 } 378 } 379 380 assertCTFEable!( 381 { 382 test("a", 'a'); 383 test(" ", ' '); 384 test("\u2029", '\u2029'); //paraSep 385 test("\u0100", '\u0100'); 386 test("\u0430", '\u0430'); 387 test("\U00010143", '\U00010143'); 388 test("abcdefcdef", 'a'); 389 test("hello\U00010143\u0100\U00010143", 'h', 0); 390 test("hello\U00010143\u0100\U00010143", 'e', 1); 391 test("hello\U00010143\u0100\U00010143", 'l', 2); 392 test("hello\U00010143\u0100\U00010143", 'l', 3); 393 test("hello\U00010143\u0100\U00010143", 'o', 4); 394 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 395 test("hello\U00010143\u0100\U00010143", '\u0100', 9); 396 test("hello\U00010143\u0100\U00010143", '\U00010143', 11); 397 398 foreach (S; AliasSeq!(char[], const char[], string)) 399 { 400 enum str = to!S("hello world"); 401 static assert(isSafe!({ stride(str, 0); })); 402 static assert(isSafe!({ stride(str); })); 403 static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0); 404 static assert((functionAttributes!({ stride(str); }) & FunctionAttribute.pure_) != 0); 405 } 406 }); 407 } 408 409 @safe unittest // invalid start bytes 410 { 411 import std.exception : assertThrown; 412 immutable char[] invalidStartBytes = [ 413 0b1111_1000, // indicating a sequence length of 5 414 0b1111_1100, // 6 415 0b1111_1110, // 7 416 0b1111_1111, // 8 417 0b1000_0000, // continuation byte 418 ]; 419 foreach (c; invalidStartBytes) 420 assertThrown!UTFException(stride([c])); 421 } 422 423 /// Ditto 424 uint stride(S)(auto ref S str, size_t index) 425 if (is(S : const wchar[]) || 426 (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar))) 427 { 428 static if (is(typeof(str.length) : ulong)) 429 assert(index < str.length, "Past the end of the UTF-16 sequence"); 430 immutable uint u = str[index]; 431 return 1 + (u >= 0xD800 && u <= 0xDBFF); 432 } 433 434 /// Ditto 435 uint stride(S)(auto ref S str) @safe pure 436 if (is(S : const wchar[])) 437 { 438 return stride(str, 0); 439 } 440 441 /// Ditto 442 uint stride(S)(auto ref S str) 443 if (isInputRange!S && is(Unqual!(ElementType!S) == wchar)) 444 { 445 assert(!str.empty, "UTF-16 sequence is empty"); 446 immutable uint u = str.front; 447 return 1 + (u >= 0xD800 && u <= 0xDBFF); 448 } 449 450 @system unittest 451 { 452 import core.exception : AssertError; 453 import std.conv : to; 454 import std.exception; 455 import std.string : format; 456 static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__) 457 { 458 enforce(stride(s, i) == codeLength!wchar(c), 459 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 460 461 enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c), 462 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 463 464 auto refRandom = new RefRandomCU!wchar(s); 465 immutable randLen = refRandom.length; 466 enforce(stride(refRandom, i) == codeLength!wchar(c), 467 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 468 enforce(refRandom.length == randLen, 469 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 470 471 if (i == 0) 472 { 473 enforce(stride(s) == codeLength!wchar(c), 474 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 475 476 enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c), 477 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 478 479 auto refBidir = new RefBidirCU!wchar(s); 480 immutable bidirLen = refBidir.length; 481 enforce(stride(refBidir) == codeLength!wchar(c), 482 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 483 enforce(refBidir.length == bidirLen, 484 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 485 } 486 } 487 488 assertCTFEable!( 489 { 490 test("a", 'a'); 491 test(" ", ' '); 492 test("\u2029", '\u2029'); //paraSep 493 test("\u0100", '\u0100'); 494 test("\u0430", '\u0430'); 495 test("\U00010143", '\U00010143'); 496 test("abcdefcdef", 'a'); 497 test("hello\U00010143\u0100\U00010143", 'h', 0); 498 test("hello\U00010143\u0100\U00010143", 'e', 1); 499 test("hello\U00010143\u0100\U00010143", 'l', 2); 500 test("hello\U00010143\u0100\U00010143", 'l', 3); 501 test("hello\U00010143\u0100\U00010143", 'o', 4); 502 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 503 test("hello\U00010143\u0100\U00010143", '\u0100', 7); 504 test("hello\U00010143\u0100\U00010143", '\U00010143', 8); 505 506 foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 507 { 508 enum str = to!S("hello world"); 509 static assert(isSafe!(() => stride(str, 0))); 510 static assert(isSafe!(() => stride(str) )); 511 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 512 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 513 } 514 }); 515 } 516 517 /// Ditto 518 uint stride(S)(auto ref S str, size_t index = 0) 519 if (is(S : const dchar[]) || 520 (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar))) 521 { 522 static if (is(typeof(str.length) : ulong)) 523 assert(index < str.length, "Past the end of the UTF-32 sequence"); 524 else 525 assert(!str.empty, "UTF-32 sequence is empty."); 526 return 1; 527 } 528 529 @system unittest 530 { 531 import core.exception : AssertError; 532 import std.conv : to; 533 import std.exception; 534 import std.string : format; 535 static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__) 536 { 537 enforce(stride(s, i) == codeLength!dchar(c), 538 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 539 540 enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c), 541 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 542 543 auto refRandom = new RefRandomCU!dchar(s); 544 immutable randLen = refRandom.length; 545 enforce(stride(refRandom, i) == codeLength!dchar(c), 546 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 547 enforce(refRandom.length == randLen, 548 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 549 550 if (i == 0) 551 { 552 enforce(stride(s) == codeLength!dchar(c), 553 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 554 555 enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c), 556 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 557 558 auto refBidir = new RefBidirCU!dchar(s); 559 immutable bidirLen = refBidir.length; 560 enforce(stride(refBidir) == codeLength!dchar(c), 561 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 562 enforce(refBidir.length == bidirLen, 563 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 564 } 565 } 566 567 assertCTFEable!( 568 { 569 test("a", 'a'); 570 test(" ", ' '); 571 test("\u2029", '\u2029'); //paraSep 572 test("\u0100", '\u0100'); 573 test("\u0430", '\u0430'); 574 test("\U00010143", '\U00010143'); 575 test("abcdefcdef", 'a'); 576 test("hello\U00010143\u0100\U00010143", 'h', 0); 577 test("hello\U00010143\u0100\U00010143", 'e', 1); 578 test("hello\U00010143\u0100\U00010143", 'l', 2); 579 test("hello\U00010143\u0100\U00010143", 'l', 3); 580 test("hello\U00010143\u0100\U00010143", 'o', 4); 581 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 582 test("hello\U00010143\u0100\U00010143", '\u0100', 6); 583 test("hello\U00010143\u0100\U00010143", '\U00010143', 7); 584 585 foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 586 { 587 enum str = to!S("hello world"); 588 static assert(isSafe!(() => stride(str, 0))); 589 static assert(isSafe!(() => stride(str) )); 590 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 591 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 592 } 593 }); 594 } 595 596 /++ 597 Calculate the length of the UTF sequence ending one code unit before 598 $(D index) in $(D str). 599 600 Params: 601 str = bidirectional range of UTF code units. Must be random access if 602 $(D index) is passed 603 index = index one past end of UTF sequence (default: $(D str.length)) 604 605 Returns: 606 The number of code units in the UTF sequence. For UTF-8, this is a 607 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 608 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 609 610 Throws: 611 May throw a $(D UTFException) if $(D str[index]) is not one past the 612 end of a valid UTF sequence. 613 614 Note: 615 $(D strideBack) will only analyze the element at $(D str[index - 1]) 616 element. It will not fully verify the validity of the UTF sequence, nor 617 even verify the presence of the sequence: it will not actually 618 guarantee that $(D strideBack(str, index) <= index). 619 +/ 620 uint strideBack(S)(auto ref S str, size_t index) 621 if (is(S : const char[]) || 622 (isRandomAccessRange!S && is(Unqual!(ElementType!S) == char))) 623 { 624 static if (is(typeof(str.length) : ulong)) 625 assert(index <= str.length, "Past the end of the UTF-8 sequence"); 626 assert(index > 0, "Not the end of the UTF-8 sequence"); 627 628 if ((str[index-1] & 0b1100_0000) != 0b1000_0000) 629 return 1; 630 631 if (index >= 4) //single verification for most common case 632 { 633 foreach (i; AliasSeq!(2, 3, 4)) 634 { 635 if ((str[index-i] & 0b1100_0000) != 0b1000_0000) 636 return i; 637 } 638 } 639 else 640 { 641 foreach (i; AliasSeq!(2, 3)) 642 { 643 if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000) 644 return i; 645 } 646 } 647 throw new UTFException("Not the end of the UTF sequence", index); 648 } 649 650 /// Ditto 651 uint strideBack(S)(auto ref S str) 652 if (is(S : const char[]) || 653 (isRandomAccessRange!S && hasLength!S && is(Unqual!(ElementType!S) == char))) 654 { 655 return strideBack(str, str.length); 656 } 657 658 /// Ditto 659 uint strideBack(S)(auto ref S str) 660 if (isBidirectionalRange!S && is(Unqual!(ElementType!S) == char) && !isRandomAccessRange!S) 661 { 662 assert(!str.empty, "Past the end of the UTF-8 sequence"); 663 auto temp = str.save; 664 foreach (i; AliasSeq!(1, 2, 3, 4)) 665 { 666 if ((temp.back & 0b1100_0000) != 0b1000_0000) 667 return i; 668 temp.popBack(); 669 if (temp.empty) 670 break; 671 } 672 throw new UTFException("The last code unit is not the end of the UTF-8 sequence"); 673 } 674 675 @system unittest 676 { 677 import core.exception : AssertError; 678 import std.conv : to; 679 import std.exception; 680 import std.string : format; 681 static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 682 { 683 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c), 684 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 685 686 enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c), 687 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 688 689 auto refRandom = new RefRandomCU!char(s); 690 immutable randLen = refRandom.length; 691 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c), 692 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 693 enforce(refRandom.length == randLen, 694 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 695 696 if (i == size_t.max) 697 { 698 enforce(strideBack(s) == codeLength!char(c), 699 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 700 701 enforce(strideBack(BidirCU!char(s)) == codeLength!char(c), 702 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 703 704 auto refBidir = new RefBidirCU!char(s); 705 immutable bidirLen = refBidir.length; 706 enforce(strideBack(refBidir) == codeLength!char(c), 707 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 708 enforce(refBidir.length == bidirLen, 709 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 710 } 711 } 712 713 assertCTFEable!( 714 { 715 test("a", 'a'); 716 test(" ", ' '); 717 test("\u2029", '\u2029'); //paraSep 718 test("\u0100", '\u0100'); 719 test("\u0430", '\u0430'); 720 test("\U00010143", '\U00010143'); 721 test("abcdefcdef", 'f'); 722 test("\U00010143\u0100\U00010143hello", 'o', 15); 723 test("\U00010143\u0100\U00010143hello", 'l', 14); 724 test("\U00010143\u0100\U00010143hello", 'l', 13); 725 test("\U00010143\u0100\U00010143hello", 'e', 12); 726 test("\U00010143\u0100\U00010143hello", 'h', 11); 727 test("\U00010143\u0100\U00010143hello", '\U00010143', 10); 728 test("\U00010143\u0100\U00010143hello", '\u0100', 6); 729 test("\U00010143\u0100\U00010143hello", '\U00010143', 4); 730 731 foreach (S; AliasSeq!(char[], const char[], string)) 732 { 733 enum str = to!S("hello world"); 734 static assert(isSafe!({ strideBack(str, 0); })); 735 static assert(isSafe!({ strideBack(str); })); 736 static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0); 737 static assert((functionAttributes!({ strideBack(str); }) & FunctionAttribute.pure_) != 0); 738 } 739 }); 740 } 741 742 //UTF-16 is self synchronizing: The length of strideBack can be found from 743 //the value of a single wchar 744 /// Ditto 745 uint strideBack(S)(auto ref S str, size_t index) 746 if (is(S : const wchar[]) || 747 (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar))) 748 { 749 static if (is(typeof(str.length) : ulong)) 750 assert(index <= str.length, "Past the end of the UTF-16 sequence"); 751 assert(index > 0, "Not the end of a UTF-16 sequence"); 752 753 immutable c2 = str[index-1]; 754 return 1 + (0xDC00 <= c2 && c2 < 0xE000); 755 } 756 757 /// Ditto 758 uint strideBack(S)(auto ref S str) 759 if (is(S : const wchar[]) || 760 (isBidirectionalRange!S && is(Unqual!(ElementType!S) == wchar))) 761 { 762 assert(!str.empty, "UTF-16 sequence is empty"); 763 764 static if (is(S : const(wchar)[])) 765 immutable c2 = str[$ - 1]; 766 else 767 immutable c2 = str.back; 768 769 return 1 + (0xDC00 <= c2 && c2 <= 0xE000); 770 } 771 772 @system unittest 773 { 774 import core.exception : AssertError; 775 import std.conv : to; 776 import std.exception; 777 import std.string : format; 778 static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 779 { 780 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c), 781 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 782 783 enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c), 784 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 785 786 auto refRandom = new RefRandomCU!wchar(s); 787 immutable randLen = refRandom.length; 788 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c), 789 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 790 enforce(refRandom.length == randLen, 791 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 792 793 if (i == size_t.max) 794 { 795 enforce(strideBack(s) == codeLength!wchar(c), 796 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 797 798 enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c), 799 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 800 801 auto refBidir = new RefBidirCU!wchar(s); 802 immutable bidirLen = refBidir.length; 803 enforce(strideBack(refBidir) == codeLength!wchar(c), 804 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 805 enforce(refBidir.length == bidirLen, 806 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 807 } 808 } 809 810 assertCTFEable!( 811 { 812 test("a", 'a'); 813 test(" ", ' '); 814 test("\u2029", '\u2029'); //paraSep 815 test("\u0100", '\u0100'); 816 test("\u0430", '\u0430'); 817 test("\U00010143", '\U00010143'); 818 test("abcdefcdef", 'f'); 819 test("\U00010143\u0100\U00010143hello", 'o', 10); 820 test("\U00010143\u0100\U00010143hello", 'l', 9); 821 test("\U00010143\u0100\U00010143hello", 'l', 8); 822 test("\U00010143\u0100\U00010143hello", 'e', 7); 823 test("\U00010143\u0100\U00010143hello", 'h', 6); 824 test("\U00010143\u0100\U00010143hello", '\U00010143', 5); 825 test("\U00010143\u0100\U00010143hello", '\u0100', 3); 826 test("\U00010143\u0100\U00010143hello", '\U00010143', 2); 827 828 foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 829 { 830 enum str = to!S("hello world"); 831 static assert(isSafe!(() => strideBack(str, 0))); 832 static assert(isSafe!(() => strideBack(str) )); 833 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 834 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 835 } 836 }); 837 } 838 839 /// Ditto 840 uint strideBack(S)(auto ref S str, size_t index) 841 if (isRandomAccessRange!S && is(Unqual!(ElementEncodingType!S) == dchar)) 842 { 843 static if (is(typeof(str.length) : ulong)) 844 assert(index <= str.length, "Past the end of the UTF-32 sequence"); 845 assert(index > 0, "Not the end of the UTF-32 sequence"); 846 return 1; 847 } 848 849 /// Ditto 850 uint strideBack(S)(auto ref S str) 851 if (isBidirectionalRange!S && is(Unqual!(ElementEncodingType!S) == dchar)) 852 { 853 assert(!str.empty, "Empty UTF-32 sequence"); 854 return 1; 855 } 856 857 @system unittest 858 { 859 import core.exception : AssertError; 860 import std.conv : to; 861 import std.exception; 862 import std.string : format; 863 static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 864 { 865 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c), 866 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 867 868 enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c), 869 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 870 871 auto refRandom = new RefRandomCU!dchar(s); 872 immutable randLen = refRandom.length; 873 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c), 874 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 875 enforce(refRandom.length == randLen, 876 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 877 878 if (i == size_t.max) 879 { 880 enforce(strideBack(s) == codeLength!dchar(c), 881 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 882 883 enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c), 884 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 885 886 auto refBidir = new RefBidirCU!dchar(s); 887 immutable bidirLen = refBidir.length; 888 enforce(strideBack(refBidir) == codeLength!dchar(c), 889 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 890 enforce(refBidir.length == bidirLen, 891 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 892 } 893 } 894 895 assertCTFEable!( 896 { 897 test("a", 'a'); 898 test(" ", ' '); 899 test("\u2029", '\u2029'); //paraSep 900 test("\u0100", '\u0100'); 901 test("\u0430", '\u0430'); 902 test("\U00010143", '\U00010143'); 903 test("abcdefcdef", 'f'); 904 test("\U00010143\u0100\U00010143hello", 'o', 8); 905 test("\U00010143\u0100\U00010143hello", 'l', 7); 906 test("\U00010143\u0100\U00010143hello", 'l', 6); 907 test("\U00010143\u0100\U00010143hello", 'e', 5); 908 test("\U00010143\u0100\U00010143hello", 'h', 4); 909 test("\U00010143\u0100\U00010143hello", '\U00010143', 3); 910 test("\U00010143\u0100\U00010143hello", '\u0100', 2); 911 test("\U00010143\u0100\U00010143hello", '\U00010143', 1); 912 913 foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 914 { 915 enum str = to!S("hello world"); 916 static assert(isSafe!(() => strideBack(str, 0))); 917 static assert(isSafe!(() => strideBack(str) )); 918 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 919 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 920 } 921 }); 922 } 923 924 925 /++ 926 Given $(D index) into $(D str) and assuming that $(D index) is at the start 927 of a UTF sequence, $(D toUCSindex) determines the number of UCS characters 928 up to $(D index). So, $(D index) is the index of a code unit at the 929 beginning of a code point, and the return value is how many code points into 930 the string that that code point is. 931 +/ 932 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure 933 if (isSomeChar!C) 934 { 935 static if (is(Unqual!C == dchar)) 936 return index; 937 else 938 { 939 size_t n = 0; 940 size_t j = 0; 941 942 for (; j < index; ++n) 943 j += stride(str, j); 944 945 if (j > index) 946 { 947 static if (is(Unqual!C == char)) 948 throw new UTFException("Invalid UTF-8 sequence", index); 949 else 950 throw new UTFException("Invalid UTF-16 sequence", index); 951 } 952 953 return n; 954 } 955 } 956 957 /// 958 @safe unittest 959 { 960 assert(toUCSindex(`hello world`, 7) == 7); 961 assert(toUCSindex(`hello world`w, 7) == 7); 962 assert(toUCSindex(`hello world`d, 7) == 7); 963 964 assert(toUCSindex(`Ma Chérie`, 7) == 6); 965 assert(toUCSindex(`Ma Chérie`w, 7) == 7); 966 assert(toUCSindex(`Ma Chérie`d, 7) == 7); 967 968 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3); 969 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 970 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 971 } 972 973 974 /++ 975 Given a UCS index $(D n) into $(D str), returns the UTF index. 976 So, $(D n) is how many code points into the string the code point is, and 977 the array index of the code unit is returned. 978 +/ 979 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure 980 if (isSomeChar!C) 981 { 982 static if (is(Unqual!C == dchar)) 983 { 984 return n; 985 } 986 else 987 { 988 size_t i; 989 while (n--) 990 { 991 i += stride(str, i); 992 } 993 return i; 994 } 995 } 996 997 /// 998 @safe unittest 999 { 1000 assert(toUTFindex(`hello world`, 7) == 7); 1001 assert(toUTFindex(`hello world`w, 7) == 7); 1002 assert(toUTFindex(`hello world`d, 7) == 7); 1003 1004 assert(toUTFindex(`Ma Chérie`, 6) == 7); 1005 assert(toUTFindex(`Ma Chérie`w, 7) == 7); 1006 assert(toUTFindex(`Ma Chérie`d, 7) == 7); 1007 1008 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9); 1009 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 1010 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 1011 } 1012 1013 1014 /* =================== Decode ======================= */ 1015 1016 /// Whether or not to replace invalid UTF with $(LREF replacementDchar) 1017 alias UseReplacementDchar = Flag!"useReplacementDchar"; 1018 1019 /++ 1020 Decodes and returns the code point starting at $(D str[index]). $(D index) 1021 is advanced to one past the decoded code point. If the code point is not 1022 well-formed, then a $(D UTFException) is thrown and $(D index) remains 1023 unchanged. 1024 1025 decode will only work with strings and random access ranges of code units 1026 with length and slicing, whereas $(LREF decodeFront) will work with any 1027 input range of code units. 1028 1029 Params: 1030 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1031 str = input string or indexable Range 1032 index = starting index into s[]; incremented by number of code units processed 1033 1034 Returns: 1035 decoded character 1036 1037 Throws: 1038 $(LREF UTFException) if $(D str[index]) is not the start of a valid UTF 1039 sequence and useReplacementDchar is $(D No.useReplacementDchar) 1040 +/ 1041 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index) 1042 if (!isSomeString!S && 1043 isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S)) 1044 in 1045 { 1046 assert(index < str.length, "Attempted to decode past the end of a string"); 1047 } 1048 out (result) 1049 { 1050 assert(isValidDchar(result)); 1051 } 1052 body 1053 { 1054 if (str[index] < codeUnitLimit!S) 1055 return str[index++]; 1056 else 1057 return decodeImpl!(true, useReplacementDchar)(str, index); 1058 } 1059 1060 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1061 auto ref S str, ref size_t index) @trusted pure 1062 if (isSomeString!S) 1063 in 1064 { 1065 assert(index < str.length, "Attempted to decode past the end of a string"); 1066 } 1067 out (result) 1068 { 1069 assert(isValidDchar(result)); 1070 } 1071 body 1072 { 1073 if (str[index] < codeUnitLimit!S) 1074 return str[index++]; 1075 else 1076 return decodeImpl!(true, useReplacementDchar)(str, index); 1077 } 1078 1079 /++ 1080 $(D decodeFront) is a variant of $(LREF decode) which specifically decodes 1081 the first code point. Unlike $(LREF decode), $(D decodeFront) accepts any 1082 input range of code units (rather than just a string or random access 1083 range). It also takes the range by $(D ref) and pops off the elements as it 1084 decodes them. If $(D numCodeUnits) is passed in, it gets set to the number 1085 of code units which were in the code point which was decoded. 1086 1087 Params: 1088 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1089 str = input string or indexable Range 1090 numCodeUnits = set to number of code units processed 1091 1092 Returns: 1093 decoded character 1094 1095 Throws: 1096 $(LREF UTFException) if $(D str.front) is not the start of a valid UTF 1097 sequence. If an exception is thrown, then there is no guarantee as to 1098 the number of code units which were popped off, as it depends on the 1099 type of range being used and how many code units had to be popped off 1100 before the code point was determined to be invalid. 1101 +/ 1102 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1103 ref S str, out size_t numCodeUnits) 1104 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S)) 1105 in 1106 { 1107 assert(!str.empty); 1108 } 1109 out (result) 1110 { 1111 assert(isValidDchar(result)); 1112 } 1113 body 1114 { 1115 immutable fst = str.front; 1116 1117 if (fst < codeUnitLimit!S) 1118 { 1119 str.popFront(); 1120 numCodeUnits = 1; 1121 return fst; 1122 } 1123 else 1124 { 1125 //@@@BUG@@@ 14447 forces canIndex to be done outside of decodeImpl, which 1126 //is undesirable, since not all overloads of decodeImpl need it. So, it 1127 //should be moved back into decodeImpl once bug# 8521 has been fixed. 1128 enum canIndex = isRandomAccessRange!S && hasSlicing!S && hasLength!S; 1129 immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits); 1130 1131 // The other range types were already popped by decodeImpl. 1132 static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1133 str = str[numCodeUnits .. str.length]; 1134 1135 return retval; 1136 } 1137 } 1138 1139 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1140 ref S str, out size_t numCodeUnits) @trusted pure 1141 if (isSomeString!S) 1142 in 1143 { 1144 assert(!str.empty); 1145 } 1146 out (result) 1147 { 1148 assert(isValidDchar(result)); 1149 } 1150 body 1151 { 1152 if (str[0] < codeUnitLimit!S) 1153 { 1154 numCodeUnits = 1; 1155 immutable retval = str[0]; 1156 str = str[1 .. $]; 1157 return retval; 1158 } 1159 else 1160 { 1161 immutable retval = decodeImpl!(true, useReplacementDchar)(str, numCodeUnits); 1162 str = str[numCodeUnits .. $]; 1163 return retval; 1164 } 1165 } 1166 1167 /++ Ditto +/ 1168 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1169 if (isInputRange!S && isSomeChar!(ElementType!S)) 1170 { 1171 size_t numCodeUnits; 1172 return decodeFront!useReplacementDchar(str, numCodeUnits); 1173 } 1174 1175 /++ 1176 $(D decodeBack) is a variant of $(LREF decode) which specifically decodes 1177 the last code point. Unlike $(LREF decode), $(D decodeBack) accepts any 1178 bidirectional range of code units (rather than just a string or random access 1179 range). It also takes the range by $(D ref) and pops off the elements as it 1180 decodes them. If $(D numCodeUnits) is passed in, it gets set to the number 1181 of code units which were in the code point which was decoded. 1182 1183 Params: 1184 useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing 1185 str = input string or bidirectional Range 1186 numCodeUnits = gives the number of code units processed 1187 1188 Returns: 1189 A decoded UTF character. 1190 1191 Throws: 1192 $(LREF UTFException) if $(D str.back) is not the end of a valid UTF 1193 sequence. If an exception is thrown, the $(D str) itself remains unchanged, 1194 but there is no guarantee as to the value of $(D numCodeUnits) (when passed). 1195 +/ 1196 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1197 ref S str, out size_t numCodeUnits) 1198 if (isSomeString!S) 1199 in 1200 { 1201 assert(!str.empty); 1202 } 1203 out (result) 1204 { 1205 assert(isValidDchar(result)); 1206 } 1207 body 1208 { 1209 if (str[$ - 1] < codeUnitLimit!S) 1210 { 1211 numCodeUnits = 1; 1212 immutable retval = str[$ - 1]; 1213 str = str[0 .. $ - 1]; 1214 return retval; 1215 } 1216 else 1217 { 1218 numCodeUnits = strideBack(str); 1219 immutable newLength = str.length - numCodeUnits; 1220 size_t index = newLength; 1221 immutable retval = decodeImpl!(true, useReplacementDchar)(str, index); 1222 str = str[0 .. newLength]; 1223 return retval; 1224 } 1225 } 1226 1227 /++ Ditto +/ 1228 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1229 ref S str, out size_t numCodeUnits) 1230 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S 1231 && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S)) 1232 in 1233 { 1234 assert(!str.empty); 1235 } 1236 out (result) 1237 { 1238 assert(isValidDchar(result)); 1239 } 1240 body 1241 { 1242 if (str.back < codeUnitLimit!S) 1243 { 1244 numCodeUnits = 1; 1245 immutable retval = str.back; 1246 str.popBack(); 1247 return retval; 1248 } 1249 else 1250 { 1251 numCodeUnits = strideBack(str); 1252 static if (isRandomAccessRange!S) 1253 { 1254 size_t index = str.length - numCodeUnits; 1255 immutable retval = decodeImpl!(true, useReplacementDchar)(str, index); 1256 str.popBackExactly(numCodeUnits); 1257 return retval; 1258 } 1259 else 1260 { 1261 alias Char = Unqual!(ElementType!S); 1262 Char[4] codeUnits; 1263 S tmp = str.save; 1264 for (size_t i = numCodeUnits; i > 0; ) 1265 { 1266 codeUnits[--i] = tmp.back; 1267 tmp.popBack(); 1268 } 1269 const Char[] codePoint = codeUnits[0 .. numCodeUnits]; 1270 size_t index = 0; 1271 immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index); 1272 str = tmp; 1273 return retval; 1274 } 1275 } 1276 } 1277 1278 /++ Ditto +/ 1279 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1280 if (isSomeString!S 1281 || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S)) 1282 || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S))) 1283 in 1284 { 1285 assert(!str.empty); 1286 } 1287 out (result) 1288 { 1289 assert(isValidDchar(result)); 1290 } 1291 body 1292 { 1293 size_t numCodeUnits; 1294 return decodeBack!useReplacementDchar(str, numCodeUnits); 1295 } 1296 1297 // Gives the maximum value that a code unit for the given range type can hold. 1298 package template codeUnitLimit(S) 1299 if (isSomeChar!(ElementEncodingType!S)) 1300 { 1301 static if (is(Unqual!(ElementEncodingType!S) == char)) 1302 enum char codeUnitLimit = 0x80; 1303 else static if (is(Unqual!(ElementEncodingType!S) == wchar)) 1304 enum wchar codeUnitLimit = 0xD800; 1305 else 1306 enum dchar codeUnitLimit = 0xD800; 1307 } 1308 1309 /* 1310 * For strings, this function does its own bounds checking to give a 1311 * more useful error message when attempting to decode past the end of a string. 1312 * Subsequently it uses a pointer instead of an array to avoid 1313 * redundant bounds checking. 1314 * 1315 * The three overloads of this operate on chars, wchars, and dchars. 1316 * 1317 * Params: 1318 * canIndex = if S is indexable 1319 * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1320 * str = input string or Range 1321 * index = starting index into s[]; incremented by number of code units processed 1322 * 1323 * Returns: 1324 * decoded character 1325 */ 1326 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1327 auto ref S str, ref size_t index) 1328 if ( 1329 is(S : const char[]) || (isInputRange!S && is(Unqual!(ElementEncodingType!S) == char))) 1330 { 1331 /* The following encodings are valid, except for the 5 and 6 byte 1332 * combinations: 1333 * 0xxxxxxx 1334 * 110xxxxx 10xxxxxx 1335 * 1110xxxx 10xxxxxx 10xxxxxx 1336 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 1337 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1338 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1339 */ 1340 1341 /* Dchar bitmask for different numbers of UTF-8 code units. 1342 */ 1343 alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1); 1344 1345 static if (is(S : const char[])) 1346 auto pstr = str.ptr + index; // this is what makes decodeImpl() @system code 1347 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1348 auto pstr = str[index .. str.length]; 1349 else 1350 alias pstr = str; 1351 1352 //@@@BUG@@@ 14447 forces this to be done outside of decodeImpl 1353 //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1354 1355 static if (canIndex) 1356 { 1357 immutable length = str.length - index; 1358 ubyte fst = pstr[0]; 1359 } 1360 else 1361 { 1362 ubyte fst = pstr.front; 1363 pstr.popFront(); 1364 } 1365 1366 static if (!useReplacementDchar) 1367 { 1368 static if (canIndex) 1369 { 1370 static UTFException exception(S)(S str, string msg) 1371 { 1372 uint[4] sequence = void; 1373 size_t i; 1374 1375 do 1376 { 1377 sequence[i] = str[i]; 1378 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80); 1379 1380 return new UTFException(msg, i).setSequence(sequence[0 .. i]); 1381 } 1382 } 1383 1384 UTFException invalidUTF() 1385 { 1386 static if (canIndex) 1387 return exception(pstr[0 .. length], "Invalid UTF-8 sequence"); 1388 else 1389 { 1390 //We can't include the invalid sequence with input strings without 1391 //saving each of the code units along the way, and we can't do it with 1392 //forward ranges without saving the entire range. Both would incur a 1393 //cost for the decoding of every character just to provide a better 1394 //error message for the (hopefully) rare case when an invalid UTF-8 1395 //sequence is encountered, so we don't bother trying to include the 1396 //invalid sequence here, unlike with strings and sliceable ranges. 1397 return new UTFException("Invalid UTF-8 sequence"); 1398 } 1399 } 1400 1401 UTFException outOfBounds() 1402 { 1403 static if (canIndex) 1404 return exception(pstr[0 .. length], "Attempted to decode past the end of a string"); 1405 else 1406 return new UTFException("Attempted to decode past the end of a string"); 1407 } 1408 } 1409 1410 if ((fst & 0b1100_0000) != 0b1100_0000) 1411 { 1412 static if (useReplacementDchar) 1413 { 1414 ++index; // always consume bad input to avoid infinite loops 1415 return replacementDchar; 1416 } 1417 else 1418 throw invalidUTF(); // starter must have at least 2 first bits set 1419 } 1420 ubyte tmp = void; 1421 dchar d = fst; // upper control bits are masked out later 1422 fst <<= 1; 1423 1424 foreach (i; AliasSeq!(1, 2, 3)) 1425 { 1426 1427 static if (canIndex) 1428 { 1429 if (i == length) 1430 { 1431 static if (useReplacementDchar) 1432 { 1433 index += i; 1434 return replacementDchar; 1435 } 1436 else 1437 throw outOfBounds(); 1438 } 1439 } 1440 else 1441 { 1442 if (pstr.empty) 1443 { 1444 static if (useReplacementDchar) 1445 { 1446 index += i; 1447 return replacementDchar; 1448 } 1449 else 1450 throw outOfBounds(); 1451 } 1452 } 1453 1454 static if (canIndex) 1455 tmp = pstr[i]; 1456 else 1457 { 1458 tmp = pstr.front; 1459 pstr.popFront(); 1460 } 1461 1462 if ((tmp & 0xC0) != 0x80) 1463 { 1464 static if (useReplacementDchar) 1465 { 1466 index += i + 1; 1467 return replacementDchar; 1468 } 1469 else 1470 throw invalidUTF(); 1471 } 1472 1473 d = (d << 6) | (tmp & 0x3F); 1474 fst <<= 1; 1475 1476 if (!(fst & 0x80)) // no more bytes 1477 { 1478 d &= bitMask[i]; // mask out control bits 1479 1480 // overlong, could have been encoded with i bytes 1481 if ((d & ~bitMask[i - 1]) == 0) 1482 { 1483 static if (useReplacementDchar) 1484 { 1485 index += i + 1; 1486 return replacementDchar; 1487 } 1488 else 1489 throw invalidUTF(); 1490 } 1491 1492 // check for surrogates only needed for 3 bytes 1493 static if (i == 2) 1494 { 1495 if (!isValidDchar(d)) 1496 { 1497 static if (useReplacementDchar) 1498 { 1499 index += i + 1; 1500 return replacementDchar; 1501 } 1502 else 1503 throw invalidUTF(); 1504 } 1505 } 1506 1507 index += i + 1; 1508 static if (i == 3) 1509 { 1510 if (d > dchar.max) 1511 { 1512 static if (useReplacementDchar) 1513 d = replacementDchar; 1514 else 1515 throw invalidUTF(); 1516 } 1517 } 1518 return d; 1519 } 1520 } 1521 1522 static if (useReplacementDchar) 1523 { 1524 index += 4; // read 4 chars by now 1525 return replacementDchar; 1526 } 1527 else 1528 throw invalidUTF(); 1529 } 1530 1531 @safe pure @nogc nothrow 1532 unittest 1533 { 1534 // Add tests for useReplacemendDchar == yes path 1535 1536 static struct R 1537 { 1538 @safe pure @nogc nothrow: 1539 this(string s) { this.s = s; } 1540 @property bool empty() { return idx == s.length; } 1541 @property char front() { return s[idx]; } 1542 void popFront() { ++idx; } 1543 size_t idx; 1544 string s; 1545 } 1546 1547 foreach (s; invalidUTFstrings!char()) 1548 { 1549 auto r = R(s); 1550 size_t index; 1551 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1552 assert(dc == replacementDchar); 1553 assert(1 <= index && index <= s.length); 1554 } 1555 } 1556 1557 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S) 1558 (auto ref S str, ref size_t index) 1559 if (is(S : const wchar[]) || (isInputRange!S && is(Unqual!(ElementEncodingType!S) == wchar))) 1560 { 1561 static if (is(S : const wchar[])) 1562 auto pstr = str.ptr + index; 1563 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1564 auto pstr = str[index .. str.length]; 1565 else 1566 alias pstr = str; 1567 1568 //@@@BUG@@@ 14447 forces this to be done outside of decodeImpl 1569 //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1570 1571 static if (canIndex) 1572 { 1573 immutable length = str.length - index; 1574 uint u = pstr[0]; 1575 } 1576 else 1577 { 1578 uint u = pstr.front; 1579 pstr.popFront(); 1580 } 1581 1582 static if (!useReplacementDchar) 1583 { 1584 UTFException exception(string msg) 1585 { 1586 static if (canIndex) 1587 return new UTFException(msg).setSequence(pstr[0]); 1588 else 1589 return new UTFException(msg); 1590 } 1591 } 1592 1593 // The < case must be taken care of before decodeImpl is called. 1594 assert(u >= 0xD800); 1595 1596 if (u <= 0xDBFF) 1597 { 1598 static if (canIndex) 1599 immutable onlyOneCodeUnit = length == 1; 1600 else 1601 immutable onlyOneCodeUnit = pstr.empty; 1602 1603 if (onlyOneCodeUnit) 1604 { 1605 static if (useReplacementDchar) 1606 { 1607 ++index; 1608 return replacementDchar; 1609 } 1610 else 1611 throw exception("surrogate UTF-16 high value past end of string"); 1612 } 1613 1614 static if (canIndex) 1615 immutable uint u2 = pstr[1]; 1616 else 1617 { 1618 immutable uint u2 = pstr.front; 1619 pstr.popFront(); 1620 } 1621 1622 if (u2 < 0xDC00 || u2 > 0xDFFF) 1623 { 1624 static if (useReplacementDchar) 1625 u = replacementDchar; 1626 else 1627 throw exception("surrogate UTF-16 low value out of range"); 1628 } 1629 else 1630 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 1631 ++index; 1632 } 1633 else if (u >= 0xDC00 && u <= 0xDFFF) 1634 { 1635 static if (useReplacementDchar) 1636 u = replacementDchar; 1637 else 1638 throw exception("unpaired surrogate UTF-16 value"); 1639 } 1640 ++index; 1641 1642 // Note: u+FFFE and u+FFFF are specifically permitted by the 1643 // Unicode standard for application internal use (see isValidDchar) 1644 1645 return cast(dchar) u; 1646 } 1647 1648 @safe pure @nogc nothrow 1649 unittest 1650 { 1651 // Add tests for useReplacemendDchar == true path 1652 1653 static struct R 1654 { 1655 @safe pure @nogc nothrow: 1656 this(wstring s) { this.s = s; } 1657 @property bool empty() { return idx == s.length; } 1658 @property wchar front() { return s[idx]; } 1659 void popFront() { ++idx; } 1660 size_t idx; 1661 wstring s; 1662 } 1663 1664 foreach (s; invalidUTFstrings!wchar()) 1665 { 1666 auto r = R(s); 1667 size_t index; 1668 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1669 assert(dc == replacementDchar); 1670 assert(1 <= index && index <= s.length); 1671 } 1672 } 1673 1674 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1675 auto ref S str, ref size_t index) 1676 if (is(S : const dchar[]) || (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar))) 1677 { 1678 static if (is(S : const dchar[])) 1679 auto pstr = str.ptr; 1680 else 1681 alias pstr = str; 1682 1683 static if (is(S : const dchar[]) || isRandomAccessRange!S) 1684 { 1685 dchar dc = pstr[index]; 1686 if (!isValidDchar(dc)) 1687 { 1688 static if (useReplacementDchar) 1689 dc = replacementDchar; 1690 else 1691 throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1692 } 1693 ++index; 1694 return dc; 1695 } 1696 else 1697 { 1698 dchar dc = pstr.front; 1699 if (!isValidDchar(dc)) 1700 { 1701 static if (useReplacementDchar) 1702 dc = replacementDchar; 1703 else 1704 throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1705 } 1706 ++index; 1707 pstr.popFront(); 1708 return dc; 1709 } 1710 } 1711 1712 @safe pure @nogc nothrow 1713 unittest 1714 { 1715 // Add tests for useReplacemendDchar == true path 1716 1717 static struct R 1718 { 1719 @safe pure @nogc nothrow: 1720 this(dstring s) { this.s = s; } 1721 @property bool empty() { return idx == s.length; } 1722 @property dchar front() { return s[idx]; } 1723 void popFront() { ++idx; } 1724 size_t idx; 1725 dstring s; 1726 } 1727 1728 foreach (s; invalidUTFstrings!dchar()) 1729 { 1730 auto r = R(s); 1731 size_t index; 1732 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1733 assert(dc == replacementDchar); 1734 assert(1 <= index && index <= s.length); 1735 } 1736 } 1737 1738 1739 version (unittest) private void testDecode(R)(R range, 1740 size_t index, 1741 dchar expectedChar, 1742 size_t expectedIndex, 1743 size_t line = __LINE__) 1744 { 1745 import core.exception : AssertError; 1746 import std.string : format; 1747 1748 static if (hasLength!R) 1749 immutable lenBefore = range.length; 1750 1751 static if (isRandomAccessRange!R) 1752 { 1753 { 1754 immutable result = decode(range, index); 1755 enforce(result == expectedChar, 1756 new AssertError(format("decode: Wrong character: %s", result), __FILE__, line)); 1757 enforce(index == expectedIndex, 1758 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 1759 static if (hasLength!R) 1760 { 1761 enforce(range.length == lenBefore, 1762 new AssertError(format("decode: length changed: %s", range.length), __FILE__, line)); 1763 } 1764 } 1765 } 1766 } 1767 1768 version (unittest) private void testDecodeFront(R)(ref R range, 1769 dchar expectedChar, 1770 size_t expectedNumCodeUnits, 1771 size_t line = __LINE__) 1772 { 1773 import core.exception : AssertError; 1774 import std.string : format; 1775 1776 static if (hasLength!R) 1777 immutable lenBefore = range.length; 1778 1779 size_t numCodeUnits; 1780 immutable result = decodeFront(range, numCodeUnits); 1781 enforce(result == expectedChar, 1782 new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line)); 1783 enforce(numCodeUnits == expectedNumCodeUnits, 1784 new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1785 1786 static if (hasLength!R) 1787 { 1788 enforce(range.length == lenBefore - numCodeUnits, 1789 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line)); 1790 } 1791 } 1792 1793 version (unittest) private void testDecodeBack(R)(ref R range, 1794 dchar expectedChar, 1795 size_t expectedNumCodeUnits, 1796 size_t line = __LINE__) 1797 { 1798 // This condition is to allow unit testing all `decode` functions together 1799 static if (!isBidirectionalRange!R) 1800 return; 1801 else 1802 { 1803 import core.exception : AssertError; 1804 import std.string : format; 1805 1806 static if (hasLength!R) 1807 immutable lenBefore = range.length; 1808 1809 size_t numCodeUnits; 1810 immutable result = decodeBack(range, numCodeUnits); 1811 enforce(result == expectedChar, 1812 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line)); 1813 enforce(numCodeUnits == expectedNumCodeUnits, 1814 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1815 1816 static if (hasLength!R) 1817 { 1818 enforce(range.length == lenBefore - numCodeUnits, 1819 new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line)); 1820 } 1821 } 1822 } 1823 1824 version (unittest) private void testAllDecode(R)(R range, 1825 dchar expectedChar, 1826 size_t expectedIndex, 1827 size_t line = __LINE__) 1828 { 1829 testDecode(range, 0, expectedChar, expectedIndex, line); 1830 static if (isBidirectionalRange!R) 1831 { 1832 auto rangeCopy = range.save; 1833 testDecodeBack(rangeCopy, expectedChar, expectedIndex, line); 1834 } 1835 testDecodeFront(range, expectedChar, expectedIndex, line); 1836 } 1837 1838 version (unittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__) 1839 { 1840 import core.exception : AssertError; 1841 import std.string : format; 1842 1843 immutable initialIndex = index; 1844 1845 static if (hasLength!R) 1846 immutable lenBefore = range.length; 1847 1848 static if (isRandomAccessRange!R) 1849 { 1850 assertThrown!UTFException(decode(range, index), null, __FILE__, line); 1851 enforce(index == initialIndex, 1852 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 1853 static if (hasLength!R) 1854 { 1855 enforce(range.length == lenBefore, 1856 new AssertError(format("decode: length changed:", range.length), __FILE__, line)); 1857 } 1858 } 1859 1860 if (initialIndex == 0) 1861 assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line); 1862 } 1863 1864 version (unittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__) 1865 { 1866 // This condition is to allow unit testing all `decode` functions together 1867 static if (!isBidirectionalRange!R) 1868 return; 1869 else 1870 { 1871 import core.exception : AssertError; 1872 import std.string : format; 1873 1874 static if (hasLength!R) 1875 immutable lenBefore = range.length; 1876 1877 static if (isRandomAccessRange!R) 1878 { 1879 assertThrown!UTFException(decodeBack(range), null, __FILE__, line); 1880 static if (hasLength!R) 1881 { 1882 enforce(range.length == lenBefore, 1883 new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line)); 1884 } 1885 } 1886 } 1887 } 1888 1889 @system unittest 1890 { 1891 import std.conv : to; 1892 import std.exception; 1893 1894 assertCTFEable!( 1895 { 1896 foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char, 1897 (string s) => new RefBidirCU!char(s), 1898 (string s) => new RefRandomCU!char(s))) 1899 { 1900 enum sHasLength = hasLength!(typeof(S("abcd"))); 1901 1902 { 1903 auto range = S("abcd"); 1904 testDecode(range, 0, 'a', 1); 1905 testDecode(range, 1, 'b', 2); 1906 testDecodeFront(range, 'a', 1); 1907 testDecodeFront(range, 'b', 1); 1908 assert(decodeFront(range) == 'c'); 1909 assert(decodeFront(range) == 'd'); 1910 } 1911 1912 { 1913 auto range = S("ウェブサイト"); 1914 testDecode(range, 0, 'ウ', 3); 1915 testDecode(range, 3, 'ェ', 6); 1916 testDecodeFront(range, 'ウ', 3); 1917 testDecodeFront(range, 'ェ', 3); 1918 assert(decodeFront(range) == 'ブ'); 1919 assert(decodeFront(range) == 'サ'); 1920 } 1921 1922 { 1923 auto range = S("abcd"); 1924 testDecodeBack(range, 'd', 1); 1925 testDecodeBack(range, 'c', 1); 1926 testDecodeBack(range, 'b', 1); 1927 testDecodeBack(range, 'a', 1); 1928 } 1929 1930 { 1931 auto range = S("ウェブサイト"); 1932 testDecodeBack(range, 'ト', 3); 1933 testDecodeBack(range, 'イ', 3); 1934 testDecodeBack(range, 'サ', 3); 1935 testDecodeBack(range, 'ブ', 3); 1936 } 1937 1938 testAllDecode(S("\xC2\xA9"), '\u00A9', 2); 1939 testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3); 1940 1941 foreach (str; ["\xE2\x89", // too short 1942 "\xC0\x8A", 1943 "\xE0\x80\x8A", 1944 "\xF0\x80\x80\x8A", 1945 "\xF8\x80\x80\x80\x8A", 1946 "\xFC\x80\x80\x80\x80\x8A"]) 1947 { 1948 testBadDecode(S(str), 0); 1949 testBadDecode(S(str), 1); 1950 testBadDecodeBack(S(str)); 1951 } 1952 1953 //Invalid UTF-8 sequence where the first code unit is valid. 1954 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3); 1955 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3); 1956 1957 //Invalid UTF-8 sequence where the first code unit isn't valid. 1958 foreach (str; ["\xED\xA0\x80", 1959 "\xED\xAD\xBF", 1960 "\xED\xAE\x80", 1961 "\xED\xAF\xBF", 1962 "\xED\xB0\x80", 1963 "\xED\xBE\x80", 1964 "\xED\xBF\xBF"]) 1965 { 1966 testBadDecode(S(str), 0); 1967 testBadDecodeBack(S(str)); 1968 } 1969 } 1970 }); 1971 } 1972 1973 @system unittest 1974 { 1975 import std.conv : to; 1976 import std.exception; 1977 assertCTFEable!( 1978 { 1979 foreach (S; AliasSeq!(to!wstring, InputCU!wchar, RandomCU!wchar, 1980 (wstring s) => new RefBidirCU!wchar(s), 1981 (wstring s) => new RefRandomCU!wchar(s))) 1982 { 1983 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1); 1984 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2); 1985 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2); 1986 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 1987 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 1988 1989 testBadDecode(S([ cast(wchar) 0xD801 ]), 0); 1990 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0); 1991 1992 testBadDecodeBack(S([ cast(wchar) 0xD801 ])); 1993 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ])); 1994 1995 { 1996 auto range = S("ウェブサイト"); 1997 testDecode(range, 0, 'ウ', 1); 1998 testDecode(range, 1, 'ェ', 2); 1999 testDecodeFront(range, 'ウ', 1); 2000 testDecodeFront(range, 'ェ', 1); 2001 assert(decodeFront(range) == 'ブ'); 2002 assert(decodeFront(range) == 'サ'); 2003 } 2004 2005 { 2006 auto range = S("ウェブサイト"); 2007 testDecodeBack(range, 'ト', 1); 2008 testDecodeBack(range, 'イ', 1); 2009 testDecodeBack(range, 'サ', 1); 2010 testDecodeBack(range, 'ブ', 1); 2011 } 2012 } 2013 2014 foreach (S; AliasSeq!(to!wstring, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s))) 2015 { 2016 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00, 2017 cast(wchar) 0x1400, 2018 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]); 2019 testDecode(str, 0, cast(dchar) 0x10000, 2); 2020 testDecode(str, 2, cast(dchar) 0x1400, 3); 2021 testDecode(str, 3, cast(dchar) 0xB9DDE, 5); 2022 testDecodeBack(str, cast(dchar) 0xB9DDE, 2); 2023 testDecodeBack(str, cast(dchar) 0x1400, 1); 2024 testDecodeBack(str, cast(dchar) 0x10000, 2); 2025 } 2026 }); 2027 } 2028 2029 @system unittest 2030 { 2031 import std.conv : to; 2032 import std.exception; 2033 assertCTFEable!( 2034 { 2035 foreach (S; AliasSeq!(to!dstring, RandomCU!dchar, InputCU!dchar, 2036 (dstring s) => new RefBidirCU!dchar(s), 2037 (dstring s) => new RefRandomCU!dchar(s))) 2038 { 2039 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1); 2040 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1); 2041 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1); 2042 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 2043 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 2044 2045 testBadDecode(S([cast(dchar) 0xD800]), 0); 2046 testBadDecode(S([cast(dchar) 0xDFFE]), 0); 2047 testBadDecode(S([cast(dchar) 0x110000]), 0); 2048 2049 testBadDecodeBack(S([cast(dchar) 0xD800])); 2050 testBadDecodeBack(S([cast(dchar) 0xDFFE])); 2051 testBadDecodeBack(S([cast(dchar) 0x110000])); 2052 2053 { 2054 auto range = S("ウェブサイト"); 2055 testDecode(range, 0, 'ウ', 1); 2056 testDecode(range, 1, 'ェ', 2); 2057 testDecodeFront(range, 'ウ', 1); 2058 testDecodeFront(range, 'ェ', 1); 2059 assert(decodeFront(range) == 'ブ'); 2060 assert(decodeFront(range) == 'サ'); 2061 } 2062 2063 { 2064 auto range = S("ウェブサイト"); 2065 testDecodeBack(range, 'ト', 1); 2066 testDecodeBack(range, 'イ', 1); 2067 testDecodeBack(range, 'サ', 1); 2068 testDecodeBack(range, 'ブ', 1); 2069 } 2070 } 2071 2072 foreach (S; AliasSeq!(to!dstring, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s))) 2073 { 2074 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]); 2075 testDecode(str, 0, 0x10000, 1); 2076 testDecode(str, 1, 0x1400, 2); 2077 testDecode(str, 2, 0xB9DDE, 3); 2078 testDecodeBack(str, cast(dchar) 0xB9DDE, 1); 2079 testDecodeBack(str, cast(dchar) 0x1400, 1); 2080 testDecodeBack(str, cast(dchar) 0x10000, 1); 2081 } 2082 }); 2083 } 2084 2085 @safe unittest 2086 { 2087 import std.exception; 2088 assertCTFEable!( 2089 { 2090 foreach (S; AliasSeq!( char[], const( char)[], string, 2091 wchar[], const(wchar)[], wstring, 2092 dchar[], const(dchar)[], dstring)) 2093 { 2094 static assert(isSafe!({ S str; size_t i = 0; decode(str, i); })); 2095 static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); })); 2096 static assert(isSafe!({ S str; decodeFront(str); })); 2097 static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0); 2098 static assert((functionAttributes!({ 2099 S str; size_t i = 0; decodeFront(str, i); 2100 }) & FunctionAttribute.pure_) != 0); 2101 static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0); 2102 static assert((functionAttributes!({ 2103 S str; size_t i = 0; decodeBack(str, i); 2104 }) & FunctionAttribute.pure_) != 0); 2105 static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0); 2106 } 2107 }); 2108 } 2109 2110 @safe unittest 2111 { 2112 import std.exception; 2113 char[4] val; 2114 val[0] = 0b1111_0111; 2115 val[1] = 0b1011_1111; 2116 val[2] = 0b1011_1111; 2117 val[3] = 0b1011_1111; 2118 size_t i = 0; 2119 assertThrown!UTFException((){ dchar ch = decode(val[], i); }()); 2120 } 2121 /* =================== Encode ======================= */ 2122 2123 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c) 2124 { 2125 static if (useReplacementDchar) 2126 return replacementDchar; 2127 else 2128 throw new UTFException(msg).setSequence(c); 2129 } 2130 2131 /++ 2132 Encodes $(D c) into the static array, $(D buf), and returns the actual 2133 length of the encoded character (a number between $(D 1) and $(D 4) for 2134 $(D char[4]) buffers and a number between $(D 1) and $(D 2) for 2135 $(D wchar[2]) buffers). 2136 2137 Throws: 2138 $(D UTFException) if $(D c) is not a valid UTF code point. 2139 +/ 2140 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2141 out char[4] buf, dchar c) @safe pure 2142 { 2143 if (c <= 0x7F) 2144 { 2145 assert(isValidDchar(c)); 2146 buf[0] = cast(char) c; 2147 return 1; 2148 } 2149 if (c <= 0x7FF) 2150 { 2151 assert(isValidDchar(c)); 2152 buf[0] = cast(char)(0xC0 | (c >> 6)); 2153 buf[1] = cast(char)(0x80 | (c & 0x3F)); 2154 return 2; 2155 } 2156 if (c <= 0xFFFF) 2157 { 2158 if (0xD800 <= c && c <= 0xDFFF) 2159 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2160 2161 assert(isValidDchar(c)); 2162 L3: 2163 buf[0] = cast(char)(0xE0 | (c >> 12)); 2164 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2165 buf[2] = cast(char)(0x80 | (c & 0x3F)); 2166 return 3; 2167 } 2168 if (c <= 0x10FFFF) 2169 { 2170 assert(isValidDchar(c)); 2171 buf[0] = cast(char)(0xF0 | (c >> 18)); 2172 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2173 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2174 buf[3] = cast(char)(0x80 | (c & 0x3F)); 2175 return 4; 2176 } 2177 2178 assert(!isValidDchar(c)); 2179 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2180 goto L3; 2181 } 2182 2183 @safe unittest 2184 { 2185 import std.exception; 2186 assertCTFEable!( 2187 { 2188 char[4] buf; 2189 2190 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2191 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F"); 2192 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080"); 2193 assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF"); 2194 assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800"); 2195 assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF"); 2196 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000"); 2197 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE"); 2198 assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF"); 2199 assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000"); 2200 assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF"); 2201 2202 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2203 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2204 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2205 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2206 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2207 2208 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2209 assert(buf.front == replacementDchar); 2210 }); 2211 } 2212 2213 2214 /// Ditto 2215 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2216 out wchar[2] buf, dchar c) @safe pure 2217 { 2218 if (c <= 0xFFFF) 2219 { 2220 if (0xD800 <= c && c <= 0xDFFF) 2221 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2222 2223 assert(isValidDchar(c)); 2224 L1: 2225 buf[0] = cast(wchar) c; 2226 return 1; 2227 } 2228 if (c <= 0x10FFFF) 2229 { 2230 assert(isValidDchar(c)); 2231 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2232 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2233 return 2; 2234 } 2235 2236 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2237 goto L1; 2238 } 2239 2240 @safe unittest 2241 { 2242 import std.exception; 2243 assertCTFEable!( 2244 { 2245 wchar[2] buf; 2246 2247 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2248 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF"); 2249 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000"); 2250 assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE); 2251 assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF); 2252 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000"); 2253 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF"); 2254 2255 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2256 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2257 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2258 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2259 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2260 2261 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2262 assert(buf.front == replacementDchar); 2263 }); 2264 } 2265 2266 2267 /// Ditto 2268 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2269 out dchar[1] buf, dchar c) @safe pure 2270 { 2271 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2272 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2273 else 2274 assert(isValidDchar(c)); 2275 buf[0] = c; 2276 return 1; 2277 } 2278 2279 @safe unittest 2280 { 2281 import std.exception; 2282 assertCTFEable!( 2283 { 2284 dchar[1] buf; 2285 2286 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2287 encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF'); 2288 encode(buf, '\uE000'); assert(buf[0] == '\uE000'); 2289 encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE); 2290 encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF); 2291 encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF'); 2292 2293 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2294 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2295 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2296 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2297 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2298 2299 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2300 assert(buf.front == replacementDchar); 2301 }); 2302 } 2303 2304 2305 /++ 2306 Encodes $(D c) in $(D str)'s encoding and appends it to $(D str). 2307 2308 Throws: 2309 $(D UTFException) if $(D c) is not a valid UTF code point. 2310 +/ 2311 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2312 ref char[] str, dchar c) @safe pure 2313 { 2314 char[] r = str; 2315 2316 if (c <= 0x7F) 2317 { 2318 assert(isValidDchar(c)); 2319 r ~= cast(char) c; 2320 } 2321 else 2322 { 2323 char[4] buf; 2324 uint L; 2325 2326 if (c <= 0x7FF) 2327 { 2328 assert(isValidDchar(c)); 2329 buf[0] = cast(char)(0xC0 | (c >> 6)); 2330 buf[1] = cast(char)(0x80 | (c & 0x3F)); 2331 L = 2; 2332 } 2333 else if (c <= 0xFFFF) 2334 { 2335 if (0xD800 <= c && c <= 0xDFFF) 2336 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2337 2338 assert(isValidDchar(c)); 2339 L3: 2340 buf[0] = cast(char)(0xE0 | (c >> 12)); 2341 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2342 buf[2] = cast(char)(0x80 | (c & 0x3F)); 2343 L = 3; 2344 } 2345 else if (c <= 0x10FFFF) 2346 { 2347 assert(isValidDchar(c)); 2348 buf[0] = cast(char)(0xF0 | (c >> 18)); 2349 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2350 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2351 buf[3] = cast(char)(0x80 | (c & 0x3F)); 2352 L = 4; 2353 } 2354 else 2355 { 2356 assert(!isValidDchar(c)); 2357 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2358 goto L3; 2359 } 2360 r ~= buf[0 .. L]; 2361 } 2362 str = r; 2363 } 2364 2365 @safe unittest 2366 { 2367 import std.exception; 2368 2369 assertCTFEable!( 2370 { 2371 char[] s = "abcd".dup; 2372 encode(s, cast(dchar)'a'); 2373 assert(s.length == 5); 2374 assert(s == "abcda"); 2375 2376 encode(s, cast(dchar)'\u00A9'); 2377 assert(s.length == 7); 2378 assert(s == "abcda\xC2\xA9"); 2379 //assert(s == "abcda\u00A9"); // BUG: fix compiler 2380 2381 encode(s, cast(dchar)'\u2260'); 2382 assert(s.length == 10); 2383 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); 2384 }); 2385 } 2386 2387 @safe unittest 2388 { 2389 import std.exception; 2390 assertCTFEable!( 2391 { 2392 char[] buf; 2393 2394 encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000"); 2395 encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F"); 2396 encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080"); 2397 encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF"); 2398 encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800"); 2399 encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF"); 2400 encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000"); 2401 encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE"); 2402 encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF"); 2403 encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000"); 2404 encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF"); 2405 2406 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2407 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2408 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2409 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2410 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2411 2412 assert(buf.back != replacementDchar); 2413 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2414 assert(buf.back == replacementDchar); 2415 }); 2416 } 2417 2418 /// ditto 2419 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2420 ref wchar[] str, dchar c) @safe pure 2421 { 2422 wchar[] r = str; 2423 2424 if (c <= 0xFFFF) 2425 { 2426 if (0xD800 <= c && c <= 0xDFFF) 2427 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2428 2429 assert(isValidDchar(c)); 2430 L1: 2431 r ~= cast(wchar) c; 2432 } 2433 else if (c <= 0x10FFFF) 2434 { 2435 wchar[2] buf; 2436 2437 assert(isValidDchar(c)); 2438 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2439 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2440 r ~= buf; 2441 } 2442 else 2443 { 2444 assert(!isValidDchar(c)); 2445 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2446 goto L1; 2447 } 2448 2449 str = r; 2450 } 2451 2452 @safe unittest 2453 { 2454 import std.exception; 2455 assertCTFEable!( 2456 { 2457 wchar[] buf; 2458 2459 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2460 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2461 encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2462 encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE); 2463 encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF); 2464 encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000"); 2465 encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF"); 2466 2467 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2468 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2469 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2470 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2471 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2472 2473 assert(buf.back != replacementDchar); 2474 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2475 assert(buf.back == replacementDchar); 2476 }); 2477 } 2478 2479 /// ditto 2480 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2481 ref dchar[] str, dchar c) @safe pure 2482 { 2483 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2484 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2485 else 2486 assert(isValidDchar(c)); 2487 str ~= c; 2488 } 2489 2490 @safe unittest 2491 { 2492 import std.exception; 2493 assertCTFEable!( 2494 { 2495 dchar[] buf; 2496 2497 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2498 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2499 encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2500 encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE); 2501 encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF); 2502 encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF'); 2503 2504 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2505 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2506 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2507 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2508 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2509 2510 assert(buf.back != replacementDchar); 2511 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2512 assert(buf.back == replacementDchar); 2513 }); 2514 } 2515 2516 2517 /++ 2518 Returns the number of code units that are required to encode the code point 2519 $(D c) when $(D C) is the character type used to encode it. 2520 +/ 2521 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc 2522 if (isSomeChar!C) 2523 { 2524 static if (C.sizeof == 1) 2525 { 2526 if (c <= 0x7F) return 1; 2527 if (c <= 0x7FF) return 2; 2528 if (c <= 0xFFFF) return 3; 2529 if (c <= 0x10FFFF) return 4; 2530 assert(false); 2531 } 2532 else static if (C.sizeof == 2) 2533 { 2534 return c <= 0xFFFF ? 1 : 2; 2535 } 2536 else 2537 { 2538 static assert(C.sizeof == 4); 2539 return 1; 2540 } 2541 } 2542 2543 /// 2544 @safe pure nothrow @nogc unittest 2545 { 2546 assert(codeLength!char('a') == 1); 2547 assert(codeLength!wchar('a') == 1); 2548 assert(codeLength!dchar('a') == 1); 2549 2550 assert(codeLength!char('\U0010FFFF') == 4); 2551 assert(codeLength!wchar('\U0010FFFF') == 2); 2552 assert(codeLength!dchar('\U0010FFFF') == 1); 2553 } 2554 2555 2556 /++ 2557 Returns the number of code units that are required to encode $(D str) 2558 in a string whose character type is $(D C). This is particularly useful 2559 when slicing one string with the length of another and the two string 2560 types use different character types. 2561 2562 Params: 2563 C = the character type to get the encoding length for 2564 input = the input range to calculate the encoding length from 2565 Returns: 2566 The number of code units in `input` when encoded to `C` 2567 +/ 2568 size_t codeLength(C, InputRange)(InputRange input) 2569 if (isInputRange!InputRange && !isInfinite!InputRange && is(ElementType!InputRange : dchar)) 2570 { 2571 alias EncType = Unqual!(ElementEncodingType!InputRange); 2572 static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length))) 2573 return input.length; 2574 else 2575 { 2576 size_t total = 0; 2577 2578 foreach (dchar c; input) 2579 total += codeLength!C(c); 2580 2581 return total; 2582 } 2583 } 2584 2585 /// 2586 @safe unittest 2587 { 2588 import std.conv : to; 2589 assert(codeLength!char("hello world") == 2590 to!string("hello world").length); 2591 assert(codeLength!wchar("hello world") == 2592 to!wstring("hello world").length); 2593 assert(codeLength!dchar("hello world") == 2594 to!dstring("hello world").length); 2595 2596 assert(codeLength!char(`プログラミング`) == 2597 to!string(`プログラミング`).length); 2598 assert(codeLength!wchar(`プログラミング`) == 2599 to!wstring(`プログラミング`).length); 2600 assert(codeLength!dchar(`プログラミング`) == 2601 to!dstring(`プログラミング`).length); 2602 2603 string haystack = `Être sans la verité, ça, ce ne serait pas bien.`; 2604 wstring needle = `Être sans la verité`; 2605 assert(haystack[codeLength!char(needle) .. $] == 2606 `, ça, ce ne serait pas bien.`); 2607 } 2608 2609 @safe unittest 2610 { 2611 import std.algorithm.iteration : filter; 2612 import std.conv : to; 2613 import std.exception; 2614 2615 assertCTFEable!( 2616 { 2617 foreach (S; AliasSeq!( char[], const char[], string, 2618 wchar[], const wchar[], wstring, 2619 dchar[], const dchar[], dstring)) 2620 { 2621 foreach (C; AliasSeq!(char, wchar, dchar)) 2622 { 2623 assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length); 2624 assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length); 2625 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) == 2626 to!(C[])(`ウェブサイト@La_Verité.com`).length); 2627 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) == 2628 to!(C[])(`ウェブサイト@La_Verité.com`).length); 2629 } 2630 } 2631 }); 2632 } 2633 2634 /+ 2635 Internal helper function: 2636 2637 Returns true if it is safe to search for the Codepoint $(D c) inside 2638 code units, without decoding. 2639 2640 This is a runtime check that is used an optimization in various functions, 2641 particularly, in $(D std.string). 2642 +/ 2643 package bool canSearchInCodeUnits(C)(dchar c) 2644 if (isSomeChar!C) 2645 { 2646 static if (C.sizeof == 1) 2647 return c <= 0x7F; 2648 else static if (C.sizeof == 2) 2649 return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF); 2650 else static if (C.sizeof == 4) 2651 return true; 2652 else 2653 static assert(0); 2654 } 2655 @safe unittest 2656 { 2657 assert( canSearchInCodeUnits! char('a')); 2658 assert( canSearchInCodeUnits!wchar('a')); 2659 assert( canSearchInCodeUnits!dchar('a')); 2660 assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF 2661 assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF 2662 assert( canSearchInCodeUnits!wchar('ö')); 2663 assert( canSearchInCodeUnits!dchar('ö')); 2664 assert(!canSearchInCodeUnits! char('日')); 2665 assert( canSearchInCodeUnits!wchar('日')); 2666 assert( canSearchInCodeUnits!dchar('日')); 2667 assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00)); 2668 assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00)); 2669 assert(!canSearchInCodeUnits! char('\U00010001')); 2670 assert(!canSearchInCodeUnits!wchar('\U00010001')); 2671 assert( canSearchInCodeUnits!dchar('\U00010001')); 2672 } 2673 2674 /* =================== Validation ======================= */ 2675 2676 /++ 2677 Checks to see if $(D str) is well-formed unicode or not. 2678 2679 Throws: 2680 $(D UTFException) if $(D str) is not well-formed. 2681 +/ 2682 void validate(S)(in S str) @safe pure 2683 if (isSomeString!S) 2684 { 2685 immutable len = str.length; 2686 for (size_t i = 0; i < len; ) 2687 { 2688 decode(str, i); 2689 } 2690 } 2691 2692 2693 @safe unittest // bugzilla 12923 2694 { 2695 import std.exception; 2696 assertThrown((){ 2697 char[3]a=[167, 133, 175]; 2698 validate(a[]); 2699 }()); 2700 } 2701 2702 /** 2703 * Encodes the elements of `s` to UTF-8 and returns a newly allocated 2704 * string of the elements. 2705 * 2706 * Params: 2707 * s = the string to encode 2708 * Returns: 2709 * A UTF-8 string 2710 * See_Also: 2711 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 2712 */ 2713 string toUTF8(S)(S s) 2714 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S)) 2715 { 2716 return toUTFImpl!string(s); 2717 } 2718 2719 /// 2720 @safe pure unittest 2721 { 2722 import std.algorithm.comparison : equal; 2723 2724 // The ö is represented by two UTF-8 code units 2725 assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 2726 2727 // is four code units in UTF-8 2728 assert(""d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 2729 } 2730 2731 @system pure unittest 2732 { 2733 import std.algorithm.comparison : equal; 2734 import std.internal.test.dummyrange : ReferenceInputRange; 2735 2736 auto r1 = new ReferenceInputRange!dchar("Hellø"); 2737 auto r2 = new ReferenceInputRange!dchar(""); 2738 2739 assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 2740 assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 2741 } 2742 2743 /** 2744 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated 2745 * `wstring` of the elements. 2746 * 2747 * Params: 2748 * s = the range to encode 2749 * Returns: 2750 * A UTF-16 string 2751 * See_Also: 2752 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 2753 */ 2754 wstring toUTF16(S)(S s) 2755 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S)) 2756 { 2757 return toUTFImpl!wstring(s); 2758 } 2759 2760 /// 2761 @safe pure unittest 2762 { 2763 import std.algorithm.comparison : equal; 2764 2765 // these graphemes are two code units in UTF-16 and one in UTF-32 2766 assert(""d.length == 1); 2767 assert(""d.length == 1); 2768 2769 assert(""d.toUTF16.equal([0xD852, 0xDF62])); 2770 assert(""d.toUTF16.equal([0xD801, 0xDC37])); 2771 } 2772 2773 @system pure unittest 2774 { 2775 import std.algorithm.comparison : equal; 2776 import std.internal.test.dummyrange : ReferenceInputRange; 2777 2778 auto r1 = new ReferenceInputRange!dchar(""); 2779 auto r2 = new ReferenceInputRange!dchar(""); 2780 2781 assert(r1.toUTF16.equal([0xD852, 0xDF62])); 2782 assert(r2.toUTF16.equal([0xD801, 0xDC37])); 2783 } 2784 2785 2786 /** 2787 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated 2788 * `dstring` of the elements. 2789 * 2790 * Params: 2791 * s = the range to encode 2792 * Returns: 2793 * A UTF-32 string 2794 * See_Also: 2795 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 2796 */ 2797 dstring toUTF32(S)(S s) 2798 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S)) 2799 { 2800 return toUTFImpl!dstring(s); 2801 } 2802 2803 private T toUTFImpl(T, S)(S s) 2804 { 2805 static if (is(S : T)) 2806 { 2807 return s.idup; 2808 } 2809 else 2810 { 2811 import std.array : appender; 2812 auto app = appender!T(); 2813 2814 static if (hasLength!S || isSomeString!S) 2815 app.reserve(s.length); 2816 2817 foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T))) 2818 app.put(c); 2819 2820 return app.data; 2821 } 2822 } 2823 2824 /* =================== toUTFz ======================= */ 2825 2826 /++ 2827 Returns a C-style zero-terminated string equivalent to $(D str). $(D str) 2828 must not contain embedded $(D '\0')'s as any C function will treat the first 2829 $(D '\0') that it sees as the end of the string. If $(D str.empty) is 2830 $(D true), then a string containing only $(D '\0') is returned. 2831 2832 $(D toUTFz) accepts any type of string and is templated on the type of 2833 character pointer that you wish to convert to. It will avoid allocating a 2834 new string if it can, but there's a decent chance that it will end up having 2835 to allocate a new string - particularly when dealing with character types 2836 other than $(D char). 2837 2838 $(RED Warning 1:) If the result of $(D toUTFz) equals $(D str.ptr), then if 2839 anything alters the character one past the end of $(D str) (which is the 2840 $(D '\0') character terminating the string), then the string won't be 2841 zero-terminated anymore. The most likely scenarios for that are if you 2842 append to $(D str) and no reallocation takes place or when $(D str) is a 2843 slice of a larger array, and you alter the character in the larger array 2844 which is one character past the end of $(D str). Another case where it could 2845 occur would be if you had a mutable character array immediately after 2846 $(D str) in memory (for example, if they're member variables in a 2847 user-defined type with one declared right after the other) and that 2848 character array happened to start with $(D '\0'). Such scenarios will never 2849 occur if you immediately use the zero-terminated string after calling 2850 $(D toUTFz) and the C function using it doesn't keep a reference to it. 2851 Also, they are unlikely to occur even if you save the zero-terminated string 2852 (the cases above would be among the few examples of where it could happen). 2853 However, if you save the zero-terminate string and want to be absolutely 2854 certain that the string stays zero-terminated, then simply append a 2855 $(D '\0') to the string and use its $(D ptr) property rather than calling 2856 $(D toUTFz). 2857 2858 $(RED Warning 2:) When passing a character pointer to a C function, and the 2859 C function keeps it around for any reason, make sure that you keep a 2860 reference to it in your D code. Otherwise, it may go away during a garbage 2861 collection cycle and cause a nasty bug when the C code tries to use it. 2862 +/ 2863 template toUTFz(P) 2864 { 2865 P toUTFz(S)(S str) @safe pure 2866 { 2867 return toUTFzImpl!(P, S)(str); 2868 } 2869 } 2870 2871 /// 2872 @safe pure unittest 2873 { 2874 auto p1 = toUTFz!(char*)("hello world"); 2875 auto p2 = toUTFz!(const(char)*)("hello world"); 2876 auto p3 = toUTFz!(immutable(char)*)("hello world"); 2877 auto p4 = toUTFz!(char*)("hello world"d); 2878 auto p5 = toUTFz!(const(wchar)*)("hello world"); 2879 auto p6 = toUTFz!(immutable(dchar)*)("hello world"w); 2880 } 2881 2882 private P toUTFzImpl(P, S)(S str) @safe pure 2883 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) && 2884 is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) && 2885 is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S)) 2886 //immutable(C)[] -> C*, const(C)*, or immutable(C)* 2887 { 2888 if (str.empty) 2889 { 2890 typeof(*P.init)[] retval = ['\0']; 2891 2892 auto trustedPtr() @trusted { return retval.ptr; } 2893 return trustedPtr(); 2894 } 2895 2896 alias C = Unqual!(ElementEncodingType!S); 2897 2898 //If the P is mutable, then we have to make a copy. 2899 static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init))) 2900 { 2901 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 2902 } 2903 else 2904 { 2905 if (!__ctfe) 2906 { 2907 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 2908 immutable p = trustedPtrAdd(str); 2909 2910 // Peek past end of str, if it's 0, no conversion necessary. 2911 // Note that the compiler will put a 0 past the end of static 2912 // strings, and the storage allocator will put a 0 past the end 2913 // of newly allocated char[]'s. 2914 // Is p dereferenceable? A simple test: if the p points to an 2915 // address multiple of 4, then conservatively assume the pointer 2916 // might be pointing to a new block of memory, which might be 2917 // unreadable. Otherwise, it's definitely pointing to valid 2918 // memory. 2919 if ((cast(size_t) p & 3) && *p == '\0') 2920 return &str[0]; 2921 } 2922 2923 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 2924 } 2925 } 2926 2927 private P toUTFzImpl(P, S)(S str) @safe pure 2928 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) && 2929 is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) && 2930 !is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S)) 2931 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)* 2932 { 2933 alias InChar = ElementEncodingType!S; 2934 alias OutChar = typeof(*P.init); 2935 2936 //const(C)[] -> const(C)* or 2937 //C[] -> C* or const(C)* 2938 static if (( is(const(Unqual!InChar) == InChar) && is(const(Unqual!OutChar) == OutChar)) || 2939 (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar))) 2940 { 2941 if (!__ctfe) 2942 { 2943 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 2944 auto p = trustedPtrAdd(str); 2945 2946 if ((cast(size_t) p & 3) && *p == '\0') 2947 return &str[0]; 2948 } 2949 2950 str ~= '\0'; 2951 return &str[0]; 2952 } 2953 //const(C)[] -> C* or immutable(C)* or 2954 //C[] -> immutable(C)* 2955 else 2956 { 2957 import std.array : uninitializedArray; 2958 auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1); 2959 copy[0 .. $ - 1] = str[]; 2960 copy[$ - 1] = '\0'; 2961 2962 auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; } 2963 return trustedCast(copy); 2964 } 2965 } 2966 2967 private P toUTFzImpl(P, S)(S str) @safe pure 2968 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) && 2969 !is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S))) 2970 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)* 2971 { 2972 import std.array : appender; 2973 auto retval = appender!(typeof(*P.init)[])(); 2974 2975 foreach (dchar c; str) 2976 retval.put(c); 2977 retval.put('\0'); 2978 2979 return () @trusted { return cast(P) retval.data.ptr; } (); 2980 } 2981 2982 @safe pure unittest 2983 { 2984 import core.exception : AssertError; 2985 import std.algorithm; 2986 import std.conv : to; 2987 import std.exception; 2988 import std.string : format; 2989 2990 assertCTFEable!( 2991 { 2992 foreach (S; AliasSeq!(string, wstring, dstring)) 2993 { 2994 alias C = Unqual!(ElementEncodingType!S); 2995 2996 auto s1 = to!S("hello\U00010143\u0100\U00010143"); 2997 auto temp = new C[](s1.length + 1); 2998 temp[0 .. $ - 1] = s1[0 .. $]; 2999 temp[$ - 1] = '\n'; 3000 --temp.length; 3001 auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); } 3002 auto s2 = trustedAssumeUnique(temp); 3003 assert(s1 == s2); 3004 3005 void trustedCStringAssert(P, S)(S s) @trusted 3006 { 3007 auto p = toUTFz!P(s); 3008 assert(p[0 .. s.length] == s); 3009 assert(p[s.length] == '\0'); 3010 } 3011 3012 foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*)) 3013 { 3014 trustedCStringAssert!P(s1); 3015 trustedCStringAssert!P(s2); 3016 } 3017 } 3018 }); 3019 3020 static void test(P, S)(S s, size_t line = __LINE__) @trusted 3021 { 3022 static size_t zeroLen(C)(const(C)* ptr) @trusted 3023 { 3024 size_t len = 0; 3025 while (*ptr != '\0') { ++ptr; ++len; } 3026 return len; 3027 } 3028 3029 auto p = toUTFz!P(s); 3030 immutable len = zeroLen(p); 3031 enforce(cmp(s, p[0 .. len]) == 0, 3032 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof), 3033 __FILE__, line)); 3034 } 3035 3036 assertCTFEable!( 3037 { 3038 foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*, 3039 dchar*, const(dchar)*, immutable(dchar)*)) 3040 { 3041 test!P("hello\U00010143\u0100\U00010143"); 3042 } 3043 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3044 dchar*, const(dchar)*, immutable(dchar)*)) 3045 { 3046 test!P("hello\U00010143\u0100\U00010143"w); 3047 } 3048 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3049 wchar*, const(wchar)*, immutable(wchar)*)) 3050 { 3051 test!P("hello\U00010143\u0100\U00010143"d); 3052 } 3053 foreach (S; AliasSeq!( char[], const( char)[], 3054 wchar[], const(wchar)[], 3055 dchar[], const(dchar)[])) 3056 { 3057 auto s = to!S("hello\U00010143\u0100\U00010143"); 3058 3059 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3060 wchar*, const(wchar)*, immutable(wchar)*, 3061 dchar*, const(dchar)*, immutable(dchar)*)) 3062 { 3063 test!P(s); 3064 } 3065 } 3066 }); 3067 } 3068 3069 3070 /++ 3071 $(D toUTF16z) is a convenience function for $(D toUTFz!(const(wchar)*)). 3072 3073 Encodes string $(D s) into UTF-16 and returns the encoded string. 3074 $(D toUTF16z) is suitable for calling the 'W' functions in the Win32 API 3075 that take an $(D LPWSTR) or $(D LPCWSTR) argument. 3076 +/ 3077 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure 3078 if (isSomeChar!C) 3079 { 3080 return toUTFz!(const(wchar)*)(str); 3081 } 3082 3083 @safe pure unittest 3084 { 3085 import std.conv : to; 3086 //toUTFz is already thoroughly tested, so this will just verify that 3087 //toUTF16z compiles properly for the various string types. 3088 foreach (S; AliasSeq!(string, wstring, dstring)) 3089 assert(toUTF16z(to!S("hello world")) !is null); 3090 } 3091 3092 3093 /* ================================ tests ================================== */ 3094 3095 @safe pure unittest 3096 { 3097 import std.exception; 3098 3099 assertCTFEable!( 3100 { 3101 assert(toUTF16("hello"c) == "hello"); 3102 assert(toUTF32("hello"c) == "hello"); 3103 assert(toUTF8 ("hello"w) == "hello"); 3104 assert(toUTF32("hello"w) == "hello"); 3105 assert(toUTF8 ("hello"d) == "hello"); 3106 assert(toUTF16("hello"d) == "hello"); 3107 3108 assert(toUTF16("hel\u1234o"c) == "hel\u1234o"); 3109 assert(toUTF32("hel\u1234o"c) == "hel\u1234o"); 3110 assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o"); 3111 assert(toUTF32("hel\u1234o"w) == "hel\u1234o"); 3112 assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o"); 3113 assert(toUTF16("hel\u1234o"d) == "hel\u1234o"); 3114 3115 assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3116 assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3117 assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3118 assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3119 assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3120 assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3121 }); 3122 } 3123 3124 3125 /++ 3126 Returns the total number of code points encoded in $(D str). 3127 3128 Supercedes: This function supercedes $(LREF toUCSindex). 3129 3130 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 3131 3132 Throws: 3133 $(D UTFException) if $(D str) is not well-formed. 3134 +/ 3135 size_t count(C)(const(C)[] str) @trusted pure nothrow @nogc 3136 if (isSomeChar!C) 3137 { 3138 return walkLength(str); 3139 } 3140 3141 @safe pure nothrow @nogc unittest 3142 { 3143 import std.exception; 3144 assertCTFEable!( 3145 { 3146 assert(count("") == 0); 3147 assert(count("a") == 1); 3148 assert(count("abc") == 3); 3149 assert(count("\u20AC100") == 4); 3150 }); 3151 } 3152 3153 3154 // Ranges of code units for testing. 3155 version (unittest) 3156 { 3157 struct InputCU(C) 3158 { 3159 import std.conv : to; 3160 @property bool empty() { return _str.empty; } 3161 @property C front() { return _str[0]; } 3162 void popFront() { _str = _str[1 .. $]; } 3163 3164 this(inout(C)[] str) 3165 { 3166 _str = to!(C[])(str); 3167 } 3168 3169 C[] _str; 3170 } 3171 3172 struct BidirCU(C) 3173 { 3174 import std.conv : to; 3175 @property bool empty() { return _str.empty; } 3176 @property C front() { return _str[0]; } 3177 void popFront() { _str = _str[1 .. $]; } 3178 @property C back() { return _str[$ - 1]; } 3179 void popBack() { _str = _str[0 .. $ - 1]; } 3180 @property auto save() { return BidirCU(_str); } 3181 @property size_t length() { return _str.length; } 3182 3183 this(inout(C)[] str) 3184 { 3185 _str = to!(C[])(str); 3186 } 3187 3188 C[] _str; 3189 } 3190 3191 struct RandomCU(C) 3192 { 3193 import std.conv : to; 3194 @property bool empty() { return _str.empty; } 3195 @property C front() { return _str[0]; } 3196 void popFront() { _str = _str[1 .. $]; } 3197 @property C back() { return _str[$ - 1]; } 3198 void popBack() { _str = _str[0 .. $ - 1]; } 3199 @property auto save() { return RandomCU(_str); } 3200 @property size_t length() { return _str.length; } 3201 C opIndex(size_t i) { return _str[i]; } 3202 auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); } 3203 3204 this(inout(C)[] str) 3205 { 3206 _str = to!(C[])(str); 3207 } 3208 3209 C[] _str; 3210 } 3211 3212 class RefBidirCU(C) 3213 { 3214 import std.conv : to; 3215 @property bool empty() { return _str.empty; } 3216 @property C front() { return _str[0]; } 3217 void popFront() { _str = _str[1 .. $]; } 3218 @property C back() { return _str[$ - 1]; } 3219 void popBack() { _str = _str[0 .. $ - 1]; } 3220 @property auto save() { return new RefBidirCU(_str); } 3221 @property size_t length() { return _str.length; } 3222 3223 this(inout(C)[] str) 3224 { 3225 _str = to!(C[])(str); 3226 } 3227 3228 C[] _str; 3229 } 3230 3231 class RefRandomCU(C) 3232 { 3233 import std.conv : to; 3234 @property bool empty() { return _str.empty; } 3235 @property C front() { return _str[0]; } 3236 void popFront() { _str = _str[1 .. $]; } 3237 @property C back() { return _str[$ - 1]; } 3238 void popBack() { _str = _str[0 .. $ - 1]; } 3239 @property auto save() { return new RefRandomCU(_str); } 3240 @property size_t length() { return _str.length; } 3241 C opIndex(size_t i) { return _str[i]; } 3242 auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); } 3243 3244 this(inout(C)[] str) 3245 { 3246 _str = to!(C[])(str); 3247 } 3248 3249 C[] _str; 3250 } 3251 } 3252 3253 3254 /** 3255 * Inserted in place of invalid UTF sequences. 3256 * 3257 * References: 3258 * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character) 3259 */ 3260 enum dchar replacementDchar = '\uFFFD'; 3261 3262 /******************************************** 3263 * Iterate a range of char, wchar, or dchars by code unit. 3264 * 3265 * The purpose is to bypass the special case decoding that 3266 * $(REF front, std,range,primitives) does to character arrays. As a result, 3267 * using ranges with `byCodeUnit` can be `nothrow` while 3268 * $(REF front, std,range,primitives) throws when it encounters invalid Unicode 3269 * sequences. 3270 * 3271 * A code unit is a building block of the UTF encodings. Generally, an 3272 * individual code unit does not represent what's perceived as a full 3273 * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters 3274 * are encoded with multiple code units. For example, the UTF-8 code units for 3275 * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit` 3276 * often does not form a character on its own. Attempting to treat it as 3277 * one while iterating over the resulting range will give nonsensical results. 3278 * 3279 * Params: 3280 * r = an input range of characters (including strings) or a type that 3281 * implicitly converts to a string type. 3282 * Returns: 3283 * If `r` is not an auto-decodable string (i.e. a narrow string or a 3284 * user-defined type that implicits converts to a string type), then `r` 3285 * is returned. 3286 * 3287 * Otherwise, `r` is converted to its corresponding string type (if it's 3288 * not already a string) and wrapped in a random-access range where the 3289 * element encoding type of the string (its code unit) is the element type 3290 * of the range, and that range returned. The range has slicing. 3291 * 3292 * If `r` is quirky enough to be a struct or class which is an input range 3293 * of characters on its own (i.e. it has the input range API as member 3294 * functions), $(I and) it's implicitly convertible to a string type, then 3295 * `r` is returned, and no implicit conversion takes place. 3296 * See_Also: 3297 * Refer to the $(MREF std, uni) docs for a reference on Unicode 3298 * terminology. 3299 * 3300 * For a range that iterates by grapheme cluster (written character) see 3301 * $(REF byGrapheme, std,uni). 3302 */ 3303 auto byCodeUnit(R)(R r) 3304 if (isAutodecodableString!R || 3305 isInputRange!R && isSomeChar!(ElementEncodingType!R) || 3306 (is(R : const dchar[]) && !isStaticArray!R)) 3307 { 3308 static if (isNarrowString!R || 3309 // This would be cleaner if we had a way to check whether a type 3310 // was a range without any implicit conversions. 3311 (isAutodecodableString!R && !__traits(hasMember, R, "empty") && 3312 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))) 3313 { 3314 static struct ByCodeUnitImpl 3315 { 3316 @safe pure nothrow @nogc: 3317 3318 @property bool empty() const { return str.length == 0; } 3319 @property auto ref front() inout { return str[0]; } 3320 void popFront() { str = str[1 .. $]; } 3321 3322 @property auto save() { return ByCodeUnitImpl(str.save); } 3323 3324 @property auto ref back() inout { return str[$ - 1]; } 3325 void popBack() { str = str[0 .. $-1]; } 3326 3327 auto ref opIndex(size_t index) inout { return str[index]; } 3328 auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(str[lower .. upper]); } 3329 3330 @property size_t length() const { return str.length; } 3331 alias opDollar = length; 3332 3333 private: 3334 StringTypeOf!R str; 3335 } 3336 3337 static assert(isRandomAccessRange!ByCodeUnitImpl); 3338 3339 return ByCodeUnitImpl(r); 3340 } 3341 else static if (is(R : const dchar[]) && !__traits(hasMember, R, "empty") && 3342 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")) 3343 { 3344 return cast(StringTypeOf!R) r; 3345 } 3346 else 3347 { 3348 // byCodeUnit for ranges and dchar[] is a no-op 3349 return r; 3350 } 3351 } 3352 3353 /// 3354 @safe unittest 3355 { 3356 import std.range.primitives; 3357 3358 auto r = "Hello, World!".byCodeUnit(); 3359 static assert(hasLength!(typeof(r))); 3360 static assert(hasSlicing!(typeof(r))); 3361 static assert(isRandomAccessRange!(typeof(r))); 3362 static assert(is(ElementType!(typeof(r)) == immutable char)); 3363 3364 // contrast with the range capabilities of standard strings 3365 auto s = "Hello, World!"; 3366 static assert(isBidirectionalRange!(typeof(r))); 3367 static assert(is(ElementType!(typeof(s)) == dchar)); 3368 3369 static assert(!isRandomAccessRange!(typeof(s))); 3370 static assert(!hasSlicing!(typeof(s))); 3371 static assert(!hasLength!(typeof(s))); 3372 } 3373 3374 /// `byCodeUnit` does no Unicode decoding 3375 @safe unittest 3376 { 3377 string noel1 = "noe\u0308l"; // noël using e + combining diaeresis 3378 assert(noel1.byCodeUnit[2] != 'ë'); 3379 assert(noel1.byCodeUnit[2] == 'e'); 3380 3381 string noel2 = "no\u00EBl"; // noël using a precomposed ë character 3382 // Because string is UTF-8, the code unit at index 2 is just 3383 // the first of a sequence that encodes 'ë' 3384 assert(noel2.byCodeUnit[2] != 'ë'); 3385 } 3386 3387 @safe pure nothrow @nogc unittest 3388 { 3389 import std.range; 3390 { 3391 enum testStr = " hello ディラン"; 3392 char[testStr.length] s; 3393 int i; 3394 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3395 { 3396 s[i++] = c; 3397 } 3398 assert(s == testStr); 3399 } 3400 { 3401 enum testStr = " hello ディラン"w; 3402 wchar[testStr.length] s; 3403 int i; 3404 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3405 { 3406 s[i++] = c; 3407 } 3408 assert(s == testStr); 3409 } 3410 { 3411 enum testStr = " hello ディラン"d; 3412 dchar[testStr.length] s; 3413 int i; 3414 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3415 { 3416 s[i++] = c; 3417 } 3418 assert(s == testStr); 3419 } 3420 { 3421 auto bcu = "hello".byCodeUnit(); 3422 assert(bcu.length == 5); 3423 assert(bcu[3] == 'l'); 3424 assert(bcu[2 .. 4][1] == 'l'); 3425 } 3426 { 3427 char[5] orig = "hello"; 3428 auto bcu = orig[].byCodeUnit(); 3429 bcu.front = 'H'; 3430 assert(bcu.front == 'H'); 3431 bcu[1] = 'E'; 3432 assert(bcu[1] == 'E'); 3433 } 3434 { 3435 auto bcu = "hello".byCodeUnit().byCodeUnit(); 3436 static assert(isForwardRange!(typeof(bcu))); 3437 static assert(is(typeof(bcu) == struct)); 3438 auto s = bcu.save; 3439 bcu.popFront(); 3440 assert(s.front == 'h'); 3441 } 3442 { 3443 auto bcu = "hello".byCodeUnit(); 3444 static assert(hasSlicing!(typeof(bcu))); 3445 static assert(isBidirectionalRange!(typeof(bcu))); 3446 static assert(is(typeof(bcu) == struct)); 3447 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3448 auto ret = bcu.retro; 3449 assert(ret.front == 'o'); 3450 ret.popFront(); 3451 assert(ret.front == 'l'); 3452 } 3453 { 3454 auto bcu = "κόσμε"w.byCodeUnit(); 3455 static assert(hasSlicing!(typeof(bcu))); 3456 static assert(isBidirectionalRange!(typeof(bcu))); 3457 static assert(is(typeof(bcu) == struct)); 3458 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3459 auto ret = bcu.retro; 3460 assert(ret.front == 'ε'); 3461 ret.popFront(); 3462 assert(ret.front == 'μ'); 3463 } 3464 { 3465 static struct Stringish 3466 { 3467 string s; 3468 alias s this; 3469 } 3470 3471 auto orig = Stringish("\U0010fff8 foo "); 3472 auto bcu = orig.byCodeUnit(); 3473 static assert(is(typeof(bcu) == struct)); 3474 static assert(!is(typeof(bcu) == Stringish)); 3475 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3476 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3477 assert(bcu.front == cast(char) 244); 3478 } 3479 { 3480 static struct WStringish 3481 { 3482 wstring s; 3483 alias s this; 3484 } 3485 3486 auto orig = WStringish("\U0010fff8 foo "w); 3487 auto bcu = orig.byCodeUnit(); 3488 static assert(is(typeof(bcu) == struct)); 3489 static assert(!is(typeof(bcu) == WStringish)); 3490 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3491 static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3492 assert(bcu.front == cast(wchar) 56319); 3493 } 3494 { 3495 static struct DStringish 3496 { 3497 dstring s; 3498 alias s this; 3499 } 3500 3501 auto orig = DStringish("\U0010fff8 foo "d); 3502 auto bcu = orig.byCodeUnit(); 3503 static assert(is(typeof(bcu) == dstring)); 3504 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3505 static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3506 assert(bcu.front == cast(dchar) 1114104); 3507 } 3508 { 3509 static struct FuncStringish 3510 { 3511 string str; 3512 string s() pure nothrow @nogc { return str; } 3513 alias s this; 3514 } 3515 3516 auto orig = FuncStringish("\U0010fff8 foo "); 3517 auto bcu = orig.byCodeUnit(); 3518 static assert(is(typeof(bcu) == struct)); 3519 static assert(!is(typeof(bcu) == FuncStringish)); 3520 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3521 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3522 assert(bcu.front == cast(char) 244); 3523 } 3524 { 3525 static struct Range 3526 { 3527 string data; 3528 bool empty() pure nothrow @nogc { return data.empty; } 3529 char front() pure nothrow @nogc { return data[0]; } 3530 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3531 } 3532 3533 auto orig = Range("\U0010fff8 foo "); 3534 auto bcu = orig.byCodeUnit(); 3535 static assert(is(typeof(bcu) == Range)); 3536 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3537 static assert(is(ElementType!(typeof(bcu)) == char)); 3538 assert(bcu.front == cast(char) 244); 3539 } 3540 { 3541 static struct WRange 3542 { 3543 wstring data; 3544 bool empty() pure nothrow @nogc { return data.empty; } 3545 wchar front() pure nothrow @nogc { return data[0]; } 3546 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3547 } 3548 3549 auto orig = WRange("\U0010fff8 foo "w); 3550 auto bcu = orig.byCodeUnit(); 3551 static assert(is(typeof(bcu) == WRange)); 3552 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3553 static assert(is(ElementType!(typeof(bcu)) == wchar)); 3554 assert(bcu.front == 56319); 3555 } 3556 { 3557 static struct DRange 3558 { 3559 dstring data; 3560 bool empty() pure nothrow @nogc { return data.empty; } 3561 dchar front() pure nothrow @nogc { return data[0]; } 3562 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3563 } 3564 3565 auto orig = DRange("\U0010fff8 foo "d); 3566 auto bcu = orig.byCodeUnit(); 3567 static assert(is(typeof(bcu) == DRange)); 3568 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3569 static assert(is(ElementType!(typeof(bcu)) == dchar)); 3570 assert(bcu.front == 1114104); 3571 } 3572 { 3573 static struct RangeAndStringish 3574 { 3575 bool empty() pure nothrow @nogc { return data.empty; } 3576 char front() pure nothrow @nogc { return data[0]; } 3577 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3578 3579 string data; 3580 string s; 3581 alias s this; 3582 } 3583 3584 auto orig = RangeAndStringish("test.d", "other"); 3585 auto bcu = orig.byCodeUnit(); 3586 static assert(is(typeof(bcu) == RangeAndStringish)); 3587 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3588 static assert(is(ElementType!(typeof(bcu)) == char)); 3589 assert(bcu.front == 't'); 3590 } 3591 { 3592 static struct WRangeAndStringish 3593 { 3594 bool empty() pure nothrow @nogc { return data.empty; } 3595 wchar front() pure nothrow @nogc { return data[0]; } 3596 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3597 3598 wstring data; 3599 wstring s; 3600 alias s this; 3601 } 3602 3603 auto orig = WRangeAndStringish("test.d"w, "other"w); 3604 auto bcu = orig.byCodeUnit(); 3605 static assert(is(typeof(bcu) == WRangeAndStringish)); 3606 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3607 static assert(is(ElementType!(typeof(bcu)) == wchar)); 3608 assert(bcu.front == 't'); 3609 } 3610 { 3611 static struct DRangeAndStringish 3612 { 3613 bool empty() pure nothrow @nogc { return data.empty; } 3614 dchar front() pure nothrow @nogc { return data[0]; } 3615 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3616 3617 dstring data; 3618 dstring s; 3619 alias s this; 3620 } 3621 3622 auto orig = DRangeAndStringish("test.d"d, "other"d); 3623 auto bcu = orig.byCodeUnit(); 3624 static assert(is(typeof(bcu) == DRangeAndStringish)); 3625 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3626 static assert(is(ElementType!(typeof(bcu)) == dchar)); 3627 assert(bcu.front == 't'); 3628 } 3629 { 3630 enum Enum : string { a = "test.d" } 3631 3632 auto orig = Enum.a; 3633 auto bcu = orig.byCodeUnit(); 3634 static assert(!is(typeof(bcu) == Enum)); 3635 static assert(is(typeof(bcu) == struct)); 3636 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3637 assert(bcu.front == 't'); 3638 } 3639 { 3640 enum WEnum : wstring { a = "test.d"w } 3641 3642 auto orig = WEnum.a; 3643 auto bcu = orig.byCodeUnit(); 3644 static assert(!is(typeof(bcu) == WEnum)); 3645 static assert(is(typeof(bcu) == struct)); 3646 static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3647 assert(bcu.front == 't'); 3648 } 3649 { 3650 enum DEnum : dstring { a = "test.d"d } 3651 3652 auto orig = DEnum.a; 3653 auto bcu = orig.byCodeUnit(); 3654 static assert(is(typeof(bcu) == dstring)); 3655 static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3656 assert(bcu.front == 't'); 3657 } 3658 3659 static assert(!is(typeof(byCodeUnit("hello")) == string)); 3660 static assert(!is(typeof(byCodeUnit("hello"w)) == wstring)); 3661 static assert(is(typeof(byCodeUnit("hello"d)) == dstring)); 3662 3663 static assert(!__traits(compiles, byCodeUnit((char[5]).init))); 3664 static assert(!__traits(compiles, byCodeUnit((wchar[5]).init))); 3665 static assert(!__traits(compiles, byCodeUnit((dchar[5]).init))); 3666 3667 enum SEnum : char[5] { a = "hello" } 3668 enum WSEnum : wchar[5] { a = "hello"w } 3669 enum DSEnum : dchar[5] { a = "hello"d } 3670 3671 static assert(!__traits(compiles, byCodeUnit(SEnum.a))); 3672 static assert(!__traits(compiles, byCodeUnit(WSEnum.a))); 3673 static assert(!__traits(compiles, byCodeUnit(DSEnum.a))); 3674 } 3675 3676 /**************************** 3677 * Iterate an input range of characters by char, wchar, or dchar. 3678 * These aliases simply forward to $(LREF byUTF) with the 3679 * corresponding C argument. 3680 * 3681 * Params: 3682 * r = input range of characters, or array of characters 3683 */ 3684 alias byChar = byUTF!char; 3685 3686 /// Ditto 3687 alias byWchar = byUTF!wchar; 3688 3689 /// Ditto 3690 alias byDchar = byUTF!dchar; 3691 3692 @safe pure nothrow @nogc unittest 3693 { 3694 { 3695 char[5] s; 3696 int i; 3697 foreach (c; "hello".byChar.byChar()) 3698 { 3699 //writefln("[%d] '%c'", i, c); 3700 s[i++] = c; 3701 } 3702 assert(s == "hello"); 3703 } 3704 { 3705 char[5+2+3+4+3+3] s; 3706 int i; 3707 dchar[10] a; 3708 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 3709 a[8] = 0xD800; // invalid 3710 a[9] = cast(dchar) 0x110000; // invalid 3711 foreach (c; a[].byChar()) 3712 { 3713 //writefln("[%d] '%c'", i, c); 3714 s[i++] = c; 3715 } 3716 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"); 3717 } 3718 { 3719 auto r = "hello"w.byChar(); 3720 r.popFront(); 3721 r.popFront(); 3722 assert(r.front == 'l'); 3723 } 3724 { 3725 auto r = "hello"d.byChar(); 3726 r.popFront(); 3727 r.popFront(); 3728 assert(r.front == 'l'); 3729 } 3730 { 3731 auto r = "hello"d.byChar(); 3732 assert(isForwardRange!(typeof(r))); 3733 auto s = r.save; 3734 r.popFront(); 3735 assert(s.front == 'h'); 3736 } 3737 } 3738 3739 @safe pure nothrow @nogc unittest 3740 { 3741 { 3742 wchar[11] s; 3743 int i; 3744 dchar[10] a; 3745 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 3746 a[8] = 0xD800; // invalid 3747 a[9] = cast(dchar) 0x110000; // invalid 3748 foreach (c; a[].byWchar()) 3749 { 3750 //writefln("[%d] '%c' x%x", i, c, c); 3751 s[i++] = c; 3752 } 3753 foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w) 3754 { 3755 //writefln("[%d] '%c' x%x", j, c, c); 3756 } 3757 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w); 3758 } 3759 3760 { 3761 auto r = "hello".byWchar(); 3762 r.popFront(); 3763 r.popFront(); 3764 assert(r.front == 'l'); 3765 } 3766 { 3767 auto r = "hello"d.byWchar(); 3768 r.popFront(); 3769 r.popFront(); 3770 assert(r.front == 'l'); 3771 } 3772 { 3773 auto r = "hello"d.byWchar(); 3774 assert(isForwardRange!(typeof(r))); 3775 auto s = r.save; 3776 r.popFront(); 3777 assert(s.front == 'h'); 3778 } 3779 } 3780 3781 @safe pure nothrow @nogc unittest 3782 { 3783 { 3784 dchar[9] s; 3785 int i; 3786 string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences 3787 foreach (c; a.byDchar()) 3788 { 3789 s[i++] = c; 3790 } 3791 assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d); 3792 } 3793 { 3794 foreach (s; invalidUTFstrings!char()) 3795 { 3796 auto r = s.byDchar(); 3797 assert(!r.empty); 3798 assert(r.front == r.front); 3799 dchar c = r.front; 3800 assert(c == replacementDchar); 3801 } 3802 } 3803 { 3804 auto r = "hello".byDchar(); 3805 r.popFront(); 3806 r.popFront(); 3807 assert(r.front == 'l'); 3808 } 3809 3810 { 3811 dchar[8] s; 3812 int i; 3813 wstring a = "hello\u07FF\uD7FF\U0010FFFF"w; 3814 foreach (c; a.byDchar()) 3815 { 3816 //writefln("[%d] '%c' x%x", i, c, c); 3817 s[i++] = c; 3818 } 3819 assert(s == "hello\u07FF\uD7FF\U0010FFFF"d); 3820 } 3821 { 3822 foreach (s; invalidUTFstrings!wchar()) 3823 { 3824 auto r = s.byDchar(); 3825 assert(!r.empty); 3826 assert(r.front == r.front); 3827 dchar c = r.front; 3828 assert(c == replacementDchar); 3829 } 3830 } 3831 { 3832 wchar[2] ws; 3833 ws[0] = 0xD800; 3834 ws[1] = 0xDD00; // correct surrogate pair 3835 auto r = ws[].byDchar(); 3836 assert(!r.empty); 3837 assert(r.front == r.front); 3838 dchar c = r.front; 3839 assert(c == '\U00010100'); 3840 } 3841 { 3842 auto r = "hello"w.byDchar(); 3843 r.popFront(); 3844 r.popFront(); 3845 assert(r.front == 'l'); 3846 } 3847 3848 { 3849 dchar[5] s; 3850 int i; 3851 dstring a = "hello"d; 3852 foreach (c; a.byDchar.byDchar()) 3853 { 3854 //writefln("[%d] '%c' x%x", i, c, c); 3855 s[i++] = c; 3856 } 3857 assert(s == "hello"d); 3858 } 3859 { 3860 auto r = "hello".byDchar(); 3861 assert(isForwardRange!(typeof(r))); 3862 auto s = r.save; 3863 r.popFront(); 3864 assert(s.front == 'h'); 3865 } 3866 { 3867 auto r = "hello"w.byDchar(); 3868 assert(isForwardRange!(typeof(r))); 3869 auto s = r.save; 3870 r.popFront(); 3871 assert(s.front == 'h'); 3872 } 3873 } 3874 3875 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar, 3876 // which needs to support ranges with and without those attributes 3877 3878 pure @safe nothrow @nogc unittest 3879 { 3880 dchar[5] s = "hello"d; 3881 foreach (c; s[].byChar()) { } 3882 foreach (c; s[].byWchar()) { } 3883 foreach (c; s[].byDchar()) { } 3884 } 3885 3886 version (unittest) 3887 int impureVariable; 3888 3889 @system unittest 3890 { 3891 static struct ImpureThrowingSystemRange(Char) 3892 { 3893 @property bool empty() const { return true; } 3894 @property Char front() const { return Char.init; } 3895 void popFront() 3896 { 3897 impureVariable++; 3898 throw new Exception("only for testing nothrow"); 3899 } 3900 } 3901 3902 foreach (Char; AliasSeq!(char, wchar, dchar)) 3903 { 3904 ImpureThrowingSystemRange!Char range; 3905 foreach (c; range.byChar()) { } 3906 foreach (c; range.byWchar()) { } 3907 foreach (c; range.byDchar()) { } 3908 } 3909 } 3910 3911 /**************************** 3912 * Iterate an input range of characters by char type `C` by 3913 * encoding the elements of the range. 3914 * 3915 * UTF sequences that cannot be converted to the specified encoding are 3916 * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution" 3917 * of the Unicode Standard 6.2. Hence byUTF is not symmetric. 3918 * This algorithm is lazy, and does not allocate memory. 3919 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the 3920 * `r` parameter. 3921 * 3922 * Params: 3923 * C = `char`, `wchar`, or `dchar` 3924 * 3925 * Returns: 3926 * A forward range if `R` is a range and not auto-decodable, as defined by 3927 * $(REF isAutodecodableString, std, traits), and if the base range is 3928 * also a forward range. 3929 * 3930 * Or, if `R` is a range and it is auto-decodable and 3931 * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed 3932 * to $(LREF byCodeUnit). 3933 * 3934 * Otherwise, an input range of characters. 3935 */ 3936 template byUTF(C) 3937 if (isSomeChar!C) 3938 { 3939 static if (!is(Unqual!C == C)) 3940 alias byUTF = byUTF!(Unqual!C); 3941 else: 3942 3943 auto ref byUTF(R)(R r) 3944 if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 3945 { 3946 return byUTF(r.byCodeUnit()); 3947 } 3948 3949 auto ref byUTF(R)(R r) 3950 if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 3951 { 3952 alias RC = Unqual!(ElementEncodingType!R); 3953 3954 static if (is(RC == C)) 3955 { 3956 return r.byCodeUnit(); 3957 } 3958 else 3959 { 3960 static struct Result 3961 { 3962 @property bool empty() 3963 { 3964 return pos == fill && r.empty; 3965 } 3966 3967 @property auto front() scope // 'scope' required by call to decodeFront() below 3968 { 3969 if (pos == fill) 3970 { 3971 pos = 0; 3972 auto c = r.front; 3973 3974 if (c <= 0x7F) 3975 { 3976 fill = 1; 3977 r.popFront; 3978 buf[pos] = cast(C) c; 3979 } 3980 else 3981 { 3982 static if (is(RC == dchar)) 3983 { 3984 r.popFront; 3985 dchar dc = c; 3986 } 3987 else 3988 dchar dc = () @trusted { return decodeFront!(Yes.useReplacementDchar)(r); }(); 3989 fill = cast(ushort) encode!(Yes.useReplacementDchar)(buf, dc); 3990 } 3991 } 3992 return buf[pos]; 3993 } 3994 3995 void popFront() 3996 { 3997 if (pos == fill) 3998 front; 3999 ++pos; 4000 } 4001 4002 static if (isForwardRange!R) 4003 { 4004 @property auto save() return scope 4005 /* `return scope` cannot be inferred because compiler does not 4006 * track it backwards from assignment to local `ret` 4007 */ 4008 { 4009 auto ret = this; 4010 ret.r = r.save; 4011 return ret; 4012 } 4013 } 4014 4015 private: 4016 4017 R r; 4018 C[4 / C.sizeof] buf = void; 4019 ushort pos, fill; 4020 } 4021 4022 return Result(r); 4023 } 4024 } 4025 } 4026 4027 /// 4028 @safe pure nothrow unittest 4029 { 4030 import std.algorithm.comparison : equal; 4031 4032 // hellö as a range of `char`s, which are UTF-8 4033 "hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]); 4034 4035 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit) 4036 "hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']); 4037 4038 // is four code units in UTF-8, two in UTF-16, and one in UTF-32 4039 "".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]); 4040 "".byUTF!wchar().equal([0xD801, 0xDC37]); 4041 "".byUTF!dchar().equal([0x00010437]); 4042 } 4043