1181254a7Smrg // Written in the D programming language. 2181254a7Smrg 3181254a7Smrg /++ 4181254a7Smrg Encode and decode UTF-8, UTF-16 and UTF-32 strings. 5181254a7Smrg 6181254a7Smrg UTF character support is restricted to 7181254a7Smrg $(D '\u0000' <= character <= '\U0010FFFF'). 8181254a7Smrg 9181254a7Smrg $(SCRIPT inhibitQuickIndex = 1;) 10*b1e83836Smrg $(DIVC quickindex, 11181254a7Smrg $(BOOKTABLE, 12181254a7Smrg $(TR $(TH Category) $(TH Functions)) 13181254a7Smrg $(TR $(TD Decode) $(TD 14181254a7Smrg $(LREF decode) 15181254a7Smrg $(LREF decodeFront) 16181254a7Smrg )) 17181254a7Smrg $(TR $(TD Lazy decode) $(TD 18181254a7Smrg $(LREF byCodeUnit) 19181254a7Smrg $(LREF byChar) 20181254a7Smrg $(LREF byWchar) 21181254a7Smrg $(LREF byDchar) 22181254a7Smrg $(LREF byUTF) 23181254a7Smrg )) 24181254a7Smrg $(TR $(TD Encode) $(TD 25181254a7Smrg $(LREF encode) 26181254a7Smrg $(LREF toUTF8) 27181254a7Smrg $(LREF toUTF16) 28181254a7Smrg $(LREF toUTF32) 29181254a7Smrg $(LREF toUTFz) 30181254a7Smrg $(LREF toUTF16z) 31181254a7Smrg )) 32181254a7Smrg $(TR $(TD Length) $(TD 33181254a7Smrg $(LREF codeLength) 34181254a7Smrg $(LREF count) 35181254a7Smrg $(LREF stride) 36181254a7Smrg $(LREF strideBack) 37181254a7Smrg )) 38181254a7Smrg $(TR $(TD Index) $(TD 39181254a7Smrg $(LREF toUCSindex) 40181254a7Smrg $(LREF toUTFindex) 41181254a7Smrg )) 42181254a7Smrg $(TR $(TD Validation) $(TD 43181254a7Smrg $(LREF isValidDchar) 44*b1e83836Smrg $(LREF isValidCodepoint) 45181254a7Smrg $(LREF validate) 46181254a7Smrg )) 47181254a7Smrg $(TR $(TD Miscellaneous) $(TD 48181254a7Smrg $(LREF replacementDchar) 49181254a7Smrg $(LREF UseReplacementDchar) 50181254a7Smrg $(LREF UTFException) 51181254a7Smrg )) 52*b1e83836Smrg )) 53181254a7Smrg See_Also: 54181254a7Smrg $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> 55181254a7Smrg $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> 56181254a7Smrg $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) 57*b1e83836Smrg Copyright: Copyright The D Language Foundation 2000 - 2012. 58181254a7Smrg License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 59*b1e83836Smrg Authors: $(HTTP digitalmars.com, Walter Bright) and 60*b1e83836Smrg $(HTTP jmdavisprog.com, Jonathan M Davis) 61*b1e83836Smrg Source: $(PHOBOSSRC std/utf.d) 62181254a7Smrg +/ 63181254a7Smrg module std.utf; 64181254a7Smrg 65*b1e83836Smrg import std.exception : basicExceptionCtors; 66*b1e83836Smrg import core.exception : UnicodeException; 67*b1e83836Smrg import std.meta : AliasSeq; 68*b1e83836Smrg import std.range; 69*b1e83836Smrg import std.traits : isAutodecodableString, isConvertibleToString, isPointer, 70*b1e83836Smrg isSomeChar, isSomeString, isStaticArray, Unqual; 71*b1e83836Smrg import std.typecons : Flag, Yes, No; 72181254a7Smrg 73181254a7Smrg 74181254a7Smrg /++ 75181254a7Smrg Exception thrown on errors in std.utf functions. 76181254a7Smrg +/ 77*b1e83836Smrg class UTFException : UnicodeException 78181254a7Smrg { 79181254a7Smrg import core.internal.string : unsignedToTempString, UnsignedStringBuf; 80181254a7Smrg 81181254a7Smrg uint[4] sequence; 82181254a7Smrg size_t len; 83181254a7Smrg 84181254a7Smrg @safe pure nothrow @nogc 85*b1e83836Smrg UTFException setSequence(scope uint[] data...) return 86181254a7Smrg { 87181254a7Smrg assert(data.length <= 4); 88181254a7Smrg 89181254a7Smrg len = data.length < 4 ? data.length : 4; 90181254a7Smrg sequence[0 .. len] = data[0 .. len]; 91181254a7Smrg 92181254a7Smrg return this; 93181254a7Smrg } 94181254a7Smrg 95*b1e83836Smrg // FIXME: Use std.exception.basicExceptionCtors here once 96*b1e83836Smrg // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed 97181254a7Smrg 98*b1e83836Smrg /** 99*b1e83836Smrg Standard exception constructors. 100*b1e83836Smrg */ 101181254a7Smrg this(string msg, string file = __FILE__, size_t line = __LINE__, 102181254a7Smrg Throwable next = null) @nogc @safe pure nothrow 103181254a7Smrg { 104*b1e83836Smrg super(msg, 0, file, line, next); 105181254a7Smrg } 106*b1e83836Smrg /// ditto 107181254a7Smrg this(string msg, size_t index, string file = __FILE__, 108181254a7Smrg size_t line = __LINE__, Throwable next = null) @safe pure nothrow 109181254a7Smrg { 110181254a7Smrg UnsignedStringBuf buf = void; 111*b1e83836Smrg msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")"; 112*b1e83836Smrg super(msg, index, file, line, next); 113181254a7Smrg } 114181254a7Smrg 115*b1e83836Smrg /** 116*b1e83836Smrg Returns: 117*b1e83836Smrg A `string` detailing the invalid UTF sequence. 118*b1e83836Smrg */ 119181254a7Smrg override string toString() const 120181254a7Smrg { 121181254a7Smrg if (len == 0) 122181254a7Smrg { 123181254a7Smrg /* Exception.toString() is not marked as const, although 124181254a7Smrg * it is const-compatible. 125181254a7Smrg */ 126181254a7Smrg //return super.toString(); 127181254a7Smrg auto e = () @trusted { return cast(Exception) super; } (); 128181254a7Smrg return e.toString(); 129181254a7Smrg } 130181254a7Smrg 131181254a7Smrg string result = "Invalid UTF sequence:"; 132181254a7Smrg 133181254a7Smrg foreach (i; sequence[0 .. len]) 134181254a7Smrg { 135181254a7Smrg UnsignedStringBuf buf = void; 136181254a7Smrg result ~= ' '; 137*b1e83836Smrg auto h = unsignedToTempString!16(i, buf); 138181254a7Smrg if (h.length == 1) 139181254a7Smrg result ~= '0'; 140181254a7Smrg result ~= h; 141181254a7Smrg result ~= 'x'; 142181254a7Smrg } 143181254a7Smrg 144181254a7Smrg if (super.msg.length > 0) 145181254a7Smrg { 146181254a7Smrg result ~= " - "; 147181254a7Smrg result ~= super.msg; 148181254a7Smrg } 149181254a7Smrg 150181254a7Smrg return result; 151181254a7Smrg } 152181254a7Smrg } 153181254a7Smrg 154*b1e83836Smrg /// 155*b1e83836Smrg @safe unittest 156*b1e83836Smrg { 157*b1e83836Smrg import std.exception : assertThrown; 158*b1e83836Smrg 159*b1e83836Smrg char[4] buf; 160*b1e83836Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 161*b1e83836Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 162*b1e83836Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 163*b1e83836Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 164*b1e83836Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 165*b1e83836Smrg } 166*b1e83836Smrg 167181254a7Smrg /* 168181254a7Smrg Provide array of invalidly encoded UTF strings. Useful for testing. 169181254a7Smrg 170181254a7Smrg Params: 171181254a7Smrg Char = char, wchar, or dchar 172181254a7Smrg 173181254a7Smrg Returns: 174181254a7Smrg an array of invalidly encoded UTF strings 175181254a7Smrg */ 176181254a7Smrg 177181254a7Smrg package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow 178181254a7Smrg if (isSomeChar!Char) 179181254a7Smrg { 180181254a7Smrg static if (is(Char == char)) 181181254a7Smrg { 182181254a7Smrg enum x = 0xDC00; // invalid surrogate value 183181254a7Smrg enum y = 0x110000; // out of range 184181254a7Smrg 185181254a7Smrg static immutable string[8] result = 186181254a7Smrg [ 187181254a7Smrg "\x80", // not a start byte 188181254a7Smrg "\xC0", // truncated 189181254a7Smrg "\xC0\xC0", // invalid continuation 190181254a7Smrg "\xF0\x82\x82\xAC", // overlong 191181254a7Smrg [ 192181254a7Smrg 0xE0 | (x >> 12), 193181254a7Smrg 0x80 | ((x >> 6) & 0x3F), 194181254a7Smrg 0x80 | (x & 0x3F) 195181254a7Smrg ], 196181254a7Smrg [ 197181254a7Smrg cast(char)(0xF0 | (y >> 18)), 198181254a7Smrg cast(char)(0x80 | ((y >> 12) & 0x3F)), 199181254a7Smrg cast(char)(0x80 | ((y >> 6) & 0x3F)), 200181254a7Smrg cast(char)(0x80 | (y & 0x3F)) 201181254a7Smrg ], 202181254a7Smrg [ 203181254a7Smrg cast(char)(0xF8 | 3), // 5 byte encoding 204181254a7Smrg cast(char)(0x80 | 3), 205181254a7Smrg cast(char)(0x80 | 3), 206181254a7Smrg cast(char)(0x80 | 3), 207181254a7Smrg cast(char)(0x80 | 3), 208181254a7Smrg ], 209181254a7Smrg [ 210181254a7Smrg cast(char)(0xFC | 3), // 6 byte encoding 211181254a7Smrg cast(char)(0x80 | 3), 212181254a7Smrg cast(char)(0x80 | 3), 213181254a7Smrg cast(char)(0x80 | 3), 214181254a7Smrg cast(char)(0x80 | 3), 215181254a7Smrg cast(char)(0x80 | 3), 216181254a7Smrg ], 217181254a7Smrg ]; 218181254a7Smrg 219181254a7Smrg return result[]; 220181254a7Smrg } 221181254a7Smrg else static if (is(Char == wchar)) 222181254a7Smrg { 223181254a7Smrg static immutable wstring[5] result = 224181254a7Smrg [ 225181254a7Smrg [ 226181254a7Smrg cast(wchar) 0xDC00, 227181254a7Smrg ], 228181254a7Smrg [ 229181254a7Smrg cast(wchar) 0xDFFF, 230181254a7Smrg ], 231181254a7Smrg [ 232181254a7Smrg cast(wchar) 0xDBFF, 233181254a7Smrg cast(wchar) 0xDBFF, 234181254a7Smrg ], 235181254a7Smrg [ 236181254a7Smrg cast(wchar) 0xDBFF, 237181254a7Smrg cast(wchar) 0xE000, 238181254a7Smrg ], 239181254a7Smrg [ 240181254a7Smrg cast(wchar) 0xD800, 241181254a7Smrg ], 242181254a7Smrg ]; 243181254a7Smrg 244181254a7Smrg return result[]; 245181254a7Smrg } 246181254a7Smrg else static if (is(Char == dchar)) 247181254a7Smrg { 248181254a7Smrg static immutable dstring[3] result = 249181254a7Smrg [ 250181254a7Smrg [ cast(dchar) 0x110000 ], 251181254a7Smrg [ cast(dchar) 0x00D800 ], 252181254a7Smrg [ cast(dchar) 0x00DFFF ], 253181254a7Smrg ]; 254181254a7Smrg 255181254a7Smrg return result; 256181254a7Smrg } 257181254a7Smrg else 258181254a7Smrg static assert(0); 259181254a7Smrg } 260181254a7Smrg 261181254a7Smrg /++ 262181254a7Smrg Check whether the given Unicode code point is valid. 263181254a7Smrg 264181254a7Smrg Params: 265181254a7Smrg c = code point to check 266181254a7Smrg 267181254a7Smrg Returns: 268*b1e83836Smrg `true` if and only if `c` is a valid Unicode code point 269181254a7Smrg 270181254a7Smrg Note: 271*b1e83836Smrg `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`, 272181254a7Smrg as they are permitted for internal use by an application, but they are 273181254a7Smrg not allowed for interchange by the Unicode standard. 274181254a7Smrg +/ 275181254a7Smrg bool isValidDchar(dchar c) pure nothrow @safe @nogc 276181254a7Smrg { 277181254a7Smrg return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF); 278181254a7Smrg } 279181254a7Smrg 280*b1e83836Smrg /// 281*b1e83836Smrg @safe @nogc pure nothrow unittest 282*b1e83836Smrg { 283*b1e83836Smrg assert( isValidDchar(cast(dchar) 0x41)); 284*b1e83836Smrg assert( isValidDchar(cast(dchar) 0x00)); 285*b1e83836Smrg assert(!isValidDchar(cast(dchar) 0xD800)); 286*b1e83836Smrg assert(!isValidDchar(cast(dchar) 0x11FFFF)); 287*b1e83836Smrg } 288*b1e83836Smrg 289181254a7Smrg pure nothrow @safe @nogc unittest 290181254a7Smrg { 291181254a7Smrg import std.exception; 292181254a7Smrg 293181254a7Smrg assertCTFEable!( 294181254a7Smrg { 295181254a7Smrg assert( isValidDchar(cast(dchar)'a') == true); 296181254a7Smrg assert( isValidDchar(cast(dchar) 0x1FFFFF) == false); 297181254a7Smrg 298181254a7Smrg assert(!isValidDchar(cast(dchar) 0x00D800)); 299181254a7Smrg assert(!isValidDchar(cast(dchar) 0x00DBFF)); 300181254a7Smrg assert(!isValidDchar(cast(dchar) 0x00DC00)); 301181254a7Smrg assert(!isValidDchar(cast(dchar) 0x00DFFF)); 302181254a7Smrg assert( isValidDchar(cast(dchar) 0x00FFFE)); 303181254a7Smrg assert( isValidDchar(cast(dchar) 0x00FFFF)); 304181254a7Smrg assert( isValidDchar(cast(dchar) 0x01FFFF)); 305181254a7Smrg assert( isValidDchar(cast(dchar) 0x10FFFF)); 306181254a7Smrg assert(!isValidDchar(cast(dchar) 0x110000)); 307181254a7Smrg }); 308181254a7Smrg } 309181254a7Smrg 310*b1e83836Smrg /** 311*b1e83836Smrg Checks if a single character forms a valid code point. 312181254a7Smrg 313*b1e83836Smrg When standing alone, some characters are invalid code points. For 314*b1e83836Smrg example the `wchar` `0xD800` is a so called high surrogate, which can 315*b1e83836Smrg only be interpreted together with a low surrogate following it. As a 316*b1e83836Smrg standalone character it is considered invalid. 317*b1e83836Smrg 318*b1e83836Smrg See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/, 319*b1e83836Smrg Unicode Standard, D90, D91 and D92) for more details. 320181254a7Smrg 321181254a7Smrg Params: 322*b1e83836Smrg c = character to test 323*b1e83836Smrg Char = character type of `c` 324*b1e83836Smrg 325*b1e83836Smrg Returns: 326*b1e83836Smrg `true`, if `c` forms a valid code point. 327*b1e83836Smrg */ 328*b1e83836Smrg bool isValidCodepoint(Char)(Char c) 329*b1e83836Smrg if (isSomeChar!Char) 330*b1e83836Smrg { 331*b1e83836Smrg alias UChar = Unqual!Char; 332*b1e83836Smrg static if (is(UChar == char)) 333*b1e83836Smrg { 334*b1e83836Smrg return c <= 0x7F; 335*b1e83836Smrg } 336*b1e83836Smrg else static if (is(UChar == wchar)) 337*b1e83836Smrg { 338*b1e83836Smrg return c <= 0xD7FF || c >= 0xE000; 339*b1e83836Smrg } 340*b1e83836Smrg else static if (is(UChar == dchar)) 341*b1e83836Smrg { 342*b1e83836Smrg return isValidDchar(c); 343*b1e83836Smrg } 344*b1e83836Smrg else 345*b1e83836Smrg static assert(false, "unknown character type: `" ~ Char.stringof ~ "`"); 346*b1e83836Smrg } 347*b1e83836Smrg 348*b1e83836Smrg /// 349*b1e83836Smrg @safe pure nothrow unittest 350*b1e83836Smrg { 351*b1e83836Smrg assert( isValidCodepoint(cast(char) 0x40)); 352*b1e83836Smrg assert(!isValidCodepoint(cast(char) 0x80)); 353*b1e83836Smrg assert( isValidCodepoint(cast(wchar) 0x1234)); 354*b1e83836Smrg assert(!isValidCodepoint(cast(wchar) 0xD800)); 355*b1e83836Smrg assert( isValidCodepoint(cast(dchar) 0x0010FFFF)); 356*b1e83836Smrg assert(!isValidCodepoint(cast(dchar) 0x12345678)); 357*b1e83836Smrg } 358*b1e83836Smrg 359*b1e83836Smrg /++ 360*b1e83836Smrg Calculate the length of the UTF sequence starting at `index` 361*b1e83836Smrg in `str`. 362*b1e83836Smrg 363*b1e83836Smrg Params: 364*b1e83836Smrg str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 365*b1e83836Smrg of UTF code units. Must be random access if `index` is passed 366*b1e83836Smrg index = starting index of UTF sequence (default: `0`) 367181254a7Smrg 368181254a7Smrg Returns: 369181254a7Smrg The number of code units in the UTF sequence. For UTF-8, this is a 370181254a7Smrg value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 371181254a7Smrg For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 372181254a7Smrg 373181254a7Smrg Throws: 374*b1e83836Smrg May throw a `UTFException` if `str[index]` is not the start of a 375181254a7Smrg valid UTF sequence. 376181254a7Smrg 377181254a7Smrg Note: 378*b1e83836Smrg `stride` will only analyze the first `str[index]` element. It 379181254a7Smrg will not fully verify the validity of the UTF sequence, nor even verify 380181254a7Smrg the presence of the sequence: it will not actually guarantee that 381181254a7Smrg $(D index + stride(str, index) <= str.length). 382181254a7Smrg +/ 383181254a7Smrg uint stride(S)(auto ref S str, size_t index) 384181254a7Smrg if (is(S : const char[]) || 385*b1e83836Smrg (isRandomAccessRange!S && is(immutable ElementType!S == immutable char))) 386181254a7Smrg { 387181254a7Smrg static if (is(typeof(str.length) : ulong)) 388181254a7Smrg assert(index < str.length, "Past the end of the UTF-8 sequence"); 389181254a7Smrg immutable c = str[index]; 390181254a7Smrg 391181254a7Smrg if (c < 0x80) 392181254a7Smrg return 1; 393181254a7Smrg else 394181254a7Smrg return strideImpl(c, index); 395181254a7Smrg } 396181254a7Smrg 397181254a7Smrg /// Ditto 398181254a7Smrg uint stride(S)(auto ref S str) 399181254a7Smrg if (is(S : const char[]) || 400*b1e83836Smrg (isInputRange!S && is(immutable ElementType!S == immutable char))) 401181254a7Smrg { 402181254a7Smrg static if (is(S : const char[])) 403181254a7Smrg immutable c = str[0]; 404181254a7Smrg else 405181254a7Smrg immutable c = str.front; 406181254a7Smrg 407181254a7Smrg if (c < 0x80) 408181254a7Smrg return 1; 409181254a7Smrg else 410181254a7Smrg return strideImpl(c, 0); 411181254a7Smrg } 412181254a7Smrg 413181254a7Smrg @system unittest 414181254a7Smrg { 415181254a7Smrg import core.exception : AssertError; 416181254a7Smrg import std.conv : to; 417181254a7Smrg import std.exception; 418181254a7Smrg import std.string : format; 419*b1e83836Smrg import std.traits : FunctionAttribute, functionAttributes, isSafe; 420181254a7Smrg static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__) 421181254a7Smrg { 422181254a7Smrg enforce(stride(s, i) == codeLength!char(c), 423181254a7Smrg new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 424181254a7Smrg 425181254a7Smrg enforce(stride(RandomCU!char(s), i) == codeLength!char(c), 426181254a7Smrg new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 427181254a7Smrg 428181254a7Smrg auto refRandom = new RefRandomCU!char(s); 429181254a7Smrg immutable randLen = refRandom.length; 430181254a7Smrg enforce(stride(refRandom, i) == codeLength!char(c), 431181254a7Smrg new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 432181254a7Smrg enforce(refRandom.length == randLen, 433181254a7Smrg new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 434181254a7Smrg 435181254a7Smrg if (i == 0) 436181254a7Smrg { 437181254a7Smrg enforce(stride(s) == codeLength!char(c), 438181254a7Smrg new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 439181254a7Smrg 440181254a7Smrg enforce(stride(InputCU!char(s)) == codeLength!char(c), 441181254a7Smrg new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 442181254a7Smrg 443181254a7Smrg auto refBidir = new RefBidirCU!char(s); 444181254a7Smrg immutable bidirLen = refBidir.length; 445181254a7Smrg enforce(stride(refBidir) == codeLength!char(c), 446181254a7Smrg new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 447181254a7Smrg enforce(refBidir.length == bidirLen, 448181254a7Smrg new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 449181254a7Smrg } 450181254a7Smrg } 451181254a7Smrg 452181254a7Smrg assertCTFEable!( 453181254a7Smrg { 454181254a7Smrg test("a", 'a'); 455181254a7Smrg test(" ", ' '); 456181254a7Smrg test("\u2029", '\u2029'); //paraSep 457181254a7Smrg test("\u0100", '\u0100'); 458181254a7Smrg test("\u0430", '\u0430'); 459181254a7Smrg test("\U00010143", '\U00010143'); 460181254a7Smrg test("abcdefcdef", 'a'); 461181254a7Smrg test("hello\U00010143\u0100\U00010143", 'h', 0); 462181254a7Smrg test("hello\U00010143\u0100\U00010143", 'e', 1); 463181254a7Smrg test("hello\U00010143\u0100\U00010143", 'l', 2); 464181254a7Smrg test("hello\U00010143\u0100\U00010143", 'l', 3); 465181254a7Smrg test("hello\U00010143\u0100\U00010143", 'o', 4); 466181254a7Smrg test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 467181254a7Smrg test("hello\U00010143\u0100\U00010143", '\u0100', 9); 468181254a7Smrg test("hello\U00010143\u0100\U00010143", '\U00010143', 11); 469181254a7Smrg 470181254a7Smrg foreach (S; AliasSeq!(char[], const char[], string)) 471181254a7Smrg { 472181254a7Smrg enum str = to!S("hello world"); 473181254a7Smrg static assert(isSafe!({ stride(str, 0); })); 474181254a7Smrg static assert(isSafe!({ stride(str); })); 475181254a7Smrg static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0); 476181254a7Smrg static assert((functionAttributes!({ stride(str); }) & FunctionAttribute.pure_) != 0); 477181254a7Smrg } 478181254a7Smrg }); 479181254a7Smrg } 480181254a7Smrg 481181254a7Smrg @safe unittest // invalid start bytes 482181254a7Smrg { 483181254a7Smrg import std.exception : assertThrown; 484181254a7Smrg immutable char[] invalidStartBytes = [ 485181254a7Smrg 0b1111_1000, // indicating a sequence length of 5 486181254a7Smrg 0b1111_1100, // 6 487181254a7Smrg 0b1111_1110, // 7 488181254a7Smrg 0b1111_1111, // 8 489181254a7Smrg 0b1000_0000, // continuation byte 490181254a7Smrg ]; 491181254a7Smrg foreach (c; invalidStartBytes) 492181254a7Smrg assertThrown!UTFException(stride([c])); 493181254a7Smrg } 494181254a7Smrg 495181254a7Smrg /// Ditto 496181254a7Smrg uint stride(S)(auto ref S str, size_t index) 497181254a7Smrg if (is(S : const wchar[]) || 498*b1e83836Smrg (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar))) 499181254a7Smrg { 500181254a7Smrg static if (is(typeof(str.length) : ulong)) 501181254a7Smrg assert(index < str.length, "Past the end of the UTF-16 sequence"); 502181254a7Smrg immutable uint u = str[index]; 503181254a7Smrg return 1 + (u >= 0xD800 && u <= 0xDBFF); 504181254a7Smrg } 505181254a7Smrg 506181254a7Smrg /// Ditto 507181254a7Smrg uint stride(S)(auto ref S str) @safe pure 508181254a7Smrg if (is(S : const wchar[])) 509181254a7Smrg { 510181254a7Smrg return stride(str, 0); 511181254a7Smrg } 512181254a7Smrg 513181254a7Smrg /// Ditto 514181254a7Smrg uint stride(S)(auto ref S str) 515*b1e83836Smrg if (isInputRange!S && is(immutable ElementType!S == immutable wchar) && 516*b1e83836Smrg !is(S : const wchar[])) 517181254a7Smrg { 518181254a7Smrg assert(!str.empty, "UTF-16 sequence is empty"); 519181254a7Smrg immutable uint u = str.front; 520181254a7Smrg return 1 + (u >= 0xD800 && u <= 0xDBFF); 521181254a7Smrg } 522181254a7Smrg 523181254a7Smrg @system unittest 524181254a7Smrg { 525181254a7Smrg import core.exception : AssertError; 526181254a7Smrg import std.conv : to; 527181254a7Smrg import std.exception; 528181254a7Smrg import std.string : format; 529*b1e83836Smrg import std.traits : FunctionAttribute, functionAttributes, isSafe; 530181254a7Smrg static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__) 531181254a7Smrg { 532181254a7Smrg enforce(stride(s, i) == codeLength!wchar(c), 533181254a7Smrg new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 534181254a7Smrg 535181254a7Smrg enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c), 536181254a7Smrg new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 537181254a7Smrg 538181254a7Smrg auto refRandom = new RefRandomCU!wchar(s); 539181254a7Smrg immutable randLen = refRandom.length; 540181254a7Smrg enforce(stride(refRandom, i) == codeLength!wchar(c), 541181254a7Smrg new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 542181254a7Smrg enforce(refRandom.length == randLen, 543181254a7Smrg new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 544181254a7Smrg 545181254a7Smrg if (i == 0) 546181254a7Smrg { 547181254a7Smrg enforce(stride(s) == codeLength!wchar(c), 548181254a7Smrg new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 549181254a7Smrg 550181254a7Smrg enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c), 551181254a7Smrg new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 552181254a7Smrg 553181254a7Smrg auto refBidir = new RefBidirCU!wchar(s); 554181254a7Smrg immutable bidirLen = refBidir.length; 555181254a7Smrg enforce(stride(refBidir) == codeLength!wchar(c), 556181254a7Smrg new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 557181254a7Smrg enforce(refBidir.length == bidirLen, 558181254a7Smrg new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 559181254a7Smrg } 560181254a7Smrg } 561181254a7Smrg 562181254a7Smrg assertCTFEable!( 563181254a7Smrg { 564181254a7Smrg test("a", 'a'); 565181254a7Smrg test(" ", ' '); 566181254a7Smrg test("\u2029", '\u2029'); //paraSep 567181254a7Smrg test("\u0100", '\u0100'); 568181254a7Smrg test("\u0430", '\u0430'); 569181254a7Smrg test("\U00010143", '\U00010143'); 570181254a7Smrg test("abcdefcdef", 'a'); 571181254a7Smrg test("hello\U00010143\u0100\U00010143", 'h', 0); 572181254a7Smrg test("hello\U00010143\u0100\U00010143", 'e', 1); 573181254a7Smrg test("hello\U00010143\u0100\U00010143", 'l', 2); 574181254a7Smrg test("hello\U00010143\u0100\U00010143", 'l', 3); 575181254a7Smrg test("hello\U00010143\u0100\U00010143", 'o', 4); 576181254a7Smrg test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 577181254a7Smrg test("hello\U00010143\u0100\U00010143", '\u0100', 7); 578181254a7Smrg test("hello\U00010143\u0100\U00010143", '\U00010143', 8); 579181254a7Smrg 580181254a7Smrg foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 581181254a7Smrg { 582181254a7Smrg enum str = to!S("hello world"); 583181254a7Smrg static assert(isSafe!(() => stride(str, 0))); 584181254a7Smrg static assert(isSafe!(() => stride(str) )); 585181254a7Smrg static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 586181254a7Smrg static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 587181254a7Smrg } 588181254a7Smrg }); 589181254a7Smrg } 590181254a7Smrg 591181254a7Smrg /// Ditto 592181254a7Smrg uint stride(S)(auto ref S str, size_t index = 0) 593181254a7Smrg if (is(S : const dchar[]) || 594*b1e83836Smrg (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar))) 595181254a7Smrg { 596181254a7Smrg static if (is(typeof(str.length) : ulong)) 597181254a7Smrg assert(index < str.length, "Past the end of the UTF-32 sequence"); 598181254a7Smrg else 599181254a7Smrg assert(!str.empty, "UTF-32 sequence is empty."); 600181254a7Smrg return 1; 601181254a7Smrg } 602181254a7Smrg 603*b1e83836Smrg /// 604*b1e83836Smrg @safe unittest 605*b1e83836Smrg { 606*b1e83836Smrg assert("a".stride == 1); 607*b1e83836Smrg assert("λ".stride == 2); 608*b1e83836Smrg assert("aλ".stride == 1); 609*b1e83836Smrg assert("aλ".stride(1) == 2); 610*b1e83836Smrg assert("".stride == 4); 611*b1e83836Smrg } 612*b1e83836Smrg 613181254a7Smrg @system unittest 614181254a7Smrg { 615181254a7Smrg import core.exception : AssertError; 616181254a7Smrg import std.conv : to; 617181254a7Smrg import std.exception; 618181254a7Smrg import std.string : format; 619*b1e83836Smrg import std.traits : FunctionAttribute, functionAttributes, isSafe; 620181254a7Smrg static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__) 621181254a7Smrg { 622181254a7Smrg enforce(stride(s, i) == codeLength!dchar(c), 623181254a7Smrg new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 624181254a7Smrg 625181254a7Smrg enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c), 626181254a7Smrg new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 627181254a7Smrg 628181254a7Smrg auto refRandom = new RefRandomCU!dchar(s); 629181254a7Smrg immutable randLen = refRandom.length; 630181254a7Smrg enforce(stride(refRandom, i) == codeLength!dchar(c), 631181254a7Smrg new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 632181254a7Smrg enforce(refRandom.length == randLen, 633181254a7Smrg new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 634181254a7Smrg 635181254a7Smrg if (i == 0) 636181254a7Smrg { 637181254a7Smrg enforce(stride(s) == codeLength!dchar(c), 638181254a7Smrg new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 639181254a7Smrg 640181254a7Smrg enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c), 641181254a7Smrg new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 642181254a7Smrg 643181254a7Smrg auto refBidir = new RefBidirCU!dchar(s); 644181254a7Smrg immutable bidirLen = refBidir.length; 645181254a7Smrg enforce(stride(refBidir) == codeLength!dchar(c), 646181254a7Smrg new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 647181254a7Smrg enforce(refBidir.length == bidirLen, 648181254a7Smrg new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 649181254a7Smrg } 650181254a7Smrg } 651181254a7Smrg 652181254a7Smrg assertCTFEable!( 653181254a7Smrg { 654181254a7Smrg test("a", 'a'); 655181254a7Smrg test(" ", ' '); 656181254a7Smrg test("\u2029", '\u2029'); //paraSep 657181254a7Smrg test("\u0100", '\u0100'); 658181254a7Smrg test("\u0430", '\u0430'); 659181254a7Smrg test("\U00010143", '\U00010143'); 660181254a7Smrg test("abcdefcdef", 'a'); 661181254a7Smrg test("hello\U00010143\u0100\U00010143", 'h', 0); 662181254a7Smrg test("hello\U00010143\u0100\U00010143", 'e', 1); 663181254a7Smrg test("hello\U00010143\u0100\U00010143", 'l', 2); 664181254a7Smrg test("hello\U00010143\u0100\U00010143", 'l', 3); 665181254a7Smrg test("hello\U00010143\u0100\U00010143", 'o', 4); 666181254a7Smrg test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 667181254a7Smrg test("hello\U00010143\u0100\U00010143", '\u0100', 6); 668181254a7Smrg test("hello\U00010143\u0100\U00010143", '\U00010143', 7); 669181254a7Smrg 670181254a7Smrg foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 671181254a7Smrg { 672181254a7Smrg enum str = to!S("hello world"); 673181254a7Smrg static assert(isSafe!(() => stride(str, 0))); 674181254a7Smrg static assert(isSafe!(() => stride(str) )); 675181254a7Smrg static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 676181254a7Smrg static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 677181254a7Smrg } 678181254a7Smrg }); 679181254a7Smrg } 680181254a7Smrg 681*b1e83836Smrg private uint strideImpl(char c, size_t index) @trusted pure 682*b1e83836Smrg in { assert(c & 0x80); } 683*b1e83836Smrg do 684*b1e83836Smrg { 685*b1e83836Smrg import core.bitop : bsr; 686*b1e83836Smrg immutable msbs = 7 - bsr((~uint(c)) & 0xFF); 687*b1e83836Smrg if (c == 0xFF || msbs < 2 || msbs > 4) 688*b1e83836Smrg throw new UTFException("Invalid UTF-8 sequence", index); 689*b1e83836Smrg return msbs; 690*b1e83836Smrg } 691*b1e83836Smrg 692181254a7Smrg /++ 693181254a7Smrg Calculate the length of the UTF sequence ending one code unit before 694*b1e83836Smrg `index` in `str`. 695181254a7Smrg 696181254a7Smrg Params: 697181254a7Smrg str = bidirectional range of UTF code units. Must be random access if 698*b1e83836Smrg `index` is passed 699*b1e83836Smrg index = index one past end of UTF sequence (default: `str.length`) 700181254a7Smrg 701181254a7Smrg Returns: 702181254a7Smrg The number of code units in the UTF sequence. For UTF-8, this is a 703181254a7Smrg value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 704181254a7Smrg For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 705181254a7Smrg 706181254a7Smrg Throws: 707*b1e83836Smrg May throw a `UTFException` if `str[index]` is not one past the 708181254a7Smrg end of a valid UTF sequence. 709181254a7Smrg 710181254a7Smrg Note: 711*b1e83836Smrg `strideBack` will only analyze the element at $(D str[index - 1]) 712181254a7Smrg element. It will not fully verify the validity of the UTF sequence, nor 713181254a7Smrg even verify the presence of the sequence: it will not actually 714181254a7Smrg guarantee that $(D strideBack(str, index) <= index). 715181254a7Smrg +/ 716181254a7Smrg uint strideBack(S)(auto ref S str, size_t index) 717181254a7Smrg if (is(S : const char[]) || 718*b1e83836Smrg (isRandomAccessRange!S && is(immutable ElementType!S == immutable char))) 719181254a7Smrg { 720181254a7Smrg static if (is(typeof(str.length) : ulong)) 721181254a7Smrg assert(index <= str.length, "Past the end of the UTF-8 sequence"); 722181254a7Smrg assert(index > 0, "Not the end of the UTF-8 sequence"); 723181254a7Smrg 724181254a7Smrg if ((str[index-1] & 0b1100_0000) != 0b1000_0000) 725181254a7Smrg return 1; 726181254a7Smrg 727181254a7Smrg if (index >= 4) //single verification for most common case 728181254a7Smrg { 729*b1e83836Smrg static foreach (i; 2 .. 5) 730181254a7Smrg { 731181254a7Smrg if ((str[index-i] & 0b1100_0000) != 0b1000_0000) 732181254a7Smrg return i; 733181254a7Smrg } 734181254a7Smrg } 735181254a7Smrg else 736181254a7Smrg { 737*b1e83836Smrg static foreach (i; 2 .. 4) 738181254a7Smrg { 739181254a7Smrg if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000) 740181254a7Smrg return i; 741181254a7Smrg } 742181254a7Smrg } 743181254a7Smrg throw new UTFException("Not the end of the UTF sequence", index); 744181254a7Smrg } 745181254a7Smrg 746181254a7Smrg /// Ditto 747181254a7Smrg uint strideBack(S)(auto ref S str) 748181254a7Smrg if (is(S : const char[]) || 749*b1e83836Smrg (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char))) 750181254a7Smrg { 751181254a7Smrg return strideBack(str, str.length); 752181254a7Smrg } 753181254a7Smrg 754181254a7Smrg /// Ditto 755181254a7Smrg uint strideBack(S)(auto ref S str) 756*b1e83836Smrg if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S) 757181254a7Smrg { 758181254a7Smrg assert(!str.empty, "Past the end of the UTF-8 sequence"); 759181254a7Smrg auto temp = str.save; 760181254a7Smrg foreach (i; AliasSeq!(1, 2, 3, 4)) 761181254a7Smrg { 762181254a7Smrg if ((temp.back & 0b1100_0000) != 0b1000_0000) 763181254a7Smrg return i; 764181254a7Smrg temp.popBack(); 765181254a7Smrg if (temp.empty) 766181254a7Smrg break; 767181254a7Smrg } 768181254a7Smrg throw new UTFException("The last code unit is not the end of the UTF-8 sequence"); 769181254a7Smrg } 770181254a7Smrg 771181254a7Smrg @system unittest 772181254a7Smrg { 773181254a7Smrg import core.exception : AssertError; 774181254a7Smrg import std.conv : to; 775181254a7Smrg import std.exception; 776181254a7Smrg import std.string : format; 777*b1e83836Smrg import std.traits : FunctionAttribute, functionAttributes, isSafe; 778181254a7Smrg static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 779181254a7Smrg { 780181254a7Smrg enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c), 781181254a7Smrg new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 782181254a7Smrg 783181254a7Smrg enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c), 784181254a7Smrg new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 785181254a7Smrg 786181254a7Smrg auto refRandom = new RefRandomCU!char(s); 787181254a7Smrg immutable randLen = refRandom.length; 788181254a7Smrg enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c), 789181254a7Smrg new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 790181254a7Smrg enforce(refRandom.length == randLen, 791181254a7Smrg new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 792181254a7Smrg 793181254a7Smrg if (i == size_t.max) 794181254a7Smrg { 795181254a7Smrg enforce(strideBack(s) == codeLength!char(c), 796181254a7Smrg new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 797181254a7Smrg 798181254a7Smrg enforce(strideBack(BidirCU!char(s)) == codeLength!char(c), 799181254a7Smrg new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 800181254a7Smrg 801181254a7Smrg auto refBidir = new RefBidirCU!char(s); 802181254a7Smrg immutable bidirLen = refBidir.length; 803181254a7Smrg enforce(strideBack(refBidir) == codeLength!char(c), 804181254a7Smrg new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 805181254a7Smrg enforce(refBidir.length == bidirLen, 806181254a7Smrg new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 807181254a7Smrg } 808181254a7Smrg } 809181254a7Smrg 810181254a7Smrg assertCTFEable!( 811181254a7Smrg { 812181254a7Smrg test("a", 'a'); 813181254a7Smrg test(" ", ' '); 814181254a7Smrg test("\u2029", '\u2029'); //paraSep 815181254a7Smrg test("\u0100", '\u0100'); 816181254a7Smrg test("\u0430", '\u0430'); 817181254a7Smrg test("\U00010143", '\U00010143'); 818181254a7Smrg test("abcdefcdef", 'f'); 819181254a7Smrg test("\U00010143\u0100\U00010143hello", 'o', 15); 820181254a7Smrg test("\U00010143\u0100\U00010143hello", 'l', 14); 821181254a7Smrg test("\U00010143\u0100\U00010143hello", 'l', 13); 822181254a7Smrg test("\U00010143\u0100\U00010143hello", 'e', 12); 823181254a7Smrg test("\U00010143\u0100\U00010143hello", 'h', 11); 824181254a7Smrg test("\U00010143\u0100\U00010143hello", '\U00010143', 10); 825181254a7Smrg test("\U00010143\u0100\U00010143hello", '\u0100', 6); 826181254a7Smrg test("\U00010143\u0100\U00010143hello", '\U00010143', 4); 827181254a7Smrg 828181254a7Smrg foreach (S; AliasSeq!(char[], const char[], string)) 829181254a7Smrg { 830181254a7Smrg enum str = to!S("hello world"); 831181254a7Smrg static assert(isSafe!({ strideBack(str, 0); })); 832181254a7Smrg static assert(isSafe!({ strideBack(str); })); 833181254a7Smrg static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0); 834181254a7Smrg static assert((functionAttributes!({ strideBack(str); }) & FunctionAttribute.pure_) != 0); 835181254a7Smrg } 836181254a7Smrg }); 837181254a7Smrg } 838181254a7Smrg 839181254a7Smrg //UTF-16 is self synchronizing: The length of strideBack can be found from 840181254a7Smrg //the value of a single wchar 841181254a7Smrg /// Ditto 842181254a7Smrg uint strideBack(S)(auto ref S str, size_t index) 843181254a7Smrg if (is(S : const wchar[]) || 844*b1e83836Smrg (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar))) 845181254a7Smrg { 846181254a7Smrg static if (is(typeof(str.length) : ulong)) 847181254a7Smrg assert(index <= str.length, "Past the end of the UTF-16 sequence"); 848181254a7Smrg assert(index > 0, "Not the end of a UTF-16 sequence"); 849181254a7Smrg 850181254a7Smrg immutable c2 = str[index-1]; 851181254a7Smrg return 1 + (0xDC00 <= c2 && c2 < 0xE000); 852181254a7Smrg } 853181254a7Smrg 854181254a7Smrg /// Ditto 855181254a7Smrg uint strideBack(S)(auto ref S str) 856181254a7Smrg if (is(S : const wchar[]) || 857*b1e83836Smrg (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar))) 858181254a7Smrg { 859181254a7Smrg assert(!str.empty, "UTF-16 sequence is empty"); 860181254a7Smrg 861181254a7Smrg static if (is(S : const(wchar)[])) 862181254a7Smrg immutable c2 = str[$ - 1]; 863181254a7Smrg else 864181254a7Smrg immutable c2 = str.back; 865181254a7Smrg 866181254a7Smrg return 1 + (0xDC00 <= c2 && c2 <= 0xE000); 867181254a7Smrg } 868181254a7Smrg 869181254a7Smrg @system unittest 870181254a7Smrg { 871181254a7Smrg import core.exception : AssertError; 872181254a7Smrg import std.conv : to; 873181254a7Smrg import std.exception; 874181254a7Smrg import std.string : format; 875*b1e83836Smrg import std.traits : FunctionAttribute, functionAttributes, isSafe; 876181254a7Smrg static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 877181254a7Smrg { 878181254a7Smrg enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c), 879181254a7Smrg new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 880181254a7Smrg 881181254a7Smrg enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c), 882181254a7Smrg new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 883181254a7Smrg 884181254a7Smrg auto refRandom = new RefRandomCU!wchar(s); 885181254a7Smrg immutable randLen = refRandom.length; 886181254a7Smrg enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c), 887181254a7Smrg new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 888181254a7Smrg enforce(refRandom.length == randLen, 889181254a7Smrg new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 890181254a7Smrg 891181254a7Smrg if (i == size_t.max) 892181254a7Smrg { 893181254a7Smrg enforce(strideBack(s) == codeLength!wchar(c), 894181254a7Smrg new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 895181254a7Smrg 896181254a7Smrg enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c), 897181254a7Smrg new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 898181254a7Smrg 899181254a7Smrg auto refBidir = new RefBidirCU!wchar(s); 900181254a7Smrg immutable bidirLen = refBidir.length; 901181254a7Smrg enforce(strideBack(refBidir) == codeLength!wchar(c), 902181254a7Smrg new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 903181254a7Smrg enforce(refBidir.length == bidirLen, 904181254a7Smrg new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 905181254a7Smrg } 906181254a7Smrg } 907181254a7Smrg 908181254a7Smrg assertCTFEable!( 909181254a7Smrg { 910181254a7Smrg test("a", 'a'); 911181254a7Smrg test(" ", ' '); 912181254a7Smrg test("\u2029", '\u2029'); //paraSep 913181254a7Smrg test("\u0100", '\u0100'); 914181254a7Smrg test("\u0430", '\u0430'); 915181254a7Smrg test("\U00010143", '\U00010143'); 916181254a7Smrg test("abcdefcdef", 'f'); 917181254a7Smrg test("\U00010143\u0100\U00010143hello", 'o', 10); 918181254a7Smrg test("\U00010143\u0100\U00010143hello", 'l', 9); 919181254a7Smrg test("\U00010143\u0100\U00010143hello", 'l', 8); 920181254a7Smrg test("\U00010143\u0100\U00010143hello", 'e', 7); 921181254a7Smrg test("\U00010143\u0100\U00010143hello", 'h', 6); 922181254a7Smrg test("\U00010143\u0100\U00010143hello", '\U00010143', 5); 923181254a7Smrg test("\U00010143\u0100\U00010143hello", '\u0100', 3); 924181254a7Smrg test("\U00010143\u0100\U00010143hello", '\U00010143', 2); 925181254a7Smrg 926181254a7Smrg foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 927181254a7Smrg { 928181254a7Smrg enum str = to!S("hello world"); 929181254a7Smrg static assert(isSafe!(() => strideBack(str, 0))); 930181254a7Smrg static assert(isSafe!(() => strideBack(str) )); 931181254a7Smrg static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 932181254a7Smrg static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 933181254a7Smrg } 934181254a7Smrg }); 935181254a7Smrg } 936181254a7Smrg 937181254a7Smrg /// Ditto 938181254a7Smrg uint strideBack(S)(auto ref S str, size_t index) 939*b1e83836Smrg if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar)) 940181254a7Smrg { 941181254a7Smrg static if (is(typeof(str.length) : ulong)) 942181254a7Smrg assert(index <= str.length, "Past the end of the UTF-32 sequence"); 943181254a7Smrg assert(index > 0, "Not the end of the UTF-32 sequence"); 944181254a7Smrg return 1; 945181254a7Smrg } 946181254a7Smrg 947181254a7Smrg /// Ditto 948181254a7Smrg uint strideBack(S)(auto ref S str) 949*b1e83836Smrg if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar)) 950181254a7Smrg { 951181254a7Smrg assert(!str.empty, "Empty UTF-32 sequence"); 952181254a7Smrg return 1; 953181254a7Smrg } 954181254a7Smrg 955*b1e83836Smrg /// 956*b1e83836Smrg @safe unittest 957*b1e83836Smrg { 958*b1e83836Smrg assert("a".strideBack == 1); 959*b1e83836Smrg assert("λ".strideBack == 2); 960*b1e83836Smrg assert("aλ".strideBack == 2); 961*b1e83836Smrg assert("aλ".strideBack(1) == 1); 962*b1e83836Smrg assert("".strideBack == 4); 963*b1e83836Smrg } 964*b1e83836Smrg 965181254a7Smrg @system unittest 966181254a7Smrg { 967181254a7Smrg import core.exception : AssertError; 968181254a7Smrg import std.conv : to; 969181254a7Smrg import std.exception; 970181254a7Smrg import std.string : format; 971*b1e83836Smrg import std.traits : FunctionAttribute, functionAttributes, isSafe; 972181254a7Smrg static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 973181254a7Smrg { 974181254a7Smrg enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c), 975181254a7Smrg new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 976181254a7Smrg 977181254a7Smrg enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c), 978181254a7Smrg new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 979181254a7Smrg 980181254a7Smrg auto refRandom = new RefRandomCU!dchar(s); 981181254a7Smrg immutable randLen = refRandom.length; 982181254a7Smrg enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c), 983181254a7Smrg new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 984181254a7Smrg enforce(refRandom.length == randLen, 985181254a7Smrg new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 986181254a7Smrg 987181254a7Smrg if (i == size_t.max) 988181254a7Smrg { 989181254a7Smrg enforce(strideBack(s) == codeLength!dchar(c), 990181254a7Smrg new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 991181254a7Smrg 992181254a7Smrg enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c), 993181254a7Smrg new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 994181254a7Smrg 995181254a7Smrg auto refBidir = new RefBidirCU!dchar(s); 996181254a7Smrg immutable bidirLen = refBidir.length; 997181254a7Smrg enforce(strideBack(refBidir) == codeLength!dchar(c), 998181254a7Smrg new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 999181254a7Smrg enforce(refBidir.length == bidirLen, 1000181254a7Smrg new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 1001181254a7Smrg } 1002181254a7Smrg } 1003181254a7Smrg 1004181254a7Smrg assertCTFEable!( 1005181254a7Smrg { 1006181254a7Smrg test("a", 'a'); 1007181254a7Smrg test(" ", ' '); 1008181254a7Smrg test("\u2029", '\u2029'); //paraSep 1009181254a7Smrg test("\u0100", '\u0100'); 1010181254a7Smrg test("\u0430", '\u0430'); 1011181254a7Smrg test("\U00010143", '\U00010143'); 1012181254a7Smrg test("abcdefcdef", 'f'); 1013181254a7Smrg test("\U00010143\u0100\U00010143hello", 'o', 8); 1014181254a7Smrg test("\U00010143\u0100\U00010143hello", 'l', 7); 1015181254a7Smrg test("\U00010143\u0100\U00010143hello", 'l', 6); 1016181254a7Smrg test("\U00010143\u0100\U00010143hello", 'e', 5); 1017181254a7Smrg test("\U00010143\u0100\U00010143hello", 'h', 4); 1018181254a7Smrg test("\U00010143\u0100\U00010143hello", '\U00010143', 3); 1019181254a7Smrg test("\U00010143\u0100\U00010143hello", '\u0100', 2); 1020181254a7Smrg test("\U00010143\u0100\U00010143hello", '\U00010143', 1); 1021181254a7Smrg 1022181254a7Smrg foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 1023181254a7Smrg { 1024181254a7Smrg enum str = to!S("hello world"); 1025181254a7Smrg static assert(isSafe!(() => strideBack(str, 0))); 1026181254a7Smrg static assert(isSafe!(() => strideBack(str) )); 1027181254a7Smrg static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 1028181254a7Smrg static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 1029181254a7Smrg } 1030181254a7Smrg }); 1031181254a7Smrg } 1032181254a7Smrg 1033181254a7Smrg 1034181254a7Smrg /++ 1035*b1e83836Smrg Given `index` into `str` and assuming that `index` is at the start 1036*b1e83836Smrg of a UTF sequence, `toUCSindex` determines the number of UCS characters 1037*b1e83836Smrg up to `index`. So, `index` is the index of a code unit at the 1038181254a7Smrg beginning of a code point, and the return value is how many code points into 1039181254a7Smrg the string that that code point is. 1040181254a7Smrg +/ 1041181254a7Smrg size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure 1042181254a7Smrg if (isSomeChar!C) 1043181254a7Smrg { 1044*b1e83836Smrg static if (is(immutable C == immutable dchar)) 1045181254a7Smrg return index; 1046181254a7Smrg else 1047181254a7Smrg { 1048181254a7Smrg size_t n = 0; 1049181254a7Smrg size_t j = 0; 1050181254a7Smrg 1051181254a7Smrg for (; j < index; ++n) 1052181254a7Smrg j += stride(str, j); 1053181254a7Smrg 1054181254a7Smrg if (j > index) 1055181254a7Smrg { 1056*b1e83836Smrg static if (is(immutable C == immutable char)) 1057181254a7Smrg throw new UTFException("Invalid UTF-8 sequence", index); 1058181254a7Smrg else 1059181254a7Smrg throw new UTFException("Invalid UTF-16 sequence", index); 1060181254a7Smrg } 1061181254a7Smrg 1062181254a7Smrg return n; 1063181254a7Smrg } 1064181254a7Smrg } 1065181254a7Smrg 1066181254a7Smrg /// 1067181254a7Smrg @safe unittest 1068181254a7Smrg { 1069181254a7Smrg assert(toUCSindex(`hello world`, 7) == 7); 1070181254a7Smrg assert(toUCSindex(`hello world`w, 7) == 7); 1071181254a7Smrg assert(toUCSindex(`hello world`d, 7) == 7); 1072181254a7Smrg 1073181254a7Smrg assert(toUCSindex(`Ma Chérie`, 7) == 6); 1074181254a7Smrg assert(toUCSindex(`Ma Chérie`w, 7) == 7); 1075181254a7Smrg assert(toUCSindex(`Ma Chérie`d, 7) == 7); 1076181254a7Smrg 1077181254a7Smrg assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3); 1078181254a7Smrg assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 1079181254a7Smrg assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 1080181254a7Smrg } 1081181254a7Smrg 1082181254a7Smrg 1083181254a7Smrg /++ 1084*b1e83836Smrg Given a UCS index `n` into `str`, returns the UTF index. 1085*b1e83836Smrg So, `n` is how many code points into the string the code point is, and 1086181254a7Smrg the array index of the code unit is returned. 1087181254a7Smrg +/ 1088181254a7Smrg size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure 1089181254a7Smrg if (isSomeChar!C) 1090181254a7Smrg { 1091*b1e83836Smrg static if (is(immutable C == immutable dchar)) 1092181254a7Smrg { 1093181254a7Smrg return n; 1094181254a7Smrg } 1095181254a7Smrg else 1096181254a7Smrg { 1097181254a7Smrg size_t i; 1098181254a7Smrg while (n--) 1099181254a7Smrg { 1100181254a7Smrg i += stride(str, i); 1101181254a7Smrg } 1102181254a7Smrg return i; 1103181254a7Smrg } 1104181254a7Smrg } 1105181254a7Smrg 1106181254a7Smrg /// 1107181254a7Smrg @safe unittest 1108181254a7Smrg { 1109181254a7Smrg assert(toUTFindex(`hello world`, 7) == 7); 1110181254a7Smrg assert(toUTFindex(`hello world`w, 7) == 7); 1111181254a7Smrg assert(toUTFindex(`hello world`d, 7) == 7); 1112181254a7Smrg 1113181254a7Smrg assert(toUTFindex(`Ma Chérie`, 6) == 7); 1114181254a7Smrg assert(toUTFindex(`Ma Chérie`w, 7) == 7); 1115181254a7Smrg assert(toUTFindex(`Ma Chérie`d, 7) == 7); 1116181254a7Smrg 1117181254a7Smrg assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9); 1118181254a7Smrg assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 1119181254a7Smrg assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 1120181254a7Smrg } 1121181254a7Smrg 1122181254a7Smrg 1123181254a7Smrg /* =================== Decode ======================= */ 1124181254a7Smrg 1125181254a7Smrg /// Whether or not to replace invalid UTF with $(LREF replacementDchar) 1126181254a7Smrg alias UseReplacementDchar = Flag!"useReplacementDchar"; 1127181254a7Smrg 1128181254a7Smrg /++ 1129*b1e83836Smrg Decodes and returns the code point starting at `str[index]`. `index` 1130181254a7Smrg is advanced to one past the decoded code point. If the code point is not 1131*b1e83836Smrg well-formed, then a `UTFException` is thrown and `index` remains 1132181254a7Smrg unchanged. 1133181254a7Smrg 1134181254a7Smrg decode will only work with strings and random access ranges of code units 1135181254a7Smrg with length and slicing, whereas $(LREF decodeFront) will work with any 1136181254a7Smrg input range of code units. 1137181254a7Smrg 1138181254a7Smrg Params: 1139181254a7Smrg useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1140181254a7Smrg str = input string or indexable Range 1141181254a7Smrg index = starting index into s[]; incremented by number of code units processed 1142181254a7Smrg 1143181254a7Smrg Returns: 1144181254a7Smrg decoded character 1145181254a7Smrg 1146181254a7Smrg Throws: 1147*b1e83836Smrg $(LREF UTFException) if `str[index]` is not the start of a valid UTF 1148*b1e83836Smrg sequence and useReplacementDchar is `No.useReplacementDchar` 1149181254a7Smrg +/ 1150181254a7Smrg dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index) 1151181254a7Smrg if (!isSomeString!S && 1152181254a7Smrg isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S)) 1153181254a7Smrg in 1154181254a7Smrg { 1155181254a7Smrg assert(index < str.length, "Attempted to decode past the end of a string"); 1156181254a7Smrg } 1157181254a7Smrg out (result) 1158181254a7Smrg { 1159181254a7Smrg assert(isValidDchar(result)); 1160181254a7Smrg } 1161*b1e83836Smrg do 1162181254a7Smrg { 1163181254a7Smrg if (str[index] < codeUnitLimit!S) 1164181254a7Smrg return str[index++]; 1165181254a7Smrg else 1166181254a7Smrg return decodeImpl!(true, useReplacementDchar)(str, index); 1167181254a7Smrg } 1168181254a7Smrg 1169*b1e83836Smrg /// ditto 1170181254a7Smrg dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1171*b1e83836Smrg auto ref scope S str, ref size_t index) @trusted pure 1172181254a7Smrg if (isSomeString!S) 1173181254a7Smrg in 1174181254a7Smrg { 1175181254a7Smrg assert(index < str.length, "Attempted to decode past the end of a string"); 1176181254a7Smrg } 1177181254a7Smrg out (result) 1178181254a7Smrg { 1179181254a7Smrg assert(isValidDchar(result)); 1180181254a7Smrg } 1181*b1e83836Smrg do 1182181254a7Smrg { 1183181254a7Smrg if (str[index] < codeUnitLimit!S) 1184181254a7Smrg return str[index++]; 1185*b1e83836Smrg else static if (is(immutable S == immutable C[], C)) 1186*b1e83836Smrg return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index); 1187*b1e83836Smrg } 1188*b1e83836Smrg 1189*b1e83836Smrg /// 1190*b1e83836Smrg @safe pure unittest 1191*b1e83836Smrg { 1192*b1e83836Smrg size_t i; 1193*b1e83836Smrg 1194*b1e83836Smrg assert("a".decode(i) == 'a' && i == 1); 1195*b1e83836Smrg i = 0; 1196*b1e83836Smrg assert("å".decode(i) == 'å' && i == 2); 1197*b1e83836Smrg i = 1; 1198*b1e83836Smrg assert("aå".decode(i) == 'å' && i == 3); 1199*b1e83836Smrg i = 0; 1200*b1e83836Smrg assert("å"w.decode(i) == 'å' && i == 1); 1201*b1e83836Smrg 1202*b1e83836Smrg // ë as a multi-code point grapheme 1203*b1e83836Smrg i = 0; 1204*b1e83836Smrg assert("e\u0308".decode(i) == 'e' && i == 1); 1205*b1e83836Smrg // ë as a single code point grapheme 1206*b1e83836Smrg i = 0; 1207*b1e83836Smrg assert("ë".decode(i) == 'ë' && i == 2); 1208*b1e83836Smrg i = 0; 1209*b1e83836Smrg assert("ë"w.decode(i) == 'ë' && i == 1); 1210*b1e83836Smrg } 1211*b1e83836Smrg 1212*b1e83836Smrg @safe pure unittest // https://issues.dlang.org/show_bug.cgi?id=22867 1213*b1e83836Smrg { 1214*b1e83836Smrg import std.conv : hexString; 1215*b1e83836Smrg string data = hexString!"f787a598"; 1216*b1e83836Smrg size_t offset = 0; 1217*b1e83836Smrg try data.decode(offset); 1218*b1e83836Smrg catch (UTFException ex) assert(offset == 0); 1219181254a7Smrg } 1220181254a7Smrg 1221181254a7Smrg /++ 1222*b1e83836Smrg `decodeFront` is a variant of $(LREF decode) which specifically decodes 1223*b1e83836Smrg the first code point. Unlike $(LREF decode), `decodeFront` accepts any 1224*b1e83836Smrg $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 1225*b1e83836Smrg of code units (rather than just a string or random access 1226*b1e83836Smrg range). It also takes the range by `ref` and pops off the elements as it 1227*b1e83836Smrg decodes them. If `numCodeUnits` is passed in, it gets set to the number 1228181254a7Smrg of code units which were in the code point which was decoded. 1229181254a7Smrg 1230181254a7Smrg Params: 1231181254a7Smrg useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1232181254a7Smrg str = input string or indexable Range 1233181254a7Smrg numCodeUnits = set to number of code units processed 1234181254a7Smrg 1235181254a7Smrg Returns: 1236181254a7Smrg decoded character 1237181254a7Smrg 1238181254a7Smrg Throws: 1239*b1e83836Smrg $(LREF UTFException) if `str.front` is not the start of a valid UTF 1240181254a7Smrg sequence. If an exception is thrown, then there is no guarantee as to 1241181254a7Smrg the number of code units which were popped off, as it depends on the 1242181254a7Smrg type of range being used and how many code units had to be popped off 1243181254a7Smrg before the code point was determined to be invalid. 1244181254a7Smrg +/ 1245181254a7Smrg dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1246181254a7Smrg ref S str, out size_t numCodeUnits) 1247181254a7Smrg if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S)) 1248181254a7Smrg in 1249181254a7Smrg { 1250181254a7Smrg assert(!str.empty); 1251181254a7Smrg } 1252181254a7Smrg out (result) 1253181254a7Smrg { 1254181254a7Smrg assert(isValidDchar(result)); 1255181254a7Smrg } 1256*b1e83836Smrg do 1257181254a7Smrg { 1258181254a7Smrg immutable fst = str.front; 1259181254a7Smrg 1260181254a7Smrg if (fst < codeUnitLimit!S) 1261181254a7Smrg { 1262181254a7Smrg str.popFront(); 1263181254a7Smrg numCodeUnits = 1; 1264181254a7Smrg return fst; 1265181254a7Smrg } 1266181254a7Smrg else 1267181254a7Smrg { 1268*b1e83836Smrg // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be 1269*b1e83836Smrg // done outside of decodeImpl, which is undesirable, since not all 1270*b1e83836Smrg // overloads of decodeImpl need it. So, it should be moved back into 1271*b1e83836Smrg // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521 1272*b1e83836Smrg // has been fixed. 1273*b1e83836Smrg enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S; 1274181254a7Smrg immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits); 1275181254a7Smrg 1276181254a7Smrg // The other range types were already popped by decodeImpl. 1277181254a7Smrg static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1278181254a7Smrg str = str[numCodeUnits .. str.length]; 1279181254a7Smrg 1280181254a7Smrg return retval; 1281181254a7Smrg } 1282181254a7Smrg } 1283181254a7Smrg 1284*b1e83836Smrg /// ditto 1285181254a7Smrg dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1286*b1e83836Smrg ref scope S str, out size_t numCodeUnits) @trusted pure 1287181254a7Smrg if (isSomeString!S) 1288181254a7Smrg in 1289181254a7Smrg { 1290181254a7Smrg assert(!str.empty); 1291181254a7Smrg } 1292181254a7Smrg out (result) 1293181254a7Smrg { 1294181254a7Smrg assert(isValidDchar(result)); 1295181254a7Smrg } 1296*b1e83836Smrg do 1297181254a7Smrg { 1298181254a7Smrg if (str[0] < codeUnitLimit!S) 1299181254a7Smrg { 1300181254a7Smrg numCodeUnits = 1; 1301181254a7Smrg immutable retval = str[0]; 1302181254a7Smrg str = str[1 .. $]; 1303181254a7Smrg return retval; 1304181254a7Smrg } 1305*b1e83836Smrg else static if (is(immutable S == immutable C[], C)) 1306181254a7Smrg { 1307*b1e83836Smrg immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits); 1308181254a7Smrg str = str[numCodeUnits .. $]; 1309181254a7Smrg return retval; 1310181254a7Smrg } 1311181254a7Smrg } 1312181254a7Smrg 1313181254a7Smrg /++ Ditto +/ 1314181254a7Smrg dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1315181254a7Smrg if (isInputRange!S && isSomeChar!(ElementType!S)) 1316181254a7Smrg { 1317181254a7Smrg size_t numCodeUnits; 1318181254a7Smrg return decodeFront!useReplacementDchar(str, numCodeUnits); 1319181254a7Smrg } 1320181254a7Smrg 1321*b1e83836Smrg /// 1322*b1e83836Smrg @safe pure unittest 1323*b1e83836Smrg { 1324*b1e83836Smrg import std.range.primitives; 1325*b1e83836Smrg string str = "Hello, World!"; 1326*b1e83836Smrg 1327*b1e83836Smrg assert(str.decodeFront == 'H' && str == "ello, World!"); 1328*b1e83836Smrg str = "å"; 1329*b1e83836Smrg assert(str.decodeFront == 'å' && str.empty); 1330*b1e83836Smrg str = "å"; 1331*b1e83836Smrg size_t i; 1332*b1e83836Smrg assert(str.decodeFront(i) == 'å' && i == 2 && str.empty); 1333*b1e83836Smrg } 1334*b1e83836Smrg 1335181254a7Smrg /++ 1336*b1e83836Smrg `decodeBack` is a variant of $(LREF decode) which specifically decodes 1337*b1e83836Smrg the last code point. Unlike $(LREF decode), `decodeBack` accepts any 1338181254a7Smrg bidirectional range of code units (rather than just a string or random access 1339*b1e83836Smrg range). It also takes the range by `ref` and pops off the elements as it 1340*b1e83836Smrg decodes them. If `numCodeUnits` is passed in, it gets set to the number 1341181254a7Smrg of code units which were in the code point which was decoded. 1342181254a7Smrg 1343181254a7Smrg Params: 1344181254a7Smrg useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing 1345181254a7Smrg str = input string or bidirectional Range 1346181254a7Smrg numCodeUnits = gives the number of code units processed 1347181254a7Smrg 1348181254a7Smrg Returns: 1349181254a7Smrg A decoded UTF character. 1350181254a7Smrg 1351181254a7Smrg Throws: 1352*b1e83836Smrg $(LREF UTFException) if `str.back` is not the end of a valid UTF 1353*b1e83836Smrg sequence. If an exception is thrown, the `str` itself remains unchanged, 1354*b1e83836Smrg but there is no guarantee as to the value of `numCodeUnits` (when passed). 1355181254a7Smrg +/ 1356181254a7Smrg dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1357181254a7Smrg ref S str, out size_t numCodeUnits) 1358181254a7Smrg if (isSomeString!S) 1359181254a7Smrg in 1360181254a7Smrg { 1361181254a7Smrg assert(!str.empty); 1362181254a7Smrg } 1363181254a7Smrg out (result) 1364181254a7Smrg { 1365181254a7Smrg assert(isValidDchar(result)); 1366181254a7Smrg } 1367*b1e83836Smrg do 1368181254a7Smrg { 1369181254a7Smrg if (str[$ - 1] < codeUnitLimit!S) 1370181254a7Smrg { 1371181254a7Smrg numCodeUnits = 1; 1372181254a7Smrg immutable retval = str[$ - 1]; 1373181254a7Smrg str = str[0 .. $ - 1]; 1374181254a7Smrg return retval; 1375181254a7Smrg } 1376*b1e83836Smrg else static if (is(immutable S == immutable C[], C)) 1377181254a7Smrg { 1378181254a7Smrg numCodeUnits = strideBack(str); 1379181254a7Smrg immutable newLength = str.length - numCodeUnits; 1380181254a7Smrg size_t index = newLength; 1381*b1e83836Smrg immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index); 1382181254a7Smrg str = str[0 .. newLength]; 1383181254a7Smrg return retval; 1384181254a7Smrg } 1385181254a7Smrg } 1386181254a7Smrg 1387181254a7Smrg /++ Ditto +/ 1388181254a7Smrg dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1389181254a7Smrg ref S str, out size_t numCodeUnits) 1390181254a7Smrg if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S 1391181254a7Smrg && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S)) 1392181254a7Smrg in 1393181254a7Smrg { 1394181254a7Smrg assert(!str.empty); 1395181254a7Smrg } 1396181254a7Smrg out (result) 1397181254a7Smrg { 1398181254a7Smrg assert(isValidDchar(result)); 1399181254a7Smrg } 1400*b1e83836Smrg do 1401181254a7Smrg { 1402181254a7Smrg if (str.back < codeUnitLimit!S) 1403181254a7Smrg { 1404181254a7Smrg numCodeUnits = 1; 1405181254a7Smrg immutable retval = str.back; 1406181254a7Smrg str.popBack(); 1407181254a7Smrg return retval; 1408181254a7Smrg } 1409181254a7Smrg else 1410181254a7Smrg { 1411181254a7Smrg numCodeUnits = strideBack(str); 1412181254a7Smrg static if (isRandomAccessRange!S) 1413181254a7Smrg { 1414181254a7Smrg size_t index = str.length - numCodeUnits; 1415181254a7Smrg immutable retval = decodeImpl!(true, useReplacementDchar)(str, index); 1416181254a7Smrg str.popBackExactly(numCodeUnits); 1417181254a7Smrg return retval; 1418181254a7Smrg } 1419181254a7Smrg else 1420181254a7Smrg { 1421181254a7Smrg alias Char = Unqual!(ElementType!S); 1422181254a7Smrg Char[4] codeUnits; 1423181254a7Smrg S tmp = str.save; 1424181254a7Smrg for (size_t i = numCodeUnits; i > 0; ) 1425181254a7Smrg { 1426181254a7Smrg codeUnits[--i] = tmp.back; 1427181254a7Smrg tmp.popBack(); 1428181254a7Smrg } 1429181254a7Smrg const Char[] codePoint = codeUnits[0 .. numCodeUnits]; 1430181254a7Smrg size_t index = 0; 1431181254a7Smrg immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index); 1432181254a7Smrg str = tmp; 1433181254a7Smrg return retval; 1434181254a7Smrg } 1435181254a7Smrg } 1436181254a7Smrg } 1437181254a7Smrg 1438181254a7Smrg /++ Ditto +/ 1439181254a7Smrg dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1440181254a7Smrg if (isSomeString!S 1441181254a7Smrg || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S)) 1442181254a7Smrg || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S))) 1443181254a7Smrg in 1444181254a7Smrg { 1445181254a7Smrg assert(!str.empty); 1446181254a7Smrg } 1447181254a7Smrg out (result) 1448181254a7Smrg { 1449181254a7Smrg assert(isValidDchar(result)); 1450181254a7Smrg } 1451*b1e83836Smrg do 1452181254a7Smrg { 1453181254a7Smrg size_t numCodeUnits; 1454181254a7Smrg return decodeBack!useReplacementDchar(str, numCodeUnits); 1455181254a7Smrg } 1456181254a7Smrg 1457*b1e83836Smrg /// 1458*b1e83836Smrg @system pure unittest 1459*b1e83836Smrg { 1460*b1e83836Smrg import std.range.primitives; 1461*b1e83836Smrg string str = "Hello, World!"; 1462*b1e83836Smrg 1463*b1e83836Smrg assert(str.decodeBack == '!' && str == "Hello, World"); 1464*b1e83836Smrg str = "å"; 1465*b1e83836Smrg assert(str.decodeBack == 'å' && str.empty); 1466*b1e83836Smrg str = "å"; 1467*b1e83836Smrg size_t i; 1468*b1e83836Smrg assert(str.decodeBack(i) == 'å' && i == 2 && str.empty); 1469*b1e83836Smrg } 1470*b1e83836Smrg 1471*b1e83836Smrg // For the given range, code unit values less than this 1472*b1e83836Smrg // are guaranteed to be valid single-codepoint encodings. 1473181254a7Smrg package template codeUnitLimit(S) 1474181254a7Smrg if (isSomeChar!(ElementEncodingType!S)) 1475181254a7Smrg { 1476*b1e83836Smrg static if (is(immutable ElementEncodingType!S == immutable char)) 1477181254a7Smrg enum char codeUnitLimit = 0x80; 1478*b1e83836Smrg else static if (is(immutable ElementEncodingType!S == immutable wchar)) 1479181254a7Smrg enum wchar codeUnitLimit = 0xD800; 1480181254a7Smrg else 1481181254a7Smrg enum dchar codeUnitLimit = 0xD800; 1482181254a7Smrg } 1483181254a7Smrg 1484181254a7Smrg /* 1485181254a7Smrg * For strings, this function does its own bounds checking to give a 1486181254a7Smrg * more useful error message when attempting to decode past the end of a string. 1487181254a7Smrg * Subsequently it uses a pointer instead of an array to avoid 1488181254a7Smrg * redundant bounds checking. 1489181254a7Smrg * 1490181254a7Smrg * The three overloads of this operate on chars, wchars, and dchars. 1491181254a7Smrg * 1492181254a7Smrg * Params: 1493181254a7Smrg * canIndex = if S is indexable 1494181254a7Smrg * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1495181254a7Smrg * str = input string or Range 1496181254a7Smrg * index = starting index into s[]; incremented by number of code units processed 1497181254a7Smrg * 1498181254a7Smrg * Returns: 1499181254a7Smrg * decoded character 1500181254a7Smrg */ 1501181254a7Smrg private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1502181254a7Smrg auto ref S str, ref size_t index) 1503181254a7Smrg if ( 1504*b1e83836Smrg is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char))) 1505181254a7Smrg { 1506181254a7Smrg /* The following encodings are valid, except for the 5 and 6 byte 1507181254a7Smrg * combinations: 1508181254a7Smrg * 0xxxxxxx 1509181254a7Smrg * 110xxxxx 10xxxxxx 1510181254a7Smrg * 1110xxxx 10xxxxxx 10xxxxxx 1511181254a7Smrg * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 1512181254a7Smrg * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1513181254a7Smrg * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1514181254a7Smrg */ 1515181254a7Smrg 1516181254a7Smrg /* Dchar bitmask for different numbers of UTF-8 code units. 1517181254a7Smrg */ 1518181254a7Smrg alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1); 1519181254a7Smrg 1520181254a7Smrg static if (is(S : const char[])) 1521181254a7Smrg auto pstr = str.ptr + index; // this is what makes decodeImpl() @system code 1522181254a7Smrg else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1523181254a7Smrg auto pstr = str[index .. str.length]; 1524181254a7Smrg else 1525181254a7Smrg alias pstr = str; 1526181254a7Smrg 1527*b1e83836Smrg // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done 1528*b1e83836Smrg // outside of decodeImpl 1529181254a7Smrg //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1530181254a7Smrg 1531181254a7Smrg static if (canIndex) 1532181254a7Smrg { 1533181254a7Smrg immutable length = str.length - index; 1534181254a7Smrg ubyte fst = pstr[0]; 1535181254a7Smrg } 1536181254a7Smrg else 1537181254a7Smrg { 1538181254a7Smrg ubyte fst = pstr.front; 1539181254a7Smrg pstr.popFront(); 1540181254a7Smrg } 1541181254a7Smrg 1542181254a7Smrg static if (!useReplacementDchar) 1543181254a7Smrg { 1544181254a7Smrg static if (canIndex) 1545181254a7Smrg { 1546181254a7Smrg static UTFException exception(S)(S str, string msg) 1547181254a7Smrg { 1548181254a7Smrg uint[4] sequence = void; 1549181254a7Smrg size_t i; 1550181254a7Smrg 1551181254a7Smrg do 1552181254a7Smrg { 1553181254a7Smrg sequence[i] = str[i]; 1554181254a7Smrg } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80); 1555181254a7Smrg 1556181254a7Smrg return new UTFException(msg, i).setSequence(sequence[0 .. i]); 1557181254a7Smrg } 1558181254a7Smrg } 1559181254a7Smrg 1560181254a7Smrg UTFException invalidUTF() 1561181254a7Smrg { 1562181254a7Smrg static if (canIndex) 1563181254a7Smrg return exception(pstr[0 .. length], "Invalid UTF-8 sequence"); 1564181254a7Smrg else 1565181254a7Smrg { 1566181254a7Smrg //We can't include the invalid sequence with input strings without 1567181254a7Smrg //saving each of the code units along the way, and we can't do it with 1568181254a7Smrg //forward ranges without saving the entire range. Both would incur a 1569181254a7Smrg //cost for the decoding of every character just to provide a better 1570181254a7Smrg //error message for the (hopefully) rare case when an invalid UTF-8 1571181254a7Smrg //sequence is encountered, so we don't bother trying to include the 1572181254a7Smrg //invalid sequence here, unlike with strings and sliceable ranges. 1573181254a7Smrg return new UTFException("Invalid UTF-8 sequence"); 1574181254a7Smrg } 1575181254a7Smrg } 1576181254a7Smrg 1577181254a7Smrg UTFException outOfBounds() 1578181254a7Smrg { 1579181254a7Smrg static if (canIndex) 1580181254a7Smrg return exception(pstr[0 .. length], "Attempted to decode past the end of a string"); 1581181254a7Smrg else 1582181254a7Smrg return new UTFException("Attempted to decode past the end of a string"); 1583181254a7Smrg } 1584181254a7Smrg } 1585181254a7Smrg 1586181254a7Smrg if ((fst & 0b1100_0000) != 0b1100_0000) 1587181254a7Smrg { 1588181254a7Smrg static if (useReplacementDchar) 1589181254a7Smrg { 1590181254a7Smrg ++index; // always consume bad input to avoid infinite loops 1591181254a7Smrg return replacementDchar; 1592181254a7Smrg } 1593181254a7Smrg else 1594181254a7Smrg throw invalidUTF(); // starter must have at least 2 first bits set 1595181254a7Smrg } 1596181254a7Smrg ubyte tmp = void; 1597181254a7Smrg dchar d = fst; // upper control bits are masked out later 1598181254a7Smrg fst <<= 1; 1599181254a7Smrg 1600181254a7Smrg foreach (i; AliasSeq!(1, 2, 3)) 1601181254a7Smrg { 1602181254a7Smrg 1603181254a7Smrg static if (canIndex) 1604181254a7Smrg { 1605181254a7Smrg if (i == length) 1606181254a7Smrg { 1607181254a7Smrg static if (useReplacementDchar) 1608181254a7Smrg { 1609181254a7Smrg index += i; 1610181254a7Smrg return replacementDchar; 1611181254a7Smrg } 1612181254a7Smrg else 1613181254a7Smrg throw outOfBounds(); 1614181254a7Smrg } 1615181254a7Smrg } 1616181254a7Smrg else 1617181254a7Smrg { 1618181254a7Smrg if (pstr.empty) 1619181254a7Smrg { 1620181254a7Smrg static if (useReplacementDchar) 1621181254a7Smrg { 1622181254a7Smrg index += i; 1623181254a7Smrg return replacementDchar; 1624181254a7Smrg } 1625181254a7Smrg else 1626181254a7Smrg throw outOfBounds(); 1627181254a7Smrg } 1628181254a7Smrg } 1629181254a7Smrg 1630181254a7Smrg static if (canIndex) 1631181254a7Smrg tmp = pstr[i]; 1632181254a7Smrg else 1633181254a7Smrg { 1634181254a7Smrg tmp = pstr.front; 1635181254a7Smrg pstr.popFront(); 1636181254a7Smrg } 1637181254a7Smrg 1638181254a7Smrg if ((tmp & 0xC0) != 0x80) 1639181254a7Smrg { 1640181254a7Smrg static if (useReplacementDchar) 1641181254a7Smrg { 1642181254a7Smrg index += i + 1; 1643181254a7Smrg return replacementDchar; 1644181254a7Smrg } 1645181254a7Smrg else 1646181254a7Smrg throw invalidUTF(); 1647181254a7Smrg } 1648181254a7Smrg 1649181254a7Smrg d = (d << 6) | (tmp & 0x3F); 1650181254a7Smrg fst <<= 1; 1651181254a7Smrg 1652181254a7Smrg if (!(fst & 0x80)) // no more bytes 1653181254a7Smrg { 1654181254a7Smrg d &= bitMask[i]; // mask out control bits 1655181254a7Smrg 1656181254a7Smrg // overlong, could have been encoded with i bytes 1657181254a7Smrg if ((d & ~bitMask[i - 1]) == 0) 1658181254a7Smrg { 1659181254a7Smrg static if (useReplacementDchar) 1660181254a7Smrg { 1661181254a7Smrg index += i + 1; 1662181254a7Smrg return replacementDchar; 1663181254a7Smrg } 1664181254a7Smrg else 1665181254a7Smrg throw invalidUTF(); 1666181254a7Smrg } 1667181254a7Smrg 1668181254a7Smrg // check for surrogates only needed for 3 bytes 1669181254a7Smrg static if (i == 2) 1670181254a7Smrg { 1671181254a7Smrg if (!isValidDchar(d)) 1672181254a7Smrg { 1673181254a7Smrg static if (useReplacementDchar) 1674181254a7Smrg { 1675181254a7Smrg index += i + 1; 1676181254a7Smrg return replacementDchar; 1677181254a7Smrg } 1678181254a7Smrg else 1679181254a7Smrg throw invalidUTF(); 1680181254a7Smrg } 1681181254a7Smrg } 1682181254a7Smrg 1683181254a7Smrg static if (i == 3) 1684181254a7Smrg { 1685181254a7Smrg if (d > dchar.max) 1686181254a7Smrg { 1687181254a7Smrg static if (useReplacementDchar) 1688181254a7Smrg d = replacementDchar; 1689181254a7Smrg else 1690181254a7Smrg throw invalidUTF(); 1691181254a7Smrg } 1692181254a7Smrg } 1693*b1e83836Smrg 1694*b1e83836Smrg index += i + 1; 1695181254a7Smrg return d; 1696181254a7Smrg } 1697181254a7Smrg } 1698181254a7Smrg 1699181254a7Smrg static if (useReplacementDchar) 1700181254a7Smrg { 1701181254a7Smrg index += 4; // read 4 chars by now 1702181254a7Smrg return replacementDchar; 1703181254a7Smrg } 1704181254a7Smrg else 1705181254a7Smrg throw invalidUTF(); 1706181254a7Smrg } 1707181254a7Smrg 1708181254a7Smrg @safe pure @nogc nothrow 1709181254a7Smrg unittest 1710181254a7Smrg { 1711181254a7Smrg // Add tests for useReplacemendDchar == yes path 1712181254a7Smrg 1713181254a7Smrg static struct R 1714181254a7Smrg { 1715181254a7Smrg @safe pure @nogc nothrow: 1716181254a7Smrg this(string s) { this.s = s; } 1717181254a7Smrg @property bool empty() { return idx == s.length; } 1718181254a7Smrg @property char front() { return s[idx]; } 1719181254a7Smrg void popFront() { ++idx; } 1720181254a7Smrg size_t idx; 1721181254a7Smrg string s; 1722181254a7Smrg } 1723181254a7Smrg 1724181254a7Smrg foreach (s; invalidUTFstrings!char()) 1725181254a7Smrg { 1726181254a7Smrg auto r = R(s); 1727181254a7Smrg size_t index; 1728181254a7Smrg dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1729181254a7Smrg assert(dc == replacementDchar); 1730181254a7Smrg assert(1 <= index && index <= s.length); 1731181254a7Smrg } 1732181254a7Smrg } 1733181254a7Smrg 1734181254a7Smrg private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S) 1735181254a7Smrg (auto ref S str, ref size_t index) 1736*b1e83836Smrg if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar))) 1737181254a7Smrg { 1738181254a7Smrg static if (is(S : const wchar[])) 1739181254a7Smrg auto pstr = str.ptr + index; 1740181254a7Smrg else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1741181254a7Smrg auto pstr = str[index .. str.length]; 1742181254a7Smrg else 1743181254a7Smrg alias pstr = str; 1744181254a7Smrg 1745*b1e83836Smrg // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done 1746*b1e83836Smrg // outside of decodeImpl 1747181254a7Smrg //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1748181254a7Smrg 1749181254a7Smrg static if (canIndex) 1750181254a7Smrg { 1751181254a7Smrg immutable length = str.length - index; 1752181254a7Smrg uint u = pstr[0]; 1753181254a7Smrg } 1754181254a7Smrg else 1755181254a7Smrg { 1756181254a7Smrg uint u = pstr.front; 1757181254a7Smrg pstr.popFront(); 1758181254a7Smrg } 1759181254a7Smrg 1760181254a7Smrg static if (!useReplacementDchar) 1761181254a7Smrg { 1762181254a7Smrg UTFException exception(string msg) 1763181254a7Smrg { 1764181254a7Smrg static if (canIndex) 1765181254a7Smrg return new UTFException(msg).setSequence(pstr[0]); 1766181254a7Smrg else 1767181254a7Smrg return new UTFException(msg); 1768181254a7Smrg } 1769181254a7Smrg } 1770181254a7Smrg 1771181254a7Smrg // The < case must be taken care of before decodeImpl is called. 1772181254a7Smrg assert(u >= 0xD800); 1773181254a7Smrg 1774181254a7Smrg if (u <= 0xDBFF) 1775181254a7Smrg { 1776181254a7Smrg static if (canIndex) 1777181254a7Smrg immutable onlyOneCodeUnit = length == 1; 1778181254a7Smrg else 1779181254a7Smrg immutable onlyOneCodeUnit = pstr.empty; 1780181254a7Smrg 1781181254a7Smrg if (onlyOneCodeUnit) 1782181254a7Smrg { 1783181254a7Smrg static if (useReplacementDchar) 1784181254a7Smrg { 1785181254a7Smrg ++index; 1786181254a7Smrg return replacementDchar; 1787181254a7Smrg } 1788181254a7Smrg else 1789181254a7Smrg throw exception("surrogate UTF-16 high value past end of string"); 1790181254a7Smrg } 1791181254a7Smrg 1792181254a7Smrg static if (canIndex) 1793181254a7Smrg immutable uint u2 = pstr[1]; 1794181254a7Smrg else 1795181254a7Smrg { 1796181254a7Smrg immutable uint u2 = pstr.front; 1797181254a7Smrg pstr.popFront(); 1798181254a7Smrg } 1799181254a7Smrg 1800181254a7Smrg if (u2 < 0xDC00 || u2 > 0xDFFF) 1801181254a7Smrg { 1802181254a7Smrg static if (useReplacementDchar) 1803181254a7Smrg u = replacementDchar; 1804181254a7Smrg else 1805181254a7Smrg throw exception("surrogate UTF-16 low value out of range"); 1806181254a7Smrg } 1807181254a7Smrg else 1808181254a7Smrg u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 1809181254a7Smrg ++index; 1810181254a7Smrg } 1811181254a7Smrg else if (u >= 0xDC00 && u <= 0xDFFF) 1812181254a7Smrg { 1813181254a7Smrg static if (useReplacementDchar) 1814181254a7Smrg u = replacementDchar; 1815181254a7Smrg else 1816181254a7Smrg throw exception("unpaired surrogate UTF-16 value"); 1817181254a7Smrg } 1818181254a7Smrg ++index; 1819181254a7Smrg 1820181254a7Smrg // Note: u+FFFE and u+FFFF are specifically permitted by the 1821181254a7Smrg // Unicode standard for application internal use (see isValidDchar) 1822181254a7Smrg 1823181254a7Smrg return cast(dchar) u; 1824181254a7Smrg } 1825181254a7Smrg 1826181254a7Smrg @safe pure @nogc nothrow 1827181254a7Smrg unittest 1828181254a7Smrg { 1829181254a7Smrg // Add tests for useReplacemendDchar == true path 1830181254a7Smrg 1831181254a7Smrg static struct R 1832181254a7Smrg { 1833181254a7Smrg @safe pure @nogc nothrow: 1834181254a7Smrg this(wstring s) { this.s = s; } 1835181254a7Smrg @property bool empty() { return idx == s.length; } 1836181254a7Smrg @property wchar front() { return s[idx]; } 1837181254a7Smrg void popFront() { ++idx; } 1838181254a7Smrg size_t idx; 1839181254a7Smrg wstring s; 1840181254a7Smrg } 1841181254a7Smrg 1842181254a7Smrg foreach (s; invalidUTFstrings!wchar()) 1843181254a7Smrg { 1844181254a7Smrg auto r = R(s); 1845181254a7Smrg size_t index; 1846181254a7Smrg dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1847181254a7Smrg assert(dc == replacementDchar); 1848181254a7Smrg assert(1 <= index && index <= s.length); 1849181254a7Smrg } 1850181254a7Smrg } 1851181254a7Smrg 1852181254a7Smrg private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1853181254a7Smrg auto ref S str, ref size_t index) 1854*b1e83836Smrg if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar))) 1855181254a7Smrg { 1856181254a7Smrg static if (is(S : const dchar[])) 1857181254a7Smrg auto pstr = str.ptr; 1858181254a7Smrg else 1859181254a7Smrg alias pstr = str; 1860181254a7Smrg 1861181254a7Smrg static if (is(S : const dchar[]) || isRandomAccessRange!S) 1862181254a7Smrg { 1863181254a7Smrg dchar dc = pstr[index]; 1864181254a7Smrg if (!isValidDchar(dc)) 1865181254a7Smrg { 1866181254a7Smrg static if (useReplacementDchar) 1867181254a7Smrg dc = replacementDchar; 1868181254a7Smrg else 1869181254a7Smrg throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1870181254a7Smrg } 1871181254a7Smrg ++index; 1872181254a7Smrg return dc; 1873181254a7Smrg } 1874181254a7Smrg else 1875181254a7Smrg { 1876181254a7Smrg dchar dc = pstr.front; 1877181254a7Smrg if (!isValidDchar(dc)) 1878181254a7Smrg { 1879181254a7Smrg static if (useReplacementDchar) 1880181254a7Smrg dc = replacementDchar; 1881181254a7Smrg else 1882181254a7Smrg throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1883181254a7Smrg } 1884181254a7Smrg ++index; 1885181254a7Smrg pstr.popFront(); 1886181254a7Smrg return dc; 1887181254a7Smrg } 1888181254a7Smrg } 1889181254a7Smrg 1890181254a7Smrg @safe pure @nogc nothrow 1891181254a7Smrg unittest 1892181254a7Smrg { 1893181254a7Smrg // Add tests for useReplacemendDchar == true path 1894181254a7Smrg 1895181254a7Smrg static struct R 1896181254a7Smrg { 1897181254a7Smrg @safe pure @nogc nothrow: 1898181254a7Smrg this(dstring s) { this.s = s; } 1899181254a7Smrg @property bool empty() { return idx == s.length; } 1900181254a7Smrg @property dchar front() { return s[idx]; } 1901181254a7Smrg void popFront() { ++idx; } 1902181254a7Smrg size_t idx; 1903181254a7Smrg dstring s; 1904181254a7Smrg } 1905181254a7Smrg 1906181254a7Smrg foreach (s; invalidUTFstrings!dchar()) 1907181254a7Smrg { 1908181254a7Smrg auto r = R(s); 1909181254a7Smrg size_t index; 1910181254a7Smrg dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1911181254a7Smrg assert(dc == replacementDchar); 1912181254a7Smrg assert(1 <= index && index <= s.length); 1913181254a7Smrg } 1914181254a7Smrg } 1915181254a7Smrg 1916181254a7Smrg 1917*b1e83836Smrg version (StdUnittest) private void testDecode(R)(R range, 1918181254a7Smrg size_t index, 1919181254a7Smrg dchar expectedChar, 1920181254a7Smrg size_t expectedIndex, 1921181254a7Smrg size_t line = __LINE__) 1922181254a7Smrg { 1923181254a7Smrg import core.exception : AssertError; 1924*b1e83836Smrg import std.exception : enforce; 1925181254a7Smrg import std.string : format; 1926*b1e83836Smrg import std.traits : isNarrowString; 1927181254a7Smrg 1928181254a7Smrg static if (hasLength!R) 1929181254a7Smrg immutable lenBefore = range.length; 1930181254a7Smrg 1931*b1e83836Smrg static if (isRandomAccessRange!R && !isNarrowString!R) 1932181254a7Smrg { 1933181254a7Smrg { 1934181254a7Smrg immutable result = decode(range, index); 1935181254a7Smrg enforce(result == expectedChar, 1936181254a7Smrg new AssertError(format("decode: Wrong character: %s", result), __FILE__, line)); 1937181254a7Smrg enforce(index == expectedIndex, 1938181254a7Smrg new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 1939181254a7Smrg static if (hasLength!R) 1940181254a7Smrg { 1941181254a7Smrg enforce(range.length == lenBefore, 1942181254a7Smrg new AssertError(format("decode: length changed: %s", range.length), __FILE__, line)); 1943181254a7Smrg } 1944181254a7Smrg } 1945181254a7Smrg } 1946181254a7Smrg } 1947181254a7Smrg 1948*b1e83836Smrg version (StdUnittest) private void testDecodeFront(R)(ref R range, 1949181254a7Smrg dchar expectedChar, 1950181254a7Smrg size_t expectedNumCodeUnits, 1951181254a7Smrg size_t line = __LINE__) 1952181254a7Smrg { 1953181254a7Smrg import core.exception : AssertError; 1954*b1e83836Smrg import std.exception : enforce; 1955181254a7Smrg import std.string : format; 1956181254a7Smrg 1957181254a7Smrg static if (hasLength!R) 1958181254a7Smrg immutable lenBefore = range.length; 1959181254a7Smrg 1960181254a7Smrg size_t numCodeUnits; 1961181254a7Smrg immutable result = decodeFront(range, numCodeUnits); 1962181254a7Smrg enforce(result == expectedChar, 1963181254a7Smrg new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line)); 1964181254a7Smrg enforce(numCodeUnits == expectedNumCodeUnits, 1965181254a7Smrg new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1966181254a7Smrg 1967181254a7Smrg static if (hasLength!R) 1968181254a7Smrg { 1969181254a7Smrg enforce(range.length == lenBefore - numCodeUnits, 1970181254a7Smrg new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line)); 1971181254a7Smrg } 1972181254a7Smrg } 1973181254a7Smrg 1974*b1e83836Smrg version (StdUnittest) private void testDecodeBack(R)(ref R range, 1975181254a7Smrg dchar expectedChar, 1976181254a7Smrg size_t expectedNumCodeUnits, 1977181254a7Smrg size_t line = __LINE__) 1978181254a7Smrg { 1979181254a7Smrg // This condition is to allow unit testing all `decode` functions together 1980181254a7Smrg static if (!isBidirectionalRange!R) 1981181254a7Smrg return; 1982181254a7Smrg else 1983181254a7Smrg { 1984181254a7Smrg import core.exception : AssertError; 1985*b1e83836Smrg import std.exception : enforce; 1986181254a7Smrg import std.string : format; 1987181254a7Smrg 1988181254a7Smrg static if (hasLength!R) 1989181254a7Smrg immutable lenBefore = range.length; 1990181254a7Smrg 1991181254a7Smrg size_t numCodeUnits; 1992181254a7Smrg immutable result = decodeBack(range, numCodeUnits); 1993181254a7Smrg enforce(result == expectedChar, 1994181254a7Smrg new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line)); 1995181254a7Smrg enforce(numCodeUnits == expectedNumCodeUnits, 1996181254a7Smrg new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1997181254a7Smrg 1998181254a7Smrg static if (hasLength!R) 1999181254a7Smrg { 2000181254a7Smrg enforce(range.length == lenBefore - numCodeUnits, 2001181254a7Smrg new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line)); 2002181254a7Smrg } 2003181254a7Smrg } 2004181254a7Smrg } 2005181254a7Smrg 2006*b1e83836Smrg version (StdUnittest) private void testAllDecode(R)(R range, 2007181254a7Smrg dchar expectedChar, 2008181254a7Smrg size_t expectedIndex, 2009181254a7Smrg size_t line = __LINE__) 2010181254a7Smrg { 2011181254a7Smrg testDecode(range, 0, expectedChar, expectedIndex, line); 2012181254a7Smrg static if (isBidirectionalRange!R) 2013181254a7Smrg { 2014181254a7Smrg auto rangeCopy = range.save; 2015181254a7Smrg testDecodeBack(rangeCopy, expectedChar, expectedIndex, line); 2016181254a7Smrg } 2017181254a7Smrg testDecodeFront(range, expectedChar, expectedIndex, line); 2018181254a7Smrg } 2019181254a7Smrg 2020*b1e83836Smrg version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__) 2021181254a7Smrg { 2022181254a7Smrg import core.exception : AssertError; 2023*b1e83836Smrg import std.exception : assertThrown, enforce; 2024181254a7Smrg import std.string : format; 2025181254a7Smrg 2026181254a7Smrg immutable initialIndex = index; 2027181254a7Smrg 2028181254a7Smrg static if (hasLength!R) 2029181254a7Smrg immutable lenBefore = range.length; 2030181254a7Smrg 2031181254a7Smrg static if (isRandomAccessRange!R) 2032181254a7Smrg { 2033181254a7Smrg assertThrown!UTFException(decode(range, index), null, __FILE__, line); 2034181254a7Smrg enforce(index == initialIndex, 2035181254a7Smrg new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 2036181254a7Smrg static if (hasLength!R) 2037181254a7Smrg { 2038181254a7Smrg enforce(range.length == lenBefore, 2039181254a7Smrg new AssertError(format("decode: length changed:", range.length), __FILE__, line)); 2040181254a7Smrg } 2041181254a7Smrg } 2042181254a7Smrg 2043181254a7Smrg if (initialIndex == 0) 2044181254a7Smrg assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line); 2045181254a7Smrg } 2046181254a7Smrg 2047*b1e83836Smrg version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__) 2048181254a7Smrg { 2049181254a7Smrg // This condition is to allow unit testing all `decode` functions together 2050181254a7Smrg static if (!isBidirectionalRange!R) 2051181254a7Smrg return; 2052181254a7Smrg else 2053181254a7Smrg { 2054181254a7Smrg import core.exception : AssertError; 2055*b1e83836Smrg import std.exception : assertThrown, enforce; 2056181254a7Smrg import std.string : format; 2057181254a7Smrg 2058181254a7Smrg static if (hasLength!R) 2059181254a7Smrg immutable lenBefore = range.length; 2060181254a7Smrg 2061181254a7Smrg static if (isRandomAccessRange!R) 2062181254a7Smrg { 2063181254a7Smrg assertThrown!UTFException(decodeBack(range), null, __FILE__, line); 2064181254a7Smrg static if (hasLength!R) 2065181254a7Smrg { 2066181254a7Smrg enforce(range.length == lenBefore, 2067181254a7Smrg new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line)); 2068181254a7Smrg } 2069181254a7Smrg } 2070181254a7Smrg } 2071181254a7Smrg } 2072181254a7Smrg 2073181254a7Smrg @system unittest 2074181254a7Smrg { 2075181254a7Smrg import std.conv : to; 2076181254a7Smrg import std.exception; 2077181254a7Smrg 2078181254a7Smrg assertCTFEable!( 2079181254a7Smrg { 2080181254a7Smrg foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char, 2081181254a7Smrg (string s) => new RefBidirCU!char(s), 2082181254a7Smrg (string s) => new RefRandomCU!char(s))) 2083181254a7Smrg { 2084181254a7Smrg enum sHasLength = hasLength!(typeof(S("abcd"))); 2085181254a7Smrg 2086181254a7Smrg { 2087181254a7Smrg auto range = S("abcd"); 2088181254a7Smrg testDecode(range, 0, 'a', 1); 2089181254a7Smrg testDecode(range, 1, 'b', 2); 2090181254a7Smrg testDecodeFront(range, 'a', 1); 2091181254a7Smrg testDecodeFront(range, 'b', 1); 2092181254a7Smrg assert(decodeFront(range) == 'c'); 2093181254a7Smrg assert(decodeFront(range) == 'd'); 2094181254a7Smrg } 2095181254a7Smrg 2096181254a7Smrg { 2097181254a7Smrg auto range = S("ウェブサイト"); 2098181254a7Smrg testDecode(range, 0, 'ウ', 3); 2099181254a7Smrg testDecode(range, 3, 'ェ', 6); 2100181254a7Smrg testDecodeFront(range, 'ウ', 3); 2101181254a7Smrg testDecodeFront(range, 'ェ', 3); 2102181254a7Smrg assert(decodeFront(range) == 'ブ'); 2103181254a7Smrg assert(decodeFront(range) == 'サ'); 2104181254a7Smrg } 2105181254a7Smrg 2106181254a7Smrg { 2107181254a7Smrg auto range = S("abcd"); 2108181254a7Smrg testDecodeBack(range, 'd', 1); 2109181254a7Smrg testDecodeBack(range, 'c', 1); 2110181254a7Smrg testDecodeBack(range, 'b', 1); 2111181254a7Smrg testDecodeBack(range, 'a', 1); 2112181254a7Smrg } 2113181254a7Smrg 2114181254a7Smrg { 2115181254a7Smrg auto range = S("ウェブサイト"); 2116181254a7Smrg testDecodeBack(range, 'ト', 3); 2117181254a7Smrg testDecodeBack(range, 'イ', 3); 2118181254a7Smrg testDecodeBack(range, 'サ', 3); 2119181254a7Smrg testDecodeBack(range, 'ブ', 3); 2120181254a7Smrg } 2121181254a7Smrg 2122181254a7Smrg testAllDecode(S("\xC2\xA9"), '\u00A9', 2); 2123181254a7Smrg testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3); 2124181254a7Smrg 2125181254a7Smrg foreach (str; ["\xE2\x89", // too short 2126181254a7Smrg "\xC0\x8A", 2127181254a7Smrg "\xE0\x80\x8A", 2128181254a7Smrg "\xF0\x80\x80\x8A", 2129181254a7Smrg "\xF8\x80\x80\x80\x8A", 2130181254a7Smrg "\xFC\x80\x80\x80\x80\x8A"]) 2131181254a7Smrg { 2132181254a7Smrg testBadDecode(S(str), 0); 2133181254a7Smrg testBadDecode(S(str), 1); 2134181254a7Smrg testBadDecodeBack(S(str)); 2135181254a7Smrg } 2136181254a7Smrg 2137181254a7Smrg //Invalid UTF-8 sequence where the first code unit is valid. 2138181254a7Smrg testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3); 2139181254a7Smrg testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3); 2140181254a7Smrg 2141181254a7Smrg //Invalid UTF-8 sequence where the first code unit isn't valid. 2142181254a7Smrg foreach (str; ["\xED\xA0\x80", 2143181254a7Smrg "\xED\xAD\xBF", 2144181254a7Smrg "\xED\xAE\x80", 2145181254a7Smrg "\xED\xAF\xBF", 2146181254a7Smrg "\xED\xB0\x80", 2147181254a7Smrg "\xED\xBE\x80", 2148181254a7Smrg "\xED\xBF\xBF"]) 2149181254a7Smrg { 2150181254a7Smrg testBadDecode(S(str), 0); 2151181254a7Smrg testBadDecodeBack(S(str)); 2152181254a7Smrg } 2153181254a7Smrg } 2154181254a7Smrg }); 2155181254a7Smrg } 2156181254a7Smrg 2157181254a7Smrg @system unittest 2158181254a7Smrg { 2159181254a7Smrg import std.exception; 2160181254a7Smrg assertCTFEable!( 2161181254a7Smrg { 2162*b1e83836Smrg foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar, 2163181254a7Smrg (wstring s) => new RefBidirCU!wchar(s), 2164181254a7Smrg (wstring s) => new RefRandomCU!wchar(s))) 2165181254a7Smrg { 2166181254a7Smrg testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1); 2167181254a7Smrg testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2); 2168181254a7Smrg testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2); 2169181254a7Smrg testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 2170181254a7Smrg testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 2171181254a7Smrg 2172181254a7Smrg testBadDecode(S([ cast(wchar) 0xD801 ]), 0); 2173181254a7Smrg testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0); 2174181254a7Smrg 2175181254a7Smrg testBadDecodeBack(S([ cast(wchar) 0xD801 ])); 2176181254a7Smrg testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ])); 2177181254a7Smrg 2178181254a7Smrg { 2179181254a7Smrg auto range = S("ウェブサイト"); 2180181254a7Smrg testDecode(range, 0, 'ウ', 1); 2181181254a7Smrg testDecode(range, 1, 'ェ', 2); 2182181254a7Smrg testDecodeFront(range, 'ウ', 1); 2183181254a7Smrg testDecodeFront(range, 'ェ', 1); 2184181254a7Smrg assert(decodeFront(range) == 'ブ'); 2185181254a7Smrg assert(decodeFront(range) == 'サ'); 2186181254a7Smrg } 2187181254a7Smrg 2188181254a7Smrg { 2189181254a7Smrg auto range = S("ウェブサイト"); 2190181254a7Smrg testDecodeBack(range, 'ト', 1); 2191181254a7Smrg testDecodeBack(range, 'イ', 1); 2192181254a7Smrg testDecodeBack(range, 'サ', 1); 2193181254a7Smrg testDecodeBack(range, 'ブ', 1); 2194181254a7Smrg } 2195181254a7Smrg } 2196181254a7Smrg 2197*b1e83836Smrg foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s))) 2198181254a7Smrg { 2199181254a7Smrg auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00, 2200181254a7Smrg cast(wchar) 0x1400, 2201181254a7Smrg cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]); 2202181254a7Smrg testDecode(str, 0, cast(dchar) 0x10000, 2); 2203181254a7Smrg testDecode(str, 2, cast(dchar) 0x1400, 3); 2204181254a7Smrg testDecode(str, 3, cast(dchar) 0xB9DDE, 5); 2205181254a7Smrg testDecodeBack(str, cast(dchar) 0xB9DDE, 2); 2206181254a7Smrg testDecodeBack(str, cast(dchar) 0x1400, 1); 2207181254a7Smrg testDecodeBack(str, cast(dchar) 0x10000, 2); 2208181254a7Smrg } 2209181254a7Smrg }); 2210181254a7Smrg } 2211181254a7Smrg 2212181254a7Smrg @system unittest 2213181254a7Smrg { 2214181254a7Smrg import std.exception; 2215181254a7Smrg assertCTFEable!( 2216181254a7Smrg { 2217*b1e83836Smrg foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar, 2218181254a7Smrg (dstring s) => new RefBidirCU!dchar(s), 2219181254a7Smrg (dstring s) => new RefRandomCU!dchar(s))) 2220181254a7Smrg { 2221181254a7Smrg testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1); 2222181254a7Smrg testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1); 2223181254a7Smrg testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1); 2224181254a7Smrg testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 2225181254a7Smrg testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 2226181254a7Smrg 2227181254a7Smrg testBadDecode(S([cast(dchar) 0xD800]), 0); 2228181254a7Smrg testBadDecode(S([cast(dchar) 0xDFFE]), 0); 2229181254a7Smrg testBadDecode(S([cast(dchar) 0x110000]), 0); 2230181254a7Smrg 2231181254a7Smrg testBadDecodeBack(S([cast(dchar) 0xD800])); 2232181254a7Smrg testBadDecodeBack(S([cast(dchar) 0xDFFE])); 2233181254a7Smrg testBadDecodeBack(S([cast(dchar) 0x110000])); 2234181254a7Smrg 2235181254a7Smrg { 2236181254a7Smrg auto range = S("ウェブサイト"); 2237181254a7Smrg testDecode(range, 0, 'ウ', 1); 2238181254a7Smrg testDecode(range, 1, 'ェ', 2); 2239181254a7Smrg testDecodeFront(range, 'ウ', 1); 2240181254a7Smrg testDecodeFront(range, 'ェ', 1); 2241181254a7Smrg assert(decodeFront(range) == 'ブ'); 2242181254a7Smrg assert(decodeFront(range) == 'サ'); 2243181254a7Smrg } 2244181254a7Smrg 2245181254a7Smrg { 2246181254a7Smrg auto range = S("ウェブサイト"); 2247181254a7Smrg testDecodeBack(range, 'ト', 1); 2248181254a7Smrg testDecodeBack(range, 'イ', 1); 2249181254a7Smrg testDecodeBack(range, 'サ', 1); 2250181254a7Smrg testDecodeBack(range, 'ブ', 1); 2251181254a7Smrg } 2252181254a7Smrg } 2253181254a7Smrg 2254*b1e83836Smrg foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s))) 2255181254a7Smrg { 2256181254a7Smrg auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]); 2257181254a7Smrg testDecode(str, 0, 0x10000, 1); 2258181254a7Smrg testDecode(str, 1, 0x1400, 2); 2259181254a7Smrg testDecode(str, 2, 0xB9DDE, 3); 2260181254a7Smrg testDecodeBack(str, cast(dchar) 0xB9DDE, 1); 2261181254a7Smrg testDecodeBack(str, cast(dchar) 0x1400, 1); 2262181254a7Smrg testDecodeBack(str, cast(dchar) 0x10000, 1); 2263181254a7Smrg } 2264181254a7Smrg }); 2265181254a7Smrg } 2266181254a7Smrg 2267181254a7Smrg @safe unittest 2268181254a7Smrg { 2269181254a7Smrg import std.exception; 2270*b1e83836Smrg import std.traits : FunctionAttribute, functionAttributes, isSafe; 2271181254a7Smrg assertCTFEable!( 2272181254a7Smrg { 2273181254a7Smrg foreach (S; AliasSeq!( char[], const( char)[], string, 2274181254a7Smrg wchar[], const(wchar)[], wstring, 2275181254a7Smrg dchar[], const(dchar)[], dstring)) 2276181254a7Smrg { 2277181254a7Smrg static assert(isSafe!({ S str; size_t i = 0; decode(str, i); })); 2278181254a7Smrg static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); })); 2279181254a7Smrg static assert(isSafe!({ S str; decodeFront(str); })); 2280181254a7Smrg static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0); 2281181254a7Smrg static assert((functionAttributes!({ 2282181254a7Smrg S str; size_t i = 0; decodeFront(str, i); 2283181254a7Smrg }) & FunctionAttribute.pure_) != 0); 2284181254a7Smrg static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0); 2285181254a7Smrg static assert((functionAttributes!({ 2286181254a7Smrg S str; size_t i = 0; decodeBack(str, i); 2287181254a7Smrg }) & FunctionAttribute.pure_) != 0); 2288181254a7Smrg static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0); 2289181254a7Smrg } 2290181254a7Smrg }); 2291181254a7Smrg } 2292181254a7Smrg 2293181254a7Smrg @safe unittest 2294181254a7Smrg { 2295181254a7Smrg import std.exception; 2296181254a7Smrg char[4] val; 2297181254a7Smrg val[0] = 0b1111_0111; 2298181254a7Smrg val[1] = 0b1011_1111; 2299181254a7Smrg val[2] = 0b1011_1111; 2300181254a7Smrg val[3] = 0b1011_1111; 2301181254a7Smrg size_t i = 0; 2302181254a7Smrg assertThrown!UTFException((){ dchar ch = decode(val[], i); }()); 2303181254a7Smrg } 2304181254a7Smrg /* =================== Encode ======================= */ 2305181254a7Smrg 2306181254a7Smrg private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c) 2307181254a7Smrg { 2308181254a7Smrg static if (useReplacementDchar) 2309181254a7Smrg return replacementDchar; 2310181254a7Smrg else 2311181254a7Smrg throw new UTFException(msg).setSequence(c); 2312181254a7Smrg } 2313181254a7Smrg 2314181254a7Smrg /++ 2315*b1e83836Smrg Encodes `c` into the static array, `buf`, and returns the actual 2316*b1e83836Smrg length of the encoded character (a number between `1` and `4` for 2317*b1e83836Smrg `char[4]` buffers and a number between `1` and `2` for 2318*b1e83836Smrg `wchar[2]` buffers). 2319181254a7Smrg 2320181254a7Smrg Throws: 2321*b1e83836Smrg `UTFException` if `c` is not a valid UTF code point. 2322181254a7Smrg +/ 2323181254a7Smrg size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2324181254a7Smrg out char[4] buf, dchar c) @safe pure 2325181254a7Smrg { 2326181254a7Smrg if (c <= 0x7F) 2327181254a7Smrg { 2328181254a7Smrg assert(isValidDchar(c)); 2329181254a7Smrg buf[0] = cast(char) c; 2330181254a7Smrg return 1; 2331181254a7Smrg } 2332181254a7Smrg if (c <= 0x7FF) 2333181254a7Smrg { 2334181254a7Smrg assert(isValidDchar(c)); 2335181254a7Smrg buf[0] = cast(char)(0xC0 | (c >> 6)); 2336181254a7Smrg buf[1] = cast(char)(0x80 | (c & 0x3F)); 2337181254a7Smrg return 2; 2338181254a7Smrg } 2339181254a7Smrg if (c <= 0xFFFF) 2340181254a7Smrg { 2341181254a7Smrg if (0xD800 <= c && c <= 0xDFFF) 2342181254a7Smrg c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2343181254a7Smrg 2344181254a7Smrg assert(isValidDchar(c)); 2345181254a7Smrg L3: 2346181254a7Smrg buf[0] = cast(char)(0xE0 | (c >> 12)); 2347181254a7Smrg buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2348181254a7Smrg buf[2] = cast(char)(0x80 | (c & 0x3F)); 2349181254a7Smrg return 3; 2350181254a7Smrg } 2351181254a7Smrg if (c <= 0x10FFFF) 2352181254a7Smrg { 2353181254a7Smrg assert(isValidDchar(c)); 2354181254a7Smrg buf[0] = cast(char)(0xF0 | (c >> 18)); 2355181254a7Smrg buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2356181254a7Smrg buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2357181254a7Smrg buf[3] = cast(char)(0x80 | (c & 0x3F)); 2358181254a7Smrg return 4; 2359181254a7Smrg } 2360181254a7Smrg 2361181254a7Smrg assert(!isValidDchar(c)); 2362181254a7Smrg c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2363181254a7Smrg goto L3; 2364181254a7Smrg } 2365181254a7Smrg 2366*b1e83836Smrg /// 2367*b1e83836Smrg @safe unittest 2368*b1e83836Smrg { 2369*b1e83836Smrg import std.exception : assertThrown; 2370*b1e83836Smrg import std.typecons : Yes; 2371*b1e83836Smrg 2372*b1e83836Smrg char[4] buf; 2373*b1e83836Smrg 2374*b1e83836Smrg assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2375*b1e83836Smrg assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F"); 2376*b1e83836Smrg assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080"); 2377*b1e83836Smrg assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000"); 2378*b1e83836Smrg assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE"); 2379*b1e83836Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2380*b1e83836Smrg 2381*b1e83836Smrg encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2382*b1e83836Smrg auto slice = buf[]; 2383*b1e83836Smrg assert(slice.decodeFront == replacementDchar); 2384*b1e83836Smrg } 2385*b1e83836Smrg 2386*b1e83836Smrg /// 2387*b1e83836Smrg @safe unittest 2388*b1e83836Smrg { 2389*b1e83836Smrg import std.exception : assertThrown; 2390*b1e83836Smrg import std.typecons : Yes; 2391*b1e83836Smrg 2392*b1e83836Smrg wchar[2] buf; 2393*b1e83836Smrg 2394*b1e83836Smrg assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2395*b1e83836Smrg assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF"); 2396*b1e83836Smrg assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000"); 2397*b1e83836Smrg assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000"); 2398*b1e83836Smrg assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF"); 2399*b1e83836Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2400*b1e83836Smrg 2401*b1e83836Smrg encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2402*b1e83836Smrg auto slice = buf[]; 2403*b1e83836Smrg assert(slice.decodeFront == replacementDchar); 2404*b1e83836Smrg } 2405*b1e83836Smrg 2406*b1e83836Smrg /// 2407*b1e83836Smrg @safe unittest 2408*b1e83836Smrg { 2409*b1e83836Smrg import std.exception : assertThrown; 2410*b1e83836Smrg import std.typecons : Yes; 2411*b1e83836Smrg 2412*b1e83836Smrg dchar[1] buf; 2413*b1e83836Smrg 2414*b1e83836Smrg assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000'); 2415*b1e83836Smrg assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF'); 2416*b1e83836Smrg assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000'); 2417*b1e83836Smrg assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF'); 2418*b1e83836Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2419*b1e83836Smrg 2420*b1e83836Smrg encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2421*b1e83836Smrg assert(buf[0] == replacementDchar); 2422*b1e83836Smrg } 2423*b1e83836Smrg 2424181254a7Smrg @safe unittest 2425181254a7Smrg { 2426181254a7Smrg import std.exception; 2427181254a7Smrg assertCTFEable!( 2428181254a7Smrg { 2429181254a7Smrg char[4] buf; 2430181254a7Smrg 2431181254a7Smrg assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2432181254a7Smrg assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F"); 2433181254a7Smrg assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080"); 2434181254a7Smrg assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF"); 2435181254a7Smrg assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800"); 2436181254a7Smrg assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF"); 2437181254a7Smrg assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000"); 2438181254a7Smrg assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE"); 2439181254a7Smrg assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF"); 2440181254a7Smrg assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000"); 2441181254a7Smrg assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF"); 2442181254a7Smrg 2443181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2444181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2445181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2446181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2447181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2448181254a7Smrg 2449181254a7Smrg assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2450*b1e83836Smrg enum replacementDcharString = "\uFFFD"; 2451*b1e83836Smrg assert(buf[0 .. replacementDcharString.length] == replacementDcharString); 2452181254a7Smrg }); 2453181254a7Smrg } 2454181254a7Smrg 2455181254a7Smrg 2456181254a7Smrg /// Ditto 2457181254a7Smrg size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2458181254a7Smrg out wchar[2] buf, dchar c) @safe pure 2459181254a7Smrg { 2460181254a7Smrg if (c <= 0xFFFF) 2461181254a7Smrg { 2462181254a7Smrg if (0xD800 <= c && c <= 0xDFFF) 2463181254a7Smrg c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2464181254a7Smrg 2465181254a7Smrg assert(isValidDchar(c)); 2466181254a7Smrg L1: 2467181254a7Smrg buf[0] = cast(wchar) c; 2468181254a7Smrg return 1; 2469181254a7Smrg } 2470181254a7Smrg if (c <= 0x10FFFF) 2471181254a7Smrg { 2472181254a7Smrg assert(isValidDchar(c)); 2473181254a7Smrg buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2474181254a7Smrg buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2475181254a7Smrg return 2; 2476181254a7Smrg } 2477181254a7Smrg 2478181254a7Smrg c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2479181254a7Smrg goto L1; 2480181254a7Smrg } 2481181254a7Smrg 2482181254a7Smrg @safe unittest 2483181254a7Smrg { 2484181254a7Smrg import std.exception; 2485181254a7Smrg assertCTFEable!( 2486181254a7Smrg { 2487181254a7Smrg wchar[2] buf; 2488181254a7Smrg 2489181254a7Smrg assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2490181254a7Smrg assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF"); 2491181254a7Smrg assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000"); 2492181254a7Smrg assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE); 2493181254a7Smrg assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF); 2494181254a7Smrg assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000"); 2495181254a7Smrg assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF"); 2496181254a7Smrg 2497181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2498181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2499181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2500181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2501181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2502181254a7Smrg 2503181254a7Smrg assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2504181254a7Smrg assert(buf.front == replacementDchar); 2505181254a7Smrg }); 2506181254a7Smrg } 2507181254a7Smrg 2508181254a7Smrg 2509181254a7Smrg /// Ditto 2510181254a7Smrg size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2511181254a7Smrg out dchar[1] buf, dchar c) @safe pure 2512181254a7Smrg { 2513181254a7Smrg if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2514181254a7Smrg c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2515181254a7Smrg else 2516181254a7Smrg assert(isValidDchar(c)); 2517181254a7Smrg buf[0] = c; 2518181254a7Smrg return 1; 2519181254a7Smrg } 2520181254a7Smrg 2521181254a7Smrg @safe unittest 2522181254a7Smrg { 2523181254a7Smrg import std.exception; 2524181254a7Smrg assertCTFEable!( 2525181254a7Smrg { 2526181254a7Smrg dchar[1] buf; 2527181254a7Smrg 2528181254a7Smrg encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2529181254a7Smrg encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF'); 2530181254a7Smrg encode(buf, '\uE000'); assert(buf[0] == '\uE000'); 2531181254a7Smrg encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE); 2532181254a7Smrg encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF); 2533181254a7Smrg encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF'); 2534181254a7Smrg 2535181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2536181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2537181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2538181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2539181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2540181254a7Smrg 2541181254a7Smrg assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2542181254a7Smrg assert(buf.front == replacementDchar); 2543181254a7Smrg }); 2544181254a7Smrg } 2545181254a7Smrg 2546181254a7Smrg 2547181254a7Smrg /++ 2548*b1e83836Smrg Encodes `c` in `str`'s encoding and appends it to `str`. 2549181254a7Smrg 2550181254a7Smrg Throws: 2551*b1e83836Smrg `UTFException` if `c` is not a valid UTF code point. 2552181254a7Smrg +/ 2553181254a7Smrg void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2554*b1e83836Smrg ref scope char[] str, dchar c) @safe pure 2555181254a7Smrg { 2556181254a7Smrg if (c <= 0x7F) 2557181254a7Smrg { 2558181254a7Smrg assert(isValidDchar(c)); 2559*b1e83836Smrg str ~= cast(char) c; 2560181254a7Smrg } 2561181254a7Smrg else 2562181254a7Smrg { 2563181254a7Smrg char[4] buf; 2564181254a7Smrg uint L; 2565181254a7Smrg 2566181254a7Smrg if (c <= 0x7FF) 2567181254a7Smrg { 2568181254a7Smrg assert(isValidDchar(c)); 2569181254a7Smrg buf[0] = cast(char)(0xC0 | (c >> 6)); 2570181254a7Smrg buf[1] = cast(char)(0x80 | (c & 0x3F)); 2571181254a7Smrg L = 2; 2572181254a7Smrg } 2573181254a7Smrg else if (c <= 0xFFFF) 2574181254a7Smrg { 2575181254a7Smrg if (0xD800 <= c && c <= 0xDFFF) 2576181254a7Smrg c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2577181254a7Smrg 2578181254a7Smrg assert(isValidDchar(c)); 2579181254a7Smrg L3: 2580181254a7Smrg buf[0] = cast(char)(0xE0 | (c >> 12)); 2581181254a7Smrg buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2582181254a7Smrg buf[2] = cast(char)(0x80 | (c & 0x3F)); 2583181254a7Smrg L = 3; 2584181254a7Smrg } 2585181254a7Smrg else if (c <= 0x10FFFF) 2586181254a7Smrg { 2587181254a7Smrg assert(isValidDchar(c)); 2588181254a7Smrg buf[0] = cast(char)(0xF0 | (c >> 18)); 2589181254a7Smrg buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2590181254a7Smrg buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2591181254a7Smrg buf[3] = cast(char)(0x80 | (c & 0x3F)); 2592181254a7Smrg L = 4; 2593181254a7Smrg } 2594181254a7Smrg else 2595181254a7Smrg { 2596181254a7Smrg assert(!isValidDchar(c)); 2597181254a7Smrg c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2598181254a7Smrg goto L3; 2599181254a7Smrg } 2600*b1e83836Smrg str ~= buf[0 .. L]; 2601181254a7Smrg } 2602*b1e83836Smrg } 2603*b1e83836Smrg 2604*b1e83836Smrg /// 2605*b1e83836Smrg @safe unittest 2606*b1e83836Smrg { 2607*b1e83836Smrg char[] s = "abcd".dup; 2608*b1e83836Smrg dchar d1 = 'a'; 2609*b1e83836Smrg dchar d2 = 'ø'; 2610*b1e83836Smrg 2611*b1e83836Smrg encode(s, d1); 2612*b1e83836Smrg assert(s.length == 5); 2613*b1e83836Smrg assert(s == "abcda"); 2614*b1e83836Smrg encode(s, d2); 2615*b1e83836Smrg assert(s.length == 7); 2616*b1e83836Smrg assert(s == "abcdaø"); 2617181254a7Smrg } 2618181254a7Smrg 2619181254a7Smrg @safe unittest 2620181254a7Smrg { 2621181254a7Smrg import std.exception; 2622181254a7Smrg 2623181254a7Smrg assertCTFEable!( 2624181254a7Smrg { 2625181254a7Smrg char[] s = "abcd".dup; 2626181254a7Smrg encode(s, cast(dchar)'a'); 2627181254a7Smrg assert(s.length == 5); 2628181254a7Smrg assert(s == "abcda"); 2629181254a7Smrg 2630181254a7Smrg encode(s, cast(dchar)'\u00A9'); 2631181254a7Smrg assert(s.length == 7); 2632181254a7Smrg assert(s == "abcda\xC2\xA9"); 2633181254a7Smrg //assert(s == "abcda\u00A9"); // BUG: fix compiler 2634181254a7Smrg 2635181254a7Smrg encode(s, cast(dchar)'\u2260'); 2636181254a7Smrg assert(s.length == 10); 2637181254a7Smrg assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); 2638181254a7Smrg }); 2639181254a7Smrg } 2640181254a7Smrg 2641181254a7Smrg @safe unittest 2642181254a7Smrg { 2643181254a7Smrg import std.exception; 2644181254a7Smrg assertCTFEable!( 2645181254a7Smrg { 2646181254a7Smrg char[] buf; 2647181254a7Smrg 2648181254a7Smrg encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000"); 2649181254a7Smrg encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F"); 2650181254a7Smrg encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080"); 2651181254a7Smrg encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF"); 2652181254a7Smrg encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800"); 2653181254a7Smrg encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF"); 2654181254a7Smrg encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000"); 2655181254a7Smrg encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE"); 2656181254a7Smrg encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF"); 2657181254a7Smrg encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000"); 2658181254a7Smrg encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF"); 2659181254a7Smrg 2660181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2661181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2662181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2663181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2664181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2665181254a7Smrg 2666*b1e83836Smrg enum replacementDcharString = "\uFFFD"; 2667*b1e83836Smrg enum rdcslen = replacementDcharString.length; 2668*b1e83836Smrg assert(buf[$ - rdcslen .. $] != replacementDcharString); 2669181254a7Smrg encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2670*b1e83836Smrg assert(buf[$ - rdcslen .. $] == replacementDcharString); 2671181254a7Smrg }); 2672181254a7Smrg } 2673181254a7Smrg 2674181254a7Smrg /// ditto 2675181254a7Smrg void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2676*b1e83836Smrg ref scope wchar[] str, dchar c) @safe pure 2677181254a7Smrg { 2678181254a7Smrg if (c <= 0xFFFF) 2679181254a7Smrg { 2680181254a7Smrg if (0xD800 <= c && c <= 0xDFFF) 2681181254a7Smrg c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2682181254a7Smrg 2683181254a7Smrg assert(isValidDchar(c)); 2684181254a7Smrg L1: 2685*b1e83836Smrg str ~= cast(wchar) c; 2686181254a7Smrg } 2687181254a7Smrg else if (c <= 0x10FFFF) 2688181254a7Smrg { 2689181254a7Smrg wchar[2] buf; 2690181254a7Smrg 2691181254a7Smrg assert(isValidDchar(c)); 2692181254a7Smrg buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2693181254a7Smrg buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2694*b1e83836Smrg str ~= buf; 2695181254a7Smrg } 2696181254a7Smrg else 2697181254a7Smrg { 2698181254a7Smrg assert(!isValidDchar(c)); 2699181254a7Smrg c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2700181254a7Smrg goto L1; 2701181254a7Smrg } 2702181254a7Smrg } 2703181254a7Smrg 2704181254a7Smrg @safe unittest 2705181254a7Smrg { 2706181254a7Smrg import std.exception; 2707181254a7Smrg assertCTFEable!( 2708181254a7Smrg { 2709181254a7Smrg wchar[] buf; 2710181254a7Smrg 2711181254a7Smrg encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2712181254a7Smrg encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2713181254a7Smrg encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2714181254a7Smrg encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE); 2715181254a7Smrg encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF); 2716181254a7Smrg encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000"); 2717181254a7Smrg encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF"); 2718181254a7Smrg 2719181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2720181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2721181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2722181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2723181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2724181254a7Smrg 2725181254a7Smrg assert(buf.back != replacementDchar); 2726181254a7Smrg encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2727181254a7Smrg assert(buf.back == replacementDchar); 2728181254a7Smrg }); 2729181254a7Smrg } 2730181254a7Smrg 2731181254a7Smrg /// ditto 2732181254a7Smrg void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2733*b1e83836Smrg ref scope dchar[] str, dchar c) @safe pure 2734181254a7Smrg { 2735181254a7Smrg if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2736181254a7Smrg c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2737181254a7Smrg else 2738181254a7Smrg assert(isValidDchar(c)); 2739181254a7Smrg str ~= c; 2740181254a7Smrg } 2741181254a7Smrg 2742181254a7Smrg @safe unittest 2743181254a7Smrg { 2744181254a7Smrg import std.exception; 2745181254a7Smrg assertCTFEable!( 2746181254a7Smrg { 2747181254a7Smrg dchar[] buf; 2748181254a7Smrg 2749181254a7Smrg encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2750181254a7Smrg encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2751181254a7Smrg encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2752181254a7Smrg encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE); 2753181254a7Smrg encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF); 2754181254a7Smrg encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF'); 2755181254a7Smrg 2756181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2757181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2758181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2759181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2760181254a7Smrg assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2761181254a7Smrg 2762181254a7Smrg assert(buf.back != replacementDchar); 2763181254a7Smrg encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2764181254a7Smrg assert(buf.back == replacementDchar); 2765181254a7Smrg }); 2766181254a7Smrg } 2767181254a7Smrg 2768181254a7Smrg 2769181254a7Smrg /++ 2770181254a7Smrg Returns the number of code units that are required to encode the code point 2771*b1e83836Smrg `c` when `C` is the character type used to encode it. 2772181254a7Smrg +/ 2773181254a7Smrg ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc 2774181254a7Smrg if (isSomeChar!C) 2775181254a7Smrg { 2776181254a7Smrg static if (C.sizeof == 1) 2777181254a7Smrg { 2778181254a7Smrg if (c <= 0x7F) return 1; 2779181254a7Smrg if (c <= 0x7FF) return 2; 2780181254a7Smrg if (c <= 0xFFFF) return 3; 2781181254a7Smrg if (c <= 0x10FFFF) return 4; 2782181254a7Smrg assert(false); 2783181254a7Smrg } 2784181254a7Smrg else static if (C.sizeof == 2) 2785181254a7Smrg { 2786181254a7Smrg return c <= 0xFFFF ? 1 : 2; 2787181254a7Smrg } 2788181254a7Smrg else 2789181254a7Smrg { 2790181254a7Smrg static assert(C.sizeof == 4); 2791181254a7Smrg return 1; 2792181254a7Smrg } 2793181254a7Smrg } 2794181254a7Smrg 2795181254a7Smrg /// 2796181254a7Smrg @safe pure nothrow @nogc unittest 2797181254a7Smrg { 2798181254a7Smrg assert(codeLength!char('a') == 1); 2799181254a7Smrg assert(codeLength!wchar('a') == 1); 2800181254a7Smrg assert(codeLength!dchar('a') == 1); 2801181254a7Smrg 2802181254a7Smrg assert(codeLength!char('\U0010FFFF') == 4); 2803181254a7Smrg assert(codeLength!wchar('\U0010FFFF') == 2); 2804181254a7Smrg assert(codeLength!dchar('\U0010FFFF') == 1); 2805181254a7Smrg } 2806181254a7Smrg 2807181254a7Smrg 2808181254a7Smrg /++ 2809*b1e83836Smrg Returns the number of code units that are required to encode `str` 2810*b1e83836Smrg in a string whose character type is `C`. This is particularly useful 2811181254a7Smrg when slicing one string with the length of another and the two string 2812181254a7Smrg types use different character types. 2813181254a7Smrg 2814181254a7Smrg Params: 2815181254a7Smrg C = the character type to get the encoding length for 2816*b1e83836Smrg input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 2817*b1e83836Smrg to calculate the encoding length from 2818181254a7Smrg Returns: 2819181254a7Smrg The number of code units in `input` when encoded to `C` 2820181254a7Smrg +/ 2821181254a7Smrg size_t codeLength(C, InputRange)(InputRange input) 2822*b1e83836Smrg if (isSomeFiniteCharInputRange!InputRange) 2823181254a7Smrg { 2824181254a7Smrg alias EncType = Unqual!(ElementEncodingType!InputRange); 2825181254a7Smrg static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length))) 2826181254a7Smrg return input.length; 2827181254a7Smrg else 2828181254a7Smrg { 2829181254a7Smrg size_t total = 0; 2830181254a7Smrg 2831*b1e83836Smrg foreach (c; input.byDchar) 2832181254a7Smrg total += codeLength!C(c); 2833181254a7Smrg 2834181254a7Smrg return total; 2835181254a7Smrg } 2836181254a7Smrg } 2837181254a7Smrg 2838181254a7Smrg /// 2839181254a7Smrg @safe unittest 2840181254a7Smrg { 2841181254a7Smrg assert(codeLength!char("hello world") == 2842*b1e83836Smrg "hello world".length); 2843181254a7Smrg assert(codeLength!wchar("hello world") == 2844*b1e83836Smrg "hello world"w.length); 2845181254a7Smrg assert(codeLength!dchar("hello world") == 2846*b1e83836Smrg "hello world"d.length); 2847181254a7Smrg 2848181254a7Smrg assert(codeLength!char(`プログラミング`) == 2849*b1e83836Smrg `プログラミング`.length); 2850181254a7Smrg assert(codeLength!wchar(`プログラミング`) == 2851*b1e83836Smrg `プログラミング`w.length); 2852181254a7Smrg assert(codeLength!dchar(`プログラミング`) == 2853*b1e83836Smrg `プログラミング`d.length); 2854181254a7Smrg 2855181254a7Smrg string haystack = `Être sans la verité, ça, ce ne serait pas bien.`; 2856181254a7Smrg wstring needle = `Être sans la verité`; 2857181254a7Smrg assert(haystack[codeLength!char(needle) .. $] == 2858181254a7Smrg `, ça, ce ne serait pas bien.`); 2859181254a7Smrg } 2860181254a7Smrg 2861181254a7Smrg @safe unittest 2862181254a7Smrg { 2863181254a7Smrg import std.algorithm.iteration : filter; 2864181254a7Smrg import std.conv : to; 2865181254a7Smrg import std.exception; 2866181254a7Smrg 2867181254a7Smrg assertCTFEable!( 2868181254a7Smrg { 2869181254a7Smrg foreach (S; AliasSeq!( char[], const char[], string, 2870181254a7Smrg wchar[], const wchar[], wstring, 2871181254a7Smrg dchar[], const dchar[], dstring)) 2872181254a7Smrg { 2873181254a7Smrg foreach (C; AliasSeq!(char, wchar, dchar)) 2874181254a7Smrg { 2875181254a7Smrg assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length); 2876181254a7Smrg assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length); 2877181254a7Smrg assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) == 2878181254a7Smrg to!(C[])(`ウェブサイト@La_Verité.com`).length); 2879181254a7Smrg assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) == 2880181254a7Smrg to!(C[])(`ウェブサイト@La_Verité.com`).length); 2881181254a7Smrg } 2882181254a7Smrg } 2883181254a7Smrg }); 2884181254a7Smrg } 2885181254a7Smrg 2886181254a7Smrg /+ 2887181254a7Smrg Internal helper function: 2888181254a7Smrg 2889*b1e83836Smrg Returns true if it is safe to search for the Codepoint `c` inside 2890181254a7Smrg code units, without decoding. 2891181254a7Smrg 2892181254a7Smrg This is a runtime check that is used an optimization in various functions, 2893*b1e83836Smrg particularly, in `std.string`. 2894181254a7Smrg +/ 2895181254a7Smrg package bool canSearchInCodeUnits(C)(dchar c) 2896181254a7Smrg if (isSomeChar!C) 2897181254a7Smrg { 2898181254a7Smrg static if (C.sizeof == 1) 2899181254a7Smrg return c <= 0x7F; 2900181254a7Smrg else static if (C.sizeof == 2) 2901181254a7Smrg return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF); 2902181254a7Smrg else static if (C.sizeof == 4) 2903181254a7Smrg return true; 2904181254a7Smrg else 2905181254a7Smrg static assert(0); 2906181254a7Smrg } 2907181254a7Smrg @safe unittest 2908181254a7Smrg { 2909181254a7Smrg assert( canSearchInCodeUnits! char('a')); 2910181254a7Smrg assert( canSearchInCodeUnits!wchar('a')); 2911181254a7Smrg assert( canSearchInCodeUnits!dchar('a')); 2912181254a7Smrg assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF 2913181254a7Smrg assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF 2914181254a7Smrg assert( canSearchInCodeUnits!wchar('ö')); 2915181254a7Smrg assert( canSearchInCodeUnits!dchar('ö')); 2916181254a7Smrg assert(!canSearchInCodeUnits! char('日')); 2917181254a7Smrg assert( canSearchInCodeUnits!wchar('日')); 2918181254a7Smrg assert( canSearchInCodeUnits!dchar('日')); 2919181254a7Smrg assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00)); 2920181254a7Smrg assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00)); 2921181254a7Smrg assert(!canSearchInCodeUnits! char('\U00010001')); 2922181254a7Smrg assert(!canSearchInCodeUnits!wchar('\U00010001')); 2923181254a7Smrg assert( canSearchInCodeUnits!dchar('\U00010001')); 2924181254a7Smrg } 2925181254a7Smrg 2926181254a7Smrg /* =================== Validation ======================= */ 2927181254a7Smrg 2928181254a7Smrg /++ 2929*b1e83836Smrg Checks to see if `str` is well-formed unicode or not. 2930181254a7Smrg 2931181254a7Smrg Throws: 2932*b1e83836Smrg `UTFException` if `str` is not well-formed. 2933181254a7Smrg +/ 2934181254a7Smrg void validate(S)(in S str) @safe pure 2935181254a7Smrg if (isSomeString!S) 2936181254a7Smrg { 2937181254a7Smrg immutable len = str.length; 2938181254a7Smrg for (size_t i = 0; i < len; ) 2939181254a7Smrg { 2940181254a7Smrg decode(str, i); 2941181254a7Smrg } 2942181254a7Smrg } 2943181254a7Smrg 2944*b1e83836Smrg /// 2945*b1e83836Smrg @safe unittest 2946*b1e83836Smrg { 2947*b1e83836Smrg import std.exception : assertThrown; 2948*b1e83836Smrg char[] a = [167, 133, 175]; 2949*b1e83836Smrg assertThrown!UTFException(validate(a)); 2950*b1e83836Smrg } 2951181254a7Smrg 2952*b1e83836Smrg // https://issues.dlang.org/show_bug.cgi?id=12923 2953*b1e83836Smrg @safe unittest 2954181254a7Smrg { 2955181254a7Smrg import std.exception; 2956181254a7Smrg assertThrown((){ 2957181254a7Smrg char[3]a=[167, 133, 175]; 2958181254a7Smrg validate(a[]); 2959181254a7Smrg }()); 2960181254a7Smrg } 2961181254a7Smrg 2962181254a7Smrg /** 2963181254a7Smrg * Encodes the elements of `s` to UTF-8 and returns a newly allocated 2964181254a7Smrg * string of the elements. 2965181254a7Smrg * 2966181254a7Smrg * Params: 2967181254a7Smrg * s = the string to encode 2968181254a7Smrg * Returns: 2969181254a7Smrg * A UTF-8 string 2970181254a7Smrg * See_Also: 2971181254a7Smrg * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 2972181254a7Smrg */ 2973181254a7Smrg string toUTF8(S)(S s) 2974*b1e83836Smrg if (isSomeFiniteCharInputRange!S) 2975181254a7Smrg { 2976181254a7Smrg return toUTFImpl!string(s); 2977181254a7Smrg } 2978181254a7Smrg 2979181254a7Smrg /// 2980181254a7Smrg @safe pure unittest 2981181254a7Smrg { 2982181254a7Smrg import std.algorithm.comparison : equal; 2983181254a7Smrg 2984181254a7Smrg // The ö is represented by two UTF-8 code units 2985181254a7Smrg assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 2986181254a7Smrg 2987181254a7Smrg // is four code units in UTF-8 2988181254a7Smrg assert(""d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 2989181254a7Smrg } 2990181254a7Smrg 2991181254a7Smrg @system pure unittest 2992181254a7Smrg { 2993181254a7Smrg import std.algorithm.comparison : equal; 2994181254a7Smrg import std.internal.test.dummyrange : ReferenceInputRange; 2995181254a7Smrg 2996*b1e83836Smrg alias RT = ReferenceInputRange!(ElementType!(string)); 2997*b1e83836Smrg auto r1 = new RT("Hellø"); 2998*b1e83836Smrg auto r2 = new RT(""); 2999181254a7Smrg 3000181254a7Smrg assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 3001181254a7Smrg assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 3002181254a7Smrg } 3003181254a7Smrg 3004181254a7Smrg /** 3005181254a7Smrg * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated 3006181254a7Smrg * `wstring` of the elements. 3007181254a7Smrg * 3008181254a7Smrg * Params: 3009181254a7Smrg * s = the range to encode 3010181254a7Smrg * Returns: 3011181254a7Smrg * A UTF-16 string 3012181254a7Smrg * See_Also: 3013181254a7Smrg * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 3014181254a7Smrg */ 3015181254a7Smrg wstring toUTF16(S)(S s) 3016*b1e83836Smrg if (isSomeFiniteCharInputRange!S) 3017181254a7Smrg { 3018181254a7Smrg return toUTFImpl!wstring(s); 3019181254a7Smrg } 3020181254a7Smrg 3021181254a7Smrg /// 3022181254a7Smrg @safe pure unittest 3023181254a7Smrg { 3024181254a7Smrg import std.algorithm.comparison : equal; 3025181254a7Smrg 3026181254a7Smrg // these graphemes are two code units in UTF-16 and one in UTF-32 3027181254a7Smrg assert(""d.length == 1); 3028181254a7Smrg assert(""d.length == 1); 3029181254a7Smrg 3030181254a7Smrg assert(""d.toUTF16.equal([0xD852, 0xDF62])); 3031181254a7Smrg assert(""d.toUTF16.equal([0xD801, 0xDC37])); 3032181254a7Smrg } 3033181254a7Smrg 3034181254a7Smrg @system pure unittest 3035181254a7Smrg { 3036181254a7Smrg import std.algorithm.comparison : equal; 3037181254a7Smrg import std.internal.test.dummyrange : ReferenceInputRange; 3038181254a7Smrg 3039*b1e83836Smrg alias RT = ReferenceInputRange!(ElementType!(string)); 3040*b1e83836Smrg auto r1 = new RT(""); 3041*b1e83836Smrg auto r2 = new RT(""); 3042181254a7Smrg 3043181254a7Smrg assert(r1.toUTF16.equal([0xD852, 0xDF62])); 3044181254a7Smrg assert(r2.toUTF16.equal([0xD801, 0xDC37])); 3045181254a7Smrg } 3046181254a7Smrg 3047181254a7Smrg 3048181254a7Smrg /** 3049181254a7Smrg * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated 3050181254a7Smrg * `dstring` of the elements. 3051181254a7Smrg * 3052181254a7Smrg * Params: 3053181254a7Smrg * s = the range to encode 3054181254a7Smrg * Returns: 3055181254a7Smrg * A UTF-32 string 3056181254a7Smrg * See_Also: 3057181254a7Smrg * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 3058181254a7Smrg */ 3059*b1e83836Smrg dstring toUTF32(S)(scope S s) 3060*b1e83836Smrg if (isSomeFiniteCharInputRange!S) 3061181254a7Smrg { 3062181254a7Smrg return toUTFImpl!dstring(s); 3063181254a7Smrg } 3064181254a7Smrg 3065*b1e83836Smrg /// 3066*b1e83836Smrg @safe pure unittest 3067*b1e83836Smrg { 3068*b1e83836Smrg import std.algorithm.comparison : equal; 3069*b1e83836Smrg 3070*b1e83836Smrg // these graphemes are two code units in UTF-16 and one in UTF-32 3071*b1e83836Smrg assert(""w.length == 2); 3072*b1e83836Smrg assert(""w.length == 2); 3073*b1e83836Smrg 3074*b1e83836Smrg assert(""w.toUTF32.equal([0x00024B62])); 3075*b1e83836Smrg assert(""w.toUTF32.equal([0x00010437])); 3076*b1e83836Smrg } 3077*b1e83836Smrg 3078*b1e83836Smrg private T toUTFImpl(T, S)(scope S s) 3079181254a7Smrg { 3080181254a7Smrg static if (is(S : T)) 3081181254a7Smrg { 3082181254a7Smrg return s.idup; 3083181254a7Smrg } 3084181254a7Smrg else 3085181254a7Smrg { 3086181254a7Smrg import std.array : appender; 3087181254a7Smrg auto app = appender!T(); 3088181254a7Smrg 3089*b1e83836Smrg static if (is(S == C[], C) || hasLength!S) 3090181254a7Smrg app.reserve(s.length); 3091181254a7Smrg 3092181254a7Smrg foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T))) 3093181254a7Smrg app.put(c); 3094181254a7Smrg 3095181254a7Smrg return app.data; 3096181254a7Smrg } 3097181254a7Smrg } 3098181254a7Smrg 3099181254a7Smrg /* =================== toUTFz ======================= */ 3100181254a7Smrg 3101181254a7Smrg /++ 3102*b1e83836Smrg Returns a C-style zero-terminated string equivalent to `str`. `str` 3103*b1e83836Smrg must not contain embedded `'\0'`'s as any C function will treat the first 3104*b1e83836Smrg `'\0'` that it sees as the end of the string. If `str.empty` is 3105*b1e83836Smrg `true`, then a string containing only `'\0'` is returned. 3106181254a7Smrg 3107*b1e83836Smrg `toUTFz` accepts any type of string and is templated on the type of 3108181254a7Smrg character pointer that you wish to convert to. It will avoid allocating a 3109181254a7Smrg new string if it can, but there's a decent chance that it will end up having 3110181254a7Smrg to allocate a new string - particularly when dealing with character types 3111*b1e83836Smrg other than `char`. 3112181254a7Smrg 3113*b1e83836Smrg $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if 3114*b1e83836Smrg anything alters the character one past the end of `str` (which is the 3115*b1e83836Smrg `'\0'` character terminating the string), then the string won't be 3116181254a7Smrg zero-terminated anymore. The most likely scenarios for that are if you 3117*b1e83836Smrg append to `str` and no reallocation takes place or when `str` is a 3118181254a7Smrg slice of a larger array, and you alter the character in the larger array 3119*b1e83836Smrg which is one character past the end of `str`. Another case where it could 3120181254a7Smrg occur would be if you had a mutable character array immediately after 3121*b1e83836Smrg `str` in memory (for example, if they're member variables in a 3122181254a7Smrg user-defined type with one declared right after the other) and that 3123*b1e83836Smrg character array happened to start with `'\0'`. Such scenarios will never 3124181254a7Smrg occur if you immediately use the zero-terminated string after calling 3125*b1e83836Smrg `toUTFz` and the C function using it doesn't keep a reference to it. 3126181254a7Smrg Also, they are unlikely to occur even if you save the zero-terminated string 3127181254a7Smrg (the cases above would be among the few examples of where it could happen). 3128181254a7Smrg However, if you save the zero-terminate string and want to be absolutely 3129181254a7Smrg certain that the string stays zero-terminated, then simply append a 3130*b1e83836Smrg `'\0'` to the string and use its `ptr` property rather than calling 3131*b1e83836Smrg `toUTFz`. 3132181254a7Smrg 3133181254a7Smrg $(RED Warning 2:) When passing a character pointer to a C function, and the 3134181254a7Smrg C function keeps it around for any reason, make sure that you keep a 3135181254a7Smrg reference to it in your D code. Otherwise, it may go away during a garbage 3136181254a7Smrg collection cycle and cause a nasty bug when the C code tries to use it. 3137181254a7Smrg +/ 3138181254a7Smrg template toUTFz(P) 3139*b1e83836Smrg if (isPointer!P && isSomeChar!(typeof(*P.init))) 3140181254a7Smrg { 3141181254a7Smrg P toUTFz(S)(S str) @safe pure 3142*b1e83836Smrg if (isSomeString!S) 3143181254a7Smrg { 3144181254a7Smrg return toUTFzImpl!(P, S)(str); 3145181254a7Smrg } 3146181254a7Smrg } 3147181254a7Smrg 3148181254a7Smrg /// 3149181254a7Smrg @safe pure unittest 3150181254a7Smrg { 3151181254a7Smrg auto p1 = toUTFz!(char*)("hello world"); 3152181254a7Smrg auto p2 = toUTFz!(const(char)*)("hello world"); 3153181254a7Smrg auto p3 = toUTFz!(immutable(char)*)("hello world"); 3154181254a7Smrg auto p4 = toUTFz!(char*)("hello world"d); 3155181254a7Smrg auto p5 = toUTFz!(const(wchar)*)("hello world"); 3156181254a7Smrg auto p6 = toUTFz!(immutable(dchar)*)("hello world"w); 3157181254a7Smrg } 3158181254a7Smrg 3159*b1e83836Smrg private P toUTFzImpl(P, S)(return scope S str) @safe pure 3160*b1e83836Smrg if (is(immutable typeof(*P.init) == typeof(str[0]))) 3161181254a7Smrg //immutable(C)[] -> C*, const(C)*, or immutable(C)* 3162181254a7Smrg { 3163181254a7Smrg if (str.empty) 3164181254a7Smrg { 3165181254a7Smrg typeof(*P.init)[] retval = ['\0']; 3166181254a7Smrg 3167181254a7Smrg auto trustedPtr() @trusted { return retval.ptr; } 3168181254a7Smrg return trustedPtr(); 3169181254a7Smrg } 3170181254a7Smrg 3171181254a7Smrg alias C = Unqual!(ElementEncodingType!S); 3172181254a7Smrg 3173181254a7Smrg //If the P is mutable, then we have to make a copy. 3174181254a7Smrg static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init))) 3175181254a7Smrg { 3176181254a7Smrg return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 3177181254a7Smrg } 3178181254a7Smrg else 3179181254a7Smrg { 3180181254a7Smrg if (!__ctfe) 3181181254a7Smrg { 3182181254a7Smrg auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 3183181254a7Smrg immutable p = trustedPtrAdd(str); 3184181254a7Smrg 3185181254a7Smrg // Peek past end of str, if it's 0, no conversion necessary. 3186181254a7Smrg // Note that the compiler will put a 0 past the end of static 3187181254a7Smrg // strings, and the storage allocator will put a 0 past the end 3188181254a7Smrg // of newly allocated char[]'s. 3189181254a7Smrg // Is p dereferenceable? A simple test: if the p points to an 3190181254a7Smrg // address multiple of 4, then conservatively assume the pointer 3191181254a7Smrg // might be pointing to a new block of memory, which might be 3192181254a7Smrg // unreadable. Otherwise, it's definitely pointing to valid 3193181254a7Smrg // memory. 3194181254a7Smrg if ((cast(size_t) p & 3) && *p == '\0') 3195181254a7Smrg return &str[0]; 3196181254a7Smrg } 3197181254a7Smrg 3198181254a7Smrg return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 3199181254a7Smrg } 3200181254a7Smrg } 3201181254a7Smrg 3202*b1e83836Smrg private P toUTFzImpl(P, S)(return scope S str) @safe pure 3203*b1e83836Smrg if (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable)) 3204181254a7Smrg //C[] or const(C)[] -> C*, const(C)*, or immutable(C)* 3205181254a7Smrg { 3206*b1e83836Smrg alias InChar = typeof(str[0]); 3207181254a7Smrg alias OutChar = typeof(*P.init); 3208181254a7Smrg 3209181254a7Smrg //const(C)[] -> const(C)* or 3210181254a7Smrg //C[] -> C* or const(C)* 3211181254a7Smrg static if (( is(const(Unqual!InChar) == InChar) && is(const(Unqual!OutChar) == OutChar)) || 3212181254a7Smrg (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar))) 3213181254a7Smrg { 3214181254a7Smrg if (!__ctfe) 3215181254a7Smrg { 3216181254a7Smrg auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 3217181254a7Smrg auto p = trustedPtrAdd(str); 3218181254a7Smrg 3219181254a7Smrg if ((cast(size_t) p & 3) && *p == '\0') 3220181254a7Smrg return &str[0]; 3221181254a7Smrg } 3222181254a7Smrg 3223181254a7Smrg str ~= '\0'; 3224181254a7Smrg return &str[0]; 3225181254a7Smrg } 3226181254a7Smrg //const(C)[] -> C* or immutable(C)* or 3227181254a7Smrg //C[] -> immutable(C)* 3228181254a7Smrg else 3229181254a7Smrg { 3230181254a7Smrg import std.array : uninitializedArray; 3231181254a7Smrg auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1); 3232181254a7Smrg copy[0 .. $ - 1] = str[]; 3233181254a7Smrg copy[$ - 1] = '\0'; 3234181254a7Smrg 3235181254a7Smrg auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; } 3236181254a7Smrg return trustedCast(copy); 3237181254a7Smrg } 3238181254a7Smrg } 3239181254a7Smrg 3240181254a7Smrg private P toUTFzImpl(P, S)(S str) @safe pure 3241*b1e83836Smrg if (!is(immutable typeof(*P.init) == immutable typeof(str[0]))) 3242181254a7Smrg //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)* 3243181254a7Smrg { 3244181254a7Smrg import std.array : appender; 3245181254a7Smrg auto retval = appender!(typeof(*P.init)[])(); 3246181254a7Smrg 3247181254a7Smrg foreach (dchar c; str) 3248181254a7Smrg retval.put(c); 3249181254a7Smrg retval.put('\0'); 3250181254a7Smrg 3251181254a7Smrg return () @trusted { return cast(P) retval.data.ptr; } (); 3252181254a7Smrg } 3253181254a7Smrg 3254181254a7Smrg @safe pure unittest 3255181254a7Smrg { 3256181254a7Smrg import core.exception : AssertError; 3257181254a7Smrg import std.algorithm; 3258181254a7Smrg import std.conv : to; 3259181254a7Smrg import std.exception; 3260181254a7Smrg import std.string : format; 3261181254a7Smrg 3262181254a7Smrg assertCTFEable!( 3263181254a7Smrg { 3264181254a7Smrg foreach (S; AliasSeq!(string, wstring, dstring)) 3265181254a7Smrg { 3266181254a7Smrg alias C = Unqual!(ElementEncodingType!S); 3267181254a7Smrg 3268181254a7Smrg auto s1 = to!S("hello\U00010143\u0100\U00010143"); 3269181254a7Smrg auto temp = new C[](s1.length + 1); 3270181254a7Smrg temp[0 .. $ - 1] = s1[0 .. $]; 3271181254a7Smrg temp[$ - 1] = '\n'; 3272181254a7Smrg --temp.length; 3273181254a7Smrg auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); } 3274181254a7Smrg auto s2 = trustedAssumeUnique(temp); 3275181254a7Smrg assert(s1 == s2); 3276181254a7Smrg 3277181254a7Smrg void trustedCStringAssert(P, S)(S s) @trusted 3278181254a7Smrg { 3279181254a7Smrg auto p = toUTFz!P(s); 3280181254a7Smrg assert(p[0 .. s.length] == s); 3281181254a7Smrg assert(p[s.length] == '\0'); 3282181254a7Smrg } 3283181254a7Smrg 3284181254a7Smrg foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*)) 3285181254a7Smrg { 3286181254a7Smrg trustedCStringAssert!P(s1); 3287181254a7Smrg trustedCStringAssert!P(s2); 3288181254a7Smrg } 3289181254a7Smrg } 3290181254a7Smrg }); 3291181254a7Smrg 3292181254a7Smrg static void test(P, S)(S s, size_t line = __LINE__) @trusted 3293181254a7Smrg { 3294181254a7Smrg static size_t zeroLen(C)(const(C)* ptr) @trusted 3295181254a7Smrg { 3296181254a7Smrg size_t len = 0; 3297181254a7Smrg while (*ptr != '\0') { ++ptr; ++len; } 3298181254a7Smrg return len; 3299181254a7Smrg } 3300181254a7Smrg 3301181254a7Smrg auto p = toUTFz!P(s); 3302181254a7Smrg immutable len = zeroLen(p); 3303181254a7Smrg enforce(cmp(s, p[0 .. len]) == 0, 3304181254a7Smrg new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof), 3305181254a7Smrg __FILE__, line)); 3306181254a7Smrg } 3307181254a7Smrg 3308181254a7Smrg assertCTFEable!( 3309181254a7Smrg { 3310181254a7Smrg foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*, 3311181254a7Smrg dchar*, const(dchar)*, immutable(dchar)*)) 3312181254a7Smrg { 3313181254a7Smrg test!P("hello\U00010143\u0100\U00010143"); 3314181254a7Smrg } 3315181254a7Smrg foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3316181254a7Smrg dchar*, const(dchar)*, immutable(dchar)*)) 3317181254a7Smrg { 3318181254a7Smrg test!P("hello\U00010143\u0100\U00010143"w); 3319181254a7Smrg } 3320181254a7Smrg foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3321181254a7Smrg wchar*, const(wchar)*, immutable(wchar)*)) 3322181254a7Smrg { 3323181254a7Smrg test!P("hello\U00010143\u0100\U00010143"d); 3324181254a7Smrg } 3325181254a7Smrg foreach (S; AliasSeq!( char[], const( char)[], 3326181254a7Smrg wchar[], const(wchar)[], 3327181254a7Smrg dchar[], const(dchar)[])) 3328181254a7Smrg { 3329181254a7Smrg auto s = to!S("hello\U00010143\u0100\U00010143"); 3330181254a7Smrg 3331181254a7Smrg foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3332181254a7Smrg wchar*, const(wchar)*, immutable(wchar)*, 3333181254a7Smrg dchar*, const(dchar)*, immutable(dchar)*)) 3334181254a7Smrg { 3335181254a7Smrg test!P(s); 3336181254a7Smrg } 3337181254a7Smrg } 3338181254a7Smrg }); 3339181254a7Smrg } 3340181254a7Smrg 3341181254a7Smrg 3342181254a7Smrg /++ 3343*b1e83836Smrg `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`. 3344181254a7Smrg 3345*b1e83836Smrg Encodes string `s` into UTF-16 and returns the encoded string. 3346*b1e83836Smrg `toUTF16z` is suitable for calling the 'W' functions in the Win32 API 3347*b1e83836Smrg that take an `LPCWSTR` argument. 3348181254a7Smrg +/ 3349181254a7Smrg const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure 3350181254a7Smrg if (isSomeChar!C) 3351181254a7Smrg { 3352181254a7Smrg return toUTFz!(const(wchar)*)(str); 3353181254a7Smrg } 3354181254a7Smrg 3355*b1e83836Smrg /// 3356*b1e83836Smrg @system unittest 3357*b1e83836Smrg { 3358*b1e83836Smrg string str = "Hello, World!"; 3359*b1e83836Smrg const(wchar)* p = str.toUTF16z; 3360*b1e83836Smrg assert(p[str.length] == '\0'); 3361*b1e83836Smrg } 3362*b1e83836Smrg 3363181254a7Smrg @safe pure unittest 3364181254a7Smrg { 3365181254a7Smrg import std.conv : to; 3366181254a7Smrg //toUTFz is already thoroughly tested, so this will just verify that 3367181254a7Smrg //toUTF16z compiles properly for the various string types. 3368181254a7Smrg foreach (S; AliasSeq!(string, wstring, dstring)) 3369181254a7Smrg assert(toUTF16z(to!S("hello world")) !is null); 3370181254a7Smrg } 3371181254a7Smrg 3372181254a7Smrg 3373181254a7Smrg /* ================================ tests ================================== */ 3374181254a7Smrg 3375181254a7Smrg @safe pure unittest 3376181254a7Smrg { 3377181254a7Smrg import std.exception; 3378181254a7Smrg 3379181254a7Smrg assertCTFEable!( 3380181254a7Smrg { 3381181254a7Smrg assert(toUTF16("hello"c) == "hello"); 3382181254a7Smrg assert(toUTF32("hello"c) == "hello"); 3383181254a7Smrg assert(toUTF8 ("hello"w) == "hello"); 3384181254a7Smrg assert(toUTF32("hello"w) == "hello"); 3385181254a7Smrg assert(toUTF8 ("hello"d) == "hello"); 3386181254a7Smrg assert(toUTF16("hello"d) == "hello"); 3387181254a7Smrg 3388181254a7Smrg assert(toUTF16("hel\u1234o"c) == "hel\u1234o"); 3389181254a7Smrg assert(toUTF32("hel\u1234o"c) == "hel\u1234o"); 3390181254a7Smrg assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o"); 3391181254a7Smrg assert(toUTF32("hel\u1234o"w) == "hel\u1234o"); 3392181254a7Smrg assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o"); 3393181254a7Smrg assert(toUTF16("hel\u1234o"d) == "hel\u1234o"); 3394181254a7Smrg 3395181254a7Smrg assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3396181254a7Smrg assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3397181254a7Smrg assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3398181254a7Smrg assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3399181254a7Smrg assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3400181254a7Smrg assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3401181254a7Smrg }); 3402181254a7Smrg } 3403181254a7Smrg 3404181254a7Smrg 3405181254a7Smrg /++ 3406*b1e83836Smrg Returns the total number of code points encoded in `str`. 3407181254a7Smrg 3408181254a7Smrg Supercedes: This function supercedes $(LREF toUCSindex). 3409181254a7Smrg 3410181254a7Smrg Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 3411181254a7Smrg 3412181254a7Smrg Throws: 3413*b1e83836Smrg `UTFException` if `str` is not well-formed. 3414181254a7Smrg +/ 3415*b1e83836Smrg size_t count(C)(const(C)[] str) @safe pure nothrow @nogc 3416181254a7Smrg if (isSomeChar!C) 3417181254a7Smrg { 3418*b1e83836Smrg return walkLength(str.byDchar); 3419*b1e83836Smrg } 3420*b1e83836Smrg 3421*b1e83836Smrg /// 3422*b1e83836Smrg @safe pure nothrow @nogc unittest 3423*b1e83836Smrg { 3424*b1e83836Smrg assert(count("") == 0); 3425*b1e83836Smrg assert(count("a") == 1); 3426*b1e83836Smrg assert(count("abc") == 3); 3427*b1e83836Smrg assert(count("\u20AC100") == 4); 3428181254a7Smrg } 3429181254a7Smrg 3430181254a7Smrg @safe pure nothrow @nogc unittest 3431181254a7Smrg { 3432181254a7Smrg import std.exception; 3433181254a7Smrg assertCTFEable!( 3434181254a7Smrg { 3435181254a7Smrg assert(count("") == 0); 3436181254a7Smrg assert(count("a") == 1); 3437181254a7Smrg assert(count("abc") == 3); 3438181254a7Smrg assert(count("\u20AC100") == 4); 3439181254a7Smrg }); 3440181254a7Smrg } 3441181254a7Smrg 3442181254a7Smrg 3443181254a7Smrg // Ranges of code units for testing. 3444*b1e83836Smrg version (StdUnittest) 3445181254a7Smrg { 3446*b1e83836Smrg private: 3447181254a7Smrg struct InputCU(C) 3448181254a7Smrg { 3449181254a7Smrg import std.conv : to; 3450181254a7Smrg @property bool empty() { return _str.empty; } 3451181254a7Smrg @property C front() { return _str[0]; } 3452181254a7Smrg void popFront() { _str = _str[1 .. $]; } 3453181254a7Smrg 3454181254a7Smrg this(inout(C)[] str) 3455181254a7Smrg { 3456181254a7Smrg _str = to!(C[])(str); 3457181254a7Smrg } 3458181254a7Smrg 3459181254a7Smrg C[] _str; 3460181254a7Smrg } 3461181254a7Smrg 3462181254a7Smrg struct BidirCU(C) 3463181254a7Smrg { 3464181254a7Smrg import std.conv : to; 3465181254a7Smrg @property bool empty() { return _str.empty; } 3466181254a7Smrg @property C front() { return _str[0]; } 3467181254a7Smrg void popFront() { _str = _str[1 .. $]; } 3468181254a7Smrg @property C back() { return _str[$ - 1]; } 3469181254a7Smrg void popBack() { _str = _str[0 .. $ - 1]; } 3470181254a7Smrg @property auto save() { return BidirCU(_str); } 3471181254a7Smrg @property size_t length() { return _str.length; } 3472181254a7Smrg 3473181254a7Smrg this(inout(C)[] str) 3474181254a7Smrg { 3475181254a7Smrg _str = to!(C[])(str); 3476181254a7Smrg } 3477181254a7Smrg 3478181254a7Smrg C[] _str; 3479181254a7Smrg } 3480181254a7Smrg 3481181254a7Smrg struct RandomCU(C) 3482181254a7Smrg { 3483181254a7Smrg import std.conv : to; 3484181254a7Smrg @property bool empty() { return _str.empty; } 3485181254a7Smrg @property C front() { return _str[0]; } 3486181254a7Smrg void popFront() { _str = _str[1 .. $]; } 3487181254a7Smrg @property C back() { return _str[$ - 1]; } 3488181254a7Smrg void popBack() { _str = _str[0 .. $ - 1]; } 3489181254a7Smrg @property auto save() { return RandomCU(_str); } 3490181254a7Smrg @property size_t length() { return _str.length; } 3491181254a7Smrg C opIndex(size_t i) { return _str[i]; } 3492181254a7Smrg auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); } 3493181254a7Smrg 3494181254a7Smrg this(inout(C)[] str) 3495181254a7Smrg { 3496181254a7Smrg _str = to!(C[])(str); 3497181254a7Smrg } 3498181254a7Smrg 3499181254a7Smrg C[] _str; 3500181254a7Smrg } 3501181254a7Smrg 3502181254a7Smrg class RefBidirCU(C) 3503181254a7Smrg { 3504181254a7Smrg import std.conv : to; 3505181254a7Smrg @property bool empty() { return _str.empty; } 3506181254a7Smrg @property C front() { return _str[0]; } 3507181254a7Smrg void popFront() { _str = _str[1 .. $]; } 3508181254a7Smrg @property C back() { return _str[$ - 1]; } 3509181254a7Smrg void popBack() { _str = _str[0 .. $ - 1]; } 3510181254a7Smrg @property auto save() { return new RefBidirCU(_str); } 3511181254a7Smrg @property size_t length() { return _str.length; } 3512181254a7Smrg 3513181254a7Smrg this(inout(C)[] str) 3514181254a7Smrg { 3515181254a7Smrg _str = to!(C[])(str); 3516181254a7Smrg } 3517181254a7Smrg 3518181254a7Smrg C[] _str; 3519181254a7Smrg } 3520181254a7Smrg 3521181254a7Smrg class RefRandomCU(C) 3522181254a7Smrg { 3523181254a7Smrg import std.conv : to; 3524181254a7Smrg @property bool empty() { return _str.empty; } 3525181254a7Smrg @property C front() { return _str[0]; } 3526181254a7Smrg void popFront() { _str = _str[1 .. $]; } 3527181254a7Smrg @property C back() { return _str[$ - 1]; } 3528181254a7Smrg void popBack() { _str = _str[0 .. $ - 1]; } 3529181254a7Smrg @property auto save() { return new RefRandomCU(_str); } 3530181254a7Smrg @property size_t length() { return _str.length; } 3531181254a7Smrg C opIndex(size_t i) { return _str[i]; } 3532181254a7Smrg auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); } 3533181254a7Smrg 3534181254a7Smrg this(inout(C)[] str) 3535181254a7Smrg { 3536181254a7Smrg _str = to!(C[])(str); 3537181254a7Smrg } 3538181254a7Smrg 3539181254a7Smrg C[] _str; 3540181254a7Smrg } 3541181254a7Smrg } 3542181254a7Smrg 3543181254a7Smrg 3544181254a7Smrg /** 3545181254a7Smrg * Inserted in place of invalid UTF sequences. 3546181254a7Smrg * 3547181254a7Smrg * References: 3548181254a7Smrg * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character) 3549181254a7Smrg */ 3550181254a7Smrg enum dchar replacementDchar = '\uFFFD'; 3551181254a7Smrg 3552181254a7Smrg /******************************************** 3553181254a7Smrg * Iterate a range of char, wchar, or dchars by code unit. 3554181254a7Smrg * 3555181254a7Smrg * The purpose is to bypass the special case decoding that 3556181254a7Smrg * $(REF front, std,range,primitives) does to character arrays. As a result, 3557181254a7Smrg * using ranges with `byCodeUnit` can be `nothrow` while 3558181254a7Smrg * $(REF front, std,range,primitives) throws when it encounters invalid Unicode 3559181254a7Smrg * sequences. 3560181254a7Smrg * 3561181254a7Smrg * A code unit is a building block of the UTF encodings. Generally, an 3562181254a7Smrg * individual code unit does not represent what's perceived as a full 3563181254a7Smrg * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters 3564181254a7Smrg * are encoded with multiple code units. For example, the UTF-8 code units for 3565181254a7Smrg * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit` 3566181254a7Smrg * often does not form a character on its own. Attempting to treat it as 3567181254a7Smrg * one while iterating over the resulting range will give nonsensical results. 3568181254a7Smrg * 3569181254a7Smrg * Params: 3570*b1e83836Smrg * r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 3571*b1e83836Smrg * of characters (including strings) or a type that implicitly converts to a string type. 3572181254a7Smrg * Returns: 3573181254a7Smrg * If `r` is not an auto-decodable string (i.e. a narrow string or a 3574181254a7Smrg * user-defined type that implicits converts to a string type), then `r` 3575181254a7Smrg * is returned. 3576181254a7Smrg * 3577181254a7Smrg * Otherwise, `r` is converted to its corresponding string type (if it's 3578181254a7Smrg * not already a string) and wrapped in a random-access range where the 3579181254a7Smrg * element encoding type of the string (its code unit) is the element type 3580181254a7Smrg * of the range, and that range returned. The range has slicing. 3581181254a7Smrg * 3582181254a7Smrg * If `r` is quirky enough to be a struct or class which is an input range 3583181254a7Smrg * of characters on its own (i.e. it has the input range API as member 3584181254a7Smrg * functions), $(I and) it's implicitly convertible to a string type, then 3585181254a7Smrg * `r` is returned, and no implicit conversion takes place. 3586*b1e83836Smrg * 3587*b1e83836Smrg * If `r` is wrapped in a new range, then that range has a `source` 3588*b1e83836Smrg * property for returning the string that's currently contained within that 3589*b1e83836Smrg * range. 3590*b1e83836Smrg * 3591181254a7Smrg * See_Also: 3592181254a7Smrg * Refer to the $(MREF std, uni) docs for a reference on Unicode 3593181254a7Smrg * terminology. 3594181254a7Smrg * 3595181254a7Smrg * For a range that iterates by grapheme cluster (written character) see 3596181254a7Smrg * $(REF byGrapheme, std,uni). 3597181254a7Smrg */ 3598181254a7Smrg auto byCodeUnit(R)(R r) 3599*b1e83836Smrg if ((isConvertibleToString!R && !isStaticArray!R) || 3600*b1e83836Smrg (isInputRange!R && isSomeChar!(ElementEncodingType!R))) 3601181254a7Smrg { 3602*b1e83836Smrg import std.traits : StringTypeOf; 3603*b1e83836Smrg static if (// This would be cleaner if we had a way to check whether a type 3604181254a7Smrg // was a range without any implicit conversions. 3605181254a7Smrg (isAutodecodableString!R && !__traits(hasMember, R, "empty") && 3606181254a7Smrg !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))) 3607181254a7Smrg { 3608181254a7Smrg static struct ByCodeUnitImpl 3609181254a7Smrg { 3610181254a7Smrg @safe pure nothrow @nogc: 3611181254a7Smrg 3612*b1e83836Smrg @property bool empty() const { return source.length == 0; } 3613*b1e83836Smrg @property auto ref front() inout { return source[0]; } 3614*b1e83836Smrg void popFront() { source = source[1 .. $]; } 3615181254a7Smrg 3616*b1e83836Smrg @property auto save() { return ByCodeUnitImpl(source.save); } 3617181254a7Smrg 3618*b1e83836Smrg @property auto ref back() inout { return source[$ - 1]; } 3619*b1e83836Smrg void popBack() { source = source[0 .. $-1]; } 3620181254a7Smrg 3621*b1e83836Smrg auto ref opIndex(size_t index) inout { return source[index]; } 3622*b1e83836Smrg auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); } 3623181254a7Smrg 3624*b1e83836Smrg @property size_t length() const { return source.length; } 3625181254a7Smrg alias opDollar = length; 3626181254a7Smrg 3627*b1e83836Smrg StringTypeOf!R source; 3628181254a7Smrg } 3629181254a7Smrg 3630181254a7Smrg static assert(isRandomAccessRange!ByCodeUnitImpl); 3631181254a7Smrg 3632181254a7Smrg return ByCodeUnitImpl(r); 3633181254a7Smrg } 3634*b1e83836Smrg else static if (!isInputRange!R || 3635*b1e83836Smrg (is(R : const dchar[]) && !__traits(hasMember, R, "empty") && 3636*b1e83836Smrg !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))) 3637181254a7Smrg { 3638181254a7Smrg return cast(StringTypeOf!R) r; 3639181254a7Smrg } 3640181254a7Smrg else 3641181254a7Smrg { 3642181254a7Smrg // byCodeUnit for ranges and dchar[] is a no-op 3643181254a7Smrg return r; 3644181254a7Smrg } 3645181254a7Smrg } 3646181254a7Smrg 3647181254a7Smrg /// 3648181254a7Smrg @safe unittest 3649181254a7Smrg { 3650181254a7Smrg import std.range.primitives; 3651*b1e83836Smrg import std.traits : isAutodecodableString; 3652181254a7Smrg 3653181254a7Smrg auto r = "Hello, World!".byCodeUnit(); 3654181254a7Smrg static assert(hasLength!(typeof(r))); 3655181254a7Smrg static assert(hasSlicing!(typeof(r))); 3656181254a7Smrg static assert(isRandomAccessRange!(typeof(r))); 3657181254a7Smrg static assert(is(ElementType!(typeof(r)) == immutable char)); 3658181254a7Smrg 3659*b1e83836Smrg // contrast with the range capabilities of standard strings (with or 3660*b1e83836Smrg // without autodecoding enabled). 3661181254a7Smrg auto s = "Hello, World!"; 3662181254a7Smrg static assert(isBidirectionalRange!(typeof(r))); 3663*b1e83836Smrg static if (isAutodecodableString!(typeof(s))) 3664*b1e83836Smrg { 3665*b1e83836Smrg // with autodecoding enabled, strings are non-random-access ranges of 3666*b1e83836Smrg // dchar. 3667181254a7Smrg static assert(is(ElementType!(typeof(s)) == dchar)); 3668181254a7Smrg static assert(!isRandomAccessRange!(typeof(s))); 3669181254a7Smrg static assert(!hasSlicing!(typeof(s))); 3670181254a7Smrg static assert(!hasLength!(typeof(s))); 3671181254a7Smrg } 3672*b1e83836Smrg else 3673*b1e83836Smrg { 3674*b1e83836Smrg // without autodecoding, strings are normal arrays. 3675*b1e83836Smrg static assert(is(ElementType!(typeof(s)) == immutable char)); 3676*b1e83836Smrg static assert(isRandomAccessRange!(typeof(s))); 3677*b1e83836Smrg static assert(hasSlicing!(typeof(s))); 3678*b1e83836Smrg static assert(hasLength!(typeof(s))); 3679*b1e83836Smrg } 3680*b1e83836Smrg } 3681181254a7Smrg 3682181254a7Smrg /// `byCodeUnit` does no Unicode decoding 3683181254a7Smrg @safe unittest 3684181254a7Smrg { 3685181254a7Smrg string noel1 = "noe\u0308l"; // noël using e + combining diaeresis 3686181254a7Smrg assert(noel1.byCodeUnit[2] != 'ë'); 3687181254a7Smrg assert(noel1.byCodeUnit[2] == 'e'); 3688181254a7Smrg 3689181254a7Smrg string noel2 = "no\u00EBl"; // noël using a precomposed ë character 3690181254a7Smrg // Because string is UTF-8, the code unit at index 2 is just 3691181254a7Smrg // the first of a sequence that encodes 'ë' 3692181254a7Smrg assert(noel2.byCodeUnit[2] != 'ë'); 3693181254a7Smrg } 3694181254a7Smrg 3695*b1e83836Smrg /// `byCodeUnit` exposes a `source` property when wrapping narrow strings. 3696*b1e83836Smrg @safe unittest 3697*b1e83836Smrg { 3698*b1e83836Smrg import std.algorithm.comparison : equal; 3699*b1e83836Smrg import std.range : popFrontN; 3700*b1e83836Smrg import std.traits : isAutodecodableString; 3701*b1e83836Smrg { 3702*b1e83836Smrg auto range = byCodeUnit("hello world"); 3703*b1e83836Smrg range.popFrontN(3); 3704*b1e83836Smrg assert(equal(range.save, "lo world")); 3705*b1e83836Smrg static if (isAutodecodableString!string) // only enabled with autodecoding 3706*b1e83836Smrg { 3707*b1e83836Smrg string str = range.source; 3708*b1e83836Smrg assert(str == "lo world"); 3709*b1e83836Smrg } 3710*b1e83836Smrg } 3711*b1e83836Smrg // source only exists if the range was wrapped 3712*b1e83836Smrg { 3713*b1e83836Smrg auto range = byCodeUnit("hello world"d); 3714*b1e83836Smrg static assert(!__traits(compiles, range.source)); 3715*b1e83836Smrg } 3716*b1e83836Smrg } 3717*b1e83836Smrg 3718181254a7Smrg @safe pure nothrow @nogc unittest 3719181254a7Smrg { 3720181254a7Smrg import std.range; 3721181254a7Smrg { 3722181254a7Smrg enum testStr = " hello ディラン"; 3723181254a7Smrg char[testStr.length] s; 3724181254a7Smrg int i; 3725181254a7Smrg foreach (c; testStr.byCodeUnit().byCodeUnit()) 3726181254a7Smrg { 3727181254a7Smrg s[i++] = c; 3728181254a7Smrg } 3729181254a7Smrg assert(s == testStr); 3730181254a7Smrg } 3731181254a7Smrg { 3732181254a7Smrg enum testStr = " hello ディラン"w; 3733181254a7Smrg wchar[testStr.length] s; 3734181254a7Smrg int i; 3735181254a7Smrg foreach (c; testStr.byCodeUnit().byCodeUnit()) 3736181254a7Smrg { 3737181254a7Smrg s[i++] = c; 3738181254a7Smrg } 3739181254a7Smrg assert(s == testStr); 3740181254a7Smrg } 3741181254a7Smrg { 3742181254a7Smrg enum testStr = " hello ディラン"d; 3743181254a7Smrg dchar[testStr.length] s; 3744181254a7Smrg int i; 3745181254a7Smrg foreach (c; testStr.byCodeUnit().byCodeUnit()) 3746181254a7Smrg { 3747181254a7Smrg s[i++] = c; 3748181254a7Smrg } 3749181254a7Smrg assert(s == testStr); 3750181254a7Smrg } 3751181254a7Smrg { 3752181254a7Smrg auto bcu = "hello".byCodeUnit(); 3753181254a7Smrg assert(bcu.length == 5); 3754181254a7Smrg assert(bcu[3] == 'l'); 3755181254a7Smrg assert(bcu[2 .. 4][1] == 'l'); 3756181254a7Smrg } 3757181254a7Smrg { 3758181254a7Smrg char[5] orig = "hello"; 3759181254a7Smrg auto bcu = orig[].byCodeUnit(); 3760181254a7Smrg bcu.front = 'H'; 3761181254a7Smrg assert(bcu.front == 'H'); 3762181254a7Smrg bcu[1] = 'E'; 3763181254a7Smrg assert(bcu[1] == 'E'); 3764181254a7Smrg } 3765181254a7Smrg { 3766181254a7Smrg auto bcu = "hello".byCodeUnit().byCodeUnit(); 3767181254a7Smrg static assert(isForwardRange!(typeof(bcu))); 3768*b1e83836Smrg static assert(is(typeof(bcu) == struct) == isAutodecodableString!string); 3769181254a7Smrg auto s = bcu.save; 3770181254a7Smrg bcu.popFront(); 3771181254a7Smrg assert(s.front == 'h'); 3772181254a7Smrg } 3773181254a7Smrg { 3774181254a7Smrg auto bcu = "hello".byCodeUnit(); 3775181254a7Smrg static assert(hasSlicing!(typeof(bcu))); 3776181254a7Smrg static assert(isBidirectionalRange!(typeof(bcu))); 3777*b1e83836Smrg static assert(is(typeof(bcu) == struct) == isAutodecodableString!string); 3778181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3779181254a7Smrg auto ret = bcu.retro; 3780181254a7Smrg assert(ret.front == 'o'); 3781181254a7Smrg ret.popFront(); 3782181254a7Smrg assert(ret.front == 'l'); 3783181254a7Smrg } 3784181254a7Smrg { 3785181254a7Smrg auto bcu = "κόσμε"w.byCodeUnit(); 3786181254a7Smrg static assert(hasSlicing!(typeof(bcu))); 3787181254a7Smrg static assert(isBidirectionalRange!(typeof(bcu))); 3788*b1e83836Smrg static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring); 3789181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3790181254a7Smrg auto ret = bcu.retro; 3791181254a7Smrg assert(ret.front == 'ε'); 3792181254a7Smrg ret.popFront(); 3793181254a7Smrg assert(ret.front == 'μ'); 3794181254a7Smrg } 3795181254a7Smrg { 3796181254a7Smrg static struct Stringish 3797181254a7Smrg { 3798181254a7Smrg string s; 3799181254a7Smrg alias s this; 3800181254a7Smrg } 3801181254a7Smrg 3802181254a7Smrg auto orig = Stringish("\U0010fff8 foo "); 3803181254a7Smrg auto bcu = orig.byCodeUnit(); 3804181254a7Smrg static assert(is(typeof(bcu) == struct)); 3805*b1e83836Smrg static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish); 3806181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3807181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3808181254a7Smrg assert(bcu.front == cast(char) 244); 3809181254a7Smrg } 3810181254a7Smrg { 3811181254a7Smrg static struct WStringish 3812181254a7Smrg { 3813181254a7Smrg wstring s; 3814181254a7Smrg alias s this; 3815181254a7Smrg } 3816181254a7Smrg 3817181254a7Smrg auto orig = WStringish("\U0010fff8 foo "w); 3818181254a7Smrg auto bcu = orig.byCodeUnit(); 3819181254a7Smrg static assert(is(typeof(bcu) == struct)); 3820*b1e83836Smrg static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish); 3821181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3822181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3823181254a7Smrg assert(bcu.front == cast(wchar) 56319); 3824181254a7Smrg } 3825181254a7Smrg { 3826181254a7Smrg static struct DStringish 3827181254a7Smrg { 3828181254a7Smrg dstring s; 3829181254a7Smrg alias s this; 3830181254a7Smrg } 3831181254a7Smrg 3832181254a7Smrg auto orig = DStringish("\U0010fff8 foo "d); 3833181254a7Smrg auto bcu = orig.byCodeUnit(); 3834181254a7Smrg static assert(is(typeof(bcu) == dstring)); 3835181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3836181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3837181254a7Smrg assert(bcu.front == cast(dchar) 1114104); 3838181254a7Smrg } 3839181254a7Smrg { 3840181254a7Smrg static struct FuncStringish 3841181254a7Smrg { 3842181254a7Smrg string str; 3843181254a7Smrg string s() pure nothrow @nogc { return str; } 3844181254a7Smrg alias s this; 3845181254a7Smrg } 3846181254a7Smrg 3847181254a7Smrg auto orig = FuncStringish("\U0010fff8 foo "); 3848181254a7Smrg auto bcu = orig.byCodeUnit(); 3849*b1e83836Smrg static if (isAutodecodableString!FuncStringish) 3850181254a7Smrg static assert(is(typeof(bcu) == struct)); 3851*b1e83836Smrg else 3852*b1e83836Smrg static assert(is(typeof(bcu) == string)); 3853181254a7Smrg static assert(!is(typeof(bcu) == FuncStringish)); 3854181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3855181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3856181254a7Smrg assert(bcu.front == cast(char) 244); 3857181254a7Smrg } 3858181254a7Smrg { 3859181254a7Smrg static struct Range 3860181254a7Smrg { 3861181254a7Smrg string data; 3862181254a7Smrg bool empty() pure nothrow @nogc { return data.empty; } 3863181254a7Smrg char front() pure nothrow @nogc { return data[0]; } 3864181254a7Smrg void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3865181254a7Smrg } 3866181254a7Smrg 3867181254a7Smrg auto orig = Range("\U0010fff8 foo "); 3868181254a7Smrg auto bcu = orig.byCodeUnit(); 3869181254a7Smrg static assert(is(typeof(bcu) == Range)); 3870181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3871181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == char)); 3872181254a7Smrg assert(bcu.front == cast(char) 244); 3873181254a7Smrg } 3874181254a7Smrg { 3875181254a7Smrg static struct WRange 3876181254a7Smrg { 3877181254a7Smrg wstring data; 3878181254a7Smrg bool empty() pure nothrow @nogc { return data.empty; } 3879181254a7Smrg wchar front() pure nothrow @nogc { return data[0]; } 3880181254a7Smrg void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3881181254a7Smrg } 3882181254a7Smrg 3883181254a7Smrg auto orig = WRange("\U0010fff8 foo "w); 3884181254a7Smrg auto bcu = orig.byCodeUnit(); 3885181254a7Smrg static assert(is(typeof(bcu) == WRange)); 3886181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3887181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == wchar)); 3888181254a7Smrg assert(bcu.front == 56319); 3889181254a7Smrg } 3890181254a7Smrg { 3891181254a7Smrg static struct DRange 3892181254a7Smrg { 3893181254a7Smrg dstring data; 3894181254a7Smrg bool empty() pure nothrow @nogc { return data.empty; } 3895181254a7Smrg dchar front() pure nothrow @nogc { return data[0]; } 3896181254a7Smrg void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3897181254a7Smrg } 3898181254a7Smrg 3899181254a7Smrg auto orig = DRange("\U0010fff8 foo "d); 3900181254a7Smrg auto bcu = orig.byCodeUnit(); 3901181254a7Smrg static assert(is(typeof(bcu) == DRange)); 3902181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3903181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == dchar)); 3904181254a7Smrg assert(bcu.front == 1114104); 3905181254a7Smrg } 3906181254a7Smrg { 3907181254a7Smrg static struct RangeAndStringish 3908181254a7Smrg { 3909181254a7Smrg bool empty() pure nothrow @nogc { return data.empty; } 3910181254a7Smrg char front() pure nothrow @nogc { return data[0]; } 3911181254a7Smrg void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3912181254a7Smrg 3913181254a7Smrg string data; 3914181254a7Smrg string s; 3915181254a7Smrg alias s this; 3916181254a7Smrg } 3917181254a7Smrg 3918181254a7Smrg auto orig = RangeAndStringish("test.d", "other"); 3919181254a7Smrg auto bcu = orig.byCodeUnit(); 3920181254a7Smrg static assert(is(typeof(bcu) == RangeAndStringish)); 3921181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3922181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == char)); 3923181254a7Smrg assert(bcu.front == 't'); 3924181254a7Smrg } 3925181254a7Smrg { 3926181254a7Smrg static struct WRangeAndStringish 3927181254a7Smrg { 3928181254a7Smrg bool empty() pure nothrow @nogc { return data.empty; } 3929181254a7Smrg wchar front() pure nothrow @nogc { return data[0]; } 3930181254a7Smrg void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3931181254a7Smrg 3932181254a7Smrg wstring data; 3933181254a7Smrg wstring s; 3934181254a7Smrg alias s this; 3935181254a7Smrg } 3936181254a7Smrg 3937181254a7Smrg auto orig = WRangeAndStringish("test.d"w, "other"w); 3938181254a7Smrg auto bcu = orig.byCodeUnit(); 3939181254a7Smrg static assert(is(typeof(bcu) == WRangeAndStringish)); 3940181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3941181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == wchar)); 3942181254a7Smrg assert(bcu.front == 't'); 3943181254a7Smrg } 3944181254a7Smrg { 3945181254a7Smrg static struct DRangeAndStringish 3946181254a7Smrg { 3947181254a7Smrg bool empty() pure nothrow @nogc { return data.empty; } 3948181254a7Smrg dchar front() pure nothrow @nogc { return data[0]; } 3949181254a7Smrg void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3950181254a7Smrg 3951181254a7Smrg dstring data; 3952181254a7Smrg dstring s; 3953181254a7Smrg alias s this; 3954181254a7Smrg } 3955181254a7Smrg 3956181254a7Smrg auto orig = DRangeAndStringish("test.d"d, "other"d); 3957181254a7Smrg auto bcu = orig.byCodeUnit(); 3958181254a7Smrg static assert(is(typeof(bcu) == DRangeAndStringish)); 3959181254a7Smrg static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3960181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == dchar)); 3961181254a7Smrg assert(bcu.front == 't'); 3962181254a7Smrg } 3963181254a7Smrg { 3964181254a7Smrg enum Enum : string { a = "test.d" } 3965181254a7Smrg 3966181254a7Smrg auto orig = Enum.a; 3967181254a7Smrg auto bcu = orig.byCodeUnit(); 3968181254a7Smrg static assert(!is(typeof(bcu) == Enum)); 3969*b1e83836Smrg static if (isAutodecodableString!Enum) 3970181254a7Smrg static assert(is(typeof(bcu) == struct)); 3971*b1e83836Smrg else 3972*b1e83836Smrg static assert(is(typeof(bcu) == string)); 3973181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3974181254a7Smrg assert(bcu.front == 't'); 3975181254a7Smrg } 3976181254a7Smrg { 3977181254a7Smrg enum WEnum : wstring { a = "test.d"w } 3978181254a7Smrg 3979181254a7Smrg auto orig = WEnum.a; 3980181254a7Smrg auto bcu = orig.byCodeUnit(); 3981181254a7Smrg static assert(!is(typeof(bcu) == WEnum)); 3982*b1e83836Smrg static if (isAutodecodableString!WEnum) 3983181254a7Smrg static assert(is(typeof(bcu) == struct)); 3984*b1e83836Smrg else 3985*b1e83836Smrg static assert(is(typeof(bcu) == wstring)); 3986181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3987181254a7Smrg assert(bcu.front == 't'); 3988181254a7Smrg } 3989181254a7Smrg { 3990181254a7Smrg enum DEnum : dstring { a = "test.d"d } 3991181254a7Smrg 3992181254a7Smrg auto orig = DEnum.a; 3993181254a7Smrg auto bcu = orig.byCodeUnit(); 3994181254a7Smrg static assert(is(typeof(bcu) == dstring)); 3995181254a7Smrg static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3996181254a7Smrg assert(bcu.front == 't'); 3997181254a7Smrg } 3998181254a7Smrg 3999*b1e83836Smrg static if (autodecodeStrings) 4000*b1e83836Smrg { 4001181254a7Smrg static assert(!is(typeof(byCodeUnit("hello")) == string)); 4002181254a7Smrg static assert(!is(typeof(byCodeUnit("hello"w)) == wstring)); 4003*b1e83836Smrg } 4004*b1e83836Smrg else 4005*b1e83836Smrg { 4006*b1e83836Smrg static assert(is(typeof(byCodeUnit("hello")) == string)); 4007*b1e83836Smrg static assert(is(typeof(byCodeUnit("hello"w)) == wstring)); 4008*b1e83836Smrg } 4009181254a7Smrg static assert(is(typeof(byCodeUnit("hello"d)) == dstring)); 4010181254a7Smrg 4011181254a7Smrg static assert(!__traits(compiles, byCodeUnit((char[5]).init))); 4012181254a7Smrg static assert(!__traits(compiles, byCodeUnit((wchar[5]).init))); 4013181254a7Smrg static assert(!__traits(compiles, byCodeUnit((dchar[5]).init))); 4014181254a7Smrg 4015181254a7Smrg enum SEnum : char[5] { a = "hello" } 4016181254a7Smrg enum WSEnum : wchar[5] { a = "hello"w } 4017181254a7Smrg enum DSEnum : dchar[5] { a = "hello"d } 4018181254a7Smrg 4019181254a7Smrg static assert(!__traits(compiles, byCodeUnit(SEnum.a))); 4020181254a7Smrg static assert(!__traits(compiles, byCodeUnit(WSEnum.a))); 4021181254a7Smrg static assert(!__traits(compiles, byCodeUnit(DSEnum.a))); 4022181254a7Smrg } 4023181254a7Smrg 4024181254a7Smrg /**************************** 4025*b1e83836Smrg * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 4026*b1e83836Smrg * of characters by char, wchar, or dchar. 4027181254a7Smrg * These aliases simply forward to $(LREF byUTF) with the 4028181254a7Smrg * corresponding C argument. 4029181254a7Smrg * 4030181254a7Smrg * Params: 4031181254a7Smrg * r = input range of characters, or array of characters 4032181254a7Smrg */ 4033181254a7Smrg alias byChar = byUTF!char; 4034181254a7Smrg 4035181254a7Smrg /// Ditto 4036181254a7Smrg alias byWchar = byUTF!wchar; 4037181254a7Smrg 4038181254a7Smrg /// Ditto 4039181254a7Smrg alias byDchar = byUTF!dchar; 4040181254a7Smrg 4041181254a7Smrg @safe pure nothrow @nogc unittest 4042181254a7Smrg { 4043181254a7Smrg { 4044181254a7Smrg char[5] s; 4045181254a7Smrg int i; 4046181254a7Smrg foreach (c; "hello".byChar.byChar()) 4047181254a7Smrg { 4048181254a7Smrg //writefln("[%d] '%c'", i, c); 4049181254a7Smrg s[i++] = c; 4050181254a7Smrg } 4051181254a7Smrg assert(s == "hello"); 4052181254a7Smrg } 4053181254a7Smrg { 4054181254a7Smrg char[5+2+3+4+3+3] s; 4055181254a7Smrg int i; 4056181254a7Smrg dchar[10] a; 4057181254a7Smrg a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 4058181254a7Smrg a[8] = 0xD800; // invalid 4059181254a7Smrg a[9] = cast(dchar) 0x110000; // invalid 4060181254a7Smrg foreach (c; a[].byChar()) 4061181254a7Smrg { 4062181254a7Smrg //writefln("[%d] '%c'", i, c); 4063181254a7Smrg s[i++] = c; 4064181254a7Smrg } 4065181254a7Smrg assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"); 4066181254a7Smrg } 4067181254a7Smrg { 4068181254a7Smrg auto r = "hello"w.byChar(); 4069181254a7Smrg r.popFront(); 4070181254a7Smrg r.popFront(); 4071181254a7Smrg assert(r.front == 'l'); 4072181254a7Smrg } 4073181254a7Smrg { 4074181254a7Smrg auto r = "hello"d.byChar(); 4075181254a7Smrg r.popFront(); 4076181254a7Smrg r.popFront(); 4077181254a7Smrg assert(r.front == 'l'); 4078181254a7Smrg } 4079181254a7Smrg { 4080181254a7Smrg auto r = "hello"d.byChar(); 4081181254a7Smrg assert(isForwardRange!(typeof(r))); 4082181254a7Smrg auto s = r.save; 4083181254a7Smrg r.popFront(); 4084181254a7Smrg assert(s.front == 'h'); 4085181254a7Smrg } 4086181254a7Smrg } 4087181254a7Smrg 4088181254a7Smrg @safe pure nothrow @nogc unittest 4089181254a7Smrg { 4090181254a7Smrg { 4091181254a7Smrg wchar[11] s; 4092181254a7Smrg int i; 4093181254a7Smrg dchar[10] a; 4094181254a7Smrg a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 4095181254a7Smrg a[8] = 0xD800; // invalid 4096181254a7Smrg a[9] = cast(dchar) 0x110000; // invalid 4097181254a7Smrg foreach (c; a[].byWchar()) 4098181254a7Smrg { 4099181254a7Smrg //writefln("[%d] '%c' x%x", i, c, c); 4100181254a7Smrg s[i++] = c; 4101181254a7Smrg } 4102181254a7Smrg foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w) 4103181254a7Smrg { 4104181254a7Smrg //writefln("[%d] '%c' x%x", j, c, c); 4105181254a7Smrg } 4106181254a7Smrg assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w); 4107181254a7Smrg } 4108181254a7Smrg 4109181254a7Smrg { 4110181254a7Smrg auto r = "hello".byWchar(); 4111181254a7Smrg r.popFront(); 4112181254a7Smrg r.popFront(); 4113181254a7Smrg assert(r.front == 'l'); 4114181254a7Smrg } 4115181254a7Smrg { 4116181254a7Smrg auto r = "hello"d.byWchar(); 4117181254a7Smrg r.popFront(); 4118181254a7Smrg r.popFront(); 4119181254a7Smrg assert(r.front == 'l'); 4120181254a7Smrg } 4121181254a7Smrg { 4122181254a7Smrg auto r = "hello"d.byWchar(); 4123181254a7Smrg assert(isForwardRange!(typeof(r))); 4124181254a7Smrg auto s = r.save; 4125181254a7Smrg r.popFront(); 4126181254a7Smrg assert(s.front == 'h'); 4127181254a7Smrg } 4128181254a7Smrg } 4129181254a7Smrg 4130181254a7Smrg @safe pure nothrow @nogc unittest 4131181254a7Smrg { 4132181254a7Smrg { 4133181254a7Smrg dchar[9] s; 4134181254a7Smrg int i; 4135181254a7Smrg string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences 4136181254a7Smrg foreach (c; a.byDchar()) 4137181254a7Smrg { 4138181254a7Smrg s[i++] = c; 4139181254a7Smrg } 4140181254a7Smrg assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d); 4141181254a7Smrg } 4142181254a7Smrg { 4143181254a7Smrg foreach (s; invalidUTFstrings!char()) 4144181254a7Smrg { 4145181254a7Smrg auto r = s.byDchar(); 4146181254a7Smrg assert(!r.empty); 4147181254a7Smrg assert(r.front == r.front); 4148181254a7Smrg dchar c = r.front; 4149181254a7Smrg assert(c == replacementDchar); 4150181254a7Smrg } 4151181254a7Smrg } 4152181254a7Smrg { 4153181254a7Smrg auto r = "hello".byDchar(); 4154181254a7Smrg r.popFront(); 4155181254a7Smrg r.popFront(); 4156181254a7Smrg assert(r.front == 'l'); 4157181254a7Smrg } 4158181254a7Smrg 4159181254a7Smrg { 4160181254a7Smrg dchar[8] s; 4161181254a7Smrg int i; 4162181254a7Smrg wstring a = "hello\u07FF\uD7FF\U0010FFFF"w; 4163181254a7Smrg foreach (c; a.byDchar()) 4164181254a7Smrg { 4165181254a7Smrg //writefln("[%d] '%c' x%x", i, c, c); 4166181254a7Smrg s[i++] = c; 4167181254a7Smrg } 4168181254a7Smrg assert(s == "hello\u07FF\uD7FF\U0010FFFF"d); 4169181254a7Smrg } 4170181254a7Smrg { 4171181254a7Smrg foreach (s; invalidUTFstrings!wchar()) 4172181254a7Smrg { 4173181254a7Smrg auto r = s.byDchar(); 4174181254a7Smrg assert(!r.empty); 4175181254a7Smrg assert(r.front == r.front); 4176181254a7Smrg dchar c = r.front; 4177181254a7Smrg assert(c == replacementDchar); 4178181254a7Smrg } 4179181254a7Smrg } 4180181254a7Smrg { 4181181254a7Smrg wchar[2] ws; 4182181254a7Smrg ws[0] = 0xD800; 4183181254a7Smrg ws[1] = 0xDD00; // correct surrogate pair 4184181254a7Smrg auto r = ws[].byDchar(); 4185181254a7Smrg assert(!r.empty); 4186181254a7Smrg assert(r.front == r.front); 4187181254a7Smrg dchar c = r.front; 4188181254a7Smrg assert(c == '\U00010100'); 4189181254a7Smrg } 4190181254a7Smrg { 4191181254a7Smrg auto r = "hello"w.byDchar(); 4192181254a7Smrg r.popFront(); 4193181254a7Smrg r.popFront(); 4194181254a7Smrg assert(r.front == 'l'); 4195181254a7Smrg } 4196181254a7Smrg 4197181254a7Smrg { 4198181254a7Smrg dchar[5] s; 4199181254a7Smrg int i; 4200181254a7Smrg dstring a = "hello"d; 4201181254a7Smrg foreach (c; a.byDchar.byDchar()) 4202181254a7Smrg { 4203181254a7Smrg //writefln("[%d] '%c' x%x", i, c, c); 4204181254a7Smrg s[i++] = c; 4205181254a7Smrg } 4206181254a7Smrg assert(s == "hello"d); 4207181254a7Smrg } 4208181254a7Smrg { 4209181254a7Smrg auto r = "hello".byDchar(); 4210181254a7Smrg assert(isForwardRange!(typeof(r))); 4211181254a7Smrg auto s = r.save; 4212181254a7Smrg r.popFront(); 4213181254a7Smrg assert(s.front == 'h'); 4214181254a7Smrg } 4215181254a7Smrg { 4216181254a7Smrg auto r = "hello"w.byDchar(); 4217181254a7Smrg assert(isForwardRange!(typeof(r))); 4218181254a7Smrg auto s = r.save; 4219181254a7Smrg r.popFront(); 4220181254a7Smrg assert(s.front == 'h'); 4221181254a7Smrg } 4222181254a7Smrg } 4223181254a7Smrg 4224181254a7Smrg // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar, 4225181254a7Smrg // which needs to support ranges with and without those attributes 4226181254a7Smrg 4227181254a7Smrg pure @safe nothrow @nogc unittest 4228181254a7Smrg { 4229181254a7Smrg dchar[5] s = "hello"d; 4230181254a7Smrg foreach (c; s[].byChar()) { } 4231181254a7Smrg foreach (c; s[].byWchar()) { } 4232181254a7Smrg foreach (c; s[].byDchar()) { } 4233181254a7Smrg } 4234181254a7Smrg 4235*b1e83836Smrg version (StdUnittest) 4236*b1e83836Smrg private int impureVariable; 4237181254a7Smrg 4238181254a7Smrg @system unittest 4239181254a7Smrg { 4240181254a7Smrg static struct ImpureThrowingSystemRange(Char) 4241181254a7Smrg { 4242181254a7Smrg @property bool empty() const { return true; } 4243181254a7Smrg @property Char front() const { return Char.init; } 4244181254a7Smrg void popFront() 4245181254a7Smrg { 4246181254a7Smrg impureVariable++; 4247181254a7Smrg throw new Exception("only for testing nothrow"); 4248181254a7Smrg } 4249181254a7Smrg } 4250181254a7Smrg 4251181254a7Smrg foreach (Char; AliasSeq!(char, wchar, dchar)) 4252181254a7Smrg { 4253181254a7Smrg ImpureThrowingSystemRange!Char range; 4254181254a7Smrg foreach (c; range.byChar()) { } 4255181254a7Smrg foreach (c; range.byWchar()) { } 4256181254a7Smrg foreach (c; range.byDchar()) { } 4257181254a7Smrg } 4258181254a7Smrg } 4259181254a7Smrg 4260181254a7Smrg /**************************** 4261*b1e83836Smrg * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 4262*b1e83836Smrg * of characters by char type `C` by encoding the elements of the range. 4263181254a7Smrg * 4264*b1e83836Smrg * UTF sequences that cannot be converted to the specified encoding are either 4265181254a7Smrg * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution" 4266*b1e83836Smrg * of the Unicode Standard 6.2 or result in a thrown UTFException. 4267*b1e83836Smrg * Hence byUTF is not symmetric. 4268181254a7Smrg * This algorithm is lazy, and does not allocate memory. 4269181254a7Smrg * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the 4270181254a7Smrg * `r` parameter. 4271181254a7Smrg * 4272181254a7Smrg * Params: 4273181254a7Smrg * C = `char`, `wchar`, or `dchar` 4274*b1e83836Smrg * useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`, 4275*b1e83836Smrg * UseReplacementDchar.no means throw `UTFException` for invalid UTF 4276*b1e83836Smrg * 4277*b1e83836Smrg * Throws: 4278*b1e83836Smrg * `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.yes` 4279*b1e83836Smrg * 4280*b1e83836Smrg * GC: 4281*b1e83836Smrg * Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.no` 4282181254a7Smrg * 4283181254a7Smrg * Returns: 4284*b1e83836Smrg * A bidirectional range if `R` is a bidirectional range and not auto-decodable, 4285*b1e83836Smrg * as defined by $(REF isAutodecodableString, std, traits). 4286*b1e83836Smrg * 4287*b1e83836Smrg * A forward range if `R` is a forward range and not auto-decodable. 4288181254a7Smrg * 4289181254a7Smrg * Or, if `R` is a range and it is auto-decodable and 4290181254a7Smrg * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed 4291181254a7Smrg * to $(LREF byCodeUnit). 4292181254a7Smrg * 4293181254a7Smrg * Otherwise, an input range of characters. 4294181254a7Smrg */ 4295*b1e83836Smrg template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar) 4296181254a7Smrg if (isSomeChar!C) 4297181254a7Smrg { 4298*b1e83836Smrg static if (is(immutable C == immutable UC, UC) && !is(C == UC)) 4299*b1e83836Smrg alias byUTF = byUTF!UC; 4300181254a7Smrg else: 4301181254a7Smrg 4302181254a7Smrg auto ref byUTF(R)(R r) 4303181254a7Smrg if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 4304181254a7Smrg { 4305181254a7Smrg return byUTF(r.byCodeUnit()); 4306181254a7Smrg } 4307181254a7Smrg 4308181254a7Smrg auto ref byUTF(R)(R r) 4309181254a7Smrg if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 4310181254a7Smrg { 4311*b1e83836Smrg static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C)) 4312181254a7Smrg { 4313181254a7Smrg return r.byCodeUnit(); 4314181254a7Smrg } 4315*b1e83836Smrg else static if (is(C == dchar)) 4316*b1e83836Smrg { 4317*b1e83836Smrg static struct Result 4318*b1e83836Smrg { 4319*b1e83836Smrg enum Empty = uint.max; // range is empty or just constructed 4320*b1e83836Smrg 4321*b1e83836Smrg this(return scope R r) 4322*b1e83836Smrg { 4323*b1e83836Smrg this.r = r; 4324*b1e83836Smrg } 4325*b1e83836Smrg 4326*b1e83836Smrg this(return scope R r, uint buff) 4327*b1e83836Smrg { 4328*b1e83836Smrg this.r = r; 4329*b1e83836Smrg this.buff = buff; 4330*b1e83836Smrg } 4331*b1e83836Smrg 4332*b1e83836Smrg static if (isBidirectionalRange!R) 4333*b1e83836Smrg { 4334*b1e83836Smrg this(return scope R r, uint frontBuff, uint backBuff) 4335*b1e83836Smrg { 4336*b1e83836Smrg this.r = r; 4337*b1e83836Smrg this.buff = frontBuff; 4338*b1e83836Smrg this.backBuff = backBuff; 4339*b1e83836Smrg } 4340*b1e83836Smrg } 4341*b1e83836Smrg 4342*b1e83836Smrg @property bool empty() 4343*b1e83836Smrg { 4344*b1e83836Smrg static if (isBidirectionalRange!R) 4345*b1e83836Smrg return buff == Empty && backBuff == Empty && r.empty; 4346*b1e83836Smrg else 4347*b1e83836Smrg return buff == Empty && r.empty; 4348*b1e83836Smrg } 4349*b1e83836Smrg 4350*b1e83836Smrg @property dchar front() scope // 'scope' required by call to decodeFront() below 4351*b1e83836Smrg { 4352*b1e83836Smrg if (buff == Empty) 4353*b1e83836Smrg { 4354*b1e83836Smrg auto c = r.front; 4355*b1e83836Smrg 4356*b1e83836Smrg static if (is(RC == wchar)) 4357*b1e83836Smrg enum firstMulti = 0xD800; // First high surrogate. 4358*b1e83836Smrg else 4359*b1e83836Smrg enum firstMulti = 0x80; // First non-ASCII. 4360*b1e83836Smrg if (c < firstMulti) 4361*b1e83836Smrg { 4362*b1e83836Smrg r.popFront; 4363*b1e83836Smrg buff = cast(dchar) c; 4364*b1e83836Smrg } 4365*b1e83836Smrg else 4366*b1e83836Smrg { 4367*b1e83836Smrg buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }(); 4368*b1e83836Smrg } 4369*b1e83836Smrg } 4370*b1e83836Smrg return cast(dchar) buff; 4371*b1e83836Smrg } 4372*b1e83836Smrg 4373*b1e83836Smrg void popFront() 4374*b1e83836Smrg { 4375*b1e83836Smrg if (buff == Empty) 4376*b1e83836Smrg front(); 4377*b1e83836Smrg buff = Empty; 4378*b1e83836Smrg } 4379*b1e83836Smrg 4380*b1e83836Smrg static if (isForwardRange!R) 4381*b1e83836Smrg { 4382*b1e83836Smrg @property auto save() 4383*b1e83836Smrg { 4384*b1e83836Smrg static if (isBidirectionalRange!R) 4385*b1e83836Smrg { 4386*b1e83836Smrg return Result(r.save, buff, backBuff); 4387*b1e83836Smrg } 4388*b1e83836Smrg else 4389*b1e83836Smrg { 4390*b1e83836Smrg return Result(r.save, buff); 4391*b1e83836Smrg } 4392*b1e83836Smrg } 4393*b1e83836Smrg } 4394*b1e83836Smrg 4395*b1e83836Smrg static if (isBidirectionalRange!R) 4396*b1e83836Smrg { 4397*b1e83836Smrg @property dchar back() scope // 'scope' required by call to decodeBack() below 4398*b1e83836Smrg { 4399*b1e83836Smrg if (backBuff != Empty) 4400*b1e83836Smrg return cast(dchar) backBuff; 4401*b1e83836Smrg 4402*b1e83836Smrg auto c = r.back; 4403*b1e83836Smrg static if (is(RC == wchar)) 4404*b1e83836Smrg enum firstMulti = 0xD800; // First high surrogate. 4405*b1e83836Smrg else 4406*b1e83836Smrg enum firstMulti = 0x80; // First non-ASCII. 4407*b1e83836Smrg if (c < firstMulti) 4408*b1e83836Smrg { 4409*b1e83836Smrg r.popBack; 4410*b1e83836Smrg backBuff = cast(dchar) c; 4411*b1e83836Smrg } 4412*b1e83836Smrg else 4413*b1e83836Smrg { 4414*b1e83836Smrg backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }(); 4415*b1e83836Smrg } 4416*b1e83836Smrg return cast(dchar) backBuff; 4417*b1e83836Smrg 4418*b1e83836Smrg } 4419*b1e83836Smrg 4420*b1e83836Smrg void popBack() 4421*b1e83836Smrg { 4422*b1e83836Smrg if (backBuff == Empty) 4423*b1e83836Smrg back(); 4424*b1e83836Smrg backBuff = Empty; 4425*b1e83836Smrg } 4426*b1e83836Smrg } 4427*b1e83836Smrg 4428*b1e83836Smrg private: 4429*b1e83836Smrg 4430*b1e83836Smrg R r; 4431*b1e83836Smrg uint buff = Empty; // one character lookahead buffer 4432*b1e83836Smrg static if (isBidirectionalRange!R) 4433*b1e83836Smrg uint backBuff = Empty; 4434*b1e83836Smrg } 4435*b1e83836Smrg 4436*b1e83836Smrg return Result(r); 4437*b1e83836Smrg } 4438181254a7Smrg else 4439181254a7Smrg { 4440181254a7Smrg static struct Result 4441181254a7Smrg { 4442*b1e83836Smrg this(return scope R r) 4443*b1e83836Smrg { 4444*b1e83836Smrg this.r = r; 4445*b1e83836Smrg } 4446*b1e83836Smrg 4447*b1e83836Smrg this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf) 4448*b1e83836Smrg { 4449*b1e83836Smrg this.r = r; 4450*b1e83836Smrg this.pos = pos; 4451*b1e83836Smrg this.fill = fill; 4452*b1e83836Smrg this.buf = buf; 4453*b1e83836Smrg } 4454*b1e83836Smrg 4455*b1e83836Smrg static if (isBidirectionalRange!R) 4456*b1e83836Smrg { 4457*b1e83836Smrg this(return scope R r, ushort frontPos, ushort frontFill, 4458*b1e83836Smrg ushort backPos, ushort backFill, C[4 / C.sizeof] buf) 4459*b1e83836Smrg { 4460*b1e83836Smrg this.r = r; 4461*b1e83836Smrg this.pos = frontPos; 4462*b1e83836Smrg this.fill = frontFill; 4463*b1e83836Smrg this.backPos = backPos; 4464*b1e83836Smrg this.backFill = backFill; 4465*b1e83836Smrg this.buf = buf; 4466*b1e83836Smrg } 4467*b1e83836Smrg } 4468*b1e83836Smrg 4469181254a7Smrg @property bool empty() 4470181254a7Smrg { 4471*b1e83836Smrg static if (isBidirectionalRange!R) 4472*b1e83836Smrg return pos == fill && backPos == backFill && r.empty; 4473*b1e83836Smrg else 4474181254a7Smrg return pos == fill && r.empty; 4475181254a7Smrg } 4476181254a7Smrg 4477181254a7Smrg @property auto front() scope // 'scope' required by call to decodeFront() below 4478181254a7Smrg { 4479181254a7Smrg if (pos == fill) 4480181254a7Smrg { 4481181254a7Smrg pos = 0; 4482181254a7Smrg auto c = r.front; 4483181254a7Smrg 4484*b1e83836Smrg static if (C.sizeof >= 2 && RC.sizeof >= 2) 4485*b1e83836Smrg enum firstMulti = 0xD800; // First high surrogate. 4486*b1e83836Smrg else 4487*b1e83836Smrg enum firstMulti = 0x80; // First non-ASCII. 4488*b1e83836Smrg if (c < firstMulti) 4489181254a7Smrg { 4490181254a7Smrg fill = 1; 4491181254a7Smrg r.popFront; 4492181254a7Smrg buf[pos] = cast(C) c; 4493181254a7Smrg } 4494181254a7Smrg else 4495181254a7Smrg { 4496181254a7Smrg static if (is(RC == dchar)) 4497181254a7Smrg { 4498181254a7Smrg r.popFront; 4499181254a7Smrg dchar dc = c; 4500181254a7Smrg } 4501181254a7Smrg else 4502*b1e83836Smrg dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }(); 4503*b1e83836Smrg fill = cast(ushort) encode!(useReplacementDchar)(buf, dc); 4504181254a7Smrg } 4505181254a7Smrg } 4506181254a7Smrg return buf[pos]; 4507181254a7Smrg } 4508181254a7Smrg 4509181254a7Smrg void popFront() 4510181254a7Smrg { 4511181254a7Smrg if (pos == fill) 4512181254a7Smrg front; 4513181254a7Smrg ++pos; 4514181254a7Smrg } 4515181254a7Smrg 4516181254a7Smrg static if (isForwardRange!R) 4517181254a7Smrg { 4518*b1e83836Smrg @property auto save() 4519181254a7Smrg { 4520*b1e83836Smrg static if (isBidirectionalRange!R) 4521*b1e83836Smrg { 4522*b1e83836Smrg return Result(r.save, pos, fill, backPos, backFill, buf); 4523*b1e83836Smrg } 4524*b1e83836Smrg else 4525*b1e83836Smrg { 4526*b1e83836Smrg return Result(r.save, pos, fill, buf); 4527*b1e83836Smrg } 4528*b1e83836Smrg } 4529*b1e83836Smrg } 4530*b1e83836Smrg 4531*b1e83836Smrg static if (isBidirectionalRange!R) 4532*b1e83836Smrg { 4533*b1e83836Smrg @property auto back() scope // 'scope' required by call to decodeBack() below 4534*b1e83836Smrg { 4535*b1e83836Smrg if (backPos != backFill) 4536*b1e83836Smrg return buf[cast(ushort) (backFill - backPos - 1)]; 4537*b1e83836Smrg 4538*b1e83836Smrg backPos = 0; 4539*b1e83836Smrg auto c = r.back; 4540*b1e83836Smrg static if (C.sizeof >= 2 && RC.sizeof >= 2) 4541*b1e83836Smrg enum firstMulti = 0xD800; // First high surrogate. 4542*b1e83836Smrg else 4543*b1e83836Smrg enum firstMulti = 0x80; // First non-ASCII. 4544*b1e83836Smrg if (c < firstMulti) 4545*b1e83836Smrg { 4546*b1e83836Smrg backFill = 1; 4547*b1e83836Smrg r.popBack; 4548*b1e83836Smrg buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c; 4549*b1e83836Smrg } 4550*b1e83836Smrg else 4551*b1e83836Smrg { 4552*b1e83836Smrg static if (is(RC == dchar)) 4553*b1e83836Smrg { 4554*b1e83836Smrg r.popBack; 4555*b1e83836Smrg dchar dc = c; 4556*b1e83836Smrg } 4557*b1e83836Smrg else 4558*b1e83836Smrg dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }(); 4559*b1e83836Smrg backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc); 4560*b1e83836Smrg } 4561*b1e83836Smrg return buf[cast(ushort) (backFill - backPos - 1)]; 4562*b1e83836Smrg } 4563*b1e83836Smrg 4564*b1e83836Smrg void popBack() 4565*b1e83836Smrg { 4566*b1e83836Smrg if (backPos == backFill) 4567*b1e83836Smrg back; 4568*b1e83836Smrg ++backPos; 4569181254a7Smrg } 4570181254a7Smrg } 4571181254a7Smrg 4572181254a7Smrg private: 4573181254a7Smrg 4574181254a7Smrg R r; 4575181254a7Smrg ushort pos, fill; 4576*b1e83836Smrg static if (isBidirectionalRange!R) 4577*b1e83836Smrg ushort backPos, backFill; 4578*b1e83836Smrg C[4 / C.sizeof] buf = void; 4579181254a7Smrg } 4580181254a7Smrg 4581181254a7Smrg return Result(r); 4582181254a7Smrg } 4583181254a7Smrg } 4584181254a7Smrg } 4585181254a7Smrg 4586181254a7Smrg /// 4587181254a7Smrg @safe pure nothrow unittest 4588181254a7Smrg { 4589181254a7Smrg import std.algorithm.comparison : equal; 4590181254a7Smrg 4591181254a7Smrg // hellö as a range of `char`s, which are UTF-8 4592*b1e83836Smrg assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6])); 4593181254a7Smrg 4594181254a7Smrg // `wchar`s are able to hold the ö in a single element (UTF-16 code unit) 4595*b1e83836Smrg assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö'])); 4596181254a7Smrg 4597181254a7Smrg // is four code units in UTF-8, two in UTF-16, and one in UTF-32 4598*b1e83836Smrg assert("".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7])); 4599*b1e83836Smrg assert("".byUTF!wchar().equal([0xD801, 0xDC37])); 4600*b1e83836Smrg assert("".byUTF!dchar().equal([0x00010437])); 4601*b1e83836Smrg } 4602*b1e83836Smrg 4603*b1e83836Smrg /// 4604*b1e83836Smrg @safe unittest 4605*b1e83836Smrg { 4606*b1e83836Smrg import std.algorithm.comparison : equal; 4607*b1e83836Smrg import std.exception : assertThrown; 4608*b1e83836Smrg 4609*b1e83836Smrg assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty")); 4610*b1e83836Smrg assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty")); 4611*b1e83836Smrg } 4612*b1e83836Smrg 4613*b1e83836Smrg @safe unittest 4614*b1e83836Smrg { 4615*b1e83836Smrg { 4616*b1e83836Smrg wchar[] s = ['a', 'b', 0x219]; 4617*b1e83836Smrg auto r = s.byUTF!char; 4618*b1e83836Smrg assert(isBidirectionalRange!(typeof(r))); 4619*b1e83836Smrg assert(r.back == 0x99); 4620*b1e83836Smrg r.popBack; 4621*b1e83836Smrg assert(r.back == 0xc8); 4622*b1e83836Smrg r.popBack; 4623*b1e83836Smrg assert(r.back == 'b'); 4624*b1e83836Smrg 4625*b1e83836Smrg } 4626*b1e83836Smrg 4627*b1e83836Smrg { 4628*b1e83836Smrg wchar[] s = ['a', 'b', 0x219]; 4629*b1e83836Smrg auto r = s.byUTF!wchar; 4630*b1e83836Smrg uint i; 4631*b1e83836Smrg assert(isBidirectionalRange!(typeof(r))); 4632*b1e83836Smrg assert(r.back == 0x219); 4633*b1e83836Smrg r.popBack; 4634*b1e83836Smrg assert(r.back == 'b'); 4635*b1e83836Smrg } 4636*b1e83836Smrg 4637*b1e83836Smrg { 4638*b1e83836Smrg wchar[] s = ['a', 'b', 0x219]; 4639*b1e83836Smrg auto r = s.byUTF!dchar; 4640*b1e83836Smrg assert(isBidirectionalRange!(typeof(r))); 4641*b1e83836Smrg assert(r.back == 0x219); 4642*b1e83836Smrg r.popBack; 4643*b1e83836Smrg assert(r.back == 'b'); 4644*b1e83836Smrg } 4645*b1e83836Smrg 4646*b1e83836Smrg { 4647*b1e83836Smrg dchar[] s = ['', '']; 4648*b1e83836Smrg auto r = s.byUTF!wchar; 4649*b1e83836Smrg assert(r.back == 0xde01); 4650*b1e83836Smrg r.popBack; 4651*b1e83836Smrg assert(r.back == 0xd83d); 4652*b1e83836Smrg r.popBack; 4653*b1e83836Smrg assert(r.back == 0xdc37); 4654*b1e83836Smrg r.popBack; 4655*b1e83836Smrg assert(r.back == 0xd801); 4656*b1e83836Smrg } 4657*b1e83836Smrg 4658*b1e83836Smrg { 4659*b1e83836Smrg dchar[] s = ['', '']; 4660*b1e83836Smrg auto r = s.byUTF!char; 4661*b1e83836Smrg char[] res; 4662*b1e83836Smrg while (!r.empty) 4663*b1e83836Smrg { 4664*b1e83836Smrg res ~= r.back; 4665*b1e83836Smrg r.popBack; 4666*b1e83836Smrg } 4667*b1e83836Smrg import std.algorithm.comparison : equal; 4668*b1e83836Smrg assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0])); 4669*b1e83836Smrg } 4670*b1e83836Smrg 4671*b1e83836Smrg { 4672*b1e83836Smrg dchar[] res; 4673*b1e83836Smrg auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar; 4674*b1e83836Smrg while (!r.empty) 4675*b1e83836Smrg { 4676*b1e83836Smrg res ~= r.back; 4677*b1e83836Smrg r.popBack; 4678*b1e83836Smrg } 4679*b1e83836Smrg import std.algorithm.comparison : equal; 4680*b1e83836Smrg assert(res.equal(['e', 'd', 'c', 'b', 'a'])); 4681*b1e83836Smrg } 4682*b1e83836Smrg 4683*b1e83836Smrg { 4684*b1e83836Smrg //testing the save() function 4685*b1e83836Smrg wchar[] s = ['Ă','ț']; 4686*b1e83836Smrg 4687*b1e83836Smrg auto rc = s.byUTF!char; 4688*b1e83836Smrg rc.popBack; 4689*b1e83836Smrg auto rcCopy = rc.save; 4690*b1e83836Smrg assert(rc.back == rcCopy.back); 4691*b1e83836Smrg assert(rcCopy.back == 0xc8); 4692*b1e83836Smrg 4693*b1e83836Smrg auto rd = s.byUTF!dchar; 4694*b1e83836Smrg rd.popBack; 4695*b1e83836Smrg auto rdCopy = rd.save; 4696*b1e83836Smrg assert(rd.back == rdCopy.back); 4697*b1e83836Smrg assert(rdCopy.back == 'Ă'); 4698*b1e83836Smrg } 4699*b1e83836Smrg } 4700*b1e83836Smrg 4701*b1e83836Smrg /// 4702*b1e83836Smrg @safe pure nothrow unittest 4703*b1e83836Smrg { 4704*b1e83836Smrg import std.range.primitives; 4705*b1e83836Smrg wchar[] s = ['ă', 'î']; 4706*b1e83836Smrg 4707*b1e83836Smrg auto rc = s.byUTF!char; 4708*b1e83836Smrg static assert(isBidirectionalRange!(typeof(rc))); 4709*b1e83836Smrg assert(rc.back == 0xae); 4710*b1e83836Smrg rc.popBack; 4711*b1e83836Smrg assert(rc.back == 0xc3); 4712*b1e83836Smrg rc.popBack; 4713*b1e83836Smrg assert(rc.back == 0x83); 4714*b1e83836Smrg rc.popBack; 4715*b1e83836Smrg assert(rc.back == 0xc4); 4716*b1e83836Smrg 4717*b1e83836Smrg auto rw = s.byUTF!wchar; 4718*b1e83836Smrg static assert(isBidirectionalRange!(typeof(rw))); 4719*b1e83836Smrg assert(rw.back == 'î'); 4720*b1e83836Smrg rw.popBack; 4721*b1e83836Smrg assert(rw.back == 'ă'); 4722*b1e83836Smrg 4723*b1e83836Smrg auto rd = s.byUTF!dchar; 4724*b1e83836Smrg static assert(isBidirectionalRange!(typeof(rd))); 4725*b1e83836Smrg assert(rd.back == 'î'); 4726*b1e83836Smrg rd.popBack; 4727*b1e83836Smrg assert(rd.back == 'ă'); 4728181254a7Smrg } 4729