xref: /netbsd-src/external/gpl3/gcc/dist/libphobos/src/std/utf.d (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1181254a7Smrg // Written in the D programming language.
2181254a7Smrg 
3181254a7Smrg /++
4181254a7Smrg     Encode and decode UTF-8, UTF-16 and UTF-32 strings.
5181254a7Smrg 
6181254a7Smrg     UTF character support is restricted to
7181254a7Smrg     $(D '\u0000' <= character <= '\U0010FFFF').
8181254a7Smrg 
9181254a7Smrg $(SCRIPT inhibitQuickIndex = 1;)
10*b1e83836Smrg $(DIVC quickindex,
11181254a7Smrg $(BOOKTABLE,
12181254a7Smrg $(TR $(TH Category) $(TH Functions))
13181254a7Smrg $(TR $(TD Decode) $(TD
14181254a7Smrg     $(LREF decode)
15181254a7Smrg     $(LREF decodeFront)
16181254a7Smrg ))
17181254a7Smrg $(TR $(TD Lazy decode) $(TD
18181254a7Smrg     $(LREF byCodeUnit)
19181254a7Smrg     $(LREF byChar)
20181254a7Smrg     $(LREF byWchar)
21181254a7Smrg     $(LREF byDchar)
22181254a7Smrg     $(LREF byUTF)
23181254a7Smrg ))
24181254a7Smrg $(TR $(TD Encode) $(TD
25181254a7Smrg     $(LREF encode)
26181254a7Smrg     $(LREF toUTF8)
27181254a7Smrg     $(LREF toUTF16)
28181254a7Smrg     $(LREF toUTF32)
29181254a7Smrg     $(LREF toUTFz)
30181254a7Smrg     $(LREF toUTF16z)
31181254a7Smrg ))
32181254a7Smrg $(TR $(TD Length) $(TD
33181254a7Smrg     $(LREF codeLength)
34181254a7Smrg     $(LREF count)
35181254a7Smrg     $(LREF stride)
36181254a7Smrg     $(LREF strideBack)
37181254a7Smrg ))
38181254a7Smrg $(TR $(TD Index) $(TD
39181254a7Smrg     $(LREF toUCSindex)
40181254a7Smrg     $(LREF toUTFindex)
41181254a7Smrg ))
42181254a7Smrg $(TR $(TD Validation) $(TD
43181254a7Smrg     $(LREF isValidDchar)
44*b1e83836Smrg     $(LREF isValidCodepoint)
45181254a7Smrg     $(LREF validate)
46181254a7Smrg ))
47181254a7Smrg $(TR $(TD Miscellaneous) $(TD
48181254a7Smrg     $(LREF replacementDchar)
49181254a7Smrg     $(LREF UseReplacementDchar)
50181254a7Smrg     $(LREF UTFException)
51181254a7Smrg ))
52*b1e83836Smrg ))
53181254a7Smrg     See_Also:
54181254a7Smrg         $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
55181254a7Smrg         $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
56181254a7Smrg         $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
57*b1e83836Smrg     Copyright: Copyright The D Language Foundation 2000 - 2012.
58181254a7Smrg     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
59*b1e83836Smrg     Authors:   $(HTTP digitalmars.com, Walter Bright) and
60*b1e83836Smrg                $(HTTP jmdavisprog.com, Jonathan M Davis)
61*b1e83836Smrg     Source:    $(PHOBOSSRC std/utf.d)
62181254a7Smrg    +/
63181254a7Smrg module std.utf;
64181254a7Smrg 
65*b1e83836Smrg import std.exception : basicExceptionCtors;
66*b1e83836Smrg import core.exception : UnicodeException;
67*b1e83836Smrg import std.meta : AliasSeq;
68*b1e83836Smrg import std.range;
69*b1e83836Smrg import std.traits : isAutodecodableString, isConvertibleToString, isPointer,
70*b1e83836Smrg     isSomeChar, isSomeString, isStaticArray, Unqual;
71*b1e83836Smrg import std.typecons : Flag, Yes, No;
72181254a7Smrg 
73181254a7Smrg 
74181254a7Smrg /++
75181254a7Smrg     Exception thrown on errors in std.utf functions.
76181254a7Smrg   +/
77*b1e83836Smrg class UTFException : UnicodeException
78181254a7Smrg {
79181254a7Smrg     import core.internal.string : unsignedToTempString, UnsignedStringBuf;
80181254a7Smrg 
81181254a7Smrg     uint[4] sequence;
82181254a7Smrg     size_t  len;
83181254a7Smrg 
84181254a7Smrg     @safe pure nothrow @nogc
85*b1e83836Smrg     UTFException setSequence(scope uint[] data...) return
86181254a7Smrg     {
87181254a7Smrg         assert(data.length <= 4);
88181254a7Smrg 
89181254a7Smrg         len = data.length < 4 ? data.length : 4;
90181254a7Smrg         sequence[0 .. len] = data[0 .. len];
91181254a7Smrg 
92181254a7Smrg         return this;
93181254a7Smrg     }
94181254a7Smrg 
95*b1e83836Smrg     // FIXME: Use std.exception.basicExceptionCtors here once
96*b1e83836Smrg     // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
97181254a7Smrg 
98*b1e83836Smrg     /**
99*b1e83836Smrg     Standard exception constructors.
100*b1e83836Smrg      */
101181254a7Smrg     this(string msg, string file = __FILE__, size_t line = __LINE__,
102181254a7Smrg          Throwable next = null) @nogc @safe pure nothrow
103181254a7Smrg     {
104*b1e83836Smrg         super(msg, 0, file, line, next);
105181254a7Smrg     }
106*b1e83836Smrg     /// ditto
107181254a7Smrg     this(string msg, size_t index, string file = __FILE__,
108181254a7Smrg          size_t line = __LINE__, Throwable next = null) @safe pure nothrow
109181254a7Smrg     {
110181254a7Smrg         UnsignedStringBuf buf = void;
111*b1e83836Smrg         msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")";
112*b1e83836Smrg         super(msg, index, file, line, next);
113181254a7Smrg     }
114181254a7Smrg 
115*b1e83836Smrg     /**
116*b1e83836Smrg     Returns:
117*b1e83836Smrg         A `string` detailing the invalid UTF sequence.
118*b1e83836Smrg      */
119181254a7Smrg     override string toString() const
120181254a7Smrg     {
121181254a7Smrg         if (len == 0)
122181254a7Smrg         {
123181254a7Smrg             /* Exception.toString() is not marked as const, although
124181254a7Smrg              * it is const-compatible.
125181254a7Smrg              */
126181254a7Smrg             //return super.toString();
127181254a7Smrg             auto e = () @trusted { return cast(Exception) super; } ();
128181254a7Smrg             return e.toString();
129181254a7Smrg         }
130181254a7Smrg 
131181254a7Smrg         string result = "Invalid UTF sequence:";
132181254a7Smrg 
133181254a7Smrg         foreach (i; sequence[0 .. len])
134181254a7Smrg         {
135181254a7Smrg             UnsignedStringBuf buf = void;
136181254a7Smrg             result ~= ' ';
137*b1e83836Smrg             auto h = unsignedToTempString!16(i, buf);
138181254a7Smrg             if (h.length == 1)
139181254a7Smrg                 result ~= '0';
140181254a7Smrg             result ~= h;
141181254a7Smrg             result ~= 'x';
142181254a7Smrg         }
143181254a7Smrg 
144181254a7Smrg         if (super.msg.length > 0)
145181254a7Smrg         {
146181254a7Smrg             result ~= " - ";
147181254a7Smrg             result ~= super.msg;
148181254a7Smrg         }
149181254a7Smrg 
150181254a7Smrg         return result;
151181254a7Smrg     }
152181254a7Smrg }
153181254a7Smrg 
154*b1e83836Smrg ///
155*b1e83836Smrg @safe unittest
156*b1e83836Smrg {
157*b1e83836Smrg     import std.exception : assertThrown;
158*b1e83836Smrg 
159*b1e83836Smrg     char[4] buf;
160*b1e83836Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
161*b1e83836Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
162*b1e83836Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
163*b1e83836Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
164*b1e83836Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
165*b1e83836Smrg }
166*b1e83836Smrg 
167181254a7Smrg /*
168181254a7Smrg    Provide array of invalidly encoded UTF strings. Useful for testing.
169181254a7Smrg 
170181254a7Smrg    Params:
171181254a7Smrg         Char = char, wchar, or dchar
172181254a7Smrg 
173181254a7Smrg    Returns:
174181254a7Smrg         an array of invalidly encoded UTF strings
175181254a7Smrg  */
176181254a7Smrg 
177181254a7Smrg package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
178181254a7Smrg if (isSomeChar!Char)
179181254a7Smrg {
180181254a7Smrg     static if (is(Char == char))
181181254a7Smrg     {
182181254a7Smrg         enum x = 0xDC00;         // invalid surrogate value
183181254a7Smrg         enum y = 0x110000;       // out of range
184181254a7Smrg 
185181254a7Smrg         static immutable string[8] result =
186181254a7Smrg         [
187181254a7Smrg             "\x80",             // not a start byte
188181254a7Smrg             "\xC0",             // truncated
189181254a7Smrg             "\xC0\xC0",         // invalid continuation
190181254a7Smrg             "\xF0\x82\x82\xAC", // overlong
191181254a7Smrg             [
192181254a7Smrg               0xE0 | (x >> 12),
193181254a7Smrg               0x80 | ((x >> 6) & 0x3F),
194181254a7Smrg               0x80 | (x & 0x3F)
195181254a7Smrg             ],
196181254a7Smrg             [
197181254a7Smrg               cast(char)(0xF0 | (y >> 18)),
198181254a7Smrg               cast(char)(0x80 | ((y >> 12) & 0x3F)),
199181254a7Smrg               cast(char)(0x80 | ((y >> 6) & 0x3F)),
200181254a7Smrg               cast(char)(0x80 | (y & 0x3F))
201181254a7Smrg             ],
202181254a7Smrg             [
203181254a7Smrg               cast(char)(0xF8 | 3),     // 5 byte encoding
204181254a7Smrg               cast(char)(0x80 | 3),
205181254a7Smrg               cast(char)(0x80 | 3),
206181254a7Smrg               cast(char)(0x80 | 3),
207181254a7Smrg               cast(char)(0x80 | 3),
208181254a7Smrg             ],
209181254a7Smrg             [
210181254a7Smrg               cast(char)(0xFC | 3),     // 6 byte encoding
211181254a7Smrg               cast(char)(0x80 | 3),
212181254a7Smrg               cast(char)(0x80 | 3),
213181254a7Smrg               cast(char)(0x80 | 3),
214181254a7Smrg               cast(char)(0x80 | 3),
215181254a7Smrg               cast(char)(0x80 | 3),
216181254a7Smrg             ],
217181254a7Smrg         ];
218181254a7Smrg 
219181254a7Smrg         return result[];
220181254a7Smrg     }
221181254a7Smrg     else static if (is(Char == wchar))
222181254a7Smrg     {
223181254a7Smrg         static immutable wstring[5] result =
224181254a7Smrg         [
225181254a7Smrg             [
226181254a7Smrg               cast(wchar) 0xDC00,
227181254a7Smrg             ],
228181254a7Smrg             [
229181254a7Smrg               cast(wchar) 0xDFFF,
230181254a7Smrg             ],
231181254a7Smrg             [
232181254a7Smrg               cast(wchar) 0xDBFF,
233181254a7Smrg               cast(wchar) 0xDBFF,
234181254a7Smrg             ],
235181254a7Smrg             [
236181254a7Smrg               cast(wchar) 0xDBFF,
237181254a7Smrg               cast(wchar) 0xE000,
238181254a7Smrg             ],
239181254a7Smrg             [
240181254a7Smrg               cast(wchar) 0xD800,
241181254a7Smrg             ],
242181254a7Smrg         ];
243181254a7Smrg 
244181254a7Smrg         return result[];
245181254a7Smrg     }
246181254a7Smrg     else static if (is(Char == dchar))
247181254a7Smrg     {
248181254a7Smrg         static immutable dstring[3] result =
249181254a7Smrg         [
250181254a7Smrg             [ cast(dchar) 0x110000 ],
251181254a7Smrg             [ cast(dchar) 0x00D800 ],
252181254a7Smrg             [ cast(dchar) 0x00DFFF ],
253181254a7Smrg         ];
254181254a7Smrg 
255181254a7Smrg         return result;
256181254a7Smrg     }
257181254a7Smrg     else
258181254a7Smrg         static assert(0);
259181254a7Smrg }
260181254a7Smrg 
261181254a7Smrg /++
262181254a7Smrg     Check whether the given Unicode code point is valid.
263181254a7Smrg 
264181254a7Smrg     Params:
265181254a7Smrg         c = code point to check
266181254a7Smrg 
267181254a7Smrg     Returns:
268*b1e83836Smrg         `true` if and only if `c` is a valid Unicode code point
269181254a7Smrg 
270181254a7Smrg     Note:
271*b1e83836Smrg     `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
272181254a7Smrg     as they are permitted for internal use by an application, but they are
273181254a7Smrg     not allowed for interchange by the Unicode standard.
274181254a7Smrg   +/
275181254a7Smrg bool isValidDchar(dchar c) pure nothrow @safe @nogc
276181254a7Smrg {
277181254a7Smrg     return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
278181254a7Smrg }
279181254a7Smrg 
280*b1e83836Smrg ///
281*b1e83836Smrg @safe @nogc pure nothrow unittest
282*b1e83836Smrg {
283*b1e83836Smrg     assert( isValidDchar(cast(dchar) 0x41));
284*b1e83836Smrg     assert( isValidDchar(cast(dchar) 0x00));
285*b1e83836Smrg     assert(!isValidDchar(cast(dchar) 0xD800));
286*b1e83836Smrg     assert(!isValidDchar(cast(dchar) 0x11FFFF));
287*b1e83836Smrg }
288*b1e83836Smrg 
289181254a7Smrg pure nothrow @safe @nogc unittest
290181254a7Smrg {
291181254a7Smrg     import std.exception;
292181254a7Smrg 
293181254a7Smrg     assertCTFEable!(
294181254a7Smrg     {
295181254a7Smrg     assert( isValidDchar(cast(dchar)'a') == true);
296181254a7Smrg     assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
297181254a7Smrg 
298181254a7Smrg     assert(!isValidDchar(cast(dchar) 0x00D800));
299181254a7Smrg     assert(!isValidDchar(cast(dchar) 0x00DBFF));
300181254a7Smrg     assert(!isValidDchar(cast(dchar) 0x00DC00));
301181254a7Smrg     assert(!isValidDchar(cast(dchar) 0x00DFFF));
302181254a7Smrg     assert( isValidDchar(cast(dchar) 0x00FFFE));
303181254a7Smrg     assert( isValidDchar(cast(dchar) 0x00FFFF));
304181254a7Smrg     assert( isValidDchar(cast(dchar) 0x01FFFF));
305181254a7Smrg     assert( isValidDchar(cast(dchar) 0x10FFFF));
306181254a7Smrg     assert(!isValidDchar(cast(dchar) 0x110000));
307181254a7Smrg     });
308181254a7Smrg }
309181254a7Smrg 
310*b1e83836Smrg /**
311*b1e83836Smrg Checks if a single character forms a valid code point.
312181254a7Smrg 
313*b1e83836Smrg When standing alone, some characters are invalid code points. For
314*b1e83836Smrg example the `wchar` `0xD800` is a so called high surrogate, which can
315*b1e83836Smrg only be interpreted together with a low surrogate following it. As a
316*b1e83836Smrg standalone character it is considered invalid.
317*b1e83836Smrg 
318*b1e83836Smrg See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/,
319*b1e83836Smrg Unicode Standard, D90, D91 and D92) for more details.
320181254a7Smrg 
321181254a7Smrg Params:
322*b1e83836Smrg     c = character to test
323*b1e83836Smrg     Char = character type of `c`
324*b1e83836Smrg 
325*b1e83836Smrg Returns:
326*b1e83836Smrg     `true`, if `c` forms a valid code point.
327*b1e83836Smrg  */
328*b1e83836Smrg bool isValidCodepoint(Char)(Char c)
329*b1e83836Smrg if (isSomeChar!Char)
330*b1e83836Smrg {
331*b1e83836Smrg     alias UChar = Unqual!Char;
332*b1e83836Smrg     static if (is(UChar == char))
333*b1e83836Smrg     {
334*b1e83836Smrg         return c <= 0x7F;
335*b1e83836Smrg     }
336*b1e83836Smrg     else static if (is(UChar == wchar))
337*b1e83836Smrg     {
338*b1e83836Smrg         return c <= 0xD7FF || c >= 0xE000;
339*b1e83836Smrg     }
340*b1e83836Smrg     else static if (is(UChar == dchar))
341*b1e83836Smrg     {
342*b1e83836Smrg         return isValidDchar(c);
343*b1e83836Smrg     }
344*b1e83836Smrg     else
345*b1e83836Smrg         static assert(false, "unknown character type: `" ~ Char.stringof ~ "`");
346*b1e83836Smrg }
347*b1e83836Smrg 
348*b1e83836Smrg ///
349*b1e83836Smrg @safe pure nothrow unittest
350*b1e83836Smrg {
351*b1e83836Smrg     assert( isValidCodepoint(cast(char) 0x40));
352*b1e83836Smrg     assert(!isValidCodepoint(cast(char) 0x80));
353*b1e83836Smrg     assert( isValidCodepoint(cast(wchar) 0x1234));
354*b1e83836Smrg     assert(!isValidCodepoint(cast(wchar) 0xD800));
355*b1e83836Smrg     assert( isValidCodepoint(cast(dchar) 0x0010FFFF));
356*b1e83836Smrg     assert(!isValidCodepoint(cast(dchar) 0x12345678));
357*b1e83836Smrg }
358*b1e83836Smrg 
359*b1e83836Smrg /++
360*b1e83836Smrg     Calculate the length of the UTF sequence starting at `index`
361*b1e83836Smrg     in `str`.
362*b1e83836Smrg 
363*b1e83836Smrg     Params:
364*b1e83836Smrg         str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
365*b1e83836Smrg         of UTF code units. Must be random access if `index` is passed
366*b1e83836Smrg         index = starting index of UTF sequence (default: `0`)
367181254a7Smrg 
368181254a7Smrg     Returns:
369181254a7Smrg         The number of code units in the UTF sequence. For UTF-8, this is a
370181254a7Smrg         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
371181254a7Smrg         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
372181254a7Smrg 
373181254a7Smrg     Throws:
374*b1e83836Smrg         May throw a `UTFException` if `str[index]` is not the start of a
375181254a7Smrg         valid UTF sequence.
376181254a7Smrg 
377181254a7Smrg     Note:
378*b1e83836Smrg         `stride` will only analyze the first `str[index]` element. It
379181254a7Smrg         will not fully verify the validity of the UTF sequence, nor even verify
380181254a7Smrg         the presence of the sequence: it will not actually guarantee that
381181254a7Smrg         $(D index + stride(str, index) <= str.length).
382181254a7Smrg   +/
383181254a7Smrg uint stride(S)(auto ref S str, size_t index)
384181254a7Smrg if (is(S : const char[]) ||
385*b1e83836Smrg     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
386181254a7Smrg {
387181254a7Smrg     static if (is(typeof(str.length) : ulong))
388181254a7Smrg         assert(index < str.length, "Past the end of the UTF-8 sequence");
389181254a7Smrg     immutable c = str[index];
390181254a7Smrg 
391181254a7Smrg     if (c < 0x80)
392181254a7Smrg         return 1;
393181254a7Smrg     else
394181254a7Smrg         return strideImpl(c, index);
395181254a7Smrg }
396181254a7Smrg 
397181254a7Smrg /// Ditto
398181254a7Smrg uint stride(S)(auto ref S str)
399181254a7Smrg if (is(S : const char[]) ||
400*b1e83836Smrg     (isInputRange!S && is(immutable ElementType!S == immutable char)))
401181254a7Smrg {
402181254a7Smrg     static if (is(S : const char[]))
403181254a7Smrg         immutable c = str[0];
404181254a7Smrg     else
405181254a7Smrg         immutable c = str.front;
406181254a7Smrg 
407181254a7Smrg     if (c < 0x80)
408181254a7Smrg         return 1;
409181254a7Smrg     else
410181254a7Smrg         return strideImpl(c, 0);
411181254a7Smrg }
412181254a7Smrg 
413181254a7Smrg @system unittest
414181254a7Smrg {
415181254a7Smrg     import core.exception : AssertError;
416181254a7Smrg     import std.conv : to;
417181254a7Smrg     import std.exception;
418181254a7Smrg     import std.string : format;
419*b1e83836Smrg     import std.traits : FunctionAttribute, functionAttributes, isSafe;
420181254a7Smrg     static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
421181254a7Smrg     {
422181254a7Smrg         enforce(stride(s, i) == codeLength!char(c),
423181254a7Smrg                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
424181254a7Smrg 
425181254a7Smrg         enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
426181254a7Smrg                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
427181254a7Smrg 
428181254a7Smrg         auto refRandom = new RefRandomCU!char(s);
429181254a7Smrg         immutable randLen = refRandom.length;
430181254a7Smrg         enforce(stride(refRandom, i) == codeLength!char(c),
431181254a7Smrg                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
432181254a7Smrg         enforce(refRandom.length == randLen,
433181254a7Smrg                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
434181254a7Smrg 
435181254a7Smrg         if (i == 0)
436181254a7Smrg         {
437181254a7Smrg             enforce(stride(s) == codeLength!char(c),
438181254a7Smrg                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
439181254a7Smrg 
440181254a7Smrg             enforce(stride(InputCU!char(s)) == codeLength!char(c),
441181254a7Smrg                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
442181254a7Smrg 
443181254a7Smrg             auto refBidir = new RefBidirCU!char(s);
444181254a7Smrg             immutable bidirLen = refBidir.length;
445181254a7Smrg             enforce(stride(refBidir) == codeLength!char(c),
446181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
447181254a7Smrg             enforce(refBidir.length == bidirLen,
448181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
449181254a7Smrg         }
450181254a7Smrg     }
451181254a7Smrg 
452181254a7Smrg     assertCTFEable!(
453181254a7Smrg     {
454181254a7Smrg     test("a", 'a');
455181254a7Smrg     test(" ", ' ');
456181254a7Smrg     test("\u2029", '\u2029'); //paraSep
457181254a7Smrg     test("\u0100", '\u0100');
458181254a7Smrg     test("\u0430", '\u0430');
459181254a7Smrg     test("\U00010143", '\U00010143');
460181254a7Smrg     test("abcdefcdef", 'a');
461181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'h', 0);
462181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'e', 1);
463181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'l', 2);
464181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'l', 3);
465181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'o', 4);
466181254a7Smrg     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
467181254a7Smrg     test("hello\U00010143\u0100\U00010143", '\u0100', 9);
468181254a7Smrg     test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
469181254a7Smrg 
470181254a7Smrg     foreach (S; AliasSeq!(char[], const char[], string))
471181254a7Smrg     {
472181254a7Smrg         enum str = to!S("hello world");
473181254a7Smrg         static assert(isSafe!({ stride(str, 0); }));
474181254a7Smrg         static assert(isSafe!({ stride(str);    }));
475181254a7Smrg         static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
476181254a7Smrg         static assert((functionAttributes!({ stride(str);    }) & FunctionAttribute.pure_) != 0);
477181254a7Smrg     }
478181254a7Smrg     });
479181254a7Smrg }
480181254a7Smrg 
481181254a7Smrg @safe unittest // invalid start bytes
482181254a7Smrg {
483181254a7Smrg     import std.exception : assertThrown;
484181254a7Smrg     immutable char[] invalidStartBytes = [
485181254a7Smrg         0b1111_1000, // indicating a sequence length of 5
486181254a7Smrg         0b1111_1100, // 6
487181254a7Smrg         0b1111_1110, // 7
488181254a7Smrg         0b1111_1111, // 8
489181254a7Smrg         0b1000_0000, // continuation byte
490181254a7Smrg     ];
491181254a7Smrg     foreach (c; invalidStartBytes)
492181254a7Smrg         assertThrown!UTFException(stride([c]));
493181254a7Smrg }
494181254a7Smrg 
495181254a7Smrg /// Ditto
496181254a7Smrg uint stride(S)(auto ref S str, size_t index)
497181254a7Smrg if (is(S : const wchar[]) ||
498*b1e83836Smrg     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
499181254a7Smrg {
500181254a7Smrg     static if (is(typeof(str.length) : ulong))
501181254a7Smrg         assert(index < str.length, "Past the end of the UTF-16 sequence");
502181254a7Smrg     immutable uint u = str[index];
503181254a7Smrg     return 1 + (u >= 0xD800 && u <= 0xDBFF);
504181254a7Smrg }
505181254a7Smrg 
506181254a7Smrg /// Ditto
507181254a7Smrg uint stride(S)(auto ref S str) @safe pure
508181254a7Smrg if (is(S : const wchar[]))
509181254a7Smrg {
510181254a7Smrg     return stride(str, 0);
511181254a7Smrg }
512181254a7Smrg 
513181254a7Smrg /// Ditto
514181254a7Smrg uint stride(S)(auto ref S str)
515*b1e83836Smrg if (isInputRange!S && is(immutable ElementType!S == immutable wchar) &&
516*b1e83836Smrg     !is(S : const wchar[]))
517181254a7Smrg {
518181254a7Smrg     assert(!str.empty, "UTF-16 sequence is empty");
519181254a7Smrg     immutable uint u = str.front;
520181254a7Smrg     return 1 + (u >= 0xD800 && u <= 0xDBFF);
521181254a7Smrg }
522181254a7Smrg 
523181254a7Smrg @system unittest
524181254a7Smrg {
525181254a7Smrg     import core.exception : AssertError;
526181254a7Smrg     import std.conv : to;
527181254a7Smrg     import std.exception;
528181254a7Smrg     import std.string : format;
529*b1e83836Smrg     import std.traits : FunctionAttribute, functionAttributes, isSafe;
530181254a7Smrg     static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
531181254a7Smrg     {
532181254a7Smrg         enforce(stride(s, i) == codeLength!wchar(c),
533181254a7Smrg                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
534181254a7Smrg 
535181254a7Smrg         enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
536181254a7Smrg                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
537181254a7Smrg 
538181254a7Smrg         auto refRandom = new RefRandomCU!wchar(s);
539181254a7Smrg         immutable randLen = refRandom.length;
540181254a7Smrg         enforce(stride(refRandom, i) == codeLength!wchar(c),
541181254a7Smrg                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
542181254a7Smrg         enforce(refRandom.length == randLen,
543181254a7Smrg                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
544181254a7Smrg 
545181254a7Smrg         if (i == 0)
546181254a7Smrg         {
547181254a7Smrg             enforce(stride(s) == codeLength!wchar(c),
548181254a7Smrg                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
549181254a7Smrg 
550181254a7Smrg             enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
551181254a7Smrg                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
552181254a7Smrg 
553181254a7Smrg             auto refBidir = new RefBidirCU!wchar(s);
554181254a7Smrg             immutable bidirLen = refBidir.length;
555181254a7Smrg             enforce(stride(refBidir) == codeLength!wchar(c),
556181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
557181254a7Smrg             enforce(refBidir.length == bidirLen,
558181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
559181254a7Smrg         }
560181254a7Smrg     }
561181254a7Smrg 
562181254a7Smrg     assertCTFEable!(
563181254a7Smrg     {
564181254a7Smrg     test("a", 'a');
565181254a7Smrg     test(" ", ' ');
566181254a7Smrg     test("\u2029", '\u2029'); //paraSep
567181254a7Smrg     test("\u0100", '\u0100');
568181254a7Smrg     test("\u0430", '\u0430');
569181254a7Smrg     test("\U00010143", '\U00010143');
570181254a7Smrg     test("abcdefcdef", 'a');
571181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'h', 0);
572181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'e', 1);
573181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'l', 2);
574181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'l', 3);
575181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'o', 4);
576181254a7Smrg     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
577181254a7Smrg     test("hello\U00010143\u0100\U00010143", '\u0100', 7);
578181254a7Smrg     test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
579181254a7Smrg 
580181254a7Smrg     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
581181254a7Smrg     {
582181254a7Smrg         enum str = to!S("hello world");
583181254a7Smrg         static assert(isSafe!(() => stride(str, 0)));
584181254a7Smrg         static assert(isSafe!(() => stride(str)   ));
585181254a7Smrg         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
586181254a7Smrg         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
587181254a7Smrg     }
588181254a7Smrg     });
589181254a7Smrg }
590181254a7Smrg 
591181254a7Smrg /// Ditto
592181254a7Smrg uint stride(S)(auto ref S str, size_t index = 0)
593181254a7Smrg if (is(S : const dchar[]) ||
594*b1e83836Smrg     (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
595181254a7Smrg {
596181254a7Smrg     static if (is(typeof(str.length) : ulong))
597181254a7Smrg         assert(index < str.length, "Past the end of the UTF-32 sequence");
598181254a7Smrg     else
599181254a7Smrg         assert(!str.empty, "UTF-32 sequence is empty.");
600181254a7Smrg     return 1;
601181254a7Smrg }
602181254a7Smrg 
603*b1e83836Smrg ///
604*b1e83836Smrg @safe unittest
605*b1e83836Smrg {
606*b1e83836Smrg     assert("a".stride == 1);
607*b1e83836Smrg     assert("λ".stride == 2);
608*b1e83836Smrg     assert("aλ".stride == 1);
609*b1e83836Smrg     assert("aλ".stride(1) == 2);
610*b1e83836Smrg     assert("��".stride == 4);
611*b1e83836Smrg }
612*b1e83836Smrg 
613181254a7Smrg @system unittest
614181254a7Smrg {
615181254a7Smrg     import core.exception : AssertError;
616181254a7Smrg     import std.conv : to;
617181254a7Smrg     import std.exception;
618181254a7Smrg     import std.string : format;
619*b1e83836Smrg     import std.traits : FunctionAttribute, functionAttributes, isSafe;
620181254a7Smrg     static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
621181254a7Smrg     {
622181254a7Smrg         enforce(stride(s, i) == codeLength!dchar(c),
623181254a7Smrg                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
624181254a7Smrg 
625181254a7Smrg         enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
626181254a7Smrg                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
627181254a7Smrg 
628181254a7Smrg         auto refRandom = new RefRandomCU!dchar(s);
629181254a7Smrg         immutable randLen = refRandom.length;
630181254a7Smrg         enforce(stride(refRandom, i) == codeLength!dchar(c),
631181254a7Smrg                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
632181254a7Smrg         enforce(refRandom.length == randLen,
633181254a7Smrg                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
634181254a7Smrg 
635181254a7Smrg         if (i == 0)
636181254a7Smrg         {
637181254a7Smrg             enforce(stride(s) == codeLength!dchar(c),
638181254a7Smrg                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
639181254a7Smrg 
640181254a7Smrg             enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
641181254a7Smrg                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
642181254a7Smrg 
643181254a7Smrg             auto refBidir = new RefBidirCU!dchar(s);
644181254a7Smrg             immutable bidirLen = refBidir.length;
645181254a7Smrg             enforce(stride(refBidir) == codeLength!dchar(c),
646181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
647181254a7Smrg             enforce(refBidir.length == bidirLen,
648181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
649181254a7Smrg         }
650181254a7Smrg     }
651181254a7Smrg 
652181254a7Smrg     assertCTFEable!(
653181254a7Smrg     {
654181254a7Smrg     test("a", 'a');
655181254a7Smrg     test(" ", ' ');
656181254a7Smrg     test("\u2029", '\u2029'); //paraSep
657181254a7Smrg     test("\u0100", '\u0100');
658181254a7Smrg     test("\u0430", '\u0430');
659181254a7Smrg     test("\U00010143", '\U00010143');
660181254a7Smrg     test("abcdefcdef", 'a');
661181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'h', 0);
662181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'e', 1);
663181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'l', 2);
664181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'l', 3);
665181254a7Smrg     test("hello\U00010143\u0100\U00010143", 'o', 4);
666181254a7Smrg     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
667181254a7Smrg     test("hello\U00010143\u0100\U00010143", '\u0100', 6);
668181254a7Smrg     test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
669181254a7Smrg 
670181254a7Smrg     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
671181254a7Smrg     {
672181254a7Smrg         enum str = to!S("hello world");
673181254a7Smrg         static assert(isSafe!(() => stride(str, 0)));
674181254a7Smrg         static assert(isSafe!(() => stride(str)   ));
675181254a7Smrg         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
676181254a7Smrg         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
677181254a7Smrg     }
678181254a7Smrg     });
679181254a7Smrg }
680181254a7Smrg 
681*b1e83836Smrg private uint strideImpl(char c, size_t index) @trusted pure
682*b1e83836Smrg in { assert(c & 0x80); }
683*b1e83836Smrg do
684*b1e83836Smrg {
685*b1e83836Smrg     import core.bitop : bsr;
686*b1e83836Smrg     immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
687*b1e83836Smrg     if (c == 0xFF || msbs < 2 || msbs > 4)
688*b1e83836Smrg         throw new UTFException("Invalid UTF-8 sequence", index);
689*b1e83836Smrg     return msbs;
690*b1e83836Smrg }
691*b1e83836Smrg 
692181254a7Smrg /++
693181254a7Smrg     Calculate the length of the UTF sequence ending one code unit before
694*b1e83836Smrg     `index` in `str`.
695181254a7Smrg 
696181254a7Smrg     Params:
697181254a7Smrg         str = bidirectional range of UTF code units. Must be random access if
698*b1e83836Smrg         `index` is passed
699*b1e83836Smrg         index = index one past end of UTF sequence (default: `str.length`)
700181254a7Smrg 
701181254a7Smrg     Returns:
702181254a7Smrg         The number of code units in the UTF sequence. For UTF-8, this is a
703181254a7Smrg         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
704181254a7Smrg         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
705181254a7Smrg 
706181254a7Smrg     Throws:
707*b1e83836Smrg         May throw a `UTFException` if `str[index]` is not one past the
708181254a7Smrg         end of a valid UTF sequence.
709181254a7Smrg 
710181254a7Smrg     Note:
711*b1e83836Smrg         `strideBack` will only analyze the element at $(D str[index - 1])
712181254a7Smrg         element. It will not fully verify the validity of the UTF sequence, nor
713181254a7Smrg         even verify the presence of the sequence: it will not actually
714181254a7Smrg         guarantee that $(D strideBack(str, index) <= index).
715181254a7Smrg   +/
716181254a7Smrg uint strideBack(S)(auto ref S str, size_t index)
717181254a7Smrg if (is(S : const char[]) ||
718*b1e83836Smrg     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
719181254a7Smrg {
720181254a7Smrg     static if (is(typeof(str.length) : ulong))
721181254a7Smrg         assert(index <= str.length, "Past the end of the UTF-8 sequence");
722181254a7Smrg     assert(index > 0, "Not the end of the UTF-8 sequence");
723181254a7Smrg 
724181254a7Smrg     if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
725181254a7Smrg         return 1;
726181254a7Smrg 
727181254a7Smrg     if (index >= 4) //single verification for most common case
728181254a7Smrg     {
729*b1e83836Smrg         static foreach (i; 2 .. 5)
730181254a7Smrg         {
731181254a7Smrg             if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
732181254a7Smrg                 return i;
733181254a7Smrg         }
734181254a7Smrg     }
735181254a7Smrg     else
736181254a7Smrg     {
737*b1e83836Smrg         static foreach (i; 2 .. 4)
738181254a7Smrg         {
739181254a7Smrg             if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
740181254a7Smrg                 return i;
741181254a7Smrg         }
742181254a7Smrg     }
743181254a7Smrg     throw new UTFException("Not the end of the UTF sequence", index);
744181254a7Smrg }
745181254a7Smrg 
746181254a7Smrg /// Ditto
747181254a7Smrg uint strideBack(S)(auto ref S str)
748181254a7Smrg if (is(S : const char[]) ||
749*b1e83836Smrg     (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char)))
750181254a7Smrg {
751181254a7Smrg     return strideBack(str, str.length);
752181254a7Smrg }
753181254a7Smrg 
754181254a7Smrg /// Ditto
755181254a7Smrg uint strideBack(S)(auto ref S str)
756*b1e83836Smrg if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S)
757181254a7Smrg {
758181254a7Smrg     assert(!str.empty, "Past the end of the UTF-8 sequence");
759181254a7Smrg     auto temp = str.save;
760181254a7Smrg     foreach (i; AliasSeq!(1, 2, 3, 4))
761181254a7Smrg     {
762181254a7Smrg         if ((temp.back & 0b1100_0000) != 0b1000_0000)
763181254a7Smrg             return i;
764181254a7Smrg         temp.popBack();
765181254a7Smrg         if (temp.empty)
766181254a7Smrg             break;
767181254a7Smrg     }
768181254a7Smrg     throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
769181254a7Smrg }
770181254a7Smrg 
771181254a7Smrg @system unittest
772181254a7Smrg {
773181254a7Smrg     import core.exception : AssertError;
774181254a7Smrg     import std.conv : to;
775181254a7Smrg     import std.exception;
776181254a7Smrg     import std.string : format;
777*b1e83836Smrg     import std.traits : FunctionAttribute, functionAttributes, isSafe;
778181254a7Smrg     static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
779181254a7Smrg     {
780181254a7Smrg         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
781181254a7Smrg                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
782181254a7Smrg 
783181254a7Smrg         enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
784181254a7Smrg                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
785181254a7Smrg 
786181254a7Smrg         auto refRandom = new RefRandomCU!char(s);
787181254a7Smrg         immutable randLen = refRandom.length;
788181254a7Smrg         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
789181254a7Smrg                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
790181254a7Smrg         enforce(refRandom.length == randLen,
791181254a7Smrg                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
792181254a7Smrg 
793181254a7Smrg         if (i == size_t.max)
794181254a7Smrg         {
795181254a7Smrg             enforce(strideBack(s) == codeLength!char(c),
796181254a7Smrg                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
797181254a7Smrg 
798181254a7Smrg             enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
799181254a7Smrg                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
800181254a7Smrg 
801181254a7Smrg             auto refBidir = new RefBidirCU!char(s);
802181254a7Smrg             immutable bidirLen = refBidir.length;
803181254a7Smrg             enforce(strideBack(refBidir) == codeLength!char(c),
804181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
805181254a7Smrg             enforce(refBidir.length == bidirLen,
806181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
807181254a7Smrg         }
808181254a7Smrg     }
809181254a7Smrg 
810181254a7Smrg     assertCTFEable!(
811181254a7Smrg     {
812181254a7Smrg     test("a", 'a');
813181254a7Smrg     test(" ", ' ');
814181254a7Smrg     test("\u2029", '\u2029'); //paraSep
815181254a7Smrg     test("\u0100", '\u0100');
816181254a7Smrg     test("\u0430", '\u0430');
817181254a7Smrg     test("\U00010143", '\U00010143');
818181254a7Smrg     test("abcdefcdef", 'f');
819181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'o', 15);
820181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'l', 14);
821181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'l', 13);
822181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'e', 12);
823181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'h', 11);
824181254a7Smrg     test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
825181254a7Smrg     test("\U00010143\u0100\U00010143hello", '\u0100', 6);
826181254a7Smrg     test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
827181254a7Smrg 
828181254a7Smrg     foreach (S; AliasSeq!(char[], const char[], string))
829181254a7Smrg     {
830181254a7Smrg         enum str = to!S("hello world");
831181254a7Smrg         static assert(isSafe!({ strideBack(str, 0); }));
832181254a7Smrg         static assert(isSafe!({ strideBack(str);    }));
833181254a7Smrg         static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
834181254a7Smrg         static assert((functionAttributes!({ strideBack(str);    }) & FunctionAttribute.pure_) != 0);
835181254a7Smrg     }
836181254a7Smrg     });
837181254a7Smrg }
838181254a7Smrg 
839181254a7Smrg //UTF-16 is self synchronizing: The length of strideBack can be found from
840181254a7Smrg //the value of a single wchar
841181254a7Smrg /// Ditto
842181254a7Smrg uint strideBack(S)(auto ref S str, size_t index)
843181254a7Smrg if (is(S : const wchar[]) ||
844*b1e83836Smrg     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
845181254a7Smrg {
846181254a7Smrg     static if (is(typeof(str.length) : ulong))
847181254a7Smrg         assert(index <= str.length, "Past the end of the UTF-16 sequence");
848181254a7Smrg     assert(index > 0, "Not the end of a UTF-16 sequence");
849181254a7Smrg 
850181254a7Smrg     immutable c2 = str[index-1];
851181254a7Smrg     return 1 + (0xDC00 <= c2 && c2 < 0xE000);
852181254a7Smrg }
853181254a7Smrg 
854181254a7Smrg /// Ditto
855181254a7Smrg uint strideBack(S)(auto ref S str)
856181254a7Smrg if (is(S : const wchar[]) ||
857*b1e83836Smrg     (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar)))
858181254a7Smrg {
859181254a7Smrg     assert(!str.empty, "UTF-16 sequence is empty");
860181254a7Smrg 
861181254a7Smrg     static if (is(S : const(wchar)[]))
862181254a7Smrg         immutable c2 = str[$ - 1];
863181254a7Smrg     else
864181254a7Smrg         immutable c2 = str.back;
865181254a7Smrg 
866181254a7Smrg     return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
867181254a7Smrg }
868181254a7Smrg 
869181254a7Smrg @system unittest
870181254a7Smrg {
871181254a7Smrg     import core.exception : AssertError;
872181254a7Smrg     import std.conv : to;
873181254a7Smrg     import std.exception;
874181254a7Smrg     import std.string : format;
875*b1e83836Smrg     import std.traits : FunctionAttribute, functionAttributes, isSafe;
876181254a7Smrg     static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
877181254a7Smrg     {
878181254a7Smrg         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
879181254a7Smrg                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
880181254a7Smrg 
881181254a7Smrg         enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
882181254a7Smrg                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
883181254a7Smrg 
884181254a7Smrg         auto refRandom = new RefRandomCU!wchar(s);
885181254a7Smrg         immutable randLen = refRandom.length;
886181254a7Smrg         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
887181254a7Smrg                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
888181254a7Smrg         enforce(refRandom.length == randLen,
889181254a7Smrg                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
890181254a7Smrg 
891181254a7Smrg         if (i == size_t.max)
892181254a7Smrg         {
893181254a7Smrg             enforce(strideBack(s) == codeLength!wchar(c),
894181254a7Smrg                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
895181254a7Smrg 
896181254a7Smrg             enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
897181254a7Smrg                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
898181254a7Smrg 
899181254a7Smrg             auto refBidir = new RefBidirCU!wchar(s);
900181254a7Smrg             immutable bidirLen = refBidir.length;
901181254a7Smrg             enforce(strideBack(refBidir) == codeLength!wchar(c),
902181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
903181254a7Smrg             enforce(refBidir.length == bidirLen,
904181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
905181254a7Smrg         }
906181254a7Smrg     }
907181254a7Smrg 
908181254a7Smrg     assertCTFEable!(
909181254a7Smrg     {
910181254a7Smrg     test("a", 'a');
911181254a7Smrg     test(" ", ' ');
912181254a7Smrg     test("\u2029", '\u2029'); //paraSep
913181254a7Smrg     test("\u0100", '\u0100');
914181254a7Smrg     test("\u0430", '\u0430');
915181254a7Smrg     test("\U00010143", '\U00010143');
916181254a7Smrg     test("abcdefcdef", 'f');
917181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'o', 10);
918181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'l', 9);
919181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'l', 8);
920181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'e', 7);
921181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'h', 6);
922181254a7Smrg     test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
923181254a7Smrg     test("\U00010143\u0100\U00010143hello", '\u0100', 3);
924181254a7Smrg     test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
925181254a7Smrg 
926181254a7Smrg     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
927181254a7Smrg     {
928181254a7Smrg         enum str = to!S("hello world");
929181254a7Smrg         static assert(isSafe!(() => strideBack(str, 0)));
930181254a7Smrg         static assert(isSafe!(() => strideBack(str)   ));
931181254a7Smrg         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
932181254a7Smrg         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
933181254a7Smrg     }
934181254a7Smrg     });
935181254a7Smrg }
936181254a7Smrg 
937181254a7Smrg /// Ditto
938181254a7Smrg uint strideBack(S)(auto ref S str, size_t index)
939*b1e83836Smrg if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar))
940181254a7Smrg {
941181254a7Smrg     static if (is(typeof(str.length) : ulong))
942181254a7Smrg         assert(index <= str.length, "Past the end of the UTF-32 sequence");
943181254a7Smrg     assert(index > 0, "Not the end of the UTF-32 sequence");
944181254a7Smrg     return 1;
945181254a7Smrg }
946181254a7Smrg 
947181254a7Smrg /// Ditto
948181254a7Smrg uint strideBack(S)(auto ref S str)
949*b1e83836Smrg if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar))
950181254a7Smrg {
951181254a7Smrg     assert(!str.empty, "Empty UTF-32 sequence");
952181254a7Smrg     return 1;
953181254a7Smrg }
954181254a7Smrg 
955*b1e83836Smrg ///
956*b1e83836Smrg @safe unittest
957*b1e83836Smrg {
958*b1e83836Smrg     assert("a".strideBack == 1);
959*b1e83836Smrg     assert("λ".strideBack == 2);
960*b1e83836Smrg     assert("aλ".strideBack == 2);
961*b1e83836Smrg     assert("aλ".strideBack(1) == 1);
962*b1e83836Smrg     assert("��".strideBack == 4);
963*b1e83836Smrg }
964*b1e83836Smrg 
965181254a7Smrg @system unittest
966181254a7Smrg {
967181254a7Smrg     import core.exception : AssertError;
968181254a7Smrg     import std.conv : to;
969181254a7Smrg     import std.exception;
970181254a7Smrg     import std.string : format;
971*b1e83836Smrg     import std.traits : FunctionAttribute, functionAttributes, isSafe;
972181254a7Smrg     static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
973181254a7Smrg     {
974181254a7Smrg         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
975181254a7Smrg                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
976181254a7Smrg 
977181254a7Smrg         enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
978181254a7Smrg                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
979181254a7Smrg 
980181254a7Smrg         auto refRandom = new RefRandomCU!dchar(s);
981181254a7Smrg         immutable randLen = refRandom.length;
982181254a7Smrg         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
983181254a7Smrg                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
984181254a7Smrg         enforce(refRandom.length == randLen,
985181254a7Smrg                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
986181254a7Smrg 
987181254a7Smrg         if (i == size_t.max)
988181254a7Smrg         {
989181254a7Smrg             enforce(strideBack(s) == codeLength!dchar(c),
990181254a7Smrg                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
991181254a7Smrg 
992181254a7Smrg             enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
993181254a7Smrg                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
994181254a7Smrg 
995181254a7Smrg             auto refBidir = new RefBidirCU!dchar(s);
996181254a7Smrg             immutable bidirLen = refBidir.length;
997181254a7Smrg             enforce(strideBack(refBidir) == codeLength!dchar(c),
998181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
999181254a7Smrg             enforce(refBidir.length == bidirLen,
1000181254a7Smrg                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
1001181254a7Smrg         }
1002181254a7Smrg     }
1003181254a7Smrg 
1004181254a7Smrg     assertCTFEable!(
1005181254a7Smrg     {
1006181254a7Smrg     test("a", 'a');
1007181254a7Smrg     test(" ", ' ');
1008181254a7Smrg     test("\u2029", '\u2029'); //paraSep
1009181254a7Smrg     test("\u0100", '\u0100');
1010181254a7Smrg     test("\u0430", '\u0430');
1011181254a7Smrg     test("\U00010143", '\U00010143');
1012181254a7Smrg     test("abcdefcdef", 'f');
1013181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'o', 8);
1014181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'l', 7);
1015181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'l', 6);
1016181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'e', 5);
1017181254a7Smrg     test("\U00010143\u0100\U00010143hello", 'h', 4);
1018181254a7Smrg     test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
1019181254a7Smrg     test("\U00010143\u0100\U00010143hello", '\u0100', 2);
1020181254a7Smrg     test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
1021181254a7Smrg 
1022181254a7Smrg     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
1023181254a7Smrg     {
1024181254a7Smrg         enum str = to!S("hello world");
1025181254a7Smrg         static assert(isSafe!(() => strideBack(str, 0)));
1026181254a7Smrg         static assert(isSafe!(() => strideBack(str)   ));
1027181254a7Smrg         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
1028181254a7Smrg         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
1029181254a7Smrg     }
1030181254a7Smrg     });
1031181254a7Smrg }
1032181254a7Smrg 
1033181254a7Smrg 
1034181254a7Smrg /++
1035*b1e83836Smrg     Given `index` into `str` and assuming that `index` is at the start
1036*b1e83836Smrg     of a UTF sequence, `toUCSindex` determines the number of UCS characters
1037*b1e83836Smrg     up to `index`. So, `index` is the index of a code unit at the
1038181254a7Smrg     beginning of a code point, and the return value is how many code points into
1039181254a7Smrg     the string that that code point is.
1040181254a7Smrg   +/
1041181254a7Smrg size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
1042181254a7Smrg if (isSomeChar!C)
1043181254a7Smrg {
1044*b1e83836Smrg     static if (is(immutable C == immutable dchar))
1045181254a7Smrg         return index;
1046181254a7Smrg     else
1047181254a7Smrg     {
1048181254a7Smrg         size_t n = 0;
1049181254a7Smrg         size_t j = 0;
1050181254a7Smrg 
1051181254a7Smrg         for (; j < index; ++n)
1052181254a7Smrg             j += stride(str, j);
1053181254a7Smrg 
1054181254a7Smrg         if (j > index)
1055181254a7Smrg         {
1056*b1e83836Smrg             static if (is(immutable C == immutable char))
1057181254a7Smrg                 throw new UTFException("Invalid UTF-8 sequence", index);
1058181254a7Smrg             else
1059181254a7Smrg                 throw new UTFException("Invalid UTF-16 sequence", index);
1060181254a7Smrg         }
1061181254a7Smrg 
1062181254a7Smrg         return n;
1063181254a7Smrg     }
1064181254a7Smrg }
1065181254a7Smrg 
1066181254a7Smrg ///
1067181254a7Smrg @safe unittest
1068181254a7Smrg {
1069181254a7Smrg     assert(toUCSindex(`hello world`, 7) == 7);
1070181254a7Smrg     assert(toUCSindex(`hello world`w, 7) == 7);
1071181254a7Smrg     assert(toUCSindex(`hello world`d, 7) == 7);
1072181254a7Smrg 
1073181254a7Smrg     assert(toUCSindex(`Ma Chérie`, 7) == 6);
1074181254a7Smrg     assert(toUCSindex(`Ma Chérie`w, 7) == 7);
1075181254a7Smrg     assert(toUCSindex(`Ma Chérie`d, 7) == 7);
1076181254a7Smrg 
1077181254a7Smrg     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
1078181254a7Smrg     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1079181254a7Smrg     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1080181254a7Smrg }
1081181254a7Smrg 
1082181254a7Smrg 
1083181254a7Smrg /++
1084*b1e83836Smrg     Given a UCS index `n` into `str`, returns the UTF index.
1085*b1e83836Smrg     So, `n` is how many code points into the string the code point is, and
1086181254a7Smrg     the array index of the code unit is returned.
1087181254a7Smrg   +/
1088181254a7Smrg size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
1089181254a7Smrg if (isSomeChar!C)
1090181254a7Smrg {
1091*b1e83836Smrg     static if (is(immutable C == immutable dchar))
1092181254a7Smrg     {
1093181254a7Smrg         return n;
1094181254a7Smrg     }
1095181254a7Smrg     else
1096181254a7Smrg     {
1097181254a7Smrg         size_t i;
1098181254a7Smrg         while (n--)
1099181254a7Smrg         {
1100181254a7Smrg             i += stride(str, i);
1101181254a7Smrg         }
1102181254a7Smrg         return i;
1103181254a7Smrg     }
1104181254a7Smrg }
1105181254a7Smrg 
1106181254a7Smrg ///
1107181254a7Smrg @safe unittest
1108181254a7Smrg {
1109181254a7Smrg     assert(toUTFindex(`hello world`, 7) == 7);
1110181254a7Smrg     assert(toUTFindex(`hello world`w, 7) == 7);
1111181254a7Smrg     assert(toUTFindex(`hello world`d, 7) == 7);
1112181254a7Smrg 
1113181254a7Smrg     assert(toUTFindex(`Ma Chérie`, 6) == 7);
1114181254a7Smrg     assert(toUTFindex(`Ma Chérie`w, 7) == 7);
1115181254a7Smrg     assert(toUTFindex(`Ma Chérie`d, 7) == 7);
1116181254a7Smrg 
1117181254a7Smrg     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1118181254a7Smrg     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1119181254a7Smrg     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1120181254a7Smrg }
1121181254a7Smrg 
1122181254a7Smrg 
1123181254a7Smrg /* =================== Decode ======================= */
1124181254a7Smrg 
1125181254a7Smrg /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1126181254a7Smrg alias UseReplacementDchar = Flag!"useReplacementDchar";
1127181254a7Smrg 
1128181254a7Smrg /++
1129*b1e83836Smrg     Decodes and returns the code point starting at `str[index]`. `index`
1130181254a7Smrg     is advanced to one past the decoded code point. If the code point is not
1131*b1e83836Smrg     well-formed, then a `UTFException` is thrown and `index` remains
1132181254a7Smrg     unchanged.
1133181254a7Smrg 
1134181254a7Smrg     decode will only work with strings and random access ranges of code units
1135181254a7Smrg     with length and slicing, whereas $(LREF decodeFront) will work with any
1136181254a7Smrg     input range of code units.
1137181254a7Smrg 
1138181254a7Smrg     Params:
1139181254a7Smrg         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1140181254a7Smrg         str = input string or indexable Range
1141181254a7Smrg         index = starting index into s[]; incremented by number of code units processed
1142181254a7Smrg 
1143181254a7Smrg     Returns:
1144181254a7Smrg         decoded character
1145181254a7Smrg 
1146181254a7Smrg     Throws:
1147*b1e83836Smrg         $(LREF UTFException) if `str[index]` is not the start of a valid UTF
1148*b1e83836Smrg         sequence and useReplacementDchar is `No.useReplacementDchar`
1149181254a7Smrg   +/
1150181254a7Smrg dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
1151181254a7Smrg if (!isSomeString!S &&
1152181254a7Smrg     isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
1153181254a7Smrg in
1154181254a7Smrg {
1155181254a7Smrg     assert(index < str.length, "Attempted to decode past the end of a string");
1156181254a7Smrg }
1157181254a7Smrg out (result)
1158181254a7Smrg {
1159181254a7Smrg     assert(isValidDchar(result));
1160181254a7Smrg }
1161*b1e83836Smrg do
1162181254a7Smrg {
1163181254a7Smrg     if (str[index] < codeUnitLimit!S)
1164181254a7Smrg         return str[index++];
1165181254a7Smrg     else
1166181254a7Smrg         return decodeImpl!(true, useReplacementDchar)(str, index);
1167181254a7Smrg }
1168181254a7Smrg 
1169*b1e83836Smrg /// ditto
1170181254a7Smrg dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1171*b1e83836Smrg auto ref scope S str, ref size_t index) @trusted pure
1172181254a7Smrg if (isSomeString!S)
1173181254a7Smrg in
1174181254a7Smrg {
1175181254a7Smrg     assert(index < str.length, "Attempted to decode past the end of a string");
1176181254a7Smrg }
1177181254a7Smrg out (result)
1178181254a7Smrg {
1179181254a7Smrg     assert(isValidDchar(result));
1180181254a7Smrg }
1181*b1e83836Smrg do
1182181254a7Smrg {
1183181254a7Smrg     if (str[index] < codeUnitLimit!S)
1184181254a7Smrg         return str[index++];
1185*b1e83836Smrg     else static if (is(immutable S == immutable C[], C))
1186*b1e83836Smrg         return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1187*b1e83836Smrg }
1188*b1e83836Smrg 
1189*b1e83836Smrg ///
1190*b1e83836Smrg @safe pure unittest
1191*b1e83836Smrg {
1192*b1e83836Smrg     size_t i;
1193*b1e83836Smrg 
1194*b1e83836Smrg     assert("a".decode(i) == 'a' && i == 1);
1195*b1e83836Smrg     i = 0;
1196*b1e83836Smrg     assert("å".decode(i) == 'å' && i == 2);
1197*b1e83836Smrg     i = 1;
1198*b1e83836Smrg     assert("aå".decode(i) == 'å' && i == 3);
1199*b1e83836Smrg     i = 0;
1200*b1e83836Smrg     assert("å"w.decode(i) == 'å' && i == 1);
1201*b1e83836Smrg 
1202*b1e83836Smrg     // ë as a multi-code point grapheme
1203*b1e83836Smrg     i = 0;
1204*b1e83836Smrg     assert("e\u0308".decode(i) == 'e' && i == 1);
1205*b1e83836Smrg     // ë as a single code point grapheme
1206*b1e83836Smrg     i = 0;
1207*b1e83836Smrg     assert("ë".decode(i) == 'ë' && i == 2);
1208*b1e83836Smrg     i = 0;
1209*b1e83836Smrg     assert("ë"w.decode(i) == 'ë' && i == 1);
1210*b1e83836Smrg }
1211*b1e83836Smrg 
1212*b1e83836Smrg @safe pure unittest // https://issues.dlang.org/show_bug.cgi?id=22867
1213*b1e83836Smrg {
1214*b1e83836Smrg     import std.conv : hexString;
1215*b1e83836Smrg     string data = hexString!"f787a598";
1216*b1e83836Smrg     size_t offset = 0;
1217*b1e83836Smrg     try data.decode(offset);
1218*b1e83836Smrg     catch (UTFException ex) assert(offset == 0);
1219181254a7Smrg }
1220181254a7Smrg 
1221181254a7Smrg /++
1222*b1e83836Smrg     `decodeFront` is a variant of $(LREF decode) which specifically decodes
1223*b1e83836Smrg     the first code point. Unlike $(LREF decode), `decodeFront` accepts any
1224*b1e83836Smrg     $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
1225*b1e83836Smrg     of code units (rather than just a string or random access
1226*b1e83836Smrg     range). It also takes the range by `ref` and pops off the elements as it
1227*b1e83836Smrg     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1228181254a7Smrg     of code units which were in the code point which was decoded.
1229181254a7Smrg 
1230181254a7Smrg     Params:
1231181254a7Smrg         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1232181254a7Smrg         str = input string or indexable Range
1233181254a7Smrg         numCodeUnits = set to number of code units processed
1234181254a7Smrg 
1235181254a7Smrg     Returns:
1236181254a7Smrg         decoded character
1237181254a7Smrg 
1238181254a7Smrg     Throws:
1239*b1e83836Smrg         $(LREF UTFException) if `str.front` is not the start of a valid UTF
1240181254a7Smrg         sequence. If an exception is thrown, then there is no guarantee as to
1241181254a7Smrg         the number of code units which were popped off, as it depends on the
1242181254a7Smrg         type of range being used and how many code units had to be popped off
1243181254a7Smrg         before the code point was determined to be invalid.
1244181254a7Smrg   +/
1245181254a7Smrg dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1246181254a7Smrg ref S str, out size_t numCodeUnits)
1247181254a7Smrg if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
1248181254a7Smrg in
1249181254a7Smrg {
1250181254a7Smrg     assert(!str.empty);
1251181254a7Smrg }
1252181254a7Smrg out (result)
1253181254a7Smrg {
1254181254a7Smrg     assert(isValidDchar(result));
1255181254a7Smrg }
1256*b1e83836Smrg do
1257181254a7Smrg {
1258181254a7Smrg     immutable fst = str.front;
1259181254a7Smrg 
1260181254a7Smrg     if (fst < codeUnitLimit!S)
1261181254a7Smrg     {
1262181254a7Smrg         str.popFront();
1263181254a7Smrg         numCodeUnits = 1;
1264181254a7Smrg         return fst;
1265181254a7Smrg     }
1266181254a7Smrg     else
1267181254a7Smrg     {
1268*b1e83836Smrg         // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
1269*b1e83836Smrg         // done outside of decodeImpl, which is undesirable, since not all
1270*b1e83836Smrg         // overloads of decodeImpl need it. So, it should be moved back into
1271*b1e83836Smrg         // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
1272*b1e83836Smrg         // has been fixed.
1273*b1e83836Smrg         enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S;
1274181254a7Smrg         immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits);
1275181254a7Smrg 
1276181254a7Smrg         // The other range types were already popped by decodeImpl.
1277181254a7Smrg         static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1278181254a7Smrg             str = str[numCodeUnits .. str.length];
1279181254a7Smrg 
1280181254a7Smrg         return retval;
1281181254a7Smrg     }
1282181254a7Smrg }
1283181254a7Smrg 
1284*b1e83836Smrg /// ditto
1285181254a7Smrg dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1286*b1e83836Smrg ref scope S str, out size_t numCodeUnits) @trusted pure
1287181254a7Smrg if (isSomeString!S)
1288181254a7Smrg in
1289181254a7Smrg {
1290181254a7Smrg     assert(!str.empty);
1291181254a7Smrg }
1292181254a7Smrg out (result)
1293181254a7Smrg {
1294181254a7Smrg     assert(isValidDchar(result));
1295181254a7Smrg }
1296*b1e83836Smrg do
1297181254a7Smrg {
1298181254a7Smrg     if (str[0] < codeUnitLimit!S)
1299181254a7Smrg     {
1300181254a7Smrg         numCodeUnits = 1;
1301181254a7Smrg         immutable retval = str[0];
1302181254a7Smrg         str = str[1 .. $];
1303181254a7Smrg         return retval;
1304181254a7Smrg     }
1305*b1e83836Smrg     else static if (is(immutable S == immutable C[], C))
1306181254a7Smrg     {
1307*b1e83836Smrg         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits);
1308181254a7Smrg         str = str[numCodeUnits .. $];
1309181254a7Smrg         return retval;
1310181254a7Smrg     }
1311181254a7Smrg }
1312181254a7Smrg 
1313181254a7Smrg /++ Ditto +/
1314181254a7Smrg dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1315181254a7Smrg if (isInputRange!S && isSomeChar!(ElementType!S))
1316181254a7Smrg {
1317181254a7Smrg     size_t numCodeUnits;
1318181254a7Smrg     return decodeFront!useReplacementDchar(str, numCodeUnits);
1319181254a7Smrg }
1320181254a7Smrg 
1321*b1e83836Smrg ///
1322*b1e83836Smrg @safe pure unittest
1323*b1e83836Smrg {
1324*b1e83836Smrg     import std.range.primitives;
1325*b1e83836Smrg     string str = "Hello, World!";
1326*b1e83836Smrg 
1327*b1e83836Smrg     assert(str.decodeFront == 'H' && str == "ello, World!");
1328*b1e83836Smrg     str = "å";
1329*b1e83836Smrg     assert(str.decodeFront == 'å' && str.empty);
1330*b1e83836Smrg     str = "å";
1331*b1e83836Smrg     size_t i;
1332*b1e83836Smrg     assert(str.decodeFront(i) == 'å' && i == 2 && str.empty);
1333*b1e83836Smrg }
1334*b1e83836Smrg 
1335181254a7Smrg /++
1336*b1e83836Smrg     `decodeBack` is a variant of $(LREF decode) which specifically decodes
1337*b1e83836Smrg     the last code point. Unlike $(LREF decode), `decodeBack` accepts any
1338181254a7Smrg     bidirectional range of code units (rather than just a string or random access
1339*b1e83836Smrg     range). It also takes the range by `ref` and pops off the elements as it
1340*b1e83836Smrg     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1341181254a7Smrg     of code units which were in the code point which was decoded.
1342181254a7Smrg 
1343181254a7Smrg     Params:
1344181254a7Smrg         useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1345181254a7Smrg         str = input string or bidirectional Range
1346181254a7Smrg         numCodeUnits = gives the number of code units processed
1347181254a7Smrg 
1348181254a7Smrg     Returns:
1349181254a7Smrg         A decoded UTF character.
1350181254a7Smrg 
1351181254a7Smrg     Throws:
1352*b1e83836Smrg         $(LREF UTFException) if `str.back` is not the end of a valid UTF
1353*b1e83836Smrg         sequence. If an exception is thrown, the `str` itself remains unchanged,
1354*b1e83836Smrg         but there is no guarantee as to the value of `numCodeUnits` (when passed).
1355181254a7Smrg   +/
1356181254a7Smrg dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1357181254a7Smrg     ref S str, out size_t numCodeUnits)
1358181254a7Smrg if (isSomeString!S)
1359181254a7Smrg in
1360181254a7Smrg {
1361181254a7Smrg     assert(!str.empty);
1362181254a7Smrg }
1363181254a7Smrg out (result)
1364181254a7Smrg {
1365181254a7Smrg     assert(isValidDchar(result));
1366181254a7Smrg }
1367*b1e83836Smrg do
1368181254a7Smrg {
1369181254a7Smrg     if (str[$ - 1] < codeUnitLimit!S)
1370181254a7Smrg     {
1371181254a7Smrg         numCodeUnits = 1;
1372181254a7Smrg         immutable retval = str[$ - 1];
1373181254a7Smrg         str = str[0 .. $ - 1];
1374181254a7Smrg         return retval;
1375181254a7Smrg     }
1376*b1e83836Smrg     else static if (is(immutable S == immutable C[], C))
1377181254a7Smrg     {
1378181254a7Smrg         numCodeUnits = strideBack(str);
1379181254a7Smrg         immutable newLength = str.length - numCodeUnits;
1380181254a7Smrg         size_t index = newLength;
1381*b1e83836Smrg         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1382181254a7Smrg         str = str[0 .. newLength];
1383181254a7Smrg         return retval;
1384181254a7Smrg     }
1385181254a7Smrg }
1386181254a7Smrg 
1387181254a7Smrg /++ Ditto +/
1388181254a7Smrg dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1389181254a7Smrg     ref S str, out size_t numCodeUnits)
1390181254a7Smrg if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
1391181254a7Smrg     && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
1392181254a7Smrg in
1393181254a7Smrg {
1394181254a7Smrg     assert(!str.empty);
1395181254a7Smrg }
1396181254a7Smrg out (result)
1397181254a7Smrg {
1398181254a7Smrg     assert(isValidDchar(result));
1399181254a7Smrg }
1400*b1e83836Smrg do
1401181254a7Smrg {
1402181254a7Smrg     if (str.back < codeUnitLimit!S)
1403181254a7Smrg     {
1404181254a7Smrg         numCodeUnits = 1;
1405181254a7Smrg         immutable retval = str.back;
1406181254a7Smrg         str.popBack();
1407181254a7Smrg         return retval;
1408181254a7Smrg     }
1409181254a7Smrg     else
1410181254a7Smrg     {
1411181254a7Smrg         numCodeUnits = strideBack(str);
1412181254a7Smrg         static if (isRandomAccessRange!S)
1413181254a7Smrg         {
1414181254a7Smrg             size_t index = str.length - numCodeUnits;
1415181254a7Smrg             immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1416181254a7Smrg             str.popBackExactly(numCodeUnits);
1417181254a7Smrg             return retval;
1418181254a7Smrg         }
1419181254a7Smrg         else
1420181254a7Smrg         {
1421181254a7Smrg             alias Char = Unqual!(ElementType!S);
1422181254a7Smrg             Char[4] codeUnits;
1423181254a7Smrg             S tmp = str.save;
1424181254a7Smrg             for (size_t i = numCodeUnits; i > 0; )
1425181254a7Smrg             {
1426181254a7Smrg                 codeUnits[--i] = tmp.back;
1427181254a7Smrg                 tmp.popBack();
1428181254a7Smrg             }
1429181254a7Smrg             const Char[] codePoint = codeUnits[0 .. numCodeUnits];
1430181254a7Smrg             size_t index = 0;
1431181254a7Smrg             immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index);
1432181254a7Smrg             str = tmp;
1433181254a7Smrg             return retval;
1434181254a7Smrg         }
1435181254a7Smrg     }
1436181254a7Smrg }
1437181254a7Smrg 
1438181254a7Smrg /++ Ditto +/
1439181254a7Smrg dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1440181254a7Smrg if (isSomeString!S
1441181254a7Smrg     || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
1442181254a7Smrg     || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
1443181254a7Smrg in
1444181254a7Smrg {
1445181254a7Smrg     assert(!str.empty);
1446181254a7Smrg }
1447181254a7Smrg out (result)
1448181254a7Smrg {
1449181254a7Smrg     assert(isValidDchar(result));
1450181254a7Smrg }
1451*b1e83836Smrg do
1452181254a7Smrg {
1453181254a7Smrg     size_t numCodeUnits;
1454181254a7Smrg     return decodeBack!useReplacementDchar(str, numCodeUnits);
1455181254a7Smrg }
1456181254a7Smrg 
1457*b1e83836Smrg ///
1458*b1e83836Smrg @system pure unittest
1459*b1e83836Smrg {
1460*b1e83836Smrg     import std.range.primitives;
1461*b1e83836Smrg     string str = "Hello, World!";
1462*b1e83836Smrg 
1463*b1e83836Smrg     assert(str.decodeBack == '!' && str == "Hello, World");
1464*b1e83836Smrg     str = "å";
1465*b1e83836Smrg     assert(str.decodeBack == 'å' && str.empty);
1466*b1e83836Smrg     str = "å";
1467*b1e83836Smrg     size_t i;
1468*b1e83836Smrg     assert(str.decodeBack(i) == 'å' && i == 2 && str.empty);
1469*b1e83836Smrg }
1470*b1e83836Smrg 
1471*b1e83836Smrg // For the given range, code unit values less than this
1472*b1e83836Smrg // are guaranteed to be valid single-codepoint encodings.
1473181254a7Smrg package template codeUnitLimit(S)
1474181254a7Smrg if (isSomeChar!(ElementEncodingType!S))
1475181254a7Smrg {
1476*b1e83836Smrg     static if (is(immutable ElementEncodingType!S == immutable char))
1477181254a7Smrg         enum char codeUnitLimit = 0x80;
1478*b1e83836Smrg     else static if (is(immutable ElementEncodingType!S == immutable wchar))
1479181254a7Smrg         enum wchar codeUnitLimit = 0xD800;
1480181254a7Smrg     else
1481181254a7Smrg         enum dchar codeUnitLimit = 0xD800;
1482181254a7Smrg }
1483181254a7Smrg 
1484181254a7Smrg /*
1485181254a7Smrg  * For strings, this function does its own bounds checking to give a
1486181254a7Smrg  * more useful error message when attempting to decode past the end of a string.
1487181254a7Smrg  * Subsequently it uses a pointer instead of an array to avoid
1488181254a7Smrg  * redundant bounds checking.
1489181254a7Smrg  *
1490181254a7Smrg  * The three overloads of this operate on chars, wchars, and dchars.
1491181254a7Smrg  *
1492181254a7Smrg  * Params:
1493181254a7Smrg  *      canIndex = if S is indexable
1494181254a7Smrg  *      useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1495181254a7Smrg  *      str = input string or Range
1496181254a7Smrg  *      index = starting index into s[]; incremented by number of code units processed
1497181254a7Smrg  *
1498181254a7Smrg  * Returns:
1499181254a7Smrg  *      decoded character
1500181254a7Smrg  */
1501181254a7Smrg private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1502181254a7Smrg     auto ref S str, ref size_t index)
1503181254a7Smrg if (
1504*b1e83836Smrg     is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char)))
1505181254a7Smrg {
1506181254a7Smrg     /* The following encodings are valid, except for the 5 and 6 byte
1507181254a7Smrg      * combinations:
1508181254a7Smrg      *  0xxxxxxx
1509181254a7Smrg      *  110xxxxx 10xxxxxx
1510181254a7Smrg      *  1110xxxx 10xxxxxx 10xxxxxx
1511181254a7Smrg      *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1512181254a7Smrg      *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1513181254a7Smrg      *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1514181254a7Smrg      */
1515181254a7Smrg 
1516181254a7Smrg     /* Dchar bitmask for different numbers of UTF-8 code units.
1517181254a7Smrg      */
1518181254a7Smrg     alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1519181254a7Smrg 
1520181254a7Smrg     static if (is(S : const char[]))
1521181254a7Smrg         auto pstr = str.ptr + index;    // this is what makes decodeImpl() @system code
1522181254a7Smrg     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1523181254a7Smrg         auto pstr = str[index .. str.length];
1524181254a7Smrg     else
1525181254a7Smrg         alias pstr = str;
1526181254a7Smrg 
1527*b1e83836Smrg     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1528*b1e83836Smrg     // outside of decodeImpl
1529181254a7Smrg     //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1530181254a7Smrg 
1531181254a7Smrg     static if (canIndex)
1532181254a7Smrg     {
1533181254a7Smrg         immutable length = str.length - index;
1534181254a7Smrg         ubyte fst = pstr[0];
1535181254a7Smrg     }
1536181254a7Smrg     else
1537181254a7Smrg     {
1538181254a7Smrg         ubyte fst = pstr.front;
1539181254a7Smrg         pstr.popFront();
1540181254a7Smrg     }
1541181254a7Smrg 
1542181254a7Smrg     static if (!useReplacementDchar)
1543181254a7Smrg     {
1544181254a7Smrg         static if (canIndex)
1545181254a7Smrg         {
1546181254a7Smrg             static UTFException exception(S)(S str, string msg)
1547181254a7Smrg             {
1548181254a7Smrg                 uint[4] sequence = void;
1549181254a7Smrg                 size_t i;
1550181254a7Smrg 
1551181254a7Smrg                 do
1552181254a7Smrg                 {
1553181254a7Smrg                     sequence[i] = str[i];
1554181254a7Smrg                 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
1555181254a7Smrg 
1556181254a7Smrg                 return new UTFException(msg, i).setSequence(sequence[0 .. i]);
1557181254a7Smrg             }
1558181254a7Smrg         }
1559181254a7Smrg 
1560181254a7Smrg         UTFException invalidUTF()
1561181254a7Smrg         {
1562181254a7Smrg             static if (canIndex)
1563181254a7Smrg                return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
1564181254a7Smrg             else
1565181254a7Smrg             {
1566181254a7Smrg                 //We can't include the invalid sequence with input strings without
1567181254a7Smrg                 //saving each of the code units along the way, and we can't do it with
1568181254a7Smrg                 //forward ranges without saving the entire range. Both would incur a
1569181254a7Smrg                 //cost for the decoding of every character just to provide a better
1570181254a7Smrg                 //error message for the (hopefully) rare case when an invalid UTF-8
1571181254a7Smrg                 //sequence is encountered, so we don't bother trying to include the
1572181254a7Smrg                 //invalid sequence here, unlike with strings and sliceable ranges.
1573181254a7Smrg                return new UTFException("Invalid UTF-8 sequence");
1574181254a7Smrg             }
1575181254a7Smrg         }
1576181254a7Smrg 
1577181254a7Smrg         UTFException outOfBounds()
1578181254a7Smrg         {
1579181254a7Smrg             static if (canIndex)
1580181254a7Smrg                return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
1581181254a7Smrg             else
1582181254a7Smrg                return new UTFException("Attempted to decode past the end of a string");
1583181254a7Smrg         }
1584181254a7Smrg     }
1585181254a7Smrg 
1586181254a7Smrg     if ((fst & 0b1100_0000) != 0b1100_0000)
1587181254a7Smrg     {
1588181254a7Smrg         static if (useReplacementDchar)
1589181254a7Smrg         {
1590181254a7Smrg             ++index;            // always consume bad input to avoid infinite loops
1591181254a7Smrg             return replacementDchar;
1592181254a7Smrg         }
1593181254a7Smrg         else
1594181254a7Smrg             throw invalidUTF(); // starter must have at least 2 first bits set
1595181254a7Smrg     }
1596181254a7Smrg     ubyte tmp = void;
1597181254a7Smrg     dchar d = fst; // upper control bits are masked out later
1598181254a7Smrg     fst <<= 1;
1599181254a7Smrg 
1600181254a7Smrg     foreach (i; AliasSeq!(1, 2, 3))
1601181254a7Smrg     {
1602181254a7Smrg 
1603181254a7Smrg         static if (canIndex)
1604181254a7Smrg         {
1605181254a7Smrg             if (i == length)
1606181254a7Smrg             {
1607181254a7Smrg                 static if (useReplacementDchar)
1608181254a7Smrg                 {
1609181254a7Smrg                     index += i;
1610181254a7Smrg                     return replacementDchar;
1611181254a7Smrg                 }
1612181254a7Smrg                 else
1613181254a7Smrg                     throw outOfBounds();
1614181254a7Smrg             }
1615181254a7Smrg         }
1616181254a7Smrg         else
1617181254a7Smrg         {
1618181254a7Smrg             if (pstr.empty)
1619181254a7Smrg             {
1620181254a7Smrg                 static if (useReplacementDchar)
1621181254a7Smrg                 {
1622181254a7Smrg                     index += i;
1623181254a7Smrg                     return replacementDchar;
1624181254a7Smrg                 }
1625181254a7Smrg                 else
1626181254a7Smrg                     throw outOfBounds();
1627181254a7Smrg             }
1628181254a7Smrg         }
1629181254a7Smrg 
1630181254a7Smrg         static if (canIndex)
1631181254a7Smrg             tmp = pstr[i];
1632181254a7Smrg         else
1633181254a7Smrg         {
1634181254a7Smrg             tmp = pstr.front;
1635181254a7Smrg             pstr.popFront();
1636181254a7Smrg         }
1637181254a7Smrg 
1638181254a7Smrg         if ((tmp & 0xC0) != 0x80)
1639181254a7Smrg         {
1640181254a7Smrg             static if (useReplacementDchar)
1641181254a7Smrg             {
1642181254a7Smrg                 index += i + 1;
1643181254a7Smrg                 return replacementDchar;
1644181254a7Smrg             }
1645181254a7Smrg             else
1646181254a7Smrg                 throw invalidUTF();
1647181254a7Smrg         }
1648181254a7Smrg 
1649181254a7Smrg         d = (d << 6) | (tmp & 0x3F);
1650181254a7Smrg         fst <<= 1;
1651181254a7Smrg 
1652181254a7Smrg         if (!(fst & 0x80)) // no more bytes
1653181254a7Smrg         {
1654181254a7Smrg             d &= bitMask[i]; // mask out control bits
1655181254a7Smrg 
1656181254a7Smrg             // overlong, could have been encoded with i bytes
1657181254a7Smrg             if ((d & ~bitMask[i - 1]) == 0)
1658181254a7Smrg             {
1659181254a7Smrg                 static if (useReplacementDchar)
1660181254a7Smrg                 {
1661181254a7Smrg                     index += i + 1;
1662181254a7Smrg                     return replacementDchar;
1663181254a7Smrg                 }
1664181254a7Smrg                 else
1665181254a7Smrg                     throw invalidUTF();
1666181254a7Smrg             }
1667181254a7Smrg 
1668181254a7Smrg             // check for surrogates only needed for 3 bytes
1669181254a7Smrg             static if (i == 2)
1670181254a7Smrg             {
1671181254a7Smrg                 if (!isValidDchar(d))
1672181254a7Smrg                 {
1673181254a7Smrg                     static if (useReplacementDchar)
1674181254a7Smrg                     {
1675181254a7Smrg                         index += i + 1;
1676181254a7Smrg                         return replacementDchar;
1677181254a7Smrg                     }
1678181254a7Smrg                     else
1679181254a7Smrg                         throw invalidUTF();
1680181254a7Smrg                 }
1681181254a7Smrg             }
1682181254a7Smrg 
1683181254a7Smrg             static if (i == 3)
1684181254a7Smrg             {
1685181254a7Smrg                 if (d > dchar.max)
1686181254a7Smrg                 {
1687181254a7Smrg                     static if (useReplacementDchar)
1688181254a7Smrg                         d = replacementDchar;
1689181254a7Smrg                     else
1690181254a7Smrg                         throw invalidUTF();
1691181254a7Smrg                 }
1692181254a7Smrg             }
1693*b1e83836Smrg 
1694*b1e83836Smrg             index += i + 1;
1695181254a7Smrg             return d;
1696181254a7Smrg         }
1697181254a7Smrg     }
1698181254a7Smrg 
1699181254a7Smrg     static if (useReplacementDchar)
1700181254a7Smrg     {
1701181254a7Smrg         index += 4;             // read 4 chars by now
1702181254a7Smrg         return replacementDchar;
1703181254a7Smrg     }
1704181254a7Smrg     else
1705181254a7Smrg         throw invalidUTF();
1706181254a7Smrg }
1707181254a7Smrg 
1708181254a7Smrg @safe pure @nogc nothrow
1709181254a7Smrg unittest
1710181254a7Smrg {
1711181254a7Smrg     // Add tests for useReplacemendDchar == yes path
1712181254a7Smrg 
1713181254a7Smrg     static struct R
1714181254a7Smrg     {
1715181254a7Smrg       @safe pure @nogc nothrow:
1716181254a7Smrg         this(string s) { this.s = s; }
1717181254a7Smrg         @property bool empty() { return idx == s.length; }
1718181254a7Smrg         @property char front() { return s[idx]; }
1719181254a7Smrg         void popFront() { ++idx; }
1720181254a7Smrg         size_t idx;
1721181254a7Smrg         string s;
1722181254a7Smrg     }
1723181254a7Smrg 
1724181254a7Smrg     foreach (s; invalidUTFstrings!char())
1725181254a7Smrg     {
1726181254a7Smrg         auto r = R(s);
1727181254a7Smrg         size_t index;
1728181254a7Smrg         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1729181254a7Smrg         assert(dc == replacementDchar);
1730181254a7Smrg         assert(1 <= index && index <= s.length);
1731181254a7Smrg     }
1732181254a7Smrg }
1733181254a7Smrg 
1734181254a7Smrg private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
1735181254a7Smrg (auto ref S str, ref size_t index)
1736*b1e83836Smrg if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar)))
1737181254a7Smrg {
1738181254a7Smrg     static if (is(S : const wchar[]))
1739181254a7Smrg         auto pstr = str.ptr + index;
1740181254a7Smrg     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1741181254a7Smrg         auto pstr = str[index .. str.length];
1742181254a7Smrg     else
1743181254a7Smrg         alias pstr = str;
1744181254a7Smrg 
1745*b1e83836Smrg     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1746*b1e83836Smrg     // outside of decodeImpl
1747181254a7Smrg     //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1748181254a7Smrg 
1749181254a7Smrg     static if (canIndex)
1750181254a7Smrg     {
1751181254a7Smrg         immutable length = str.length - index;
1752181254a7Smrg         uint u = pstr[0];
1753181254a7Smrg     }
1754181254a7Smrg     else
1755181254a7Smrg     {
1756181254a7Smrg         uint u = pstr.front;
1757181254a7Smrg         pstr.popFront();
1758181254a7Smrg     }
1759181254a7Smrg 
1760181254a7Smrg     static if (!useReplacementDchar)
1761181254a7Smrg     {
1762181254a7Smrg         UTFException exception(string msg)
1763181254a7Smrg         {
1764181254a7Smrg             static if (canIndex)
1765181254a7Smrg                 return new UTFException(msg).setSequence(pstr[0]);
1766181254a7Smrg             else
1767181254a7Smrg                 return new UTFException(msg);
1768181254a7Smrg         }
1769181254a7Smrg     }
1770181254a7Smrg 
1771181254a7Smrg     // The < case must be taken care of before decodeImpl is called.
1772181254a7Smrg     assert(u >= 0xD800);
1773181254a7Smrg 
1774181254a7Smrg     if (u <= 0xDBFF)
1775181254a7Smrg     {
1776181254a7Smrg         static if (canIndex)
1777181254a7Smrg             immutable onlyOneCodeUnit = length == 1;
1778181254a7Smrg         else
1779181254a7Smrg             immutable onlyOneCodeUnit = pstr.empty;
1780181254a7Smrg 
1781181254a7Smrg         if (onlyOneCodeUnit)
1782181254a7Smrg         {
1783181254a7Smrg             static if (useReplacementDchar)
1784181254a7Smrg             {
1785181254a7Smrg                 ++index;
1786181254a7Smrg                 return replacementDchar;
1787181254a7Smrg             }
1788181254a7Smrg             else
1789181254a7Smrg                 throw exception("surrogate UTF-16 high value past end of string");
1790181254a7Smrg         }
1791181254a7Smrg 
1792181254a7Smrg         static if (canIndex)
1793181254a7Smrg             immutable uint u2 = pstr[1];
1794181254a7Smrg         else
1795181254a7Smrg         {
1796181254a7Smrg             immutable uint u2 = pstr.front;
1797181254a7Smrg             pstr.popFront();
1798181254a7Smrg         }
1799181254a7Smrg 
1800181254a7Smrg         if (u2 < 0xDC00 || u2 > 0xDFFF)
1801181254a7Smrg         {
1802181254a7Smrg             static if (useReplacementDchar)
1803181254a7Smrg                 u = replacementDchar;
1804181254a7Smrg             else
1805181254a7Smrg                 throw exception("surrogate UTF-16 low value out of range");
1806181254a7Smrg         }
1807181254a7Smrg         else
1808181254a7Smrg             u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1809181254a7Smrg         ++index;
1810181254a7Smrg     }
1811181254a7Smrg     else if (u >= 0xDC00 && u <= 0xDFFF)
1812181254a7Smrg     {
1813181254a7Smrg         static if (useReplacementDchar)
1814181254a7Smrg             u = replacementDchar;
1815181254a7Smrg         else
1816181254a7Smrg             throw exception("unpaired surrogate UTF-16 value");
1817181254a7Smrg     }
1818181254a7Smrg     ++index;
1819181254a7Smrg 
1820181254a7Smrg     // Note: u+FFFE and u+FFFF are specifically permitted by the
1821181254a7Smrg     // Unicode standard for application internal use (see isValidDchar)
1822181254a7Smrg 
1823181254a7Smrg     return cast(dchar) u;
1824181254a7Smrg }
1825181254a7Smrg 
1826181254a7Smrg @safe pure @nogc nothrow
1827181254a7Smrg unittest
1828181254a7Smrg {
1829181254a7Smrg     // Add tests for useReplacemendDchar == true path
1830181254a7Smrg 
1831181254a7Smrg     static struct R
1832181254a7Smrg     {
1833181254a7Smrg       @safe pure @nogc nothrow:
1834181254a7Smrg         this(wstring s) { this.s = s; }
1835181254a7Smrg         @property bool empty() { return idx == s.length; }
1836181254a7Smrg         @property wchar front() { return s[idx]; }
1837181254a7Smrg         void popFront() { ++idx; }
1838181254a7Smrg         size_t idx;
1839181254a7Smrg         wstring s;
1840181254a7Smrg     }
1841181254a7Smrg 
1842181254a7Smrg     foreach (s; invalidUTFstrings!wchar())
1843181254a7Smrg     {
1844181254a7Smrg         auto r = R(s);
1845181254a7Smrg         size_t index;
1846181254a7Smrg         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1847181254a7Smrg         assert(dc == replacementDchar);
1848181254a7Smrg         assert(1 <= index && index <= s.length);
1849181254a7Smrg     }
1850181254a7Smrg }
1851181254a7Smrg 
1852181254a7Smrg private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1853181254a7Smrg     auto ref S str, ref size_t index)
1854*b1e83836Smrg if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
1855181254a7Smrg {
1856181254a7Smrg     static if (is(S : const dchar[]))
1857181254a7Smrg         auto pstr = str.ptr;
1858181254a7Smrg     else
1859181254a7Smrg         alias pstr = str;
1860181254a7Smrg 
1861181254a7Smrg     static if (is(S : const dchar[]) || isRandomAccessRange!S)
1862181254a7Smrg     {
1863181254a7Smrg         dchar dc = pstr[index];
1864181254a7Smrg         if (!isValidDchar(dc))
1865181254a7Smrg         {
1866181254a7Smrg             static if (useReplacementDchar)
1867181254a7Smrg                 dc = replacementDchar;
1868181254a7Smrg             else
1869181254a7Smrg                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1870181254a7Smrg         }
1871181254a7Smrg         ++index;
1872181254a7Smrg         return dc;
1873181254a7Smrg     }
1874181254a7Smrg     else
1875181254a7Smrg     {
1876181254a7Smrg         dchar dc = pstr.front;
1877181254a7Smrg         if (!isValidDchar(dc))
1878181254a7Smrg         {
1879181254a7Smrg             static if (useReplacementDchar)
1880181254a7Smrg                 dc = replacementDchar;
1881181254a7Smrg             else
1882181254a7Smrg                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1883181254a7Smrg         }
1884181254a7Smrg         ++index;
1885181254a7Smrg         pstr.popFront();
1886181254a7Smrg         return dc;
1887181254a7Smrg     }
1888181254a7Smrg }
1889181254a7Smrg 
1890181254a7Smrg @safe pure @nogc nothrow
1891181254a7Smrg unittest
1892181254a7Smrg {
1893181254a7Smrg     // Add tests for useReplacemendDchar == true path
1894181254a7Smrg 
1895181254a7Smrg     static struct R
1896181254a7Smrg     {
1897181254a7Smrg       @safe pure @nogc nothrow:
1898181254a7Smrg         this(dstring s) { this.s = s; }
1899181254a7Smrg         @property bool empty() { return idx == s.length; }
1900181254a7Smrg         @property dchar front() { return s[idx]; }
1901181254a7Smrg         void popFront() { ++idx; }
1902181254a7Smrg         size_t idx;
1903181254a7Smrg         dstring s;
1904181254a7Smrg     }
1905181254a7Smrg 
1906181254a7Smrg     foreach (s; invalidUTFstrings!dchar())
1907181254a7Smrg     {
1908181254a7Smrg         auto r = R(s);
1909181254a7Smrg         size_t index;
1910181254a7Smrg         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1911181254a7Smrg         assert(dc == replacementDchar);
1912181254a7Smrg         assert(1 <= index && index <= s.length);
1913181254a7Smrg     }
1914181254a7Smrg }
1915181254a7Smrg 
1916181254a7Smrg 
1917*b1e83836Smrg version (StdUnittest) private void testDecode(R)(R range,
1918181254a7Smrg                                              size_t index,
1919181254a7Smrg                                              dchar expectedChar,
1920181254a7Smrg                                              size_t expectedIndex,
1921181254a7Smrg                                              size_t line = __LINE__)
1922181254a7Smrg {
1923181254a7Smrg     import core.exception : AssertError;
1924*b1e83836Smrg     import std.exception : enforce;
1925181254a7Smrg     import std.string : format;
1926*b1e83836Smrg     import std.traits : isNarrowString;
1927181254a7Smrg 
1928181254a7Smrg     static if (hasLength!R)
1929181254a7Smrg         immutable lenBefore = range.length;
1930181254a7Smrg 
1931*b1e83836Smrg     static if (isRandomAccessRange!R && !isNarrowString!R)
1932181254a7Smrg     {
1933181254a7Smrg         {
1934181254a7Smrg             immutable result = decode(range, index);
1935181254a7Smrg             enforce(result == expectedChar,
1936181254a7Smrg                     new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
1937181254a7Smrg             enforce(index == expectedIndex,
1938181254a7Smrg                     new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1939181254a7Smrg             static if (hasLength!R)
1940181254a7Smrg             {
1941181254a7Smrg                 enforce(range.length == lenBefore,
1942181254a7Smrg                         new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
1943181254a7Smrg             }
1944181254a7Smrg         }
1945181254a7Smrg     }
1946181254a7Smrg }
1947181254a7Smrg 
1948*b1e83836Smrg version (StdUnittest) private void testDecodeFront(R)(ref R range,
1949181254a7Smrg                                                   dchar expectedChar,
1950181254a7Smrg                                                   size_t expectedNumCodeUnits,
1951181254a7Smrg                                                   size_t line = __LINE__)
1952181254a7Smrg {
1953181254a7Smrg     import core.exception : AssertError;
1954*b1e83836Smrg     import std.exception : enforce;
1955181254a7Smrg     import std.string : format;
1956181254a7Smrg 
1957181254a7Smrg     static if (hasLength!R)
1958181254a7Smrg         immutable lenBefore = range.length;
1959181254a7Smrg 
1960181254a7Smrg     size_t numCodeUnits;
1961181254a7Smrg     immutable result = decodeFront(range, numCodeUnits);
1962181254a7Smrg     enforce(result == expectedChar,
1963181254a7Smrg             new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
1964181254a7Smrg     enforce(numCodeUnits == expectedNumCodeUnits,
1965181254a7Smrg             new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1966181254a7Smrg 
1967181254a7Smrg     static if (hasLength!R)
1968181254a7Smrg     {
1969181254a7Smrg         enforce(range.length == lenBefore - numCodeUnits,
1970181254a7Smrg                 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
1971181254a7Smrg     }
1972181254a7Smrg }
1973181254a7Smrg 
1974*b1e83836Smrg version (StdUnittest) private void testDecodeBack(R)(ref R range,
1975181254a7Smrg                                                  dchar expectedChar,
1976181254a7Smrg                                                  size_t expectedNumCodeUnits,
1977181254a7Smrg                                                  size_t line = __LINE__)
1978181254a7Smrg {
1979181254a7Smrg     // This condition is to allow unit testing all `decode` functions together
1980181254a7Smrg     static if (!isBidirectionalRange!R)
1981181254a7Smrg         return;
1982181254a7Smrg     else
1983181254a7Smrg     {
1984181254a7Smrg         import core.exception : AssertError;
1985*b1e83836Smrg         import std.exception : enforce;
1986181254a7Smrg         import std.string : format;
1987181254a7Smrg 
1988181254a7Smrg         static if (hasLength!R)
1989181254a7Smrg             immutable lenBefore = range.length;
1990181254a7Smrg 
1991181254a7Smrg         size_t numCodeUnits;
1992181254a7Smrg         immutable result = decodeBack(range, numCodeUnits);
1993181254a7Smrg         enforce(result == expectedChar,
1994181254a7Smrg                 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
1995181254a7Smrg         enforce(numCodeUnits == expectedNumCodeUnits,
1996181254a7Smrg                 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1997181254a7Smrg 
1998181254a7Smrg         static if (hasLength!R)
1999181254a7Smrg         {
2000181254a7Smrg             enforce(range.length == lenBefore - numCodeUnits,
2001181254a7Smrg                     new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
2002181254a7Smrg         }
2003181254a7Smrg     }
2004181254a7Smrg }
2005181254a7Smrg 
2006*b1e83836Smrg version (StdUnittest) private void testAllDecode(R)(R range,
2007181254a7Smrg                                                 dchar expectedChar,
2008181254a7Smrg                                                 size_t expectedIndex,
2009181254a7Smrg                                                 size_t line = __LINE__)
2010181254a7Smrg {
2011181254a7Smrg     testDecode(range, 0, expectedChar, expectedIndex, line);
2012181254a7Smrg     static if (isBidirectionalRange!R)
2013181254a7Smrg     {
2014181254a7Smrg         auto rangeCopy = range.save;
2015181254a7Smrg         testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
2016181254a7Smrg     }
2017181254a7Smrg     testDecodeFront(range, expectedChar, expectedIndex, line);
2018181254a7Smrg }
2019181254a7Smrg 
2020*b1e83836Smrg version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
2021181254a7Smrg {
2022181254a7Smrg     import core.exception : AssertError;
2023*b1e83836Smrg     import std.exception : assertThrown, enforce;
2024181254a7Smrg     import std.string : format;
2025181254a7Smrg 
2026181254a7Smrg     immutable initialIndex = index;
2027181254a7Smrg 
2028181254a7Smrg     static if (hasLength!R)
2029181254a7Smrg         immutable lenBefore = range.length;
2030181254a7Smrg 
2031181254a7Smrg     static if (isRandomAccessRange!R)
2032181254a7Smrg     {
2033181254a7Smrg         assertThrown!UTFException(decode(range, index), null, __FILE__, line);
2034181254a7Smrg         enforce(index == initialIndex,
2035181254a7Smrg                 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
2036181254a7Smrg         static if (hasLength!R)
2037181254a7Smrg         {
2038181254a7Smrg             enforce(range.length == lenBefore,
2039181254a7Smrg                     new AssertError(format("decode: length changed:", range.length), __FILE__, line));
2040181254a7Smrg         }
2041181254a7Smrg     }
2042181254a7Smrg 
2043181254a7Smrg     if (initialIndex == 0)
2044181254a7Smrg         assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
2045181254a7Smrg }
2046181254a7Smrg 
2047*b1e83836Smrg version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
2048181254a7Smrg {
2049181254a7Smrg     // This condition is to allow unit testing all `decode` functions together
2050181254a7Smrg     static if (!isBidirectionalRange!R)
2051181254a7Smrg         return;
2052181254a7Smrg     else
2053181254a7Smrg     {
2054181254a7Smrg         import core.exception : AssertError;
2055*b1e83836Smrg         import std.exception : assertThrown, enforce;
2056181254a7Smrg         import std.string : format;
2057181254a7Smrg 
2058181254a7Smrg         static if (hasLength!R)
2059181254a7Smrg             immutable lenBefore = range.length;
2060181254a7Smrg 
2061181254a7Smrg         static if (isRandomAccessRange!R)
2062181254a7Smrg         {
2063181254a7Smrg             assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
2064181254a7Smrg             static if (hasLength!R)
2065181254a7Smrg             {
2066181254a7Smrg                 enforce(range.length == lenBefore,
2067181254a7Smrg                         new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
2068181254a7Smrg             }
2069181254a7Smrg         }
2070181254a7Smrg     }
2071181254a7Smrg }
2072181254a7Smrg 
2073181254a7Smrg @system unittest
2074181254a7Smrg {
2075181254a7Smrg     import std.conv : to;
2076181254a7Smrg     import std.exception;
2077181254a7Smrg 
2078181254a7Smrg     assertCTFEable!(
2079181254a7Smrg     {
2080181254a7Smrg     foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
2081181254a7Smrg                           (string s) => new RefBidirCU!char(s),
2082181254a7Smrg                           (string s) => new RefRandomCU!char(s)))
2083181254a7Smrg     {
2084181254a7Smrg         enum sHasLength = hasLength!(typeof(S("abcd")));
2085181254a7Smrg 
2086181254a7Smrg         {
2087181254a7Smrg             auto range = S("abcd");
2088181254a7Smrg             testDecode(range, 0, 'a', 1);
2089181254a7Smrg             testDecode(range, 1, 'b', 2);
2090181254a7Smrg             testDecodeFront(range, 'a', 1);
2091181254a7Smrg             testDecodeFront(range, 'b', 1);
2092181254a7Smrg             assert(decodeFront(range) == 'c');
2093181254a7Smrg             assert(decodeFront(range) == 'd');
2094181254a7Smrg         }
2095181254a7Smrg 
2096181254a7Smrg         {
2097181254a7Smrg             auto range = S("ウェブサイト");
2098181254a7Smrg             testDecode(range, 0, 'ウ', 3);
2099181254a7Smrg             testDecode(range, 3, 'ェ', 6);
2100181254a7Smrg             testDecodeFront(range, 'ウ', 3);
2101181254a7Smrg             testDecodeFront(range, 'ェ', 3);
2102181254a7Smrg             assert(decodeFront(range) == 'ブ');
2103181254a7Smrg             assert(decodeFront(range) == 'サ');
2104181254a7Smrg         }
2105181254a7Smrg 
2106181254a7Smrg         {
2107181254a7Smrg             auto range = S("abcd");
2108181254a7Smrg             testDecodeBack(range, 'd', 1);
2109181254a7Smrg             testDecodeBack(range, 'c', 1);
2110181254a7Smrg             testDecodeBack(range, 'b', 1);
2111181254a7Smrg             testDecodeBack(range, 'a', 1);
2112181254a7Smrg         }
2113181254a7Smrg 
2114181254a7Smrg         {
2115181254a7Smrg             auto range = S("ウェブサイト");
2116181254a7Smrg             testDecodeBack(range, 'ト', 3);
2117181254a7Smrg             testDecodeBack(range, 'イ', 3);
2118181254a7Smrg             testDecodeBack(range, 'サ', 3);
2119181254a7Smrg             testDecodeBack(range, 'ブ', 3);
2120181254a7Smrg         }
2121181254a7Smrg 
2122181254a7Smrg         testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
2123181254a7Smrg         testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
2124181254a7Smrg 
2125181254a7Smrg         foreach (str; ["\xE2\x89", // too short
2126181254a7Smrg                        "\xC0\x8A",
2127181254a7Smrg                        "\xE0\x80\x8A",
2128181254a7Smrg                        "\xF0\x80\x80\x8A",
2129181254a7Smrg                        "\xF8\x80\x80\x80\x8A",
2130181254a7Smrg                        "\xFC\x80\x80\x80\x80\x8A"])
2131181254a7Smrg         {
2132181254a7Smrg             testBadDecode(S(str), 0);
2133181254a7Smrg             testBadDecode(S(str), 1);
2134181254a7Smrg             testBadDecodeBack(S(str));
2135181254a7Smrg         }
2136181254a7Smrg 
2137181254a7Smrg         //Invalid UTF-8 sequence where the first code unit is valid.
2138181254a7Smrg         testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
2139181254a7Smrg         testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
2140181254a7Smrg 
2141181254a7Smrg         //Invalid UTF-8 sequence where the first code unit isn't valid.
2142181254a7Smrg         foreach (str; ["\xED\xA0\x80",
2143181254a7Smrg                        "\xED\xAD\xBF",
2144181254a7Smrg                        "\xED\xAE\x80",
2145181254a7Smrg                        "\xED\xAF\xBF",
2146181254a7Smrg                        "\xED\xB0\x80",
2147181254a7Smrg                        "\xED\xBE\x80",
2148181254a7Smrg                        "\xED\xBF\xBF"])
2149181254a7Smrg         {
2150181254a7Smrg             testBadDecode(S(str), 0);
2151181254a7Smrg             testBadDecodeBack(S(str));
2152181254a7Smrg         }
2153181254a7Smrg     }
2154181254a7Smrg     });
2155181254a7Smrg }
2156181254a7Smrg 
2157181254a7Smrg @system unittest
2158181254a7Smrg {
2159181254a7Smrg     import std.exception;
2160181254a7Smrg     assertCTFEable!(
2161181254a7Smrg     {
2162*b1e83836Smrg     foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar,
2163181254a7Smrg                           (wstring s) => new RefBidirCU!wchar(s),
2164181254a7Smrg                           (wstring s) => new RefRandomCU!wchar(s)))
2165181254a7Smrg     {
2166181254a7Smrg         testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
2167181254a7Smrg         testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
2168181254a7Smrg         testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
2169181254a7Smrg         testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2170181254a7Smrg         testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2171181254a7Smrg 
2172181254a7Smrg         testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
2173181254a7Smrg         testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
2174181254a7Smrg 
2175181254a7Smrg         testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
2176181254a7Smrg         testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2177181254a7Smrg 
2178181254a7Smrg         {
2179181254a7Smrg             auto range = S("ウェブサイト");
2180181254a7Smrg             testDecode(range, 0, 'ウ', 1);
2181181254a7Smrg             testDecode(range, 1, 'ェ', 2);
2182181254a7Smrg             testDecodeFront(range, 'ウ', 1);
2183181254a7Smrg             testDecodeFront(range, 'ェ', 1);
2184181254a7Smrg             assert(decodeFront(range) == 'ブ');
2185181254a7Smrg             assert(decodeFront(range) == 'サ');
2186181254a7Smrg         }
2187181254a7Smrg 
2188181254a7Smrg         {
2189181254a7Smrg             auto range = S("ウェブサイト");
2190181254a7Smrg             testDecodeBack(range, 'ト', 1);
2191181254a7Smrg             testDecodeBack(range, 'イ', 1);
2192181254a7Smrg             testDecodeBack(range, 'サ', 1);
2193181254a7Smrg             testDecodeBack(range, 'ブ', 1);
2194181254a7Smrg         }
2195181254a7Smrg     }
2196181254a7Smrg 
2197*b1e83836Smrg     foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
2198181254a7Smrg     {
2199181254a7Smrg         auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2200181254a7Smrg                       cast(wchar) 0x1400,
2201181254a7Smrg                       cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2202181254a7Smrg         testDecode(str, 0, cast(dchar) 0x10000, 2);
2203181254a7Smrg         testDecode(str, 2, cast(dchar) 0x1400, 3);
2204181254a7Smrg         testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2205181254a7Smrg         testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2206181254a7Smrg         testDecodeBack(str, cast(dchar) 0x1400, 1);
2207181254a7Smrg         testDecodeBack(str, cast(dchar) 0x10000, 2);
2208181254a7Smrg     }
2209181254a7Smrg     });
2210181254a7Smrg }
2211181254a7Smrg 
2212181254a7Smrg @system unittest
2213181254a7Smrg {
2214181254a7Smrg     import std.exception;
2215181254a7Smrg     assertCTFEable!(
2216181254a7Smrg     {
2217*b1e83836Smrg     foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar,
2218181254a7Smrg                           (dstring s) => new RefBidirCU!dchar(s),
2219181254a7Smrg                           (dstring s) => new RefRandomCU!dchar(s)))
2220181254a7Smrg     {
2221181254a7Smrg         testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2222181254a7Smrg         testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2223181254a7Smrg         testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2224181254a7Smrg         testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2225181254a7Smrg         testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2226181254a7Smrg 
2227181254a7Smrg         testBadDecode(S([cast(dchar) 0xD800]), 0);
2228181254a7Smrg         testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2229181254a7Smrg         testBadDecode(S([cast(dchar) 0x110000]), 0);
2230181254a7Smrg 
2231181254a7Smrg         testBadDecodeBack(S([cast(dchar) 0xD800]));
2232181254a7Smrg         testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2233181254a7Smrg         testBadDecodeBack(S([cast(dchar) 0x110000]));
2234181254a7Smrg 
2235181254a7Smrg         {
2236181254a7Smrg             auto range = S("ウェブサイト");
2237181254a7Smrg             testDecode(range, 0, 'ウ', 1);
2238181254a7Smrg             testDecode(range, 1, 'ェ', 2);
2239181254a7Smrg             testDecodeFront(range, 'ウ', 1);
2240181254a7Smrg             testDecodeFront(range, 'ェ', 1);
2241181254a7Smrg             assert(decodeFront(range) == 'ブ');
2242181254a7Smrg             assert(decodeFront(range) == 'サ');
2243181254a7Smrg         }
2244181254a7Smrg 
2245181254a7Smrg         {
2246181254a7Smrg             auto range = S("ウェブサイト");
2247181254a7Smrg             testDecodeBack(range, 'ト', 1);
2248181254a7Smrg             testDecodeBack(range, 'イ', 1);
2249181254a7Smrg             testDecodeBack(range, 'サ', 1);
2250181254a7Smrg             testDecodeBack(range, 'ブ', 1);
2251181254a7Smrg         }
2252181254a7Smrg     }
2253181254a7Smrg 
2254*b1e83836Smrg     foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
2255181254a7Smrg     {
2256181254a7Smrg         auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2257181254a7Smrg         testDecode(str, 0, 0x10000, 1);
2258181254a7Smrg         testDecode(str, 1, 0x1400, 2);
2259181254a7Smrg         testDecode(str, 2, 0xB9DDE, 3);
2260181254a7Smrg         testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2261181254a7Smrg         testDecodeBack(str, cast(dchar) 0x1400, 1);
2262181254a7Smrg         testDecodeBack(str, cast(dchar) 0x10000, 1);
2263181254a7Smrg     }
2264181254a7Smrg     });
2265181254a7Smrg }
2266181254a7Smrg 
2267181254a7Smrg @safe unittest
2268181254a7Smrg {
2269181254a7Smrg     import std.exception;
2270*b1e83836Smrg     import std.traits : FunctionAttribute, functionAttributes, isSafe;
2271181254a7Smrg     assertCTFEable!(
2272181254a7Smrg     {
2273181254a7Smrg     foreach (S; AliasSeq!( char[], const( char)[],  string,
2274181254a7Smrg                           wchar[], const(wchar)[], wstring,
2275181254a7Smrg                           dchar[], const(dchar)[], dstring))
2276181254a7Smrg     {
2277181254a7Smrg         static assert(isSafe!({ S str; size_t i = 0; decode(str, i);      }));
2278181254a7Smrg         static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
2279181254a7Smrg         static assert(isSafe!({ S str; decodeFront(str); }));
2280181254a7Smrg         static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
2281181254a7Smrg         static assert((functionAttributes!({
2282181254a7Smrg             S str; size_t i = 0; decodeFront(str, i);
2283181254a7Smrg         }) & FunctionAttribute.pure_) != 0);
2284181254a7Smrg         static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
2285181254a7Smrg         static assert((functionAttributes!({
2286181254a7Smrg             S str; size_t i = 0; decodeBack(str, i);
2287181254a7Smrg         }) & FunctionAttribute.pure_) != 0);
2288181254a7Smrg         static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
2289181254a7Smrg     }
2290181254a7Smrg     });
2291181254a7Smrg }
2292181254a7Smrg 
2293181254a7Smrg @safe unittest
2294181254a7Smrg {
2295181254a7Smrg     import std.exception;
2296181254a7Smrg     char[4] val;
2297181254a7Smrg     val[0] = 0b1111_0111;
2298181254a7Smrg     val[1] = 0b1011_1111;
2299181254a7Smrg     val[2] = 0b1011_1111;
2300181254a7Smrg     val[3] = 0b1011_1111;
2301181254a7Smrg     size_t i = 0;
2302181254a7Smrg     assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
2303181254a7Smrg }
2304181254a7Smrg /* =================== Encode ======================= */
2305181254a7Smrg 
2306181254a7Smrg private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
2307181254a7Smrg {
2308181254a7Smrg     static if (useReplacementDchar)
2309181254a7Smrg         return replacementDchar;
2310181254a7Smrg     else
2311181254a7Smrg         throw new UTFException(msg).setSequence(c);
2312181254a7Smrg }
2313181254a7Smrg 
2314181254a7Smrg /++
2315*b1e83836Smrg     Encodes `c` into the static array, `buf`, and returns the actual
2316*b1e83836Smrg     length of the encoded character (a number between `1` and `4` for
2317*b1e83836Smrg     `char[4]` buffers and a number between `1` and `2` for
2318*b1e83836Smrg     `wchar[2]` buffers).
2319181254a7Smrg 
2320181254a7Smrg     Throws:
2321*b1e83836Smrg         `UTFException` if `c` is not a valid UTF code point.
2322181254a7Smrg   +/
2323181254a7Smrg size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2324181254a7Smrg     out char[4] buf, dchar c) @safe pure
2325181254a7Smrg {
2326181254a7Smrg     if (c <= 0x7F)
2327181254a7Smrg     {
2328181254a7Smrg         assert(isValidDchar(c));
2329181254a7Smrg         buf[0] = cast(char) c;
2330181254a7Smrg         return 1;
2331181254a7Smrg     }
2332181254a7Smrg     if (c <= 0x7FF)
2333181254a7Smrg     {
2334181254a7Smrg         assert(isValidDchar(c));
2335181254a7Smrg         buf[0] = cast(char)(0xC0 | (c >> 6));
2336181254a7Smrg         buf[1] = cast(char)(0x80 | (c & 0x3F));
2337181254a7Smrg         return 2;
2338181254a7Smrg     }
2339181254a7Smrg     if (c <= 0xFFFF)
2340181254a7Smrg     {
2341181254a7Smrg         if (0xD800 <= c && c <= 0xDFFF)
2342181254a7Smrg             c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2343181254a7Smrg 
2344181254a7Smrg         assert(isValidDchar(c));
2345181254a7Smrg     L3:
2346181254a7Smrg         buf[0] = cast(char)(0xE0 | (c >> 12));
2347181254a7Smrg         buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2348181254a7Smrg         buf[2] = cast(char)(0x80 | (c & 0x3F));
2349181254a7Smrg         return 3;
2350181254a7Smrg     }
2351181254a7Smrg     if (c <= 0x10FFFF)
2352181254a7Smrg     {
2353181254a7Smrg         assert(isValidDchar(c));
2354181254a7Smrg         buf[0] = cast(char)(0xF0 | (c >> 18));
2355181254a7Smrg         buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2356181254a7Smrg         buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2357181254a7Smrg         buf[3] = cast(char)(0x80 | (c & 0x3F));
2358181254a7Smrg         return 4;
2359181254a7Smrg     }
2360181254a7Smrg 
2361181254a7Smrg     assert(!isValidDchar(c));
2362181254a7Smrg     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2363181254a7Smrg     goto L3;
2364181254a7Smrg }
2365181254a7Smrg 
2366*b1e83836Smrg ///
2367*b1e83836Smrg @safe unittest
2368*b1e83836Smrg {
2369*b1e83836Smrg     import std.exception : assertThrown;
2370*b1e83836Smrg     import std.typecons : Yes;
2371*b1e83836Smrg 
2372*b1e83836Smrg     char[4] buf;
2373*b1e83836Smrg 
2374*b1e83836Smrg     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2375*b1e83836Smrg     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2376*b1e83836Smrg     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2377*b1e83836Smrg     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2378*b1e83836Smrg     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2379*b1e83836Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2380*b1e83836Smrg 
2381*b1e83836Smrg     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2382*b1e83836Smrg     auto slice = buf[];
2383*b1e83836Smrg     assert(slice.decodeFront == replacementDchar);
2384*b1e83836Smrg }
2385*b1e83836Smrg 
2386*b1e83836Smrg ///
2387*b1e83836Smrg @safe unittest
2388*b1e83836Smrg {
2389*b1e83836Smrg     import std.exception : assertThrown;
2390*b1e83836Smrg     import std.typecons : Yes;
2391*b1e83836Smrg 
2392*b1e83836Smrg     wchar[2] buf;
2393*b1e83836Smrg 
2394*b1e83836Smrg     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2395*b1e83836Smrg     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2396*b1e83836Smrg     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2397*b1e83836Smrg     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2398*b1e83836Smrg     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2399*b1e83836Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2400*b1e83836Smrg 
2401*b1e83836Smrg     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2402*b1e83836Smrg     auto slice = buf[];
2403*b1e83836Smrg     assert(slice.decodeFront == replacementDchar);
2404*b1e83836Smrg }
2405*b1e83836Smrg 
2406*b1e83836Smrg ///
2407*b1e83836Smrg @safe unittest
2408*b1e83836Smrg {
2409*b1e83836Smrg     import std.exception : assertThrown;
2410*b1e83836Smrg     import std.typecons : Yes;
2411*b1e83836Smrg 
2412*b1e83836Smrg     dchar[1] buf;
2413*b1e83836Smrg 
2414*b1e83836Smrg     assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000');
2415*b1e83836Smrg     assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF');
2416*b1e83836Smrg     assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000');
2417*b1e83836Smrg     assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF');
2418*b1e83836Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2419*b1e83836Smrg 
2420*b1e83836Smrg     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2421*b1e83836Smrg     assert(buf[0] == replacementDchar);
2422*b1e83836Smrg }
2423*b1e83836Smrg 
2424181254a7Smrg @safe unittest
2425181254a7Smrg {
2426181254a7Smrg     import std.exception;
2427181254a7Smrg     assertCTFEable!(
2428181254a7Smrg     {
2429181254a7Smrg     char[4] buf;
2430181254a7Smrg 
2431181254a7Smrg     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2432181254a7Smrg     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2433181254a7Smrg     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2434181254a7Smrg     assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
2435181254a7Smrg     assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
2436181254a7Smrg     assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
2437181254a7Smrg     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2438181254a7Smrg     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2439181254a7Smrg     assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
2440181254a7Smrg     assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
2441181254a7Smrg     assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
2442181254a7Smrg 
2443181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2444181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2445181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2446181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2447181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2448181254a7Smrg 
2449181254a7Smrg     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2450*b1e83836Smrg     enum replacementDcharString = "\uFFFD";
2451*b1e83836Smrg     assert(buf[0 .. replacementDcharString.length] == replacementDcharString);
2452181254a7Smrg     });
2453181254a7Smrg }
2454181254a7Smrg 
2455181254a7Smrg 
2456181254a7Smrg /// Ditto
2457181254a7Smrg size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2458181254a7Smrg     out wchar[2] buf, dchar c) @safe pure
2459181254a7Smrg {
2460181254a7Smrg     if (c <= 0xFFFF)
2461181254a7Smrg     {
2462181254a7Smrg         if (0xD800 <= c && c <= 0xDFFF)
2463181254a7Smrg             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2464181254a7Smrg 
2465181254a7Smrg         assert(isValidDchar(c));
2466181254a7Smrg     L1:
2467181254a7Smrg         buf[0] = cast(wchar) c;
2468181254a7Smrg         return 1;
2469181254a7Smrg     }
2470181254a7Smrg     if (c <= 0x10FFFF)
2471181254a7Smrg     {
2472181254a7Smrg         assert(isValidDchar(c));
2473181254a7Smrg         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2474181254a7Smrg         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2475181254a7Smrg         return 2;
2476181254a7Smrg     }
2477181254a7Smrg 
2478181254a7Smrg     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2479181254a7Smrg     goto L1;
2480181254a7Smrg }
2481181254a7Smrg 
2482181254a7Smrg @safe unittest
2483181254a7Smrg {
2484181254a7Smrg     import std.exception;
2485181254a7Smrg     assertCTFEable!(
2486181254a7Smrg     {
2487181254a7Smrg     wchar[2] buf;
2488181254a7Smrg 
2489181254a7Smrg     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2490181254a7Smrg     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2491181254a7Smrg     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2492181254a7Smrg     assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
2493181254a7Smrg     assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
2494181254a7Smrg     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2495181254a7Smrg     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2496181254a7Smrg 
2497181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2498181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2499181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2500181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2501181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2502181254a7Smrg 
2503181254a7Smrg     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2504181254a7Smrg     assert(buf.front == replacementDchar);
2505181254a7Smrg     });
2506181254a7Smrg }
2507181254a7Smrg 
2508181254a7Smrg 
2509181254a7Smrg /// Ditto
2510181254a7Smrg size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2511181254a7Smrg     out dchar[1] buf, dchar c) @safe pure
2512181254a7Smrg {
2513181254a7Smrg     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2514181254a7Smrg         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2515181254a7Smrg     else
2516181254a7Smrg         assert(isValidDchar(c));
2517181254a7Smrg     buf[0] = c;
2518181254a7Smrg     return 1;
2519181254a7Smrg }
2520181254a7Smrg 
2521181254a7Smrg @safe unittest
2522181254a7Smrg {
2523181254a7Smrg     import std.exception;
2524181254a7Smrg     assertCTFEable!(
2525181254a7Smrg     {
2526181254a7Smrg     dchar[1] buf;
2527181254a7Smrg 
2528181254a7Smrg     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2529181254a7Smrg     encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
2530181254a7Smrg     encode(buf, '\uE000'); assert(buf[0] == '\uE000');
2531181254a7Smrg     encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE);
2532181254a7Smrg     encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF);
2533181254a7Smrg     encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
2534181254a7Smrg 
2535181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2536181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2537181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2538181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2539181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2540181254a7Smrg 
2541181254a7Smrg     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2542181254a7Smrg     assert(buf.front == replacementDchar);
2543181254a7Smrg     });
2544181254a7Smrg }
2545181254a7Smrg 
2546181254a7Smrg 
2547181254a7Smrg /++
2548*b1e83836Smrg     Encodes `c` in `str`'s encoding and appends it to `str`.
2549181254a7Smrg 
2550181254a7Smrg     Throws:
2551*b1e83836Smrg         `UTFException` if `c` is not a valid UTF code point.
2552181254a7Smrg   +/
2553181254a7Smrg void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2554*b1e83836Smrg     ref scope char[] str, dchar c) @safe pure
2555181254a7Smrg {
2556181254a7Smrg     if (c <= 0x7F)
2557181254a7Smrg     {
2558181254a7Smrg         assert(isValidDchar(c));
2559*b1e83836Smrg         str ~= cast(char) c;
2560181254a7Smrg     }
2561181254a7Smrg     else
2562181254a7Smrg     {
2563181254a7Smrg         char[4] buf;
2564181254a7Smrg         uint L;
2565181254a7Smrg 
2566181254a7Smrg         if (c <= 0x7FF)
2567181254a7Smrg         {
2568181254a7Smrg             assert(isValidDchar(c));
2569181254a7Smrg             buf[0] = cast(char)(0xC0 | (c >> 6));
2570181254a7Smrg             buf[1] = cast(char)(0x80 | (c & 0x3F));
2571181254a7Smrg             L = 2;
2572181254a7Smrg         }
2573181254a7Smrg         else if (c <= 0xFFFF)
2574181254a7Smrg         {
2575181254a7Smrg             if (0xD800 <= c && c <= 0xDFFF)
2576181254a7Smrg                 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2577181254a7Smrg 
2578181254a7Smrg             assert(isValidDchar(c));
2579181254a7Smrg         L3:
2580181254a7Smrg             buf[0] = cast(char)(0xE0 | (c >> 12));
2581181254a7Smrg             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2582181254a7Smrg             buf[2] = cast(char)(0x80 | (c & 0x3F));
2583181254a7Smrg             L = 3;
2584181254a7Smrg         }
2585181254a7Smrg         else if (c <= 0x10FFFF)
2586181254a7Smrg         {
2587181254a7Smrg             assert(isValidDchar(c));
2588181254a7Smrg             buf[0] = cast(char)(0xF0 | (c >> 18));
2589181254a7Smrg             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2590181254a7Smrg             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2591181254a7Smrg             buf[3] = cast(char)(0x80 | (c & 0x3F));
2592181254a7Smrg             L = 4;
2593181254a7Smrg         }
2594181254a7Smrg         else
2595181254a7Smrg         {
2596181254a7Smrg             assert(!isValidDchar(c));
2597181254a7Smrg             c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2598181254a7Smrg             goto L3;
2599181254a7Smrg         }
2600*b1e83836Smrg         str ~= buf[0 .. L];
2601181254a7Smrg     }
2602*b1e83836Smrg }
2603*b1e83836Smrg 
2604*b1e83836Smrg ///
2605*b1e83836Smrg @safe unittest
2606*b1e83836Smrg {
2607*b1e83836Smrg     char[] s = "abcd".dup;
2608*b1e83836Smrg     dchar d1 = 'a';
2609*b1e83836Smrg     dchar d2 = 'ø';
2610*b1e83836Smrg 
2611*b1e83836Smrg     encode(s, d1);
2612*b1e83836Smrg     assert(s.length == 5);
2613*b1e83836Smrg     assert(s == "abcda");
2614*b1e83836Smrg     encode(s, d2);
2615*b1e83836Smrg     assert(s.length == 7);
2616*b1e83836Smrg     assert(s == "abcdaø");
2617181254a7Smrg }
2618181254a7Smrg 
2619181254a7Smrg @safe unittest
2620181254a7Smrg {
2621181254a7Smrg     import std.exception;
2622181254a7Smrg 
2623181254a7Smrg     assertCTFEable!(
2624181254a7Smrg     {
2625181254a7Smrg     char[] s = "abcd".dup;
2626181254a7Smrg     encode(s, cast(dchar)'a');
2627181254a7Smrg     assert(s.length == 5);
2628181254a7Smrg     assert(s == "abcda");
2629181254a7Smrg 
2630181254a7Smrg     encode(s, cast(dchar)'\u00A9');
2631181254a7Smrg     assert(s.length == 7);
2632181254a7Smrg     assert(s == "abcda\xC2\xA9");
2633181254a7Smrg     //assert(s == "abcda\u00A9");   // BUG: fix compiler
2634181254a7Smrg 
2635181254a7Smrg     encode(s, cast(dchar)'\u2260');
2636181254a7Smrg     assert(s.length == 10);
2637181254a7Smrg     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
2638181254a7Smrg     });
2639181254a7Smrg }
2640181254a7Smrg 
2641181254a7Smrg @safe unittest
2642181254a7Smrg {
2643181254a7Smrg     import std.exception;
2644181254a7Smrg     assertCTFEable!(
2645181254a7Smrg     {
2646181254a7Smrg     char[] buf;
2647181254a7Smrg 
2648181254a7Smrg     encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
2649181254a7Smrg     encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
2650181254a7Smrg     encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
2651181254a7Smrg     encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
2652181254a7Smrg     encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
2653181254a7Smrg     encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
2654181254a7Smrg     encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
2655181254a7Smrg     encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
2656181254a7Smrg     encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
2657181254a7Smrg     encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
2658181254a7Smrg     encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
2659181254a7Smrg 
2660181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2661181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2662181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2663181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2664181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2665181254a7Smrg 
2666*b1e83836Smrg     enum replacementDcharString = "\uFFFD";
2667*b1e83836Smrg     enum rdcslen = replacementDcharString.length;
2668*b1e83836Smrg     assert(buf[$ - rdcslen .. $] != replacementDcharString);
2669181254a7Smrg     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2670*b1e83836Smrg     assert(buf[$ - rdcslen .. $] == replacementDcharString);
2671181254a7Smrg     });
2672181254a7Smrg }
2673181254a7Smrg 
2674181254a7Smrg /// ditto
2675181254a7Smrg void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2676*b1e83836Smrg     ref scope wchar[] str, dchar c) @safe pure
2677181254a7Smrg {
2678181254a7Smrg     if (c <= 0xFFFF)
2679181254a7Smrg     {
2680181254a7Smrg         if (0xD800 <= c && c <= 0xDFFF)
2681181254a7Smrg             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2682181254a7Smrg 
2683181254a7Smrg         assert(isValidDchar(c));
2684181254a7Smrg     L1:
2685*b1e83836Smrg         str ~= cast(wchar) c;
2686181254a7Smrg     }
2687181254a7Smrg     else if (c <= 0x10FFFF)
2688181254a7Smrg     {
2689181254a7Smrg         wchar[2] buf;
2690181254a7Smrg 
2691181254a7Smrg         assert(isValidDchar(c));
2692181254a7Smrg         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2693181254a7Smrg         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2694*b1e83836Smrg         str ~= buf;
2695181254a7Smrg     }
2696181254a7Smrg     else
2697181254a7Smrg     {
2698181254a7Smrg         assert(!isValidDchar(c));
2699181254a7Smrg         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2700181254a7Smrg         goto L1;
2701181254a7Smrg     }
2702181254a7Smrg }
2703181254a7Smrg 
2704181254a7Smrg @safe unittest
2705181254a7Smrg {
2706181254a7Smrg     import std.exception;
2707181254a7Smrg     assertCTFEable!(
2708181254a7Smrg     {
2709181254a7Smrg     wchar[] buf;
2710181254a7Smrg 
2711181254a7Smrg     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2712181254a7Smrg     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2713181254a7Smrg     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2714181254a7Smrg     encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2715181254a7Smrg     encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2716181254a7Smrg     encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
2717181254a7Smrg     encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
2718181254a7Smrg 
2719181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2720181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2721181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2722181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2723181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2724181254a7Smrg 
2725181254a7Smrg     assert(buf.back != replacementDchar);
2726181254a7Smrg     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2727181254a7Smrg     assert(buf.back == replacementDchar);
2728181254a7Smrg     });
2729181254a7Smrg }
2730181254a7Smrg 
2731181254a7Smrg /// ditto
2732181254a7Smrg void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2733*b1e83836Smrg     ref scope dchar[] str, dchar c) @safe pure
2734181254a7Smrg {
2735181254a7Smrg     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2736181254a7Smrg         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2737181254a7Smrg     else
2738181254a7Smrg         assert(isValidDchar(c));
2739181254a7Smrg     str ~= c;
2740181254a7Smrg }
2741181254a7Smrg 
2742181254a7Smrg @safe unittest
2743181254a7Smrg {
2744181254a7Smrg     import std.exception;
2745181254a7Smrg     assertCTFEable!(
2746181254a7Smrg     {
2747181254a7Smrg     dchar[] buf;
2748181254a7Smrg 
2749181254a7Smrg     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2750181254a7Smrg     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2751181254a7Smrg     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2752181254a7Smrg     encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
2753181254a7Smrg     encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
2754181254a7Smrg     encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
2755181254a7Smrg 
2756181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2757181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2758181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2759181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2760181254a7Smrg     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2761181254a7Smrg 
2762181254a7Smrg     assert(buf.back != replacementDchar);
2763181254a7Smrg     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2764181254a7Smrg     assert(buf.back == replacementDchar);
2765181254a7Smrg     });
2766181254a7Smrg }
2767181254a7Smrg 
2768181254a7Smrg 
2769181254a7Smrg /++
2770181254a7Smrg     Returns the number of code units that are required to encode the code point
2771*b1e83836Smrg     `c` when `C` is the character type used to encode it.
2772181254a7Smrg   +/
2773181254a7Smrg ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
2774181254a7Smrg if (isSomeChar!C)
2775181254a7Smrg {
2776181254a7Smrg     static if (C.sizeof == 1)
2777181254a7Smrg     {
2778181254a7Smrg         if (c <= 0x7F) return 1;
2779181254a7Smrg         if (c <= 0x7FF) return 2;
2780181254a7Smrg         if (c <= 0xFFFF) return 3;
2781181254a7Smrg         if (c <= 0x10FFFF) return 4;
2782181254a7Smrg         assert(false);
2783181254a7Smrg     }
2784181254a7Smrg     else static if (C.sizeof == 2)
2785181254a7Smrg     {
2786181254a7Smrg         return c <= 0xFFFF ? 1 : 2;
2787181254a7Smrg     }
2788181254a7Smrg     else
2789181254a7Smrg     {
2790181254a7Smrg         static assert(C.sizeof == 4);
2791181254a7Smrg         return 1;
2792181254a7Smrg     }
2793181254a7Smrg }
2794181254a7Smrg 
2795181254a7Smrg ///
2796181254a7Smrg @safe pure nothrow @nogc unittest
2797181254a7Smrg {
2798181254a7Smrg     assert(codeLength!char('a') == 1);
2799181254a7Smrg     assert(codeLength!wchar('a') == 1);
2800181254a7Smrg     assert(codeLength!dchar('a') == 1);
2801181254a7Smrg 
2802181254a7Smrg     assert(codeLength!char('\U0010FFFF') == 4);
2803181254a7Smrg     assert(codeLength!wchar('\U0010FFFF') == 2);
2804181254a7Smrg     assert(codeLength!dchar('\U0010FFFF') == 1);
2805181254a7Smrg }
2806181254a7Smrg 
2807181254a7Smrg 
2808181254a7Smrg /++
2809*b1e83836Smrg     Returns the number of code units that are required to encode `str`
2810*b1e83836Smrg     in a string whose character type is `C`. This is particularly useful
2811181254a7Smrg     when slicing one string with the length of another and the two string
2812181254a7Smrg     types use different character types.
2813181254a7Smrg 
2814181254a7Smrg     Params:
2815181254a7Smrg         C = the character type to get the encoding length for
2816*b1e83836Smrg         input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
2817*b1e83836Smrg         to calculate the encoding length from
2818181254a7Smrg     Returns:
2819181254a7Smrg         The number of code units in `input` when encoded to `C`
2820181254a7Smrg   +/
2821181254a7Smrg size_t codeLength(C, InputRange)(InputRange input)
2822*b1e83836Smrg if (isSomeFiniteCharInputRange!InputRange)
2823181254a7Smrg {
2824181254a7Smrg     alias EncType = Unqual!(ElementEncodingType!InputRange);
2825181254a7Smrg     static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
2826181254a7Smrg         return input.length;
2827181254a7Smrg     else
2828181254a7Smrg     {
2829181254a7Smrg         size_t total = 0;
2830181254a7Smrg 
2831*b1e83836Smrg         foreach (c; input.byDchar)
2832181254a7Smrg             total += codeLength!C(c);
2833181254a7Smrg 
2834181254a7Smrg         return total;
2835181254a7Smrg     }
2836181254a7Smrg }
2837181254a7Smrg 
2838181254a7Smrg ///
2839181254a7Smrg @safe unittest
2840181254a7Smrg {
2841181254a7Smrg     assert(codeLength!char("hello world") ==
2842*b1e83836Smrg            "hello world".length);
2843181254a7Smrg     assert(codeLength!wchar("hello world") ==
2844*b1e83836Smrg            "hello world"w.length);
2845181254a7Smrg     assert(codeLength!dchar("hello world") ==
2846*b1e83836Smrg            "hello world"d.length);
2847181254a7Smrg 
2848181254a7Smrg     assert(codeLength!char(`プログラミング`) ==
2849*b1e83836Smrg            `プログラミング`.length);
2850181254a7Smrg     assert(codeLength!wchar(`プログラミング`) ==
2851*b1e83836Smrg            `プログラミング`w.length);
2852181254a7Smrg     assert(codeLength!dchar(`プログラミング`) ==
2853*b1e83836Smrg            `プログラミング`d.length);
2854181254a7Smrg 
2855181254a7Smrg     string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
2856181254a7Smrg     wstring needle = `Être sans la verité`;
2857181254a7Smrg     assert(haystack[codeLength!char(needle) .. $] ==
2858181254a7Smrg            `, ça, ce ne serait pas bien.`);
2859181254a7Smrg }
2860181254a7Smrg 
2861181254a7Smrg @safe unittest
2862181254a7Smrg {
2863181254a7Smrg     import std.algorithm.iteration : filter;
2864181254a7Smrg     import std.conv : to;
2865181254a7Smrg     import std.exception;
2866181254a7Smrg 
2867181254a7Smrg     assertCTFEable!(
2868181254a7Smrg     {
2869181254a7Smrg     foreach (S; AliasSeq!( char[], const  char[],  string,
2870181254a7Smrg                           wchar[], const wchar[], wstring,
2871181254a7Smrg                           dchar[], const dchar[], dstring))
2872181254a7Smrg     {
2873181254a7Smrg         foreach (C; AliasSeq!(char, wchar, dchar))
2874181254a7Smrg         {
2875181254a7Smrg             assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
2876181254a7Smrg             assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
2877181254a7Smrg             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
2878181254a7Smrg                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2879181254a7Smrg             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
2880181254a7Smrg                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2881181254a7Smrg         }
2882181254a7Smrg     }
2883181254a7Smrg     });
2884181254a7Smrg }
2885181254a7Smrg 
2886181254a7Smrg /+
2887181254a7Smrg Internal helper function:
2888181254a7Smrg 
2889*b1e83836Smrg Returns true if it is safe to search for the Codepoint `c` inside
2890181254a7Smrg code units, without decoding.
2891181254a7Smrg 
2892181254a7Smrg This is a runtime check that is used an optimization in various functions,
2893*b1e83836Smrg particularly, in `std.string`.
2894181254a7Smrg   +/
2895181254a7Smrg package bool canSearchInCodeUnits(C)(dchar c)
2896181254a7Smrg if (isSomeChar!C)
2897181254a7Smrg {
2898181254a7Smrg     static if (C.sizeof == 1)
2899181254a7Smrg          return c <= 0x7F;
2900181254a7Smrg     else static if (C.sizeof == 2)
2901181254a7Smrg         return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
2902181254a7Smrg     else static if (C.sizeof == 4)
2903181254a7Smrg         return true;
2904181254a7Smrg     else
2905181254a7Smrg         static assert(0);
2906181254a7Smrg }
2907181254a7Smrg @safe unittest
2908181254a7Smrg {
2909181254a7Smrg     assert( canSearchInCodeUnits! char('a'));
2910181254a7Smrg     assert( canSearchInCodeUnits!wchar('a'));
2911181254a7Smrg     assert( canSearchInCodeUnits!dchar('a'));
2912181254a7Smrg     assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
2913181254a7Smrg     assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2914181254a7Smrg     assert( canSearchInCodeUnits!wchar('ö'));
2915181254a7Smrg     assert( canSearchInCodeUnits!dchar('ö'));
2916181254a7Smrg     assert(!canSearchInCodeUnits! char(''));
2917181254a7Smrg     assert( canSearchInCodeUnits!wchar(''));
2918181254a7Smrg     assert( canSearchInCodeUnits!dchar(''));
2919181254a7Smrg     assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2920181254a7Smrg     assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2921181254a7Smrg     assert(!canSearchInCodeUnits! char('\U00010001'));
2922181254a7Smrg     assert(!canSearchInCodeUnits!wchar('\U00010001'));
2923181254a7Smrg     assert( canSearchInCodeUnits!dchar('\U00010001'));
2924181254a7Smrg }
2925181254a7Smrg 
2926181254a7Smrg /* =================== Validation ======================= */
2927181254a7Smrg 
2928181254a7Smrg /++
2929*b1e83836Smrg     Checks to see if `str` is well-formed unicode or not.
2930181254a7Smrg 
2931181254a7Smrg     Throws:
2932*b1e83836Smrg         `UTFException` if `str` is not well-formed.
2933181254a7Smrg   +/
2934181254a7Smrg void validate(S)(in S str) @safe pure
2935181254a7Smrg if (isSomeString!S)
2936181254a7Smrg {
2937181254a7Smrg     immutable len = str.length;
2938181254a7Smrg     for (size_t i = 0; i < len; )
2939181254a7Smrg     {
2940181254a7Smrg         decode(str, i);
2941181254a7Smrg     }
2942181254a7Smrg }
2943181254a7Smrg 
2944*b1e83836Smrg ///
2945*b1e83836Smrg @safe unittest
2946*b1e83836Smrg {
2947*b1e83836Smrg     import std.exception : assertThrown;
2948*b1e83836Smrg     char[] a = [167, 133, 175];
2949*b1e83836Smrg     assertThrown!UTFException(validate(a));
2950*b1e83836Smrg }
2951181254a7Smrg 
2952*b1e83836Smrg // https://issues.dlang.org/show_bug.cgi?id=12923
2953*b1e83836Smrg @safe unittest
2954181254a7Smrg {
2955181254a7Smrg     import std.exception;
2956181254a7Smrg     assertThrown((){
2957181254a7Smrg         char[3]a=[167, 133, 175];
2958181254a7Smrg         validate(a[]);
2959181254a7Smrg     }());
2960181254a7Smrg }
2961181254a7Smrg 
2962181254a7Smrg /**
2963181254a7Smrg  * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2964181254a7Smrg  * string of the elements.
2965181254a7Smrg  *
2966181254a7Smrg  * Params:
2967181254a7Smrg  *     s = the string to encode
2968181254a7Smrg  * Returns:
2969181254a7Smrg  *     A UTF-8 string
2970181254a7Smrg  * See_Also:
2971181254a7Smrg  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2972181254a7Smrg  */
2973181254a7Smrg string toUTF8(S)(S s)
2974*b1e83836Smrg if (isSomeFiniteCharInputRange!S)
2975181254a7Smrg {
2976181254a7Smrg     return toUTFImpl!string(s);
2977181254a7Smrg }
2978181254a7Smrg 
2979181254a7Smrg ///
2980181254a7Smrg @safe pure unittest
2981181254a7Smrg {
2982181254a7Smrg     import std.algorithm.comparison : equal;
2983181254a7Smrg 
2984181254a7Smrg     // The ö is represented by two UTF-8 code units
2985181254a7Smrg     assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2986181254a7Smrg 
2987181254a7Smrg     // �� is four code units in UTF-8
2988181254a7Smrg     assert("��"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2989181254a7Smrg }
2990181254a7Smrg 
2991181254a7Smrg @system pure unittest
2992181254a7Smrg {
2993181254a7Smrg     import std.algorithm.comparison : equal;
2994181254a7Smrg     import std.internal.test.dummyrange : ReferenceInputRange;
2995181254a7Smrg 
2996*b1e83836Smrg     alias RT = ReferenceInputRange!(ElementType!(string));
2997*b1e83836Smrg     auto r1 = new RT("Hellø");
2998*b1e83836Smrg     auto r2 = new RT("��");
2999181254a7Smrg 
3000181254a7Smrg     assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
3001181254a7Smrg     assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
3002181254a7Smrg }
3003181254a7Smrg 
3004181254a7Smrg /**
3005181254a7Smrg  * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
3006181254a7Smrg  * `wstring` of the elements.
3007181254a7Smrg  *
3008181254a7Smrg  * Params:
3009181254a7Smrg  *     s = the range to encode
3010181254a7Smrg  * Returns:
3011181254a7Smrg  *     A UTF-16 string
3012181254a7Smrg  * See_Also:
3013181254a7Smrg  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3014181254a7Smrg  */
3015181254a7Smrg wstring toUTF16(S)(S s)
3016*b1e83836Smrg if (isSomeFiniteCharInputRange!S)
3017181254a7Smrg {
3018181254a7Smrg     return toUTFImpl!wstring(s);
3019181254a7Smrg }
3020181254a7Smrg 
3021181254a7Smrg ///
3022181254a7Smrg @safe pure unittest
3023181254a7Smrg {
3024181254a7Smrg     import std.algorithm.comparison : equal;
3025181254a7Smrg 
3026181254a7Smrg     // these graphemes are two code units in UTF-16 and one in UTF-32
3027181254a7Smrg     assert("��"d.length == 1);
3028181254a7Smrg     assert("��"d.length == 1);
3029181254a7Smrg 
3030181254a7Smrg     assert("��"d.toUTF16.equal([0xD852, 0xDF62]));
3031181254a7Smrg     assert("��"d.toUTF16.equal([0xD801, 0xDC37]));
3032181254a7Smrg }
3033181254a7Smrg 
3034181254a7Smrg @system pure unittest
3035181254a7Smrg {
3036181254a7Smrg     import std.algorithm.comparison : equal;
3037181254a7Smrg     import std.internal.test.dummyrange : ReferenceInputRange;
3038181254a7Smrg 
3039*b1e83836Smrg     alias RT = ReferenceInputRange!(ElementType!(string));
3040*b1e83836Smrg     auto r1 = new RT("��");
3041*b1e83836Smrg     auto r2 = new RT("��");
3042181254a7Smrg 
3043181254a7Smrg     assert(r1.toUTF16.equal([0xD852, 0xDF62]));
3044181254a7Smrg     assert(r2.toUTF16.equal([0xD801, 0xDC37]));
3045181254a7Smrg }
3046181254a7Smrg 
3047181254a7Smrg 
3048181254a7Smrg /**
3049181254a7Smrg  * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
3050181254a7Smrg  * `dstring` of the elements.
3051181254a7Smrg  *
3052181254a7Smrg  * Params:
3053181254a7Smrg  *     s = the range to encode
3054181254a7Smrg  * Returns:
3055181254a7Smrg  *     A UTF-32 string
3056181254a7Smrg  * See_Also:
3057181254a7Smrg  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3058181254a7Smrg  */
3059*b1e83836Smrg dstring toUTF32(S)(scope S s)
3060*b1e83836Smrg if (isSomeFiniteCharInputRange!S)
3061181254a7Smrg {
3062181254a7Smrg     return toUTFImpl!dstring(s);
3063181254a7Smrg }
3064181254a7Smrg 
3065*b1e83836Smrg ///
3066*b1e83836Smrg @safe pure unittest
3067*b1e83836Smrg {
3068*b1e83836Smrg     import std.algorithm.comparison : equal;
3069*b1e83836Smrg 
3070*b1e83836Smrg     // these graphemes are two code units in UTF-16 and one in UTF-32
3071*b1e83836Smrg     assert("��"w.length == 2);
3072*b1e83836Smrg     assert("��"w.length == 2);
3073*b1e83836Smrg 
3074*b1e83836Smrg     assert("��"w.toUTF32.equal([0x00024B62]));
3075*b1e83836Smrg     assert("��"w.toUTF32.equal([0x00010437]));
3076*b1e83836Smrg }
3077*b1e83836Smrg 
3078*b1e83836Smrg private T toUTFImpl(T, S)(scope S s)
3079181254a7Smrg {
3080181254a7Smrg     static if (is(S : T))
3081181254a7Smrg     {
3082181254a7Smrg         return s.idup;
3083181254a7Smrg     }
3084181254a7Smrg     else
3085181254a7Smrg     {
3086181254a7Smrg         import std.array : appender;
3087181254a7Smrg         auto app = appender!T();
3088181254a7Smrg 
3089*b1e83836Smrg         static if (is(S == C[], C) || hasLength!S)
3090181254a7Smrg             app.reserve(s.length);
3091181254a7Smrg 
3092181254a7Smrg         foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T)))
3093181254a7Smrg             app.put(c);
3094181254a7Smrg 
3095181254a7Smrg         return app.data;
3096181254a7Smrg     }
3097181254a7Smrg }
3098181254a7Smrg 
3099181254a7Smrg /* =================== toUTFz ======================= */
3100181254a7Smrg 
3101181254a7Smrg /++
3102*b1e83836Smrg     Returns a C-style zero-terminated string equivalent to `str`. `str`
3103*b1e83836Smrg     must not contain embedded `'\0'`'s as any C function will treat the first
3104*b1e83836Smrg     `'\0'` that it sees as the end of the string. If `str.empty` is
3105*b1e83836Smrg     `true`, then a string containing only `'\0'` is returned.
3106181254a7Smrg 
3107*b1e83836Smrg     `toUTFz` accepts any type of string and is templated on the type of
3108181254a7Smrg     character pointer that you wish to convert to. It will avoid allocating a
3109181254a7Smrg     new string if it can, but there's a decent chance that it will end up having
3110181254a7Smrg     to allocate a new string - particularly when dealing with character types
3111*b1e83836Smrg     other than `char`.
3112181254a7Smrg 
3113*b1e83836Smrg     $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
3114*b1e83836Smrg     anything alters the character one past the end of `str` (which is the
3115*b1e83836Smrg     `'\0'` character terminating the string), then the string won't be
3116181254a7Smrg     zero-terminated anymore. The most likely scenarios for that are if you
3117*b1e83836Smrg     append to `str` and no reallocation takes place or when `str` is a
3118181254a7Smrg     slice of a larger array, and you alter the character in the larger array
3119*b1e83836Smrg     which is one character past the end of `str`. Another case where it could
3120181254a7Smrg     occur would be if you had a mutable character array immediately after
3121*b1e83836Smrg     `str` in memory (for example, if they're member variables in a
3122181254a7Smrg     user-defined type with one declared right after the other) and that
3123*b1e83836Smrg     character array happened to start with `'\0'`. Such scenarios will never
3124181254a7Smrg     occur if you immediately use the zero-terminated string after calling
3125*b1e83836Smrg     `toUTFz` and the C function using it doesn't keep a reference to it.
3126181254a7Smrg     Also, they are unlikely to occur even if you save the zero-terminated string
3127181254a7Smrg     (the cases above would be among the few examples of where it could happen).
3128181254a7Smrg     However, if you save the zero-terminate string and want to be absolutely
3129181254a7Smrg     certain that the string stays zero-terminated, then simply append a
3130*b1e83836Smrg     `'\0'` to the string and use its `ptr` property rather than calling
3131*b1e83836Smrg     `toUTFz`.
3132181254a7Smrg 
3133181254a7Smrg     $(RED Warning 2:) When passing a character pointer to a C function, and the
3134181254a7Smrg     C function keeps it around for any reason, make sure that you keep a
3135181254a7Smrg     reference to it in your D code. Otherwise, it may go away during a garbage
3136181254a7Smrg     collection cycle and cause a nasty bug when the C code tries to use it.
3137181254a7Smrg   +/
3138181254a7Smrg template toUTFz(P)
3139*b1e83836Smrg if (isPointer!P && isSomeChar!(typeof(*P.init)))
3140181254a7Smrg {
3141181254a7Smrg     P toUTFz(S)(S str) @safe pure
3142*b1e83836Smrg     if (isSomeString!S)
3143181254a7Smrg     {
3144181254a7Smrg         return toUTFzImpl!(P, S)(str);
3145181254a7Smrg     }
3146181254a7Smrg }
3147181254a7Smrg 
3148181254a7Smrg ///
3149181254a7Smrg @safe pure unittest
3150181254a7Smrg {
3151181254a7Smrg     auto p1 = toUTFz!(char*)("hello world");
3152181254a7Smrg     auto p2 = toUTFz!(const(char)*)("hello world");
3153181254a7Smrg     auto p3 = toUTFz!(immutable(char)*)("hello world");
3154181254a7Smrg     auto p4 = toUTFz!(char*)("hello world"d);
3155181254a7Smrg     auto p5 = toUTFz!(const(wchar)*)("hello world");
3156181254a7Smrg     auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
3157181254a7Smrg }
3158181254a7Smrg 
3159*b1e83836Smrg private P toUTFzImpl(P, S)(return scope S str) @safe pure
3160*b1e83836Smrg if (is(immutable typeof(*P.init) == typeof(str[0])))
3161181254a7Smrg //immutable(C)[] -> C*, const(C)*, or immutable(C)*
3162181254a7Smrg {
3163181254a7Smrg     if (str.empty)
3164181254a7Smrg     {
3165181254a7Smrg         typeof(*P.init)[] retval = ['\0'];
3166181254a7Smrg 
3167181254a7Smrg         auto trustedPtr() @trusted { return retval.ptr; }
3168181254a7Smrg         return trustedPtr();
3169181254a7Smrg     }
3170181254a7Smrg 
3171181254a7Smrg     alias C = Unqual!(ElementEncodingType!S);
3172181254a7Smrg 
3173181254a7Smrg     //If the P is mutable, then we have to make a copy.
3174181254a7Smrg     static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
3175181254a7Smrg     {
3176181254a7Smrg         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3177181254a7Smrg     }
3178181254a7Smrg     else
3179181254a7Smrg     {
3180181254a7Smrg         if (!__ctfe)
3181181254a7Smrg         {
3182181254a7Smrg             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3183181254a7Smrg             immutable p = trustedPtrAdd(str);
3184181254a7Smrg 
3185181254a7Smrg             // Peek past end of str, if it's 0, no conversion necessary.
3186181254a7Smrg             // Note that the compiler will put a 0 past the end of static
3187181254a7Smrg             // strings, and the storage allocator will put a 0 past the end
3188181254a7Smrg             // of newly allocated char[]'s.
3189181254a7Smrg             // Is p dereferenceable? A simple test: if the p points to an
3190181254a7Smrg             // address multiple of 4, then conservatively assume the pointer
3191181254a7Smrg             // might be pointing to a new block of memory, which might be
3192181254a7Smrg             // unreadable. Otherwise, it's definitely pointing to valid
3193181254a7Smrg             // memory.
3194181254a7Smrg             if ((cast(size_t) p & 3) && *p == '\0')
3195181254a7Smrg                 return &str[0];
3196181254a7Smrg         }
3197181254a7Smrg 
3198181254a7Smrg         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3199181254a7Smrg     }
3200181254a7Smrg }
3201181254a7Smrg 
3202*b1e83836Smrg private P toUTFzImpl(P, S)(return scope S str) @safe pure
3203*b1e83836Smrg if (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable))
3204181254a7Smrg //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
3205181254a7Smrg {
3206*b1e83836Smrg     alias InChar  = typeof(str[0]);
3207181254a7Smrg     alias OutChar = typeof(*P.init);
3208181254a7Smrg 
3209181254a7Smrg     //const(C)[] -> const(C)* or
3210181254a7Smrg     //C[] -> C* or const(C)*
3211181254a7Smrg     static if (( is(const(Unqual!InChar) == InChar) &&  is(const(Unqual!OutChar) == OutChar)) ||
3212181254a7Smrg                (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
3213181254a7Smrg     {
3214181254a7Smrg         if (!__ctfe)
3215181254a7Smrg         {
3216181254a7Smrg             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3217181254a7Smrg             auto p = trustedPtrAdd(str);
3218181254a7Smrg 
3219181254a7Smrg             if ((cast(size_t) p & 3) && *p == '\0')
3220181254a7Smrg                 return &str[0];
3221181254a7Smrg         }
3222181254a7Smrg 
3223181254a7Smrg         str ~= '\0';
3224181254a7Smrg         return &str[0];
3225181254a7Smrg     }
3226181254a7Smrg     //const(C)[] -> C* or immutable(C)* or
3227181254a7Smrg     //C[] -> immutable(C)*
3228181254a7Smrg     else
3229181254a7Smrg     {
3230181254a7Smrg         import std.array : uninitializedArray;
3231181254a7Smrg         auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
3232181254a7Smrg         copy[0 .. $ - 1] = str[];
3233181254a7Smrg         copy[$ - 1] = '\0';
3234181254a7Smrg 
3235181254a7Smrg         auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
3236181254a7Smrg         return trustedCast(copy);
3237181254a7Smrg     }
3238181254a7Smrg }
3239181254a7Smrg 
3240181254a7Smrg private P toUTFzImpl(P, S)(S str) @safe pure
3241*b1e83836Smrg if (!is(immutable typeof(*P.init) == immutable typeof(str[0])))
3242181254a7Smrg //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
3243181254a7Smrg {
3244181254a7Smrg     import std.array : appender;
3245181254a7Smrg     auto retval = appender!(typeof(*P.init)[])();
3246181254a7Smrg 
3247181254a7Smrg     foreach (dchar c; str)
3248181254a7Smrg         retval.put(c);
3249181254a7Smrg     retval.put('\0');
3250181254a7Smrg 
3251181254a7Smrg     return () @trusted { return cast(P) retval.data.ptr; } ();
3252181254a7Smrg }
3253181254a7Smrg 
3254181254a7Smrg @safe pure unittest
3255181254a7Smrg {
3256181254a7Smrg     import core.exception : AssertError;
3257181254a7Smrg     import std.algorithm;
3258181254a7Smrg     import std.conv : to;
3259181254a7Smrg     import std.exception;
3260181254a7Smrg     import std.string : format;
3261181254a7Smrg 
3262181254a7Smrg     assertCTFEable!(
3263181254a7Smrg     {
3264181254a7Smrg     foreach (S; AliasSeq!(string, wstring, dstring))
3265181254a7Smrg     {
3266181254a7Smrg         alias C = Unqual!(ElementEncodingType!S);
3267181254a7Smrg 
3268181254a7Smrg         auto s1 = to!S("hello\U00010143\u0100\U00010143");
3269181254a7Smrg         auto temp = new C[](s1.length + 1);
3270181254a7Smrg         temp[0 .. $ - 1] = s1[0 .. $];
3271181254a7Smrg         temp[$ - 1] = '\n';
3272181254a7Smrg         --temp.length;
3273181254a7Smrg         auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
3274181254a7Smrg         auto s2 = trustedAssumeUnique(temp);
3275181254a7Smrg         assert(s1 == s2);
3276181254a7Smrg 
3277181254a7Smrg         void trustedCStringAssert(P, S)(S s) @trusted
3278181254a7Smrg         {
3279181254a7Smrg             auto p = toUTFz!P(s);
3280181254a7Smrg             assert(p[0 .. s.length] == s);
3281181254a7Smrg             assert(p[s.length] == '\0');
3282181254a7Smrg         }
3283181254a7Smrg 
3284181254a7Smrg         foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
3285181254a7Smrg         {
3286181254a7Smrg             trustedCStringAssert!P(s1);
3287181254a7Smrg             trustedCStringAssert!P(s2);
3288181254a7Smrg         }
3289181254a7Smrg     }
3290181254a7Smrg     });
3291181254a7Smrg 
3292181254a7Smrg     static void test(P, S)(S s, size_t line = __LINE__) @trusted
3293181254a7Smrg     {
3294181254a7Smrg         static size_t zeroLen(C)(const(C)* ptr) @trusted
3295181254a7Smrg         {
3296181254a7Smrg             size_t len = 0;
3297181254a7Smrg             while (*ptr != '\0') { ++ptr; ++len; }
3298181254a7Smrg             return len;
3299181254a7Smrg         }
3300181254a7Smrg 
3301181254a7Smrg         auto p = toUTFz!P(s);
3302181254a7Smrg         immutable len = zeroLen(p);
3303181254a7Smrg         enforce(cmp(s, p[0 .. len]) == 0,
3304181254a7Smrg                 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
3305181254a7Smrg                                 __FILE__, line));
3306181254a7Smrg     }
3307181254a7Smrg 
3308181254a7Smrg     assertCTFEable!(
3309181254a7Smrg     {
3310181254a7Smrg     foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
3311181254a7Smrg                           dchar*, const(dchar)*, immutable(dchar)*))
3312181254a7Smrg     {
3313181254a7Smrg         test!P("hello\U00010143\u0100\U00010143");
3314181254a7Smrg     }
3315181254a7Smrg     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3316181254a7Smrg                           dchar*, const(dchar)*, immutable(dchar)*))
3317181254a7Smrg     {
3318181254a7Smrg         test!P("hello\U00010143\u0100\U00010143"w);
3319181254a7Smrg     }
3320181254a7Smrg     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3321181254a7Smrg                           wchar*, const(wchar)*, immutable(wchar)*))
3322181254a7Smrg     {
3323181254a7Smrg         test!P("hello\U00010143\u0100\U00010143"d);
3324181254a7Smrg     }
3325181254a7Smrg     foreach (S; AliasSeq!( char[], const( char)[],
3326181254a7Smrg                           wchar[], const(wchar)[],
3327181254a7Smrg                           dchar[], const(dchar)[]))
3328181254a7Smrg     {
3329181254a7Smrg         auto s = to!S("hello\U00010143\u0100\U00010143");
3330181254a7Smrg 
3331181254a7Smrg         foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3332181254a7Smrg                               wchar*, const(wchar)*, immutable(wchar)*,
3333181254a7Smrg                               dchar*, const(dchar)*, immutable(dchar)*))
3334181254a7Smrg         {
3335181254a7Smrg             test!P(s);
3336181254a7Smrg         }
3337181254a7Smrg     }
3338181254a7Smrg     });
3339181254a7Smrg }
3340181254a7Smrg 
3341181254a7Smrg 
3342181254a7Smrg /++
3343*b1e83836Smrg     `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
3344181254a7Smrg 
3345*b1e83836Smrg     Encodes string `s` into UTF-16 and returns the encoded string.
3346*b1e83836Smrg     `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
3347*b1e83836Smrg     that take an `LPCWSTR` argument.
3348181254a7Smrg   +/
3349181254a7Smrg const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
3350181254a7Smrg if (isSomeChar!C)
3351181254a7Smrg {
3352181254a7Smrg     return toUTFz!(const(wchar)*)(str);
3353181254a7Smrg }
3354181254a7Smrg 
3355*b1e83836Smrg ///
3356*b1e83836Smrg @system unittest
3357*b1e83836Smrg {
3358*b1e83836Smrg     string str = "Hello, World!";
3359*b1e83836Smrg     const(wchar)* p = str.toUTF16z;
3360*b1e83836Smrg     assert(p[str.length] == '\0');
3361*b1e83836Smrg }
3362*b1e83836Smrg 
3363181254a7Smrg @safe pure unittest
3364181254a7Smrg {
3365181254a7Smrg     import std.conv : to;
3366181254a7Smrg     //toUTFz is already thoroughly tested, so this will just verify that
3367181254a7Smrg     //toUTF16z compiles properly for the various string types.
3368181254a7Smrg     foreach (S; AliasSeq!(string, wstring, dstring))
3369181254a7Smrg         assert(toUTF16z(to!S("hello world")) !is null);
3370181254a7Smrg }
3371181254a7Smrg 
3372181254a7Smrg 
3373181254a7Smrg /* ================================ tests ================================== */
3374181254a7Smrg 
3375181254a7Smrg @safe pure unittest
3376181254a7Smrg {
3377181254a7Smrg     import std.exception;
3378181254a7Smrg 
3379181254a7Smrg     assertCTFEable!(
3380181254a7Smrg     {
3381181254a7Smrg     assert(toUTF16("hello"c) == "hello");
3382181254a7Smrg     assert(toUTF32("hello"c) == "hello");
3383181254a7Smrg     assert(toUTF8 ("hello"w) == "hello");
3384181254a7Smrg     assert(toUTF32("hello"w) == "hello");
3385181254a7Smrg     assert(toUTF8 ("hello"d) == "hello");
3386181254a7Smrg     assert(toUTF16("hello"d) == "hello");
3387181254a7Smrg 
3388181254a7Smrg     assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
3389181254a7Smrg     assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
3390181254a7Smrg     assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
3391181254a7Smrg     assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
3392181254a7Smrg     assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
3393181254a7Smrg     assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
3394181254a7Smrg 
3395181254a7Smrg     assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3396181254a7Smrg     assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3397181254a7Smrg     assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3398181254a7Smrg     assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3399181254a7Smrg     assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3400181254a7Smrg     assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3401181254a7Smrg     });
3402181254a7Smrg }
3403181254a7Smrg 
3404181254a7Smrg 
3405181254a7Smrg /++
3406*b1e83836Smrg     Returns the total number of code points encoded in `str`.
3407181254a7Smrg 
3408181254a7Smrg     Supercedes: This function supercedes $(LREF toUCSindex).
3409181254a7Smrg 
3410181254a7Smrg     Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3411181254a7Smrg 
3412181254a7Smrg     Throws:
3413*b1e83836Smrg         `UTFException` if `str` is not well-formed.
3414181254a7Smrg   +/
3415*b1e83836Smrg size_t count(C)(const(C)[] str) @safe pure nothrow @nogc
3416181254a7Smrg if (isSomeChar!C)
3417181254a7Smrg {
3418*b1e83836Smrg     return walkLength(str.byDchar);
3419*b1e83836Smrg }
3420*b1e83836Smrg 
3421*b1e83836Smrg ///
3422*b1e83836Smrg @safe pure nothrow @nogc unittest
3423*b1e83836Smrg {
3424*b1e83836Smrg     assert(count("") == 0);
3425*b1e83836Smrg     assert(count("a") == 1);
3426*b1e83836Smrg     assert(count("abc") == 3);
3427*b1e83836Smrg     assert(count("\u20AC100") == 4);
3428181254a7Smrg }
3429181254a7Smrg 
3430181254a7Smrg @safe pure nothrow @nogc unittest
3431181254a7Smrg {
3432181254a7Smrg     import std.exception;
3433181254a7Smrg     assertCTFEable!(
3434181254a7Smrg     {
3435181254a7Smrg     assert(count("") == 0);
3436181254a7Smrg     assert(count("a") == 1);
3437181254a7Smrg     assert(count("abc") == 3);
3438181254a7Smrg     assert(count("\u20AC100") == 4);
3439181254a7Smrg     });
3440181254a7Smrg }
3441181254a7Smrg 
3442181254a7Smrg 
3443181254a7Smrg // Ranges of code units for testing.
3444*b1e83836Smrg version (StdUnittest)
3445181254a7Smrg {
3446*b1e83836Smrg private:
3447181254a7Smrg     struct InputCU(C)
3448181254a7Smrg     {
3449181254a7Smrg         import std.conv : to;
3450181254a7Smrg         @property bool empty() { return _str.empty; }
3451181254a7Smrg         @property C front() { return _str[0]; }
3452181254a7Smrg         void popFront() { _str = _str[1 .. $]; }
3453181254a7Smrg 
3454181254a7Smrg         this(inout(C)[] str)
3455181254a7Smrg         {
3456181254a7Smrg             _str = to!(C[])(str);
3457181254a7Smrg         }
3458181254a7Smrg 
3459181254a7Smrg         C[] _str;
3460181254a7Smrg     }
3461181254a7Smrg 
3462181254a7Smrg     struct BidirCU(C)
3463181254a7Smrg     {
3464181254a7Smrg         import std.conv : to;
3465181254a7Smrg         @property bool empty() { return _str.empty; }
3466181254a7Smrg         @property C front() { return _str[0]; }
3467181254a7Smrg         void popFront() { _str = _str[1 .. $]; }
3468181254a7Smrg         @property C back() { return _str[$ - 1]; }
3469181254a7Smrg         void popBack() { _str = _str[0 .. $ - 1]; }
3470181254a7Smrg         @property auto save() { return BidirCU(_str); }
3471181254a7Smrg         @property size_t length() { return _str.length; }
3472181254a7Smrg 
3473181254a7Smrg         this(inout(C)[] str)
3474181254a7Smrg         {
3475181254a7Smrg             _str = to!(C[])(str);
3476181254a7Smrg         }
3477181254a7Smrg 
3478181254a7Smrg         C[] _str;
3479181254a7Smrg     }
3480181254a7Smrg 
3481181254a7Smrg     struct RandomCU(C)
3482181254a7Smrg     {
3483181254a7Smrg         import std.conv : to;
3484181254a7Smrg         @property bool empty() { return _str.empty; }
3485181254a7Smrg         @property C front() { return _str[0]; }
3486181254a7Smrg         void popFront() { _str = _str[1 .. $]; }
3487181254a7Smrg         @property C back() { return _str[$ - 1]; }
3488181254a7Smrg         void popBack() { _str = _str[0 .. $ - 1]; }
3489181254a7Smrg         @property auto save() { return RandomCU(_str); }
3490181254a7Smrg         @property size_t length() { return _str.length; }
3491181254a7Smrg         C opIndex(size_t i) { return _str[i]; }
3492181254a7Smrg         auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
3493181254a7Smrg 
3494181254a7Smrg         this(inout(C)[] str)
3495181254a7Smrg         {
3496181254a7Smrg             _str = to!(C[])(str);
3497181254a7Smrg         }
3498181254a7Smrg 
3499181254a7Smrg         C[] _str;
3500181254a7Smrg     }
3501181254a7Smrg 
3502181254a7Smrg     class RefBidirCU(C)
3503181254a7Smrg     {
3504181254a7Smrg         import std.conv : to;
3505181254a7Smrg         @property bool empty() { return _str.empty; }
3506181254a7Smrg         @property C front() { return _str[0]; }
3507181254a7Smrg         void popFront() { _str = _str[1 .. $]; }
3508181254a7Smrg         @property C back() { return _str[$ - 1]; }
3509181254a7Smrg         void popBack() { _str = _str[0 .. $ - 1]; }
3510181254a7Smrg         @property auto save() { return new RefBidirCU(_str); }
3511181254a7Smrg         @property size_t length() { return _str.length; }
3512181254a7Smrg 
3513181254a7Smrg         this(inout(C)[] str)
3514181254a7Smrg         {
3515181254a7Smrg             _str = to!(C[])(str);
3516181254a7Smrg         }
3517181254a7Smrg 
3518181254a7Smrg         C[] _str;
3519181254a7Smrg     }
3520181254a7Smrg 
3521181254a7Smrg     class RefRandomCU(C)
3522181254a7Smrg     {
3523181254a7Smrg         import std.conv : to;
3524181254a7Smrg         @property bool empty() { return _str.empty; }
3525181254a7Smrg         @property C front() { return _str[0]; }
3526181254a7Smrg         void popFront() { _str = _str[1 .. $]; }
3527181254a7Smrg         @property C back() { return _str[$ - 1]; }
3528181254a7Smrg         void popBack() { _str = _str[0 .. $ - 1]; }
3529181254a7Smrg         @property auto save() { return new RefRandomCU(_str); }
3530181254a7Smrg         @property size_t length() { return _str.length; }
3531181254a7Smrg         C opIndex(size_t i) { return _str[i]; }
3532181254a7Smrg         auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
3533181254a7Smrg 
3534181254a7Smrg         this(inout(C)[] str)
3535181254a7Smrg         {
3536181254a7Smrg             _str = to!(C[])(str);
3537181254a7Smrg         }
3538181254a7Smrg 
3539181254a7Smrg         C[] _str;
3540181254a7Smrg     }
3541181254a7Smrg }
3542181254a7Smrg 
3543181254a7Smrg 
3544181254a7Smrg /**
3545181254a7Smrg  * Inserted in place of invalid UTF sequences.
3546181254a7Smrg  *
3547181254a7Smrg  * References:
3548181254a7Smrg  *      $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3549181254a7Smrg  */
3550181254a7Smrg enum dchar replacementDchar = '\uFFFD';
3551181254a7Smrg 
3552181254a7Smrg /********************************************
3553181254a7Smrg  * Iterate a range of char, wchar, or dchars by code unit.
3554181254a7Smrg  *
3555181254a7Smrg  * The purpose is to bypass the special case decoding that
3556181254a7Smrg  * $(REF front, std,range,primitives) does to character arrays. As a result,
3557181254a7Smrg  * using ranges with `byCodeUnit` can be `nothrow` while
3558181254a7Smrg  * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3559181254a7Smrg  * sequences.
3560181254a7Smrg  *
3561181254a7Smrg  * A code unit is a building block of the UTF encodings. Generally, an
3562181254a7Smrg  * individual code unit does not represent what's perceived as a full
3563181254a7Smrg  * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3564181254a7Smrg  * are encoded with multiple code units. For example, the UTF-8 code units for
3565181254a7Smrg  * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3566181254a7Smrg  * often does not form a character on its own. Attempting to treat it as
3567181254a7Smrg  * one while iterating over the resulting range will give nonsensical results.
3568181254a7Smrg  *
3569181254a7Smrg  * Params:
3570*b1e83836Smrg  *      r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3571*b1e83836Smrg  *      of characters (including strings) or a type that implicitly converts to a string type.
3572181254a7Smrg  * Returns:
3573181254a7Smrg  *      If `r` is not an auto-decodable string (i.e. a narrow string or a
3574181254a7Smrg  *      user-defined type that implicits converts to a string type), then `r`
3575181254a7Smrg  *      is returned.
3576181254a7Smrg  *
3577181254a7Smrg  *      Otherwise, `r` is converted to its corresponding string type (if it's
3578181254a7Smrg  *      not already a string) and wrapped in a random-access range where the
3579181254a7Smrg  *      element encoding type of the string (its code unit) is the element type
3580181254a7Smrg  *      of the range, and that range returned. The range has slicing.
3581181254a7Smrg  *
3582181254a7Smrg  *      If `r` is quirky enough to be a struct or class which is an input range
3583181254a7Smrg  *      of characters on its own (i.e. it has the input range API as member
3584181254a7Smrg  *      functions), $(I and) it's implicitly convertible to a string type, then
3585181254a7Smrg  *      `r` is returned, and no implicit conversion takes place.
3586*b1e83836Smrg  *
3587*b1e83836Smrg  *      If `r` is wrapped in a new range, then that range has a `source`
3588*b1e83836Smrg  *      property for returning the string that's currently contained within that
3589*b1e83836Smrg  *      range.
3590*b1e83836Smrg  *
3591181254a7Smrg  * See_Also:
3592181254a7Smrg  *      Refer to the $(MREF std, uni) docs for a reference on Unicode
3593181254a7Smrg  *      terminology.
3594181254a7Smrg  *
3595181254a7Smrg  *      For a range that iterates by grapheme cluster (written character) see
3596181254a7Smrg  *      $(REF byGrapheme, std,uni).
3597181254a7Smrg  */
3598181254a7Smrg auto byCodeUnit(R)(R r)
3599*b1e83836Smrg if ((isConvertibleToString!R && !isStaticArray!R) ||
3600*b1e83836Smrg     (isInputRange!R && isSomeChar!(ElementEncodingType!R)))
3601181254a7Smrg {
3602*b1e83836Smrg     import std.traits : StringTypeOf;
3603*b1e83836Smrg     static if (// This would be cleaner if we had a way to check whether a type
3604181254a7Smrg                // was a range without any implicit conversions.
3605181254a7Smrg                (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
3606181254a7Smrg                 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3607181254a7Smrg     {
3608181254a7Smrg         static struct ByCodeUnitImpl
3609181254a7Smrg         {
3610181254a7Smrg         @safe pure nothrow @nogc:
3611181254a7Smrg 
3612*b1e83836Smrg             @property bool empty() const     { return source.length == 0; }
3613*b1e83836Smrg             @property auto ref front() inout { return source[0]; }
3614*b1e83836Smrg             void popFront()                  { source = source[1 .. $]; }
3615181254a7Smrg 
3616*b1e83836Smrg             @property auto save() { return ByCodeUnitImpl(source.save); }
3617181254a7Smrg 
3618*b1e83836Smrg             @property auto ref back() inout { return source[$ - 1]; }
3619*b1e83836Smrg             void popBack()                  { source = source[0 .. $-1]; }
3620181254a7Smrg 
3621*b1e83836Smrg             auto ref opIndex(size_t index) inout     { return source[index]; }
3622*b1e83836Smrg             auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); }
3623181254a7Smrg 
3624*b1e83836Smrg             @property size_t length() const { return source.length; }
3625181254a7Smrg             alias opDollar = length;
3626181254a7Smrg 
3627*b1e83836Smrg             StringTypeOf!R source;
3628181254a7Smrg         }
3629181254a7Smrg 
3630181254a7Smrg         static assert(isRandomAccessRange!ByCodeUnitImpl);
3631181254a7Smrg 
3632181254a7Smrg         return ByCodeUnitImpl(r);
3633181254a7Smrg     }
3634*b1e83836Smrg     else static if (!isInputRange!R ||
3635*b1e83836Smrg                     (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
3636*b1e83836Smrg                     !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3637181254a7Smrg     {
3638181254a7Smrg         return cast(StringTypeOf!R) r;
3639181254a7Smrg     }
3640181254a7Smrg     else
3641181254a7Smrg     {
3642181254a7Smrg         // byCodeUnit for ranges and dchar[] is a no-op
3643181254a7Smrg         return r;
3644181254a7Smrg     }
3645181254a7Smrg }
3646181254a7Smrg 
3647181254a7Smrg ///
3648181254a7Smrg @safe unittest
3649181254a7Smrg {
3650181254a7Smrg     import std.range.primitives;
3651*b1e83836Smrg     import std.traits : isAutodecodableString;
3652181254a7Smrg 
3653181254a7Smrg     auto r = "Hello, World!".byCodeUnit();
3654181254a7Smrg     static assert(hasLength!(typeof(r)));
3655181254a7Smrg     static assert(hasSlicing!(typeof(r)));
3656181254a7Smrg     static assert(isRandomAccessRange!(typeof(r)));
3657181254a7Smrg     static assert(is(ElementType!(typeof(r)) == immutable char));
3658181254a7Smrg 
3659*b1e83836Smrg     // contrast with the range capabilities of standard strings (with or
3660*b1e83836Smrg     // without autodecoding enabled).
3661181254a7Smrg     auto s = "Hello, World!";
3662181254a7Smrg     static assert(isBidirectionalRange!(typeof(r)));
3663*b1e83836Smrg     static if (isAutodecodableString!(typeof(s)))
3664*b1e83836Smrg     {
3665*b1e83836Smrg         // with autodecoding enabled, strings are non-random-access ranges of
3666*b1e83836Smrg         // dchar.
3667181254a7Smrg         static assert(is(ElementType!(typeof(s)) == dchar));
3668181254a7Smrg         static assert(!isRandomAccessRange!(typeof(s)));
3669181254a7Smrg         static assert(!hasSlicing!(typeof(s)));
3670181254a7Smrg         static assert(!hasLength!(typeof(s)));
3671181254a7Smrg     }
3672*b1e83836Smrg     else
3673*b1e83836Smrg     {
3674*b1e83836Smrg         // without autodecoding, strings are normal arrays.
3675*b1e83836Smrg         static assert(is(ElementType!(typeof(s)) == immutable char));
3676*b1e83836Smrg         static assert(isRandomAccessRange!(typeof(s)));
3677*b1e83836Smrg         static assert(hasSlicing!(typeof(s)));
3678*b1e83836Smrg         static assert(hasLength!(typeof(s)));
3679*b1e83836Smrg     }
3680*b1e83836Smrg }
3681181254a7Smrg 
3682181254a7Smrg /// `byCodeUnit` does no Unicode decoding
3683181254a7Smrg @safe unittest
3684181254a7Smrg {
3685181254a7Smrg     string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
3686181254a7Smrg     assert(noel1.byCodeUnit[2] != 'ë');
3687181254a7Smrg     assert(noel1.byCodeUnit[2] == 'e');
3688181254a7Smrg 
3689181254a7Smrg     string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3690181254a7Smrg     // Because string is UTF-8, the code unit at index 2 is just
3691181254a7Smrg     // the first of a sequence that encodes 'ë'
3692181254a7Smrg     assert(noel2.byCodeUnit[2] != 'ë');
3693181254a7Smrg }
3694181254a7Smrg 
3695*b1e83836Smrg /// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
3696*b1e83836Smrg @safe unittest
3697*b1e83836Smrg {
3698*b1e83836Smrg     import std.algorithm.comparison : equal;
3699*b1e83836Smrg     import std.range : popFrontN;
3700*b1e83836Smrg     import std.traits : isAutodecodableString;
3701*b1e83836Smrg     {
3702*b1e83836Smrg         auto range = byCodeUnit("hello world");
3703*b1e83836Smrg         range.popFrontN(3);
3704*b1e83836Smrg         assert(equal(range.save, "lo world"));
3705*b1e83836Smrg         static if (isAutodecodableString!string) // only enabled with autodecoding
3706*b1e83836Smrg         {
3707*b1e83836Smrg             string str = range.source;
3708*b1e83836Smrg             assert(str == "lo world");
3709*b1e83836Smrg         }
3710*b1e83836Smrg     }
3711*b1e83836Smrg     // source only exists if the range was wrapped
3712*b1e83836Smrg     {
3713*b1e83836Smrg         auto range = byCodeUnit("hello world"d);
3714*b1e83836Smrg         static assert(!__traits(compiles, range.source));
3715*b1e83836Smrg     }
3716*b1e83836Smrg }
3717*b1e83836Smrg 
3718181254a7Smrg @safe pure nothrow @nogc unittest
3719181254a7Smrg {
3720181254a7Smrg     import std.range;
3721181254a7Smrg     {
3722181254a7Smrg         enum testStr = "������ hello ディラン";
3723181254a7Smrg         char[testStr.length] s;
3724181254a7Smrg         int i;
3725181254a7Smrg         foreach (c; testStr.byCodeUnit().byCodeUnit())
3726181254a7Smrg         {
3727181254a7Smrg             s[i++] = c;
3728181254a7Smrg         }
3729181254a7Smrg         assert(s == testStr);
3730181254a7Smrg     }
3731181254a7Smrg     {
3732181254a7Smrg         enum testStr = "������ hello ディラン"w;
3733181254a7Smrg         wchar[testStr.length] s;
3734181254a7Smrg         int i;
3735181254a7Smrg         foreach (c; testStr.byCodeUnit().byCodeUnit())
3736181254a7Smrg         {
3737181254a7Smrg             s[i++] = c;
3738181254a7Smrg         }
3739181254a7Smrg         assert(s == testStr);
3740181254a7Smrg     }
3741181254a7Smrg     {
3742181254a7Smrg         enum testStr = "������ hello ディラン"d;
3743181254a7Smrg         dchar[testStr.length] s;
3744181254a7Smrg         int i;
3745181254a7Smrg         foreach (c; testStr.byCodeUnit().byCodeUnit())
3746181254a7Smrg         {
3747181254a7Smrg             s[i++] = c;
3748181254a7Smrg         }
3749181254a7Smrg         assert(s == testStr);
3750181254a7Smrg     }
3751181254a7Smrg     {
3752181254a7Smrg         auto bcu = "hello".byCodeUnit();
3753181254a7Smrg         assert(bcu.length == 5);
3754181254a7Smrg         assert(bcu[3] == 'l');
3755181254a7Smrg         assert(bcu[2 .. 4][1] == 'l');
3756181254a7Smrg     }
3757181254a7Smrg     {
3758181254a7Smrg         char[5] orig = "hello";
3759181254a7Smrg         auto bcu = orig[].byCodeUnit();
3760181254a7Smrg         bcu.front = 'H';
3761181254a7Smrg         assert(bcu.front == 'H');
3762181254a7Smrg         bcu[1] = 'E';
3763181254a7Smrg         assert(bcu[1] == 'E');
3764181254a7Smrg     }
3765181254a7Smrg     {
3766181254a7Smrg         auto bcu = "hello".byCodeUnit().byCodeUnit();
3767181254a7Smrg         static assert(isForwardRange!(typeof(bcu)));
3768*b1e83836Smrg         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3769181254a7Smrg         auto s = bcu.save;
3770181254a7Smrg         bcu.popFront();
3771181254a7Smrg         assert(s.front == 'h');
3772181254a7Smrg     }
3773181254a7Smrg     {
3774181254a7Smrg         auto bcu = "hello".byCodeUnit();
3775181254a7Smrg         static assert(hasSlicing!(typeof(bcu)));
3776181254a7Smrg         static assert(isBidirectionalRange!(typeof(bcu)));
3777*b1e83836Smrg         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3778181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3779181254a7Smrg         auto ret = bcu.retro;
3780181254a7Smrg         assert(ret.front == 'o');
3781181254a7Smrg         ret.popFront();
3782181254a7Smrg         assert(ret.front == 'l');
3783181254a7Smrg     }
3784181254a7Smrg     {
3785181254a7Smrg         auto bcu = "κόσμε"w.byCodeUnit();
3786181254a7Smrg         static assert(hasSlicing!(typeof(bcu)));
3787181254a7Smrg         static assert(isBidirectionalRange!(typeof(bcu)));
3788*b1e83836Smrg         static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring);
3789181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3790181254a7Smrg         auto ret = bcu.retro;
3791181254a7Smrg         assert(ret.front == 'ε');
3792181254a7Smrg         ret.popFront();
3793181254a7Smrg         assert(ret.front == 'μ');
3794181254a7Smrg     }
3795181254a7Smrg     {
3796181254a7Smrg         static struct Stringish
3797181254a7Smrg         {
3798181254a7Smrg             string s;
3799181254a7Smrg             alias s this;
3800181254a7Smrg         }
3801181254a7Smrg 
3802181254a7Smrg         auto orig = Stringish("\U0010fff8 �� foo ��");
3803181254a7Smrg         auto bcu = orig.byCodeUnit();
3804181254a7Smrg         static assert(is(typeof(bcu) == struct));
3805*b1e83836Smrg         static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish);
3806181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3807181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3808181254a7Smrg         assert(bcu.front == cast(char) 244);
3809181254a7Smrg     }
3810181254a7Smrg     {
3811181254a7Smrg         static struct WStringish
3812181254a7Smrg         {
3813181254a7Smrg             wstring s;
3814181254a7Smrg             alias s this;
3815181254a7Smrg         }
3816181254a7Smrg 
3817181254a7Smrg         auto orig = WStringish("\U0010fff8 �� foo ��"w);
3818181254a7Smrg         auto bcu = orig.byCodeUnit();
3819181254a7Smrg         static assert(is(typeof(bcu) == struct));
3820*b1e83836Smrg         static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish);
3821181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3822181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3823181254a7Smrg         assert(bcu.front == cast(wchar) 56319);
3824181254a7Smrg     }
3825181254a7Smrg     {
3826181254a7Smrg         static struct DStringish
3827181254a7Smrg         {
3828181254a7Smrg             dstring s;
3829181254a7Smrg             alias s this;
3830181254a7Smrg         }
3831181254a7Smrg 
3832181254a7Smrg         auto orig = DStringish("\U0010fff8 �� foo ��"d);
3833181254a7Smrg         auto bcu = orig.byCodeUnit();
3834181254a7Smrg         static assert(is(typeof(bcu) == dstring));
3835181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3836181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3837181254a7Smrg         assert(bcu.front == cast(dchar) 1114104);
3838181254a7Smrg     }
3839181254a7Smrg     {
3840181254a7Smrg         static struct FuncStringish
3841181254a7Smrg         {
3842181254a7Smrg             string str;
3843181254a7Smrg             string s() pure nothrow @nogc { return str; }
3844181254a7Smrg             alias s this;
3845181254a7Smrg         }
3846181254a7Smrg 
3847181254a7Smrg         auto orig = FuncStringish("\U0010fff8 �� foo ��");
3848181254a7Smrg         auto bcu = orig.byCodeUnit();
3849*b1e83836Smrg         static if (isAutodecodableString!FuncStringish)
3850181254a7Smrg             static assert(is(typeof(bcu) == struct));
3851*b1e83836Smrg         else
3852*b1e83836Smrg             static assert(is(typeof(bcu) == string));
3853181254a7Smrg         static assert(!is(typeof(bcu) == FuncStringish));
3854181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3855181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3856181254a7Smrg         assert(bcu.front == cast(char) 244);
3857181254a7Smrg     }
3858181254a7Smrg     {
3859181254a7Smrg         static struct Range
3860181254a7Smrg         {
3861181254a7Smrg             string data;
3862181254a7Smrg             bool empty() pure nothrow @nogc { return data.empty; }
3863181254a7Smrg             char front() pure nothrow @nogc { return data[0]; }
3864181254a7Smrg             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3865181254a7Smrg         }
3866181254a7Smrg 
3867181254a7Smrg         auto orig = Range("\U0010fff8 �� foo ��");
3868181254a7Smrg         auto bcu = orig.byCodeUnit();
3869181254a7Smrg         static assert(is(typeof(bcu) == Range));
3870181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3871181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == char));
3872181254a7Smrg         assert(bcu.front == cast(char) 244);
3873181254a7Smrg     }
3874181254a7Smrg     {
3875181254a7Smrg         static struct WRange
3876181254a7Smrg         {
3877181254a7Smrg             wstring data;
3878181254a7Smrg             bool empty() pure nothrow @nogc { return data.empty; }
3879181254a7Smrg             wchar front() pure nothrow @nogc { return data[0]; }
3880181254a7Smrg             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3881181254a7Smrg         }
3882181254a7Smrg 
3883181254a7Smrg         auto orig = WRange("\U0010fff8 �� foo ��"w);
3884181254a7Smrg         auto bcu = orig.byCodeUnit();
3885181254a7Smrg         static assert(is(typeof(bcu) == WRange));
3886181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3887181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == wchar));
3888181254a7Smrg         assert(bcu.front == 56319);
3889181254a7Smrg     }
3890181254a7Smrg     {
3891181254a7Smrg         static struct DRange
3892181254a7Smrg         {
3893181254a7Smrg             dstring data;
3894181254a7Smrg             bool empty() pure nothrow @nogc { return data.empty; }
3895181254a7Smrg             dchar front() pure nothrow @nogc { return data[0]; }
3896181254a7Smrg             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3897181254a7Smrg         }
3898181254a7Smrg 
3899181254a7Smrg         auto orig = DRange("\U0010fff8 �� foo ��"d);
3900181254a7Smrg         auto bcu = orig.byCodeUnit();
3901181254a7Smrg         static assert(is(typeof(bcu) == DRange));
3902181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3903181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == dchar));
3904181254a7Smrg         assert(bcu.front == 1114104);
3905181254a7Smrg     }
3906181254a7Smrg     {
3907181254a7Smrg         static struct RangeAndStringish
3908181254a7Smrg         {
3909181254a7Smrg             bool empty() pure nothrow @nogc { return data.empty; }
3910181254a7Smrg             char front() pure nothrow @nogc { return data[0]; }
3911181254a7Smrg             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3912181254a7Smrg 
3913181254a7Smrg             string data;
3914181254a7Smrg             string s;
3915181254a7Smrg             alias s this;
3916181254a7Smrg         }
3917181254a7Smrg 
3918181254a7Smrg         auto orig = RangeAndStringish("test.d", "other");
3919181254a7Smrg         auto bcu = orig.byCodeUnit();
3920181254a7Smrg         static assert(is(typeof(bcu) == RangeAndStringish));
3921181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3922181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == char));
3923181254a7Smrg         assert(bcu.front == 't');
3924181254a7Smrg     }
3925181254a7Smrg     {
3926181254a7Smrg         static struct WRangeAndStringish
3927181254a7Smrg         {
3928181254a7Smrg             bool empty() pure nothrow @nogc { return data.empty; }
3929181254a7Smrg             wchar front() pure nothrow @nogc { return data[0]; }
3930181254a7Smrg             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3931181254a7Smrg 
3932181254a7Smrg             wstring data;
3933181254a7Smrg             wstring s;
3934181254a7Smrg             alias s this;
3935181254a7Smrg         }
3936181254a7Smrg 
3937181254a7Smrg         auto orig = WRangeAndStringish("test.d"w, "other"w);
3938181254a7Smrg         auto bcu = orig.byCodeUnit();
3939181254a7Smrg         static assert(is(typeof(bcu) == WRangeAndStringish));
3940181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3941181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == wchar));
3942181254a7Smrg         assert(bcu.front == 't');
3943181254a7Smrg     }
3944181254a7Smrg     {
3945181254a7Smrg         static struct DRangeAndStringish
3946181254a7Smrg         {
3947181254a7Smrg             bool empty() pure nothrow @nogc { return data.empty; }
3948181254a7Smrg             dchar front() pure nothrow @nogc { return data[0]; }
3949181254a7Smrg             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3950181254a7Smrg 
3951181254a7Smrg             dstring data;
3952181254a7Smrg             dstring s;
3953181254a7Smrg             alias s this;
3954181254a7Smrg         }
3955181254a7Smrg 
3956181254a7Smrg         auto orig = DRangeAndStringish("test.d"d, "other"d);
3957181254a7Smrg         auto bcu = orig.byCodeUnit();
3958181254a7Smrg         static assert(is(typeof(bcu) == DRangeAndStringish));
3959181254a7Smrg         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3960181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == dchar));
3961181254a7Smrg         assert(bcu.front == 't');
3962181254a7Smrg     }
3963181254a7Smrg     {
3964181254a7Smrg         enum Enum : string { a = "test.d" }
3965181254a7Smrg 
3966181254a7Smrg         auto orig = Enum.a;
3967181254a7Smrg         auto bcu = orig.byCodeUnit();
3968181254a7Smrg         static assert(!is(typeof(bcu) == Enum));
3969*b1e83836Smrg         static if (isAutodecodableString!Enum)
3970181254a7Smrg             static assert(is(typeof(bcu) == struct));
3971*b1e83836Smrg         else
3972*b1e83836Smrg             static assert(is(typeof(bcu) == string));
3973181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3974181254a7Smrg         assert(bcu.front == 't');
3975181254a7Smrg     }
3976181254a7Smrg     {
3977181254a7Smrg         enum WEnum : wstring { a = "test.d"w }
3978181254a7Smrg 
3979181254a7Smrg         auto orig = WEnum.a;
3980181254a7Smrg         auto bcu = orig.byCodeUnit();
3981181254a7Smrg         static assert(!is(typeof(bcu) == WEnum));
3982*b1e83836Smrg         static if (isAutodecodableString!WEnum)
3983181254a7Smrg             static assert(is(typeof(bcu) == struct));
3984*b1e83836Smrg         else
3985*b1e83836Smrg             static assert(is(typeof(bcu) == wstring));
3986181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3987181254a7Smrg         assert(bcu.front == 't');
3988181254a7Smrg     }
3989181254a7Smrg     {
3990181254a7Smrg         enum DEnum : dstring { a = "test.d"d }
3991181254a7Smrg 
3992181254a7Smrg         auto orig = DEnum.a;
3993181254a7Smrg         auto bcu = orig.byCodeUnit();
3994181254a7Smrg         static assert(is(typeof(bcu) == dstring));
3995181254a7Smrg         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3996181254a7Smrg         assert(bcu.front == 't');
3997181254a7Smrg     }
3998181254a7Smrg 
3999*b1e83836Smrg     static if (autodecodeStrings)
4000*b1e83836Smrg     {
4001181254a7Smrg         static assert(!is(typeof(byCodeUnit("hello")) == string));
4002181254a7Smrg         static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
4003*b1e83836Smrg     }
4004*b1e83836Smrg     else
4005*b1e83836Smrg     {
4006*b1e83836Smrg         static assert(is(typeof(byCodeUnit("hello")) == string));
4007*b1e83836Smrg         static assert(is(typeof(byCodeUnit("hello"w)) == wstring));
4008*b1e83836Smrg     }
4009181254a7Smrg     static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
4010181254a7Smrg 
4011181254a7Smrg     static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
4012181254a7Smrg     static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
4013181254a7Smrg     static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
4014181254a7Smrg 
4015181254a7Smrg     enum SEnum : char[5] { a = "hello" }
4016181254a7Smrg     enum WSEnum : wchar[5] { a = "hello"w }
4017181254a7Smrg     enum DSEnum : dchar[5] { a = "hello"d }
4018181254a7Smrg 
4019181254a7Smrg     static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
4020181254a7Smrg     static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
4021181254a7Smrg     static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
4022181254a7Smrg }
4023181254a7Smrg 
4024181254a7Smrg /****************************
4025*b1e83836Smrg  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4026*b1e83836Smrg  * of characters by char, wchar, or dchar.
4027181254a7Smrg  * These aliases simply forward to $(LREF byUTF) with the
4028181254a7Smrg  * corresponding C argument.
4029181254a7Smrg  *
4030181254a7Smrg  * Params:
4031181254a7Smrg  *      r = input range of characters, or array of characters
4032181254a7Smrg  */
4033181254a7Smrg alias byChar = byUTF!char;
4034181254a7Smrg 
4035181254a7Smrg /// Ditto
4036181254a7Smrg alias byWchar = byUTF!wchar;
4037181254a7Smrg 
4038181254a7Smrg /// Ditto
4039181254a7Smrg alias byDchar = byUTF!dchar;
4040181254a7Smrg 
4041181254a7Smrg @safe pure nothrow @nogc unittest
4042181254a7Smrg {
4043181254a7Smrg   {
4044181254a7Smrg     char[5] s;
4045181254a7Smrg     int i;
4046181254a7Smrg     foreach (c; "hello".byChar.byChar())
4047181254a7Smrg     {
4048181254a7Smrg         //writefln("[%d] '%c'", i, c);
4049181254a7Smrg         s[i++] = c;
4050181254a7Smrg     }
4051181254a7Smrg     assert(s == "hello");
4052181254a7Smrg   }
4053181254a7Smrg   {
4054181254a7Smrg     char[5+2+3+4+3+3] s;
4055181254a7Smrg     int i;
4056181254a7Smrg     dchar[10] a;
4057181254a7Smrg     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4058181254a7Smrg     a[8] = 0xD800;   // invalid
4059181254a7Smrg     a[9] = cast(dchar) 0x110000; // invalid
4060181254a7Smrg     foreach (c; a[].byChar())
4061181254a7Smrg     {
4062181254a7Smrg         //writefln("[%d] '%c'", i, c);
4063181254a7Smrg         s[i++] = c;
4064181254a7Smrg     }
4065181254a7Smrg     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
4066181254a7Smrg   }
4067181254a7Smrg   {
4068181254a7Smrg     auto r = "hello"w.byChar();
4069181254a7Smrg     r.popFront();
4070181254a7Smrg     r.popFront();
4071181254a7Smrg     assert(r.front == 'l');
4072181254a7Smrg   }
4073181254a7Smrg   {
4074181254a7Smrg     auto r = "hello"d.byChar();
4075181254a7Smrg     r.popFront();
4076181254a7Smrg     r.popFront();
4077181254a7Smrg     assert(r.front == 'l');
4078181254a7Smrg   }
4079181254a7Smrg   {
4080181254a7Smrg     auto r = "hello"d.byChar();
4081181254a7Smrg     assert(isForwardRange!(typeof(r)));
4082181254a7Smrg     auto s = r.save;
4083181254a7Smrg     r.popFront();
4084181254a7Smrg     assert(s.front == 'h');
4085181254a7Smrg   }
4086181254a7Smrg }
4087181254a7Smrg 
4088181254a7Smrg @safe pure nothrow @nogc unittest
4089181254a7Smrg {
4090181254a7Smrg   {
4091181254a7Smrg     wchar[11] s;
4092181254a7Smrg     int i;
4093181254a7Smrg     dchar[10] a;
4094181254a7Smrg     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4095181254a7Smrg     a[8] = 0xD800;   // invalid
4096181254a7Smrg     a[9] = cast(dchar) 0x110000; // invalid
4097181254a7Smrg     foreach (c; a[].byWchar())
4098181254a7Smrg     {
4099181254a7Smrg         //writefln("[%d] '%c' x%x", i, c, c);
4100181254a7Smrg         s[i++] = c;
4101181254a7Smrg     }
4102181254a7Smrg     foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
4103181254a7Smrg     {
4104181254a7Smrg         //writefln("[%d] '%c' x%x", j, c, c);
4105181254a7Smrg     }
4106181254a7Smrg     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
4107181254a7Smrg   }
4108181254a7Smrg 
4109181254a7Smrg   {
4110181254a7Smrg     auto r = "hello".byWchar();
4111181254a7Smrg     r.popFront();
4112181254a7Smrg     r.popFront();
4113181254a7Smrg     assert(r.front == 'l');
4114181254a7Smrg   }
4115181254a7Smrg   {
4116181254a7Smrg     auto r = "hello"d.byWchar();
4117181254a7Smrg     r.popFront();
4118181254a7Smrg     r.popFront();
4119181254a7Smrg     assert(r.front == 'l');
4120181254a7Smrg   }
4121181254a7Smrg   {
4122181254a7Smrg     auto r = "hello"d.byWchar();
4123181254a7Smrg     assert(isForwardRange!(typeof(r)));
4124181254a7Smrg     auto s = r.save;
4125181254a7Smrg     r.popFront();
4126181254a7Smrg     assert(s.front == 'h');
4127181254a7Smrg   }
4128181254a7Smrg }
4129181254a7Smrg 
4130181254a7Smrg @safe pure nothrow @nogc unittest
4131181254a7Smrg {
4132181254a7Smrg   {
4133181254a7Smrg     dchar[9] s;
4134181254a7Smrg     int i;
4135181254a7Smrg     string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
4136181254a7Smrg     foreach (c; a.byDchar())
4137181254a7Smrg     {
4138181254a7Smrg         s[i++] = c;
4139181254a7Smrg     }
4140181254a7Smrg     assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
4141181254a7Smrg   }
4142181254a7Smrg   {
4143181254a7Smrg     foreach (s; invalidUTFstrings!char())
4144181254a7Smrg     {
4145181254a7Smrg         auto r = s.byDchar();
4146181254a7Smrg         assert(!r.empty);
4147181254a7Smrg         assert(r.front == r.front);
4148181254a7Smrg         dchar c = r.front;
4149181254a7Smrg         assert(c == replacementDchar);
4150181254a7Smrg     }
4151181254a7Smrg   }
4152181254a7Smrg   {
4153181254a7Smrg     auto r = "hello".byDchar();
4154181254a7Smrg     r.popFront();
4155181254a7Smrg     r.popFront();
4156181254a7Smrg     assert(r.front == 'l');
4157181254a7Smrg   }
4158181254a7Smrg 
4159181254a7Smrg   {
4160181254a7Smrg     dchar[8] s;
4161181254a7Smrg     int i;
4162181254a7Smrg     wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
4163181254a7Smrg     foreach (c; a.byDchar())
4164181254a7Smrg     {
4165181254a7Smrg         //writefln("[%d] '%c' x%x", i, c, c);
4166181254a7Smrg         s[i++] = c;
4167181254a7Smrg     }
4168181254a7Smrg     assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
4169181254a7Smrg   }
4170181254a7Smrg   {
4171181254a7Smrg     foreach (s; invalidUTFstrings!wchar())
4172181254a7Smrg     {
4173181254a7Smrg         auto r = s.byDchar();
4174181254a7Smrg         assert(!r.empty);
4175181254a7Smrg         assert(r.front == r.front);
4176181254a7Smrg         dchar c = r.front;
4177181254a7Smrg         assert(c == replacementDchar);
4178181254a7Smrg     }
4179181254a7Smrg   }
4180181254a7Smrg   {
4181181254a7Smrg     wchar[2] ws;
4182181254a7Smrg     ws[0] = 0xD800;
4183181254a7Smrg     ws[1] = 0xDD00;             // correct surrogate pair
4184181254a7Smrg     auto r = ws[].byDchar();
4185181254a7Smrg     assert(!r.empty);
4186181254a7Smrg     assert(r.front == r.front);
4187181254a7Smrg     dchar c = r.front;
4188181254a7Smrg     assert(c == '\U00010100');
4189181254a7Smrg   }
4190181254a7Smrg   {
4191181254a7Smrg     auto r = "hello"w.byDchar();
4192181254a7Smrg     r.popFront();
4193181254a7Smrg     r.popFront();
4194181254a7Smrg     assert(r.front == 'l');
4195181254a7Smrg   }
4196181254a7Smrg 
4197181254a7Smrg   {
4198181254a7Smrg     dchar[5] s;
4199181254a7Smrg     int i;
4200181254a7Smrg     dstring a = "hello"d;
4201181254a7Smrg     foreach (c; a.byDchar.byDchar())
4202181254a7Smrg     {
4203181254a7Smrg         //writefln("[%d] '%c' x%x", i, c, c);
4204181254a7Smrg         s[i++] = c;
4205181254a7Smrg     }
4206181254a7Smrg     assert(s == "hello"d);
4207181254a7Smrg   }
4208181254a7Smrg   {
4209181254a7Smrg     auto r = "hello".byDchar();
4210181254a7Smrg     assert(isForwardRange!(typeof(r)));
4211181254a7Smrg     auto s = r.save;
4212181254a7Smrg     r.popFront();
4213181254a7Smrg     assert(s.front == 'h');
4214181254a7Smrg   }
4215181254a7Smrg   {
4216181254a7Smrg     auto r = "hello"w.byDchar();
4217181254a7Smrg     assert(isForwardRange!(typeof(r)));
4218181254a7Smrg     auto s = r.save;
4219181254a7Smrg     r.popFront();
4220181254a7Smrg     assert(s.front == 'h');
4221181254a7Smrg   }
4222181254a7Smrg }
4223181254a7Smrg 
4224181254a7Smrg // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
4225181254a7Smrg // which needs to support ranges with and without those attributes
4226181254a7Smrg 
4227181254a7Smrg pure @safe nothrow @nogc unittest
4228181254a7Smrg {
4229181254a7Smrg     dchar[5] s = "hello"d;
4230181254a7Smrg     foreach (c; s[].byChar())  { }
4231181254a7Smrg     foreach (c; s[].byWchar()) { }
4232181254a7Smrg     foreach (c; s[].byDchar()) { }
4233181254a7Smrg }
4234181254a7Smrg 
4235*b1e83836Smrg version (StdUnittest)
4236*b1e83836Smrg private int impureVariable;
4237181254a7Smrg 
4238181254a7Smrg @system unittest
4239181254a7Smrg {
4240181254a7Smrg     static struct ImpureThrowingSystemRange(Char)
4241181254a7Smrg     {
4242181254a7Smrg         @property bool empty() const { return true; }
4243181254a7Smrg         @property Char front() const { return Char.init; }
4244181254a7Smrg         void popFront()
4245181254a7Smrg         {
4246181254a7Smrg             impureVariable++;
4247181254a7Smrg             throw new Exception("only for testing nothrow");
4248181254a7Smrg         }
4249181254a7Smrg     }
4250181254a7Smrg 
4251181254a7Smrg     foreach (Char; AliasSeq!(char, wchar, dchar))
4252181254a7Smrg     {
4253181254a7Smrg         ImpureThrowingSystemRange!Char range;
4254181254a7Smrg         foreach (c; range.byChar())  { }
4255181254a7Smrg         foreach (c; range.byWchar()) { }
4256181254a7Smrg         foreach (c; range.byDchar()) { }
4257181254a7Smrg     }
4258181254a7Smrg }
4259181254a7Smrg 
4260181254a7Smrg /****************************
4261*b1e83836Smrg  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4262*b1e83836Smrg  * of characters by char type `C` by encoding the elements of the range.
4263181254a7Smrg  *
4264*b1e83836Smrg  * UTF sequences that cannot be converted to the specified encoding are either
4265181254a7Smrg  * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
4266*b1e83836Smrg  * of the Unicode Standard 6.2 or result in a thrown UTFException.
4267*b1e83836Smrg  *  Hence byUTF is not symmetric.
4268181254a7Smrg  * This algorithm is lazy, and does not allocate memory.
4269181254a7Smrg  * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
4270181254a7Smrg  * `r` parameter.
4271181254a7Smrg  *
4272181254a7Smrg  * Params:
4273181254a7Smrg  *      C = `char`, `wchar`, or `dchar`
4274*b1e83836Smrg  *      useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
4275*b1e83836Smrg  *                            UseReplacementDchar.no means throw `UTFException` for invalid UTF
4276*b1e83836Smrg  *
4277*b1e83836Smrg  * Throws:
4278*b1e83836Smrg  *      `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.yes`
4279*b1e83836Smrg  *
4280*b1e83836Smrg  * GC:
4281*b1e83836Smrg  *      Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.no`
4282181254a7Smrg  *
4283181254a7Smrg  * Returns:
4284*b1e83836Smrg  *      A bidirectional range if `R` is a bidirectional range and not auto-decodable,
4285*b1e83836Smrg  *      as defined by $(REF isAutodecodableString, std, traits).
4286*b1e83836Smrg  *
4287*b1e83836Smrg  *      A forward range if `R` is a forward range and not auto-decodable.
4288181254a7Smrg  *
4289181254a7Smrg  *      Or, if `R` is a range and it is auto-decodable and
4290181254a7Smrg  *      `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
4291181254a7Smrg  *      to $(LREF byCodeUnit).
4292181254a7Smrg  *
4293181254a7Smrg  *      Otherwise, an input range of characters.
4294181254a7Smrg  */
4295*b1e83836Smrg template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar)
4296181254a7Smrg if (isSomeChar!C)
4297181254a7Smrg {
4298*b1e83836Smrg     static if (is(immutable C == immutable UC, UC) && !is(C == UC))
4299*b1e83836Smrg         alias byUTF = byUTF!UC;
4300181254a7Smrg     else:
4301181254a7Smrg 
4302181254a7Smrg     auto ref byUTF(R)(R r)
4303181254a7Smrg         if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4304181254a7Smrg     {
4305181254a7Smrg         return byUTF(r.byCodeUnit());
4306181254a7Smrg     }
4307181254a7Smrg 
4308181254a7Smrg     auto ref byUTF(R)(R r)
4309181254a7Smrg         if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4310181254a7Smrg     {
4311*b1e83836Smrg         static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C))
4312181254a7Smrg         {
4313181254a7Smrg             return r.byCodeUnit();
4314181254a7Smrg         }
4315*b1e83836Smrg         else static if (is(C == dchar))
4316*b1e83836Smrg         {
4317*b1e83836Smrg             static struct Result
4318*b1e83836Smrg             {
4319*b1e83836Smrg                 enum Empty = uint.max;  // range is empty or just constructed
4320*b1e83836Smrg 
4321*b1e83836Smrg                 this(return scope R r)
4322*b1e83836Smrg                 {
4323*b1e83836Smrg                     this.r = r;
4324*b1e83836Smrg                 }
4325*b1e83836Smrg 
4326*b1e83836Smrg                 this(return scope R r, uint buff)
4327*b1e83836Smrg                 {
4328*b1e83836Smrg                     this.r = r;
4329*b1e83836Smrg                     this.buff = buff;
4330*b1e83836Smrg                 }
4331*b1e83836Smrg 
4332*b1e83836Smrg                 static if (isBidirectionalRange!R)
4333*b1e83836Smrg                 {
4334*b1e83836Smrg                     this(return scope R r, uint frontBuff, uint backBuff)
4335*b1e83836Smrg                     {
4336*b1e83836Smrg                         this.r = r;
4337*b1e83836Smrg                         this.buff = frontBuff;
4338*b1e83836Smrg                         this.backBuff = backBuff;
4339*b1e83836Smrg                     }
4340*b1e83836Smrg                 }
4341*b1e83836Smrg 
4342*b1e83836Smrg                 @property bool empty()
4343*b1e83836Smrg                 {
4344*b1e83836Smrg                     static if (isBidirectionalRange!R)
4345*b1e83836Smrg                         return buff == Empty && backBuff == Empty && r.empty;
4346*b1e83836Smrg                     else
4347*b1e83836Smrg                         return buff == Empty && r.empty;
4348*b1e83836Smrg                 }
4349*b1e83836Smrg 
4350*b1e83836Smrg                 @property dchar front() scope // 'scope' required by call to decodeFront() below
4351*b1e83836Smrg                 {
4352*b1e83836Smrg                     if (buff == Empty)
4353*b1e83836Smrg                     {
4354*b1e83836Smrg                         auto c = r.front;
4355*b1e83836Smrg 
4356*b1e83836Smrg                         static if (is(RC == wchar))
4357*b1e83836Smrg                             enum firstMulti = 0xD800; // First high surrogate.
4358*b1e83836Smrg                         else
4359*b1e83836Smrg                             enum firstMulti = 0x80; // First non-ASCII.
4360*b1e83836Smrg                         if (c < firstMulti)
4361*b1e83836Smrg                         {
4362*b1e83836Smrg                             r.popFront;
4363*b1e83836Smrg                             buff = cast(dchar) c;
4364*b1e83836Smrg                         }
4365*b1e83836Smrg                         else
4366*b1e83836Smrg                         {
4367*b1e83836Smrg                             buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4368*b1e83836Smrg                         }
4369*b1e83836Smrg                     }
4370*b1e83836Smrg                     return cast(dchar) buff;
4371*b1e83836Smrg                 }
4372*b1e83836Smrg 
4373*b1e83836Smrg                 void popFront()
4374*b1e83836Smrg                 {
4375*b1e83836Smrg                     if (buff == Empty)
4376*b1e83836Smrg                         front();
4377*b1e83836Smrg                     buff = Empty;
4378*b1e83836Smrg                 }
4379*b1e83836Smrg 
4380*b1e83836Smrg                 static if (isForwardRange!R)
4381*b1e83836Smrg                 {
4382*b1e83836Smrg                     @property auto save()
4383*b1e83836Smrg                     {
4384*b1e83836Smrg                         static if (isBidirectionalRange!R)
4385*b1e83836Smrg                         {
4386*b1e83836Smrg                             return Result(r.save, buff, backBuff);
4387*b1e83836Smrg                         }
4388*b1e83836Smrg                         else
4389*b1e83836Smrg                         {
4390*b1e83836Smrg                             return Result(r.save, buff);
4391*b1e83836Smrg                         }
4392*b1e83836Smrg                     }
4393*b1e83836Smrg                 }
4394*b1e83836Smrg 
4395*b1e83836Smrg                 static if (isBidirectionalRange!R)
4396*b1e83836Smrg                 {
4397*b1e83836Smrg                     @property dchar back() scope // 'scope' required by call to decodeBack() below
4398*b1e83836Smrg                     {
4399*b1e83836Smrg                         if (backBuff != Empty)
4400*b1e83836Smrg                             return cast(dchar) backBuff;
4401*b1e83836Smrg 
4402*b1e83836Smrg                         auto c = r.back;
4403*b1e83836Smrg                         static if (is(RC == wchar))
4404*b1e83836Smrg                             enum firstMulti = 0xD800; // First high surrogate.
4405*b1e83836Smrg                         else
4406*b1e83836Smrg                             enum firstMulti = 0x80; // First non-ASCII.
4407*b1e83836Smrg                         if (c < firstMulti)
4408*b1e83836Smrg                         {
4409*b1e83836Smrg                             r.popBack;
4410*b1e83836Smrg                             backBuff = cast(dchar) c;
4411*b1e83836Smrg                         }
4412*b1e83836Smrg                         else
4413*b1e83836Smrg                         {
4414*b1e83836Smrg                             backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }();
4415*b1e83836Smrg                         }
4416*b1e83836Smrg                         return cast(dchar) backBuff;
4417*b1e83836Smrg 
4418*b1e83836Smrg                     }
4419*b1e83836Smrg 
4420*b1e83836Smrg                     void popBack()
4421*b1e83836Smrg                     {
4422*b1e83836Smrg                         if (backBuff == Empty)
4423*b1e83836Smrg                             back();
4424*b1e83836Smrg                         backBuff = Empty;
4425*b1e83836Smrg                     }
4426*b1e83836Smrg                 }
4427*b1e83836Smrg 
4428*b1e83836Smrg             private:
4429*b1e83836Smrg 
4430*b1e83836Smrg                 R r;
4431*b1e83836Smrg                 uint buff = Empty;      // one character lookahead buffer
4432*b1e83836Smrg                 static if (isBidirectionalRange!R)
4433*b1e83836Smrg                     uint backBuff = Empty;
4434*b1e83836Smrg             }
4435*b1e83836Smrg 
4436*b1e83836Smrg             return Result(r);
4437*b1e83836Smrg         }
4438181254a7Smrg         else
4439181254a7Smrg         {
4440181254a7Smrg             static struct Result
4441181254a7Smrg             {
4442*b1e83836Smrg                 this(return scope R r)
4443*b1e83836Smrg                 {
4444*b1e83836Smrg                     this.r = r;
4445*b1e83836Smrg                 }
4446*b1e83836Smrg 
4447*b1e83836Smrg                 this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf)
4448*b1e83836Smrg                 {
4449*b1e83836Smrg                     this.r = r;
4450*b1e83836Smrg                     this.pos = pos;
4451*b1e83836Smrg                     this.fill = fill;
4452*b1e83836Smrg                     this.buf = buf;
4453*b1e83836Smrg                 }
4454*b1e83836Smrg 
4455*b1e83836Smrg                 static if (isBidirectionalRange!R)
4456*b1e83836Smrg                 {
4457*b1e83836Smrg                     this(return scope R r, ushort frontPos, ushort frontFill,
4458*b1e83836Smrg                          ushort backPos, ushort backFill, C[4 / C.sizeof] buf)
4459*b1e83836Smrg                     {
4460*b1e83836Smrg                         this.r = r;
4461*b1e83836Smrg                         this.pos = frontPos;
4462*b1e83836Smrg                         this.fill = frontFill;
4463*b1e83836Smrg                         this.backPos = backPos;
4464*b1e83836Smrg                         this.backFill = backFill;
4465*b1e83836Smrg                         this.buf = buf;
4466*b1e83836Smrg                     }
4467*b1e83836Smrg                 }
4468*b1e83836Smrg 
4469181254a7Smrg                 @property bool empty()
4470181254a7Smrg                 {
4471*b1e83836Smrg                     static if (isBidirectionalRange!R)
4472*b1e83836Smrg                         return pos == fill && backPos == backFill && r.empty;
4473*b1e83836Smrg                     else
4474181254a7Smrg                         return pos == fill && r.empty;
4475181254a7Smrg                 }
4476181254a7Smrg 
4477181254a7Smrg                 @property auto front() scope // 'scope' required by call to decodeFront() below
4478181254a7Smrg                 {
4479181254a7Smrg                     if (pos == fill)
4480181254a7Smrg                     {
4481181254a7Smrg                         pos = 0;
4482181254a7Smrg                         auto c = r.front;
4483181254a7Smrg 
4484*b1e83836Smrg                         static if (C.sizeof >= 2 && RC.sizeof >= 2)
4485*b1e83836Smrg                             enum firstMulti = 0xD800; // First high surrogate.
4486*b1e83836Smrg                         else
4487*b1e83836Smrg                             enum firstMulti = 0x80; // First non-ASCII.
4488*b1e83836Smrg                         if (c < firstMulti)
4489181254a7Smrg                         {
4490181254a7Smrg                             fill = 1;
4491181254a7Smrg                             r.popFront;
4492181254a7Smrg                             buf[pos] = cast(C) c;
4493181254a7Smrg                         }
4494181254a7Smrg                         else
4495181254a7Smrg                         {
4496181254a7Smrg                             static if (is(RC == dchar))
4497181254a7Smrg                             {
4498181254a7Smrg                                 r.popFront;
4499181254a7Smrg                                 dchar dc = c;
4500181254a7Smrg                             }
4501181254a7Smrg                             else
4502*b1e83836Smrg                                 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4503*b1e83836Smrg                             fill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4504181254a7Smrg                         }
4505181254a7Smrg                     }
4506181254a7Smrg                     return buf[pos];
4507181254a7Smrg                 }
4508181254a7Smrg 
4509181254a7Smrg                 void popFront()
4510181254a7Smrg                 {
4511181254a7Smrg                     if (pos == fill)
4512181254a7Smrg                         front;
4513181254a7Smrg                     ++pos;
4514181254a7Smrg                 }
4515181254a7Smrg 
4516181254a7Smrg                 static if (isForwardRange!R)
4517181254a7Smrg                 {
4518*b1e83836Smrg                     @property auto save()
4519181254a7Smrg                     {
4520*b1e83836Smrg                         static if (isBidirectionalRange!R)
4521*b1e83836Smrg                         {
4522*b1e83836Smrg                             return Result(r.save, pos, fill, backPos, backFill, buf);
4523*b1e83836Smrg                         }
4524*b1e83836Smrg                         else
4525*b1e83836Smrg                         {
4526*b1e83836Smrg                             return Result(r.save, pos, fill, buf);
4527*b1e83836Smrg                         }
4528*b1e83836Smrg                     }
4529*b1e83836Smrg                 }
4530*b1e83836Smrg 
4531*b1e83836Smrg                 static if (isBidirectionalRange!R)
4532*b1e83836Smrg                 {
4533*b1e83836Smrg                     @property auto back() scope // 'scope' required by call to decodeBack() below
4534*b1e83836Smrg                     {
4535*b1e83836Smrg                         if (backPos != backFill)
4536*b1e83836Smrg                             return buf[cast(ushort) (backFill - backPos - 1)];
4537*b1e83836Smrg 
4538*b1e83836Smrg                         backPos = 0;
4539*b1e83836Smrg                         auto c = r.back;
4540*b1e83836Smrg                         static if (C.sizeof >= 2 && RC.sizeof >= 2)
4541*b1e83836Smrg                             enum firstMulti = 0xD800; // First high surrogate.
4542*b1e83836Smrg                         else
4543*b1e83836Smrg                             enum firstMulti = 0x80; // First non-ASCII.
4544*b1e83836Smrg                         if (c < firstMulti)
4545*b1e83836Smrg                         {
4546*b1e83836Smrg                             backFill = 1;
4547*b1e83836Smrg                             r.popBack;
4548*b1e83836Smrg                             buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c;
4549*b1e83836Smrg                         }
4550*b1e83836Smrg                         else
4551*b1e83836Smrg                         {
4552*b1e83836Smrg                             static if (is(RC == dchar))
4553*b1e83836Smrg                             {
4554*b1e83836Smrg                                 r.popBack;
4555*b1e83836Smrg                                 dchar dc = c;
4556*b1e83836Smrg                             }
4557*b1e83836Smrg                             else
4558*b1e83836Smrg                                 dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }();
4559*b1e83836Smrg                             backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4560*b1e83836Smrg                         }
4561*b1e83836Smrg                         return buf[cast(ushort) (backFill - backPos - 1)];
4562*b1e83836Smrg                     }
4563*b1e83836Smrg 
4564*b1e83836Smrg                     void popBack()
4565*b1e83836Smrg                     {
4566*b1e83836Smrg                         if (backPos == backFill)
4567*b1e83836Smrg                             back;
4568*b1e83836Smrg                         ++backPos;
4569181254a7Smrg                     }
4570181254a7Smrg                 }
4571181254a7Smrg 
4572181254a7Smrg             private:
4573181254a7Smrg 
4574181254a7Smrg                 R r;
4575181254a7Smrg                 ushort pos, fill;
4576*b1e83836Smrg                 static if (isBidirectionalRange!R)
4577*b1e83836Smrg                     ushort backPos, backFill;
4578*b1e83836Smrg                 C[4 / C.sizeof] buf = void;
4579181254a7Smrg             }
4580181254a7Smrg 
4581181254a7Smrg             return Result(r);
4582181254a7Smrg         }
4583181254a7Smrg     }
4584181254a7Smrg }
4585181254a7Smrg 
4586181254a7Smrg ///
4587181254a7Smrg @safe pure nothrow unittest
4588181254a7Smrg {
4589181254a7Smrg     import std.algorithm.comparison : equal;
4590181254a7Smrg 
4591181254a7Smrg     // hellö as a range of `char`s, which are UTF-8
4592*b1e83836Smrg     assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
4593181254a7Smrg 
4594181254a7Smrg     // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4595*b1e83836Smrg     assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
4596181254a7Smrg 
4597181254a7Smrg     // �� is four code units in UTF-8, two in UTF-16, and one in UTF-32
4598*b1e83836Smrg     assert("��".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]));
4599*b1e83836Smrg     assert("��".byUTF!wchar().equal([0xD801, 0xDC37]));
4600*b1e83836Smrg     assert("��".byUTF!dchar().equal([0x00010437]));
4601*b1e83836Smrg }
4602*b1e83836Smrg 
4603*b1e83836Smrg ///
4604*b1e83836Smrg @safe unittest
4605*b1e83836Smrg {
4606*b1e83836Smrg     import std.algorithm.comparison : equal;
4607*b1e83836Smrg     import std.exception : assertThrown;
4608*b1e83836Smrg 
4609*b1e83836Smrg     assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty"));
4610*b1e83836Smrg     assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty"));
4611*b1e83836Smrg }
4612*b1e83836Smrg 
4613*b1e83836Smrg @safe unittest
4614*b1e83836Smrg {
4615*b1e83836Smrg     {
4616*b1e83836Smrg         wchar[] s = ['a', 'b', 0x219];
4617*b1e83836Smrg         auto r = s.byUTF!char;
4618*b1e83836Smrg         assert(isBidirectionalRange!(typeof(r)));
4619*b1e83836Smrg         assert(r.back == 0x99);
4620*b1e83836Smrg         r.popBack;
4621*b1e83836Smrg         assert(r.back == 0xc8);
4622*b1e83836Smrg         r.popBack;
4623*b1e83836Smrg         assert(r.back == 'b');
4624*b1e83836Smrg 
4625*b1e83836Smrg     }
4626*b1e83836Smrg 
4627*b1e83836Smrg     {
4628*b1e83836Smrg         wchar[] s = ['a', 'b', 0x219];
4629*b1e83836Smrg         auto r = s.byUTF!wchar;
4630*b1e83836Smrg         uint i;
4631*b1e83836Smrg         assert(isBidirectionalRange!(typeof(r)));
4632*b1e83836Smrg         assert(r.back == 0x219);
4633*b1e83836Smrg         r.popBack;
4634*b1e83836Smrg         assert(r.back == 'b');
4635*b1e83836Smrg     }
4636*b1e83836Smrg 
4637*b1e83836Smrg     {
4638*b1e83836Smrg         wchar[] s = ['a', 'b', 0x219];
4639*b1e83836Smrg         auto r = s.byUTF!dchar;
4640*b1e83836Smrg         assert(isBidirectionalRange!(typeof(r)));
4641*b1e83836Smrg         assert(r.back == 0x219);
4642*b1e83836Smrg         r.popBack;
4643*b1e83836Smrg         assert(r.back == 'b');
4644*b1e83836Smrg     }
4645*b1e83836Smrg 
4646*b1e83836Smrg     {
4647*b1e83836Smrg         dchar[] s = ['��', '��'];
4648*b1e83836Smrg         auto r = s.byUTF!wchar;
4649*b1e83836Smrg         assert(r.back == 0xde01);
4650*b1e83836Smrg         r.popBack;
4651*b1e83836Smrg         assert(r.back == 0xd83d);
4652*b1e83836Smrg         r.popBack;
4653*b1e83836Smrg         assert(r.back == 0xdc37);
4654*b1e83836Smrg         r.popBack;
4655*b1e83836Smrg         assert(r.back == 0xd801);
4656*b1e83836Smrg     }
4657*b1e83836Smrg 
4658*b1e83836Smrg     {
4659*b1e83836Smrg         dchar[] s = ['��', '��'];
4660*b1e83836Smrg         auto r = s.byUTF!char;
4661*b1e83836Smrg         char[] res;
4662*b1e83836Smrg         while (!r.empty)
4663*b1e83836Smrg         {
4664*b1e83836Smrg             res ~= r.back;
4665*b1e83836Smrg             r.popBack;
4666*b1e83836Smrg         }
4667*b1e83836Smrg         import std.algorithm.comparison : equal;
4668*b1e83836Smrg         assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0]));
4669*b1e83836Smrg     }
4670*b1e83836Smrg 
4671*b1e83836Smrg     {
4672*b1e83836Smrg         dchar[] res;
4673*b1e83836Smrg         auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar;
4674*b1e83836Smrg         while (!r.empty)
4675*b1e83836Smrg         {
4676*b1e83836Smrg             res ~= r.back;
4677*b1e83836Smrg             r.popBack;
4678*b1e83836Smrg         }
4679*b1e83836Smrg         import std.algorithm.comparison : equal;
4680*b1e83836Smrg         assert(res.equal(['e', 'd', 'c', 'b', 'a']));
4681*b1e83836Smrg     }
4682*b1e83836Smrg 
4683*b1e83836Smrg     {
4684*b1e83836Smrg         //testing the save() function
4685*b1e83836Smrg         wchar[] s = ['Ă','ț'];
4686*b1e83836Smrg 
4687*b1e83836Smrg         auto rc = s.byUTF!char;
4688*b1e83836Smrg         rc.popBack;
4689*b1e83836Smrg         auto rcCopy = rc.save;
4690*b1e83836Smrg         assert(rc.back == rcCopy.back);
4691*b1e83836Smrg         assert(rcCopy.back == 0xc8);
4692*b1e83836Smrg 
4693*b1e83836Smrg         auto rd = s.byUTF!dchar;
4694*b1e83836Smrg         rd.popBack;
4695*b1e83836Smrg         auto rdCopy = rd.save;
4696*b1e83836Smrg         assert(rd.back == rdCopy.back);
4697*b1e83836Smrg         assert(rdCopy.back == 'Ă');
4698*b1e83836Smrg     }
4699*b1e83836Smrg }
4700*b1e83836Smrg 
4701*b1e83836Smrg ///
4702*b1e83836Smrg @safe pure nothrow unittest
4703*b1e83836Smrg {
4704*b1e83836Smrg     import std.range.primitives;
4705*b1e83836Smrg     wchar[] s = ['ă', 'î'];
4706*b1e83836Smrg 
4707*b1e83836Smrg     auto rc = s.byUTF!char;
4708*b1e83836Smrg     static assert(isBidirectionalRange!(typeof(rc)));
4709*b1e83836Smrg     assert(rc.back == 0xae);
4710*b1e83836Smrg     rc.popBack;
4711*b1e83836Smrg     assert(rc.back == 0xc3);
4712*b1e83836Smrg     rc.popBack;
4713*b1e83836Smrg     assert(rc.back == 0x83);
4714*b1e83836Smrg     rc.popBack;
4715*b1e83836Smrg     assert(rc.back == 0xc4);
4716*b1e83836Smrg 
4717*b1e83836Smrg     auto rw = s.byUTF!wchar;
4718*b1e83836Smrg     static assert(isBidirectionalRange!(typeof(rw)));
4719*b1e83836Smrg     assert(rw.back == 'î');
4720*b1e83836Smrg     rw.popBack;
4721*b1e83836Smrg     assert(rw.back == 'ă');
4722*b1e83836Smrg 
4723*b1e83836Smrg     auto rd = s.byUTF!dchar;
4724*b1e83836Smrg     static assert(isBidirectionalRange!(typeof(rd)));
4725*b1e83836Smrg     assert(rd.back == 'î');
4726*b1e83836Smrg     rd.popBack;
4727*b1e83836Smrg     assert(rd.back == 'ă');
4728181254a7Smrg }
4729